示例#1
0
def getSkippedWordInfo(baseFolder):
    output = []
    splitter = "\n------\n"

    output.append("Greek:\n")
    output.append(
        utils.getContent("output/greek/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append("\nPoetry:")
    output.append(
        utils.getContent(
            "output/greek/no_split/top250+p/chosenWordInfoPoetry.txt", False))
    output.append(splitter)
    output.append("English:\n")
    output.append(
        utils.getContent("output/english/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append("\nPoetry:")
    output.append(
        utils.getContent(
            "output/english/no_split/top250+p/chosenWordInfoPoetry.txt",
            False))
    output.append(splitter)
    output.append("Icelandic:\n")
    output.append(
        utils.getContent("output/icelandic/no_split/top250/chosenWordInfo.txt",
                         False))
    output.append(splitter)

    utils.safeWrite("%s/skippedWords.txt" % baseFolder, "\n".join(output))
示例#2
0
def getTokenColorMap(saveDir, topWords, topName):
    numTops = len(topWords)
    tokenMap = []

    usePrecomputed = True
    if (usePrecomputed):
        fname = "%scolorByIndex.json" % (saveDir)
        colors = utils.getContent(fname, True)
        for c in colors:
            tokenMap.append((c[0], c[1], c[2]))
    else:
        fname = "%s../wordCountData/wordPrincipalComponents_%d.json" % (
            saveDir, topName)
        components = np.array(utils.getContent(fname, True))
        skipFirst = 4
        minVals = np.min(wordPCAFitTransform(components), axis=0)
        valRange = np.max(wordPCAFitTransform(components), axis=0) - minVals
        normalizedComponents = np.round(255 * np.clip(
            (components - minVals) / valRange, 0, 1))

        for i in range(numTops):
            comps = normalizedComponents[i]
            rgb = (int(comps[0]), int(comps[1]), int(comps[2]))
            tokenMap.append(rgb)

    width = 400
    height = 20 * numTops
    im = Image.new("RGB", (width, height), "#FFFFFF")
    # get drawing context
    d = ImageDraw.Draw(im)
    # get a font
    fnt = ImageFont.truetype('fonts/DejaVuSans.ttf', 16)

    includedColors = {}
    colorList = []

    for i in range(numTops):
        rgb = tokenMap[i]

        baseY = 20 * i
        colorValuesText = "(%03d,%03d,%03d) " % rgb

        # keep track of each new color
        if not (colorValuesText in includedColors):
            includedColors[colorValuesText] = True
            colorList.append(rgb)

        text = colorValuesText + topWords[i]
        d.text((50, baseY + 2), text, font=fnt, fill=(0, 0, 0))
        d.rectangle(((10, baseY + 2), (40, baseY + 18)), fill=rgb)

    fname = saveDir + "images/key.png"
    utils.check_and_create_path(fname)
    im.save(fname)

    return tokenMap, colorList
示例#3
0
def getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum,
                  simMetrics, baseFolder):
    # Copy full eval files for jensen-shannon
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_tops.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_+p.txt"
        % (topStr, baseFolder),
        shell=True)

    # Grab median distance
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt" % (
        topStr)
    metricEvalInfo = utils.getContent(
        fname, False).split("=========")[-2].split("\n")[2:-1]
    sameAuthorRanks = []
    for i, line in enumerate(metricEvalInfo):
        sameAuthorRank = line.split("with same author: ")[1].split(".")[0]
        sameAuthorRanks.append(int(sameAuthorRank))

    median = np.median(sameAuthorRanks)

    utils.safeWrite(
        "%smetric/extraInfo/medianForDifferentAuthor.txt" % (baseFolder),
        "Median distance for closest author: %f" % median)

    # get info on the indica
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Books/sims/Arrian.Indica.1.txt %smetric/extraInfo/arrianIndica.txt"
        % (topStr, baseFolder),
        shell=True)

    # Info on book distance
    # Grab this from the best metric
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/sims.txt" % (
        topStr)
    allBookSims = utils.getContent(fname, False).split("\n")

    utils.safeWrite("%smetric/lowestSimilarity.txt" % (baseFolder),
                    "Lowest similarity between segments: %s" % allBookSims[-1])

    # Info on top similar authors
    makeTopAuthorTable(topStr, baseFolder)

    # ===============================

    makeMetricEvalTables("", topStr, comparableTopStr, topNum, poetryNum,
                         comparableNum, simMetrics, baseFolder)
示例#4
0
def dota_news(message):
    if intime(message):
        cid = getCID(message)
        content = getContent(message)
        url = "http://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/?appid=570&count=1&maxlength=300&format=json"
        request = requests.get(url)
        data = request.json()
        if content != "?":
            if request.status_code == 200:
                title = data['appnews']['newsitems'][0]['title']
                content = data['appnews']['newsitems'][0]['contents']
                content_nice = content.replace(" - ", "\n - ")
                content_nice = content_nice.replace("*", "\n*")
                content_nice = parser.unescape(content_nice)
                url = data['appnews']['newsitems'][0]['url']
                bot.send_message(
                    cid,
                    u'*{title}*```> \n{content_nice}\n[...]\n```'.format(title=title, content_nice=content_nice)
                    + u'[More info here]({url})'.format(url=url),
                    parse_mode="Markdown",
                    disable_web_page_preview=True)
            else:
                bot.reply_to(
                    message,
                    "`There has been an error, the number {error} to be specific.`"
                        .format(error=request.status_code),
                    parse_mode="Markdown")
        else:
            bot.reply_to(
                message,
                "`Send this command alone and I will show you the last Steam News for Dota2 entry`",
                parse_mode="Markdown")
示例#5
0
def loadWCData(saveDir, dataSplit, topName, type=""):
    wcData = utils.getContent(getWCFilename(saveDir, topName, type), True)

    # load author data
    authors = []
    for key in wcData["authors"]:
        a = wcData["authors"][key]
        authorName = a["name"]
        auth = utils.Author(authorName)
        auth.counts = a["counts"]
        auth.totalTokenCount = np.sum(a["counts"])

        authors.append(auth)

    # load book data
    books = []
    for key in wcData["books"]:
        b = wcData["books"][key]
        raw = {"bookText": "", "bookNumber": b["number"]}
        book = utils.Book(raw, b["name"], b["author"])
        book.counts = b["counts"]
        book.numTokens = np.sum(b["counts"])

        books.append(book)

    topWords = wcData["topWords"]

    calculateFrequencies(authors, books, topWords)

    return authors, books, topWords
示例#6
0
def makeTopAuthorTable(topStr, baseFolder):
    # Grab this from the best metric
    fname = "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (
        topStr)
    allAuthorSims = utils.getContent(fname, False).split("\n")

    topAuthorPairs = []

    topAuthorPairs.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1.2}
  \\begin{tabular}{| r | l | l | l | l |} \\hline
  & \\textbf{Author 1} & \\textbf{Author 2} & \\textbf{Score} & \\textbf{Notes}  \\\\\\hline
""")

    for i, pair in enumerate(allAuthorSims[:10]):
        splt1 = pair.split(" - ")
        sim = splt1[0]
        auths = splt1[1].split(" (")[0].split(", ")
        topAuthorPairs.append("  %.2d & %s & %s & %s & TODO \\\\\\hline" %
                              (i + 1, auths[0], auths[1], sim))

    topAuthorPairs.append("""
  \\end{tabular}
  \\caption{Top author pairs by similarity score according to Jensen-Shannon Similarity.}
  \\label{table:top_author_pairs}
\\end{table}
    """)

    utils.safeWrite("%smetric/topAuthorPairs.tex" % baseFolder,
                    "\n".join(topAuthorPairs))
示例#7
0
def pro_matches(message):
    """Gets recent pro matches, will give a number of matches equal to argument."""

    default_number_of_posts = 5
    posts_max = 20

    if intime(message):
        cid = getCID(message)

        param = getContent(message)
        try:
            param = int(param)
        except ValueError:
            param = 0

        number_of_posts = param if 0 < param <= posts_max else default_number_of_posts

        open_dota_url = 'https://api.opendota.com/api/proMatches'
        response = requests.get(open_dota_url)
        response_json = response.json()  # Array of 100 most recent pro matches
        matches_json = response_json[:number_of_posts]

        matches_text = []
        for match_json in matches_json:
            matches_text.append(match_short_description(match_json))

        message_text = 'Last {number} pro matches:'.format(
            number=number_of_posts)
        for match_text in matches_text:
            message_text = message_text + '\n{match}'.format(match=match_text)

        bot.send_message(cid,
                         message_text,
                         disable_web_page_preview=True,
                         parse_mode="Markdown")
示例#8
0
def getTokenColorMapMultiRun(saveDir, topWords, topName):
    numTops = len(topWords)
    tokenMaps = []

    fnames = ["%scolorByIndex.json" % (saveDir)]

    for file in utils.listFiles("%sextra_runs/" % saveDir):
        fnames.append("%sextra_runs/%s" % (saveDir, file))

    for fname in fnames:
        colors = utils.getContent(fname, True)
        tokenMap = []
        for c in colors:
            tokenMap.append((c[0], c[1], c[2]))
        tokenMaps.append(tokenMap)

    text_end = 0
    rect_width = 12
    rect_margin_h = 5
    rect_height = 5
    rect_top = 1
    rect_bottom = 4

    width = text_end + len(tokenMaps) * (rect_margin_h +
                                         rect_width) + rect_margin_h

    height = rect_height * numTops + 5
    im = Image.new("RGB", (width, height), "#FFFFFF")
    # get drawing context
    d = ImageDraw.Draw(im)
    # get a font
    fnt = ImageFont.truetype('fonts/DejaVuSans.ttf', int(0.8 * rect_height))

    includedColors = {}
    colorList = []

    # draw text labels
    for i in range(numTops):

        baseY = rect_height * i

        # text = topWords[i]
        # text_width, _ = d.textsize(text, font=fnt)
        #
        # d.text((text_end - text_width,baseY+rect_top), text, font=fnt, fill=(0, 0, 0))

        rect_right = text_end
        # draw groupings for this word
        for tm in tokenMaps:
            rgb = tm[i]
            rect_left = rect_right + rect_margin_h
            rect_right = rect_left + rect_width
            d.rectangle(((rect_left, baseY + rect_top),
                         (rect_right, baseY + rect_bottom)),
                        fill=rgb)

    fname = saveDir + "images/groupingCompare.png"
    utils.check_and_create_path(fname)
    im.save(fname)
示例#9
0
def computeHash(fname):
    """
	Pattern-like
	"""
    content = getContent(fname)
    # Hash
    hf = hashlib.md5()  # Possibility to change hash function
    hf.update(content)
    h = hf.digest()
    return h
示例#10
0
def getTextCounts(textLocation, saveDir):
    subprocess.run("cp %savailable.json %savailable.json" %
                   (textLocation, saveDir),
                   shell=True)

    available = utils.getContent(textLocation + "available.json", True)
    # For each available text
    for i, o in enumerate(available):
        if (i % 20 == 0):
            print(i, end=" ", flush=True)

        workLocs = o["works"]

        # Process each work
        for w in workLocs:
            t = utils.getContent(w["location"], True)

            booksRaw = t["booksRaw"]
            booksCounts = []
            for b in booksRaw:
                rawTokens = re.sub(r'\.,;:᾽῾\'', "", b["bookText"]).split(" ")
                tokenCounts = {}
                for token in rawTokens:
                    if (token == ""):
                        continue

                    if not (token in tokenCounts):
                        tokenCounts[token] = 1
                    else:
                        tokenCounts[token] += 1

                bookWithCounts = {}
                bookWithCounts["bookNumber"] = b["bookNumber"]
                bookWithCounts["bookTokenCounts"] = tokenCounts
                bookWithCounts["bookText"] = ""

                booksCounts.append(bookWithCounts)

            t["booksRaw"] = booksCounts

            # Remove "texts/" from start
            filename = "textCounts/" + w["location"][6:]
            utils.safeWrite(filename, t, True)
示例#11
0
def process(logfile):
    """ Take a given log file and return the system name,
        along with all the dated entries in a dictionary
    """

    print('processing log file:', logfile)
    loglines = getContent(
        logfile)  # getContent() returns reversed lines for pop()
    return gather_data(logfile,
                       loglines)  # gather_data() returns sysname, sysobjlist
示例#12
0
def getOverlapInfo(baseFolder):
    output = []
    splitter = "\n------\n"

    output.append("Greek:\n")
    output.append(
        utils.getContent("output/greek/topWordOverlapOverTime.txt", False))
    output.append(splitter)
    output.append("English:\n")
    output.append(
        utils.getContent("output/english/topWordOverlapOverTime.txt", False))
    output.append(splitter)
    output.append("Icelandic:\n")
    output.append(
        utils.getContent("output/icelandic/topWordOverlapOverTime.txt", False))
    output.append(splitter)

    utils.safeWrite("%s/topWordOverlapOverTime.txt" % baseFolder,
                    "\n".join(output))
示例#13
0
def combineTexts(textName, sourceTexts):

    allLines = []
    for source in sourceTexts:
        inFileName = utils.getTextFn(source)
        lines = utils.getContent(inFileName, True)
        allLines.extend(lines)

    jsonDump = json.dumps(allLines)
    outFileName = utils.getTextFn(textName)
    utils.safeWrite(outFileName, jsonDump)
示例#14
0
def makeMLTable(source, norm, filename):
    output = []

    output.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1.2}
""")

    # No naive bayes if normed due to negative data
    if norm:
        output.append("  \\begin{tabular}{| r | l | l |} \\hline")
        output.append(
            "  \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN}  \\\\\\hline"
        )
    else:
        output.append("  \\begin{tabular}{| r | l | l | l |} \\hline")
        output.append(
            "  \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} & \\textbf{Naive Bayes}  \\\\\\hline"
        )

    for t in ["Authors", "Books", "Books_2"]:
        cats = ["genre", "dialect", "timeframe"]
        if (t == "Books"):
            cats.append("author")
        if (t == "Books_2"):
            cats = ["work", "genre", "dialect", "timeframe", "author"]

        for cat in cats:
            fname = source + "res_%s_%s.txt" % (cat, t)
            lines = utils.getContent(fname, False).split("\n")
            maj_class = lines[1].split(" - ")[0].strip()
            knn = lines[2].split(" - ")[0].strip()
            naive_bayes = lines[3].split(" - ")[0].strip()

            t_name = t
            if t_name == "Books":
                t_name = "Segments"
            if t_name == "Books_2":
                t_name = "Segments*"
            if norm:
                output.append(" %s of %s & %s & %s \\\\\\hline" %
                              (cat, t_name, maj_class, knn))
            else:
                output.append(" %s of %s & %s & %s & %s \\\\\\hline" %
                              (cat, t_name, maj_class, knn, naive_bayes))

    output.append("""
  \\end{tabular}
  \\caption{Results of running simple machine learning on the frequency data.}
  \\label{table:ml+p}
\\end{table}
    """)

    utils.safeWrite(filename, "\n".join(output))
示例#15
0
def getAuthorBookCounts(baseFolder):
    ab_counts_output = []
    splitter = "\n------\n"

    ab_counts_output.append("Greek:\n")
    ab_counts_output.append(
        utils.getContent("output/greek/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/greek/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)
    ab_counts_output.append("English:\n")
    ab_counts_output.append(
        utils.getContent("output/english/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/english/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)
    ab_counts_output.append("Icelandic:\n")
    ab_counts_output.append(
        utils.getContent("output/icelandic/numberOfAuthors_Books.txt", False))
    ab_counts_output.append(
        utils.getContent("output/icelandic/numberOfTypes_Tokens.txt", False))
    ab_counts_output.append(splitter)

    utils.safeWrite("%s/AuthorBookNumbers.txt" % baseFolder,
                    "\n".join(ab_counts_output))
示例#16
0
def printKeyWords(dataSplit, top, subsetSize, language, saveDirBase):
    topName, _, _ = top
    # calculate save directory based on input parameters
    saveDir = saveDirBase + "%s" % (topName)

    keyWordsDir = saveDir + "/wordImportance/keyWords/"

    # find all the relevant json files
    files = os.listdir(keyWordsDir)
    for f in files:
        if f[-5:] == ".json":
            nameCore = f.split(".json")[0]

            # get the word info for this author pair
            words = utils.getContent(keyWordsDir + f, True)

            # get the authors
            authors = nameCore.split("_")
            a1 = authors[0]
            a2 = authors[1]
            print(a1, a2)

            # save dir for new files
            wordsDir = keyWordsDir + nameCore + "/"

            # gather the list of words and print them out along with percentiles
            wordList = []
            out = ["index, percentile, token"]
            for word in words:
                wordList.append("%03d_%s" % (words[word][0] + 1, word))
                out.append("%d, %.2f, %s" %
                           (words[word][0], words[word][1], word))

            utils.safeWrite(wordsDir + "words.txt", "\n".join(out))

            # get the info for each occurrence of the given words
            # associated with these authors
            target = {
                a1: wordList,
                a2: wordList,
            }
            printOccs(wordsDir, target, language)
示例#17
0
def visualizeWordOrder(authors, books, baseSaveDir, topWords, topName):
    baseSaveDir += "textsOnlyTopWords/"

    for numGroups in getWordGroupsRange(len(topWords)):
        if (numGroups == -1):
            print("    part of speech groups...")
            saveDir = "%spos_group/" % (baseSaveDir)
        else:
            print("    %d groups..." % numGroups)
            saveDir = "%s%d_group/" % (baseSaveDir, numGroups)

        # generate color map
        tokenToColor, colorList = getTokenColorMap(saveDir, topWords, topName)

        maxHeight = 0
        bars = {}

        # Create visualizations of each individual word colored by its group.
        for author in authors:
            # get author tokens as a block
            fname = baseSaveDir + "lists/authors/" + author.getSaveName(
            ) + ".json"
            tokens = utils.getContent(fname, True)

            arr = []
            for t in tokens:
                arr.append(tokenToColor[t])

            counts = barsFromRGBArray(arr, author.totalTokenCount, colorList)
            bars[author.authorName] = counts
            mh = np.max(counts)
            if mh > maxHeight:
                maxHeight = mh

            # This is not used for the paper but is rather interesting, as it lets
            # you potentially *see* different books, and certainly lets you see
            # different word usage

            # fname = saveDir + "images/authors_in_order/" + author.getSaveName() + ".png"
            # imageFromRGBArray(arr, fname)
            #
            # # get author's tokens divided by book
            # bookTokensArr = []
            # for book in books:
            #     if book.author == author.authorName:
            #         fname = baseSaveDir + "lists/books/" + book.getSaveName() + ".json"
            #         tokens = utils.getContent(fname, True)
            #         bookArr = []
            #         for t in tokens:
            #             bookArr.append(tokenToColor[t])
            #         bookTokensArr.append(bookArr)
            #
            # fname = saveDir + "images/authors-divided/" + author.getSaveName() + ".png"
            # imageFromRGBArrays(bookTokensArr, fname)

        # Graph word use bar charts now that we know the maximum scale.
        yHeight = (np.ceil(maxHeight * 100.0)) / 100.0
        groupLabels = utils.getContent(saveDir + "groupLabels.json", True)
        title = "Group Frequency"
        # for author in authors:
        #     fname = saveDir + "images/authors_bars/" + author.getSaveName()
        #     graphUtils.wordUseBarChart(bars[author.authorName], colorList, yHeight, groupLabels, title, fname)

        quadList = [
            ("demosthenesHomer",
             ["AeliusAristides", "Demosthenes", "ApolloniusRhodius", "Homer"]),
            ("clementThucydides",
             ["JohnOfDamascus", "ClementOfAlexandria", "Appian",
              "Thucydides"]),
        ]
        for saveName, quad in quadList:
            fname = saveDir + "images/" + saveName
            counts4 = []
            for authorName in quad:
                counts4.append(bars[authorName])
            graphUtils.wordUseBarChart4Up(counts4, colorList, yHeight,
                                          groupLabels, quad, fname)

        #octo = ["ApolloniusRhodius", "Homer", "AeliusAristides", "Demosthenes", "Appian", "Thucydides", "JohnOfDamascus", "ClementOfAlexandria"]
        octo = [
            "Homer", "ApolloniusRhodius", "Demosthenes", "AeliusAristides",
            "Thucydides", "Appian", "ClementOfAlexandria", "JohnOfDamascus"
        ]

        fname = saveDir + "images/dhct"
        counts8 = []
        for authorName in octo:
            counts8.append(bars[authorName])
        graphUtils.wordUseBarChart8Up(counts8, colorList, yHeight, groupLabels,
                                      octo, fname)
        #utils.safeWrite(saveDir+ "textsOnlyTopWords/images/authors/" + author.getSaveName() + ".json", tokens, dumpJSON=True)

        # Group by author
        numGroups = len(colorList)
        groups = []
        for i in range(numGroups):
            groups.append([])
        for author in bars:
            for i in range(numGroups):
                groups[i].append([author, bars[author][i]])

        for i, group in enumerate(groups):
            groupName = groupLabels[i]
            g = sorted(group, key=lambda x: x[1], reverse=True)
            tickLabels = []
            data = []
            dataErr = []
            # for each author
            for a in g:
                tickLabels.append(a[0].replace("Anonymous", "Anon "))
                data.append(a[1])
                dataErr.append(0)

            fname = "byAuthor/%.2d_%s" % (i + 1, groupName)
            graphUtils.authorWordUseHistogram(data, ["Freq"],
                                              tickLabels,
                                              "Word usage for group %s" %
                                              groupName,
                                              "Frequency",
                                              saveDir,
                                              fname,
                                              True,
                                              color=colorList[i])
示例#18
0
def groupWordsMultipleRuns(topName, topWords, baseSaveDir):
    fname = baseSaveDir + "wordCountData/wordCountByText_%s.json" % (topName)
    rawCounts = np.array(utils.getContent(fname, True))
    tokensByItem = np.sum(rawCounts, axis=0)
    data = rawCounts / tokensByItem

    baseSaveDir = baseSaveDir + "textsOnlyTopWords/"

    # cluster data
    for numGroups in getWordGroupsRangeTest(len(topWords)):

        fname = "%s%d_group/colorByIndex.json" % (baseSaveDir, numGroups)
        baseColorsRaw = utils.getContent(fname, True)

        baseColors = []
        colorIndices = {}
        indexToColor = {}
        indexToColorName = [
            "Black", "Yellow", "Purple", "Orange", "Blue", "Red", "Tan",
            "Gray", "Green"
        ]
        numColors = 0

        labelToIndicesBase = {}

        for i, c in enumerate(baseColorsRaw):
            colorString = "%d,%d,%d" % (c[0], c[1], c[2])
            baseColors.append(colorString)

            if not (colorString in colorIndices):
                colorIndices[colorString] = numColors
                indexToColor[numColors] = colorString
                numColors += 1

                labelToIndicesBase[colorString] = [i]
            else:
                labelToIndicesBase[colorString].append(i)

        saveDir = "%s%d_group/extra_runs/" % (baseSaveDir, numGroups)
        print("      calculating extra for %.2d groups..." % numGroups,
              end=" ",
              flush=True)

        # we already ran the 0th run
        for run in range(1, RUNS):
            print(run, end=" ", flush=True)
            startOffset = 1
            # Make deterministic using group seed
            # 10000
            kmeans = cluster.KMeans(n_clusters=numGroups,
                                    n_init=1000,
                                    random_state=GROUP_SEED + run)
            kmeans.fit(data)
            wordLabels = kmeans.labels_

            # rename groups and keep track of color associated with each word

            # get the indices for each label
            labelToIndices = {}
            maxLabel = -1
            for i in range(len(topWords)):
                label = wordLabels[i]
                if label in labelToIndices:
                    labelToIndices[label].append(i)
                else:
                    labelToIndices[label] = [i]

                if label > maxLabel:
                    maxLabel = label

            # store colors used already
            takenColors = {}

            # this will convert from label to color
            labelToColor = {}

            # store labels already assigned
            takenLabels = {}
            unassignedColors = []

            # Go through each color in original grouping, assign it to the
            # group in this grouping that most closely matches it.
            for i in range(maxLabel + 1):
                labelCounts = np.full((numColors), 0)
                for j in labelToIndicesBase[indexToColor[i]]:
                    labelCounts[wordLabels[j]] += 1

                # find colors with highest overlap
                bestLabels = np.flipud(np.argsort(labelCounts))
                #print(labelCounts)
                #print(bestLabels)
                for j in bestLabels:
                    # If there an no longer any matches
                    if (labelCounts[j] == 0):
                        # print("No valid label for color %s" % indexToColorName[i])
                        unassignedColors.append(i)
                        break

                    # print("trying to assign color %s to best label %d" % (indexToColorName[i], j))
                    if not (j in takenLabels):
                        splt = indexToColor[i].split(",")
                        labelToColor[j] = (int(splt[0]), int(splt[1]),
                                           int(splt[2]))
                        takenLabels[j] = True
                        #print(labelToIndices[j])
                        break

                # print("---")

            # assing labels that aren't taken
            for i in range(maxLabel + 1):
                # ignore taken colors
                if i in labelToColor:
                    continue
                freeColorIndex = unassignedColors[0]
                splt = indexToColor[freeColorIndex].split(",")
                labelToColor[i] = (int(splt[0]), int(splt[1]), int(splt[2]))
                unassignedColors = unassignedColors[1:]
            # print("========")

            colorsUsed = []
            for i in range(len(topWords)):
                colorsUsed.append(labelToColor[wordLabels[i]])

            # save used colors
            fname = "%s/groups_%.3d.json" % (saveDir, run)
            utils.safeWrite(fname, colorsUsed, True)

        print("")
示例#19
0
def groupAndPlotWords(topName,
                      topWords,
                      wordToPOS,
                      baseSaveDir,
                      groupings=None):
    fname = baseSaveDir + "wordCountData/wordCountByText_%s.json" % (topName)
    rawCounts = np.array(utils.getContent(fname, True))
    tokensByItem = np.sum(rawCounts, axis=0)
    data = rawCounts / tokensByItem

    baseSaveDir = baseSaveDir + "textsOnlyTopWords/"

    if (groupings == None):
        groupings = getWordGroupsRange(len(topWords))
    #print(data)

    # cluster data
    for numGroups in groupings:
        startOffset = 1

        if (numGroups == -1):
            saveDir = "%spos_group/" % (baseSaveDir)
            print("    calculating for part of speech groups")

            wordLabels = []
            for word in topWords:
                if not (word in wordToPOS):
                    raise Exception(
                        "Word %s not in part of speech dictionary" % word)
                wordLabels.append(wordToPOS[word])
        else:
            saveDir = "%s%d_group/" % (baseSaveDir, numGroups)
            print("    calculating for %d groups" % numGroups)
            # Make deterministic using group seed
            kmeans = cluster.KMeans(n_clusters=numGroups,
                                    n_init=10000,
                                    random_state=GROUP_SEED)
            kmeans.fit(data)
            wordLabels = kmeans.labels_

        # rename groups and keep track of color associated with each word
        labelsSeen = 0
        labelConversion = {}
        target = []
        firstWords = []
        colorsUsed = []
        for i in range(len(topWords)):
            if not (wordLabels[i] in labelConversion):
                labelConversion[wordLabels[i]] = labelsSeen
                labelsSeen += 1
                firstWords.append(topWords[i])
            label = labelConversion[wordLabels[i]]
            target.append(label)
            colorsUsed.append(KELLY_COLORS[startOffset + label])

        # save used colors
        fname = "%s/colorByIndex.json" % (saveDir)
        utils.safeWrite(fname, colorsUsed, True)

        # create labels
        targetLabels = []
        for i in range(numGroups):
            targetLabels.append("Group %d (%s)" % (i + 1, firstWords[i]))

        # group data and colors
        targetList = []
        targetList.append({
            "name": "Word_Groupings_",
            "target": np.array(target),
            "labels": targetLabels
        })

        dataSet = graphUtils.Dataset(data, targetList)

        # Save group labels
        groupLabels = firstWords
        if (numGroups == -1):
            origGroupLabels = [
                "noun", "verb", "adj", "adv", "pron", "article", "prep",
                "conj", "partic"
            ]
            groupLabels = ["", "", "", "", "", "", "", "", ""]
            for i in range(len(origGroupLabels)):
                groupLabels[labelConversion[str(i)]] = origGroupLabels[i]

        utils.safeWrite(saveDir + "groupLabels.json", groupLabels, True)

        # graph the data
        tSNEDir = saveDir + "tSNE/"
        colors = KELLY_COLORS[startOffset:startOffset + numGroups]
        for u in [False]:  # [False, True]:
            graphUtils.tSNE_2D(dataSet,
                               topWords,
                               20.0,
                               True,
                               tSNEDir,
                               True,
                               predefinedColors=colors,
                               verbose=False,
                               useUMAP=u)
示例#20
0
def getWordUseInfo(topStr, baseFolder):
    # total +p words
    tops = utils.getContent(
        "output/greek/no_split/%s/wordInfo_%s.txt" % (topStr, topStr),
        False).split("\n")[1:]
    poetrys = utils.getContent(
        "output/greek/no_split/top_p/wordInfo_top_p.txt",
        False).split("\n")[1:]
    # Top plus poetry
    totals = utils.getContent(
        "output/greek/no_split/%s+p/wordInfo_%s+p.txt" % (topStr, topStr),
        False).split("\n")[1:]

    numWordsOutput = []
    numWordsOutput.append("Number of Top Words: %d" % len(tops))
    numWordsOutput.append("Number of Poetry Words: %d" % len(poetrys))
    numWordsOutput.append("Total Number of Words: %d" % len(totals))
    utils.safeWrite("%s/wordUse/totalWords.txt" % baseFolder,
                    "\n".join(numWordsOutput))

    # Create Table of words
    topRanks = {}
    poetryRanks = {}

    for i, line in enumerate(tops):
        w = line.split(":")[0]
        topRanks[w] = i + 1

    for i, line in enumerate(poetrys):
        w = line.split(":")[0]
        poetryRanks[w] = i + 1

    rankInfo = []
    for line in totals:
        w = line.split(":")[0]
        topRank = ""
        if w in topRanks:
            topRank = "%d" % topRanks[w]
        poetryRank = ""
        if w in poetryRanks:
            poetryRank = "%d" % poetryRanks[w]

        rankInfo.append((w, topRank, poetryRank))

    rankTableOutput = []
    rankTableOutput.append("""
    \\begin{table}[!hbt]
      \\centering
      \\def\\arraystretch{1}
      \\begin{tabular}{| l | l | l ||| l | l | l ||| l | l | l ||| l | l | l |}
    \\hline

    \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P}\\\\\\hline
    """)

    columnHeight = 43
    for i in range(columnHeight):
        cells = []
        for j in range(4):
            index = i + j * columnHeight
            cell = ""
            if (index < len(rankInfo)):
                cell = "%s & %s & %s" % rankInfo[index]

            cells.append(cell)
        rankTableOutput.append("%s \\\\\\hline" % (" & ".join(cells)))

    rankTableOutput.append("""
      \\end{tabular}
      \\caption{List of tokens used, along with their rank in the top 150 tokens found in all texts (\\textbf{A}) and rank in the top 100 tokens found in poetry texts (\\textbf{P}).}
      \\label{table:top_words}
    \\end{table}
    """)

    utils.safeWrite("%swordUse/topWordsTable.tex" % baseFolder,
                    "\n".join(rankTableOutput))
示例#21
0
def loadTexts(splitParameter, subsetSize, textLocation, language, saveDir,
              useTextCounts):
    useSplitParam = splitParameter != -1

    if useTextCounts:
        textLocation = convertToTextCounts(textLocation)
    available = utils.getContent(textLocation + "available.json", True)
    authors = []
    allWorks = []
    books = []
    workTokenLengths = []
    bookTokenLengths = []
    print(len(available), end=" - ", flush=True)

    # For each available text
    for i, o in enumerate(available):
        if (i % 20 == 0):
            print(i, end=" ", flush=True)

        authorName = o["author"]
        # split into two authors if necessary
        if useSplitParam:
            a1 = utils.Author(authorName)
            a2 = utils.Author(authorName + "_2")
        else:
            a = utils.Author(authorName)

        workLocs = o["works"]
        works = []
        authorTokens1 = []
        authorTokens2 = []

        # Process each work
        for w in workLocs:
            allWorks.append(w)
            # if authorName == "Arrian" and w["name"] != "Anabasis":
            #     continue
            location = w["location"]
            if useTextCounts:
                location = convertToTextCounts(location)
            t = utils.Text(location)

            if useSplitParam:
                a1.addWork(t)
                a2.addWork(t)
            else:
                a.addWork(t)

            workTokenLength = 0
            # For each book, process all of its tokens, count them,
            # add them to this author.
            for b in t.books:
                tokens = []
                if not (useTextCounts):
                    rawTokens = re.sub(r'\.,;:᾽῾\'', "", b.bookText).split(" ")
                    for token in rawTokens:
                        if language == "Greek":
                            token = preprocessTokenGreek(token)

                            token = utils.transformElided(token)
                        if language == "Icelandic":
                            token = preprocessTokenIcelandic(token)

                        if (token == ""):
                            continue

                        tokens.append(token)
                else:
                    tokenCounts = b.bookTokenCounts
                    for token in tokenCounts:
                        cleanToken = token
                        if language == "Greek":
                            cleanToken = preprocessTokenGreek(cleanToken)

                            cleanToken = utils.transformElided(cleanToken)
                        if language == "Icelandic":
                            cleanToken = preprocessTokenIcelandic(cleanToken)

                        if (cleanToken == ""):
                            continue

                        # Add token once per each count. Bit of a hack and the
                        # text will end up out of order, but since the paper
                        # doesn't consider word order this should be fine.
                        for i in range(tokenCounts[token]):
                            tokens.append(cleanToken)

                b.tokens = tokens
                books.append(b)

                bookTokenLength = len(tokens)
                bookTokenLengths.append(bookTokenLength)
                workTokenLength += bookTokenLength

                if useSplitParam:
                    # add in the tokens from this book as well
                    if (splitParameter == -2):
                        authorTokens1.extend(tokens)
                        authorTokens2.extend(tokens)
                    else:
                        modul = splitParameter * 2
                        t1 = [
                            tokens[i] for i in range(len(tokens))
                            if ((i % modul) < splitParameter)
                        ]
                        t2 = [
                            tokens[i] for i in range(len(tokens))
                            if ((i % modul) >= splitParameter)
                        ]
                        authorTokens1.extend(t1)
                        authorTokens2.extend(t2)
                        a1.bookSplits[len(authorTokens1)] = True
                        a2.bookSplits[len(authorTokens2)] = True
                else:
                    # add in the tokens from this book as well
                    authorTokens1.extend(tokens)
                    a.bookSplits[len(authorTokens1)] = True

            workTokenLengths.append(workTokenLength)

        if useSplitParam:
            if splitParameter == -2:
                half = int(len(authorTokens1) / 2)
                a1.allTokens = authorTokens1[:half]
                a2.allTokens = authorTokens2[half:]
            else:
                a1.allTokens = selectSubset(authorTokens1, subsetSize)
                a2.allTokens = selectSubset(authorTokens2, subsetSize)

            authors.append(a1)
            authors.append(a2)
        else:
            a.allTokens = selectSubset(authorTokens1, subsetSize)

            authors.append(a)

    numProseA = 0
    numPoetryA = 0
    for a in authors:
        if (toGenre(a.authorName) == 0):
            numProseA += 1
        else:
            numPoetryA += 1

    numProseB = 0
    numPoetryB = 0
    for b in books:
        if (toGenre(b.author) == 0):
            numProseB += 1
        else:
            numPoetryB += 1

    print("")
    countInfo = []
    countInfo.append("Number of authors: %d" % len(authors))
    countInfo.append("  prose: %d" % numProseA)
    countInfo.append("  poetry: %d" % numPoetryA)
    countInfo.append("Number of works:    %d" % len(allWorks))
    countInfo.append("Number of segments: %d" % len(books))
    countInfo.append("  prose: %d" % numProseB)
    countInfo.append("  poetry: %d" % numPoetryB)
    countInfo.append("-----")
    countInfo.append("           5%, 25%, 50%, 75%, 95%")
    countInfo.append(
        "works:    %d, %d, %d, %d, %d" %
        tuple(np.percentile(workTokenLengths, [5, 25, 50, 75, 95]).tolist()))
    countInfo.append(
        "segments: %d, %d, %d, %d, %d" %
        tuple(np.percentile(bookTokenLengths, [5, 25, 50, 75, 95]).tolist()))
    countInfoStr = "\n".join(countInfo)
    print(countInfoStr)

    if (saveDir != ""):
        utils.safeWrite(saveDir + "numberOfAuthors_Books.txt", countInfoStr)

    # If true, print all of the loaded texts.
    printLoaded = False

    if printLoaded:
        tab = "  "
        print("Authors:")
        s = []
        for author in authors:
            s.append(tab + str(author))
        print("\n".join(s))
        print("----")

        print("Books:")
        s = []
        for book in books:
            s.append(tab + str(book))
        print("\n".join(s))
        print("----")

    return authors, books
示例#22
0
def visualizeItemData(data, target, names, authornames, typeName, saveDir):
    targetList = []
    targetList.append({"name": "", "target": np.array(target), "labels": []})

    # go through all grouping options
    for fun in genre.labelList:
        targetList.append(fun(authornames))

    # visualize author data
    dataSet = graphUtils.Dataset(np.array(data), targetList)

    # Dummy test data
    testSet = graphUtils.Dataset([], [])
    testNames = []
    saveOutput = True

    perplexity = 20.0

    baseSaveDir = saveDir
    # I have some state carrying over that I can't figure out, so only one of
    # these can run at a time, but umap results look roughly the same as tSNE.
    for u in [False]:
        algorithmName = "tSNE"
        if u:
            algorithmName = "umap"
        print("Visualizing using %s" % algorithmName)

        tSNEDir = baseSaveDir + algorithmName + "/"

        graphUtils.tSNE_2D(dataSet,
                           names,
                           perplexity,
                           saveOutput,
                           tSNEDir,
                           True,
                           useUMAP=u)

        if (typeName == "Authors"):
            #graphUtils.tSNE_2D_2color(dataSet, names, perplexity, saveOutput, tSNEDir, True)

            # load tsne data
            saveDir = tSNEDir
            precalcFilename = saveDir + ("%s_2D_data.txt" % algorithmName)

            precalculated = utils.getContent(precalcFilename, True)
            tsneX = np.array(precalculated["x"], dtype=np.float64)

            # skip first and last target
            dataSet = graphUtils.Dataset(tsneX, targetList[1:-1])

            graphUtils.tSNE_2D_4Up(dataSet,
                                   names,
                                   False,
                                   saveOutput,
                                   saveDir,
                                   "info_no_labels_4Up",
                                   False,
                                   useUMAP=u)

        # Create fourup containing books
        if (typeName == "Books"):
            saveDir = tSNEDir
            precalcFilename = saveDir + ("%s_2D_data.txt" % algorithmName)

            precalculated = utils.getContent(precalcFilename, True)
            tsneX = np.array(precalculated["x"], dtype=np.float64)
            preY = np.array(precalculated["y"], dtype=np.float64)
            names = precalculated["names"]

            #print("Precalculateds loaded")

            targets = []
            targets.append({
                "name": "_",
                "target": preY != 30,
                "labels": ["Demosthenes", "Others"],
                "outlierName": "Demos.speec.59",
                "prettyName": "Speech 59"
            })  # Demosthenes .15
            targets.append({
                "name": "_",
                "target": preY != 55,
                "labels": ["Isocrates", "Others"],
                "outlierName": "Isocr.speec.21",
                "prettyName": "Speech 21"
            })  # Isocrates .10
            targets.append({
                "name": "_",
                "target": preY != 91,
                "labels": ["Xenophon", "Others"],
                "outlierName": "Xenop.hunti.1",
                "prettyName": "Cynegeticus"
            })  #  Xenophon .helle.7
            targets.append({
                "name": "_",
                "target": preY != 76,
                "labels": ["Plato", "Others"],
                "outlierName": "Plato.menex.1",
                "prettyName": "Menexenus"
            })  # Plato .laws.8
            #targets.append(genre.labelList[]())

            dataSet = graphUtils.Dataset(tsneX, targets)

            #graphUtils.clickable_tSNE_2D(dataSet, names, -1, saveDir, False)
            graphUtils.tSNE_2D_4Up(dataSet,
                                   names,
                                   True,
                                   saveOutput,
                                   saveDir,
                                   "outliers4Up",
                                   False,
                                   useUMAP=u)
示例#23
0
def keyAuthorComparisonWithImportance(authors, books, baseSaveDir, splitParam,
                                      topWords):
    makeWordImportanceGraphs = False
    keyAuthData = getKeyAuthorData(authors, books)
    saveDir = baseSaveDir + "wordImportance/"
    allDiffLineData = {}
    allCumulDiffLineData = {}
    allRCumulDiffLineData = {}
    allPercentageLineData = {}

    # load diffs for plotting internal similarities
    allDiffsFilename = baseSaveDir + "dists/diffLists.json"
    allDiffs = utils.getContent(allDiffsFilename, True)

    # For each set of key authors, make necessary visaulizations
    for dat in keyAuthData:
        data, _, dataLabels, chartFileName = dat

        print("    %s..." % chartFileName)
        numWords = len(topWords)
        numTexts = len(dataLabels)
        tickLabels = topWords
        distsFilename = baseSaveDir + "dists/" + chartFileName + ".json"
        dists = utils.getContent(distsFilename, True)
        # dists = [
        #     {"name": "D1", "vals": (np.random.random((numWords))*1.5 - 0.5)},
        #     {"name": "D2", "vals": (np.random.random((numWords))*1.5 - 0.5)}
        # ]
        for d in dists:
            d["vals"] = np.array(d["vals"])

        if (makeWordImportanceGraphs):
            graphUtils.wordImportanceComparison(data, dataLabels, tickLabels,
                                                dists, saveDir + "unsorted/",
                                                chartFileName, True)

        # display versions sorted by each metric
        for d in dists:
            sortedSaveDir = saveDir + d["name"] + "-sorted/"
            fname = chartFileName
            sortedInds = np.array(
                list(
                    map(
                        lambda x: x[0],
                        sorted(enumerate(d["vals"]),
                               key=lambda x: x[1][0],
                               reverse=True))))

            data1 = copy.deepcopy(data)
            tickLabels1 = copy.deepcopy(tickLabels)
            wordsUsed = len(topWords)
            # If the similarity metric includes remainder, we have to add it
            if (len(dists[0]["vals"]) == len(data[0]) + 1):
                newData = []
                for row in data1:
                    r = np.append(row, 1 - np.sum(row))
                    newData.append(r)
                data1 = newData

                tickLabels1.append("Remainder")
                wordsUsed += 1

            data2 = list(map(lambda x: np.array(x)[sortedInds], data1))
            tickLabels2 = np.array(tickLabels1)[sortedInds]
            dists2 = copy.deepcopy(dists)
            percentiles = []
            for d2 in dists2:
                d2["vals"] = np.copy(d2["vals"])[sortedInds]

            if (makeWordImportanceGraphs):
                graphUtils.wordImportanceComparison(data2, dataLabels,
                                                    tickLabels2, dists2,
                                                    sortedSaveDir, fname, True)

            # save all words
            if d["name"] == "Jensen-shannon":
                fname = saveDir + "keyWords/" + chartFileName + ".json"
                SimDiff = {}
                for i, val in enumerate(d["vals"][sortedInds]):
                    if (True):
                        SimDiff[tickLabels2[i]] = [i, val[1]]
                utils.safeWrite(fname, SimDiff, True)

            # Diff data
            trueDiffs = np.array(
                list(map(lambda x: x[0], d["vals"][sortedInds])))
            y = (chartFileName, trueDiffs)
            y_cumul = (chartFileName, np.cumsum(trueDiffs))
            linesToGraphDiff = [y]
            linesToGraphDiffCumul = [y_cumul]

            # store info for the chart with all authors
            if d["name"] in allDiffLineData:
                allDiffLineData[d["name"]].extend([y])
            else:
                allDiffLineData[d["name"]] = [y]
            if d["name"] in allCumulDiffLineData:
                allCumulDiffLineData[d["name"]].extend([y_cumul])
            else:
                allCumulDiffLineData[d["name"]] = [y_cumul]

            # dif percentile data
            percentiles = list(map(lambda x: x[1], d["vals"][sortedInds]))
            y = (chartFileName, percentiles)
            linesToGraphPct = [y]

            # store info for the chart with all authors
            if d["name"] in allPercentageLineData:
                allPercentageLineData[d["name"]].append(y)
            else:
                allPercentageLineData[d["name"]] = [y]

            if splitParam == -1:
                # get percentiles for internal consistency of second author
                author1 = dataLabels[0]
                author2 = dataLabels[1]

                authorInternalConsistencies = [
                    # ["split5", author1, "-split5"],
                    # ["split-2", author1, "-splitHalf"],

                    # ["split5", author2, "-split5"],
                    # ["split-2", author2, "-splitHalf"]
                ]

                # Gen information comparing consistencies within given authors.
                for aic in authorInternalConsistencies:
                    a2DiffsFilename = baseSaveDir.replace(
                        "no_split",
                        aic[0]) + "dists/%s_%s_2.json" % (aic[1], aic[1])
                    if (utils.fileExists(a2DiffsFilename)):
                        a2Diffs = utils.getContent(a2DiffsFilename, True)
                        diffNums = None
                        for ad in allDiffs:
                            if ad["name"] == d["name"]:
                                diffNums = ad["allDiffs"]

                        a2RawDiffs = None
                        for ad in a2Diffs:
                            if ad["name"] == d["name"]:
                                a2RawDiffs = ad["vals"]

                        if (diffNums != None and a2RawDiffs != None):
                            # Add difference data
                            aicName = aic[1] + aic[2]
                            a2SortedInds = np.array(
                                list(
                                    map(
                                        lambda x: int(x[0]),
                                        sorted(enumerate(a2RawDiffs),
                                               key=lambda x: x[1][0],
                                               reverse=True))))
                            trueDiffs = np.array(
                                list(
                                    map(lambda x: x[0],
                                        np.array(a2RawDiffs)[a2SortedInds])))
                            y_diff = (aicName, trueDiffs)
                            y_diff_cumul = (aicName, np.cumsum(trueDiffs))
                            linesToGraphDiff.append(y_diff)
                            linesToGraphDiffCumul.append(y_diff_cumul)

                            # Add Percentile data
                            a2Percentiles = []
                            for rd in a2RawDiffs:
                                index = bisect.bisect_left(diffNums, rd[0])
                                a2Percentiles.append(
                                    (100.0 * index) / len(diffNums))

                            a2Percentiles = sorted(a2Percentiles, reverse=True)
                            y2 = (aicName, a2Percentiles)
                            linesToGraphPct.append(y2)
                    else:
                        print("File does not exist: \"%s\"" % a2DiffsFilename)

            # Create charts showing differences for various authors
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiff,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-chart",
                                 yLim=None)  #[-0.002, 0]
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiffCumul,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-cumul-chart",
                                 yLim=None,
                                 yAdjust=1)  #[-0.002, 0]
            #graphUtils.lineChart(range(wordsUsed), linesToGraphPct, True, sortedSaveDir, chartFileName+"_pct-chart")

            linesToGraphDiffRCumul = []
            for name, c in linesToGraphDiffCumul:
                name = name.replace("-split5", " Local Split")
                name = name.replace("-splitHalf", " Global Split")
                linesToGraphDiffRCumul.append((name, c[-1] - np.array(c)))

            if d["name"] in allRCumulDiffLineData:
                allRCumulDiffLineData[d["name"]].extend(
                    [linesToGraphDiffRCumul])
            else:
                allRCumulDiffLineData[d["name"]] = [linesToGraphDiffRCumul]
            graphUtils.lineChart(range(wordsUsed),
                                 linesToGraphDiffRCumul,
                                 True,
                                 sortedSaveDir,
                                 chartFileName + "_diff-r-cumul-chart",
                                 yLim=None,
                                 yAdjust=1)  #[-0.002, 0]

    for d in dists:
        # 4-Up Chart for these authors
        sortedSaveDir = saveDir + d["name"] + "-sorted/"
        graphUtils.lineChart4Up(range(wordsUsed),
                                allRCumulDiffLineData[d["name"]],
                                True,
                                sortedSaveDir,
                                "4up-r-cumul",
                                yLim=None,
                                yAdjust=1)

    # Create graph charts for all data in a cloud
    graphTypes = [
        ("all-diffs", allDiffLineData, None, 0),
        ("all-diffs-cumul", allCumulDiffLineData, None, 1),
        #("all-pcts", allPercentageLineData, [0, 100], 0)
    ]
    alls = {}
    for graphType, lineList, yLim, adjust in graphTypes:
        medFilename = baseSaveDir + "dists/median-%s.json" % graphType
        med = utils.getContent(medFilename, True)

        alls[graphType] = {}
        for d in med:
            lineList[d["name"]].append(["Median", d["line"]])
            alls[graphType][d["name"]] = d["all"]

        for name in allPercentageLineData:
            sortedSaveDir = baseSaveDir + "wordImportance/" + name + "-sorted/"
            for log in [False]:  #, True]:
                print("  %s..." % graphType)
                graphUtils.lineChart(range(wordsUsed),
                                     lineList[name],
                                     True,
                                     sortedSaveDir,
                                     graphType,
                                     yLim=yLim,
                                     log=log,
                                     yAdjust=adjust)
                print("  %s cloud..." % graphType)
                graphUtils.lineChart(range(wordsUsed),
                                     lineList[name],
                                     True,
                                     sortedSaveDir,
                                     graphType + "-cloud",
                                     yLim=yLim,
                                     allLines=alls[graphType][name],
                                     log=log,
                                     yAdjust=adjust)

    # Create chart showing ignored top words
    n = "Jensen-shannon"
    sortedSaveDir = baseSaveDir + "wordImportance/" + n + "-sorted/"

    # Cumulative
    data = allCumulDiffLineData[n]

    # Add lines
    res = []
    targetSim = -1
    for item in alls["all-diffs-cumul"][n]:
        name, c = item
        # "Aristotle_Pindar" in name or

        #"AeliusAristides_Demosthenes", "DioChrysostom_Plato"
        if ("ApolloniusRhodius_QuintusSmyrnaeus" in name
                or "DioChrysostom_Xenophon" == name):
            res.append((name, "-", 1 + c[-1] - np.array(c)))

        # Lowest of our top authors
        if ("DioChrysostom_Xenophon" == name):
            targetSim = c[-1]

    # add median
    # for item in allCumulDiffLineData[n]:
    #     name, c = item
    #     if ("Median" in name):
    #         res.append((name, "-", 1 + c[-1] - np.array(c)))

    # Add line cloud
    resAll = []
    for item in alls["all-diffs-cumul"][n]:
        name, c = item
        if not ("Hymns_Dionysus" in name or "Euclid" in name):
            n1, n2 = name.replace("Hymns_", "Hymns").split("_")
            n1 = n1.replace("Hymns", "Hymns_")
            n2 = n2.replace("Hymns", "Hymns_")
            centuryDiff = centDiff(genre.toCent(n1), genre.toCent(n2))
            #print("%s, %s: %d" % (n1, n2, centuryDiff))
            if (centuryDiff >= 4):
                # color top sims differently
                color = "k-"

                resAll.append((name, color, 1 + c[-1] - np.array(c)))

    # for name, c in data:
    #     y = c[-1] - np.array(c)
    #     res.append((name, y))

    #resAll = map(lambda n, c: (n, c[-1] - np.array(c)))
    graphUtils.compareWordUsageChart(res,
                                     True,
                                     sortedSaveDir,
                                     "ignoreBestWords",
                                     yLim=None,
                                     allLines=resAll)
示例#24
0
def getCenturyInfo(topStr, baseFolder):
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_no_labels.pdf %scentury/centuriesGreek.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_labels.pdf %scentury/extraInfo/Greek_CenturyOverall_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Greek_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)

    # -------------------------
    # Century similarity data
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek_Century_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek+p_Century_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek+p_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_No_Label.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/centuriesGreek2.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels_violin.pdf %scentury/centuriesGreekViolin.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/English_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesEnglish.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesEnglishViolin.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/English_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Icelandic_SimRange.txt"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesIcelandic.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesIcelandicViolin.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Icelandic_Century_Label.pdf"
        % (topStr, baseFolder),
        shell=True)

    # Get pvalue + other regression information for charts
    greekPval = utils.getContent(
        "output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt"
        % (topStr), False)
    englishPval = utils.getContent(
        "output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)
    icelandicPval = utils.getContent(
        "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)

    pvalOutput = []
    pvalOutput.append("Greek:")
    pvalOutput.append(greekPval)
    pvalOutput.append("English:")
    pvalOutput.append(englishPval)
    pvalOutput.append("Icelandic:")
    pvalOutput.append(icelandicPval)

    utils.safeWrite("%scentury/century_pvals.txt" % baseFolder,
                    "\n".join(pvalOutput))
示例#25
0
	#Integrity
		#See with custommer what he wants... for the moment just print the different results:
	if integrity.checkHash(fname) :
		if integrity.checkIntegrity(fname) :
			print "Integrity OK for : " + fname
		else : print "Integrity PROBLEM, the file " + fname + " has been modified"
	else :
		print "No integrity check for : " + fname
	
#####
#TEST

if __name__ == "__main__" :
	fname = "toto"
	putFile(fname)
	print 'ENCYPHERED : ' +  getContent(fname)
	getFile(fname)
	print 'DECYPHERED : ' +  getContent(fname)
	









####################################"
示例#26
0
def gatherFilesFull(topStr, topNum, comparableTopStr, comparableNum,
                    poetryNum):
    baseFolder = "output/full/"

    folders = [
        "",
        "data",
        "genre",
        "metric",
        "metric/extraInfo",
        "century",
        "century/extraInfo",
        "wordUse",
        "wordUse/extraInfo",
        "wordUse/grouping",
    ]
    createFolders(folders, baseFolder)

    # Get info for the data section
    getDataInfo(topStr, baseFolder)

    # Get info for approach section
    getWordUseInfo(topStr, baseFolder)

    # Get genre info
    getGenreInfo(topStr, baseFolder)
    # Gather 4up tsne charts for standard data and data normalized by genre
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/Authors/tSNE/info_no_labels_4Up.pdf %sgenre/groupings.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/outliers4up.pdf %sgenre/bookOutliers.pdf"
        % (topStr, baseFolder),
        shell=True)

    # Get book tsne charts
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_no_labels.pdf %sgenre/books_tSNE_no_labels.pdf"
        % (topStr, baseFolder),
        shell=True)
    subprocess.run(
        "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_labels.pdf %sgenre/books_tSNE_labels.pdf"
        % (topStr, baseFolder),
        shell=True)
    # To get a look at these, run python3 visualizeBooks

    # Get info for standard and normalized by poetry
    makeMLTable("output/greek/no_split/%s/dataPreds/" % (topStr), False,
                "%sgenre/ml_table.tex" % baseFolder)
    # makeMLTable("output/greek/no_split/%s+p/dataPreds/" % (topStr), False, "%sgenre/ml_table+p.tex" % baseFolder)

    # =========================

    # Get info for results section

    # -----------
    # Metric
    getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum,
                  SIM_METRICS, baseFolder)

    makeMetricInternalTables("", topStr, SIM_METRICS, baseFolder)
    makeMetricInternalTables("", topStr + "+p", SIM_METRICS, baseFolder)

    # -----------
    # Century
    # Get information on century comparison
    getCenturyInfo(topStr, baseFolder)
    # Get pvalue + other regression information for charts that are + p
    greekPval = utils.getContent(
        "output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt"
        % (topStr), False)
    englishPval = utils.getContent(
        "output/english/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt"
        % (topStr), False)

    pvalOutput = []
    pvalOutput.append("Greek:")
    pvalOutput.append(greekPval)
    pvalOutput.append("English:")
    pvalOutput.append(englishPval)

    utils.safeWrite("%scentury/century_pvals+p.txt" % baseFolder,
                    "\n".join(pvalOutput))

    # -------------------------
    # Grab this from the best metric
    subprocess.run(
        "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt %swordUse/authorSims.txt"
        % (topStr, baseFolder),
        shell=True)

    fourCenturiesTables(topStr, SIM_METRICS, baseFolder)

    # get word usage charts and info
    getWordUsageInfo(topStr, baseFolder)
示例#27
0
def cleanAndCombineFeatures(texts, approach):

    matrix = []

    textNames = []

    featureNames = []

    numTexts = len(texts)
    # for all the texts
    for i in range(numTexts):
        text = texts[i]
        textName = text["textName"]
        divideByBook = text["divideByBook"]
        toBeCombined = text["toBeCombined"]

        if (toBeCombined or textName == "Iliad1" or textName == "Odyssey1"):
            continue

        ofn = generalUtils.getTextFeatureDataOdikonFn(textName, approach)
        tfn = generalUtils.getTextFeatureDataTamnonFn(textName)

        odikonFeaturesRaw = generalUtils.getContent(ofn, True)
        tamnonFeaturesRaw = generalUtils.getContent(tfn, True)

        if (len(odikonFeaturesRaw) != len(tamnonFeaturesRaw)):
            raise Exception("Number of subtexts for " + textName + " do not match")

        # for each set of features (the books plus the overall text)
        for j in range(len(odikonFeaturesRaw)):
            # get the raw features for this subtext
            ro = odikonFeaturesRaw[j]
            rt = tamnonFeaturesRaw[j]

            # determine the names for these two texts and make sure they match
            roString = ro["TextName"] + ": " + ro["SubName"]
            rtString = rt["TextName"] + ": " + rt["SubName"]
            if (roString != rtString):
                raise Exception("Book mismatch! " + roString + " and " + rtString)

            # add the cleaned features to the row
            row = []
            row.extend(cleanRawOdikon(ro, False))
            row.extend(cleanRawTamnon(rt, False))
            matrix.append(row)
            textNames.append(roString)

            # and one time, get the list of feature names.
            if (i == 0 and j == 0):
                featureNames.extend(cleanRawOdikon(ro, True))
                featureNames.extend(cleanRawTamnon(rt, True))

    # output the information.
    print "Number of Features: %d." % len(matrix[0])
    output = {
    "rowNames": textNames,
    "matrix": matrix,
    "featureNames": featureNames
    }
    fName = generalUtils.getFeatureMatrixFn()
    generalUtils.safeWrite(fName, json.dumps(output))
示例#28
0
def makeMetricInternalTables(suffix, topStr, simMetrics, baseFolder):
    metricInternalTables = []
    for simMetric in simMetrics:
        dir, metricName = simMetric

        # skip Jensen-Shannon
        if metricName == "Jensen-Shannon":
            continue

        tableOutput = []
        temp = """
\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | c | c | c |}
\\hline
        """
        tableOutput.append(temp)

        temp = "\\textbf{Metric Options} & \\textbf{Author} & \\textbf{Work} & \\textbf{Total} \\\\\\hline"
        tableOutput.append(temp)

        workSigReport = []
        authorSigReport = []
        totalSigReport = []

        # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median}

        metricOptions = [("Baseline", "-remainder-smoothed"),
                         ("+1 Smoothing", "-remainder+smoothed"),
                         ("Remainder", "+remainder-smoothed"),
                         ("Both", "+remainder+smoothed")]

        # Get the list of authors and works the metric got correct
        scoreLists = {}
        for _, opt in metricOptions:
            scoreLists[opt] = {}
            name = opt
            # Use Poetry Words
            metricTopStr = topStr

            fname = "output/greek/no_split/%s/%s/metric%s/Books/scores.json" % (
                metricTopStr, dir, opt)
            scores = utils.getContent(fname, True)
            scoreLists[opt] = scores
            scoreLists[opt]["name"] = name

        baseScore = scoreLists["-remainder-smoothed"]
        # baseScores = []
        # for bsi in baseScoreInfo:
        #     baseScoreMetric, baseScoreIndex = bsi
        #     baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex])

        # Create a table of the information using the provided scores
        for optName, opt in metricOptions:
            cell = "\\textbf{%s}" % (optName)

            currentScores = scoreLists[opt]
            authorScores = currentScores["author"]
            workScores = currentScores["work"]
            name = currentScores["name"]
            sameWork = "%.2f%%, (%d/%d)" % (
                100 * np.mean(workScores), np.sum(workScores), len(workScores))
            sameAuth = "%.2f%%, (%d/%d)" % (100 * np.mean(authorScores),
                                            np.sum(authorScores),
                                            len(authorScores))
            all = np.concatenate((workScores, authorScores))
            total = "%.2f%%, (%d/%d)" % (100 * np.mean(all), np.sum(all),
                                         len(all))

            wrk = " & %s" % (sameWork)
            auth = " & %s" % (sameAuth)
            tot = " & %s" % (total)

            # Calculate significance
            a = baseScore["work"]
            b = currentScores["work"]
            work_t, work_p = stats.ttest_rel(a, b)
            workSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            workSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, work_t, work_p)
            workSigReport.append(workSig)

            a = baseScore["author"]
            b = currentScores["author"]
            author_t, author_p = stats.ttest_rel(a, b)
            authorSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            authorSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, author_t, author_p)
            authorSigReport.append(authorSig)

            a = np.concatenate((baseScore["work"], baseScore["author"]))
            b = np.concatenate(
                (currentScores["work"], currentScores["author"]))
            all_t, all_p = stats.ttest_rel(a, b)
            totalSigReport.append(name)
            # Degrees of freedom
            df = len(b) - 1
            totalSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                np.mean(b), np.std(b), df, all_t, all_p)
            totalSigReport.append(totalSig)

            # if (name == bestMetricName or name == baseScore["name"]):
            #     bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"]))
            #     bestMetricSigWork.append(workSig)
            #
            #     bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"]))
            #     bestMetricSigAuthor.append(authorSig)

            #print("  Author: t-statistic = %6.3f pvalue = %f" %  stats.ttest_rel(a, b))

            # Significance notes
            if (work_p < 0.01):
                wrk += "\\textbf{†}"
            elif (work_p < 0.05):
                wrk += "\\textbf{*}"
            if (author_p < 0.01):
                auth += "\\textbf{†}"
            elif (author_p < 0.05):
                auth += "\\textbf{*}"
            if (all_p < 0.01):
                tot += "\\textbf{†}"
            elif (all_p < 0.05):
                tot += "\\textbf{*}"

            # wrk += " %.4f" % work_p
            # auth += " %.4f" % author_p
            # tot += " %.4f" % all_p

            cell += "%s%s%s" % (wrk, auth, tot)

            cell = cell.replace("%", "\\%")
            tableOutput.append("%s\\\\\\hline" % cell)

        tableOutput.append("\\end{tabular}")
        tableOutput.append("\\caption{")
        tableOutput.append(
            "How well %s performs with the remainder words and smoothing included. "
            % metricName)
        tableOutput.append(
            "†: Results very significant (p < 0.01) when compared to baseline. "
        )
        tableOutput.append(
            "*: Results significant (p < 0.05) when compared to baseline. ")
        tableOutput.append("}")
        tableOutput.append("\\label{table:metric_options_eval_%s}" % dir)
        tableOutput.append("\\end{table}")

        tableOutput.append("")
        tableOutput.append("")
        metricInternalTables.append("\n".join(tableOutput))
        utils.safeWrite(
            "%smetric/%s_optionsEvalTable%s.tex" %
            (baseFolder, metricName, suffix), "\n".join(tableOutput))

        # sigReport = "Work:\n" + ("\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ("\n".join(bestMetricSigAuthor))
        # utils.safeWrite("%smetric/bestMetricSignificance%s_2.txt" % (baseFolder, suffix), sigReport)

        # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportWork%s_2.txt" % (baseFolder, suffix), "\n".join(workSigReport))
        # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportAuthor%s_2.txt" % (baseFolder, suffix), "\n".join(authorSigReport))
    utils.safeWrite(
        "%smetric/extraInfo/optionsEvalTables%s.tex" % (baseFolder, suffix),
        "\n".join(metricInternalTables))
示例#29
0
def find_match(message):
    if intime(message):
        cid = getCID(message)
        content = getContent(message)

        match_id = message.text
        match_id = match_id.split()[1]
        try:
            match = api.get_match_details(match_id)

            url = match.url
            request = requests.get(url)
            match_data = request.json()

            if content != "?":
                if request.status_code == 200:
                    hero_list = []
                    if match_data['result']['radiant_win']:
                        title = "Radiant!"
                    else:
                        title = "Dire!"

                    url = "http://www.dotabuff.com/matches/" + match_id

                    radiant_content = ""
                    dire_content = ""
                    for player in match_data['result']['players']:
                        if player['player_slot'] < 100:  # radiant
                            for hero in heroes_list:
                                if hero['id'] == player['hero_id']:
                                    hero_list.append(hero['localized_name'])
                                    radiant_content = (radiant_content +
                                        hero['localized_name'] + " " +
                                        str(player['kills']) + "/" +
                                        str(player['deaths']) + "/" +
                                        str(player['assists']) + '\n')
                        else:  # dire
                            for hero in heroes_list:
                                if hero['id'] == player['hero_id']:
                                    hero_list.append(hero['localized_name'])
                                    dire_content = (dire_content +
                                        hero['localized_name'] + " " +
                                        str(player['kills']) + "/" +
                                        str(player['deaths']) + "/" +
                                        str(player['assists']) + '\n')

                    bot.send_message(
                        cid,
                        'Winner:  *{title}* \n _Radiant:_ \n{radiant}\n _Dire:_\n{dire}\n'
                        .format(title=title, radiant=radiant_content, dire=dire_content)
                        + '[Dotabuff link]({url})'.format(url=url),
                        parse_mode="Markdown",
                        disable_web_page_preview=True)
                else:
                    bot.reply_to(
                        message,
                        "`There has been an error, the number {error} to be specific.`".format(error=request.status_code),
                        parse_mode="Markdown")
            else:
                bot.reply_to(message, "`wat`", parse_mode="Markdown")
        except Exception as ex:
            bot.reply_to(
                message,
                "There has been an error, its message is:\n `{error}`".format(error=ex.msg),
                parse_mode="Markdown")
示例#30
0
def makeMetricEvalTables(suffix, topStr, comparableTopStr, topNum, poetryNum,
                         comparableNum, simMetrics, baseFolder):
    baseScoreInfo = [
        ("Cosine", 0),
        ("Burrows' Delta", 0),
    ]

    bestMetricName = "Jensen-Shannon (250)"  #Jensen-Shannon+p
    bestMetricSigWork = []
    bestMetricSigAuthor = []

    evalTableOutput = []
    evalTableOutput.append("""\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | r | r |}
\\hline
 & \\multicolumn{2}{c|}{\\textbf{Percentage of segments most similar to a segment...}} \\\\

\\textbf{Metric}& \\textbf{from the same work} & \\textbf{by the same author} \\\\\\hline
""")

    sameWorkTableOutput = []
    sameAuthorTableOutput = []
    temp = """\\begin{table}[!bt]
  \\centering
  \\def\\arraystretch{1}
  \\begin{tabular}{| l | c | c | c |}
\\hline
    """
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    temp = "& & \\textbf{Top %d +} & \\\\" % (topNum)
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    temp = "\\textbf{Metric}& \\textbf{Top %d} & \\textbf{Top %d in Poetry} & \\textbf{Top %d} \\\\\\hline" % (
        topNum, poetryNum, comparableNum)
    sameWorkTableOutput.append(temp)
    sameAuthorTableOutput.append(temp)

    workSigReport = []
    authorSigReport = []

    # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median}

    # Get the list of authors and works the metric got correct
    scoreLists = {}
    for simMetric in simMetrics:
        dir, metricName = simMetric
        scoreLists[metricName] = {}
        for i, params in enumerate([
            (False, False),
            (True, False),
            (False, True),
        ]):
            name = metricName
            addP, comparable = params
            metricTopStr = topStr
            if addP:
                metricTopStr += "+p"
                name += "+p"
            # look at comparable number of non-poetry words
            elif comparable:
                metricTopStr = comparableTopStr
                name += " (%d)" % comparableNum
            else:
                name += " (%d)" % topNum

            fname = "output/greek/no_split/%s/%s/metric/Books/scores.json" % (
                metricTopStr, dir)
            scores = utils.getContent(fname, True)
            scoreLists[metricName][i] = scores
            scoreLists[metricName][i]["name"] = name

    baseScores = []
    for bsi in baseScoreInfo:
        baseScoreMetric, baseScoreIndex = bsi
        baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex])

    # Create a table of the information using the provided scores
    for metricName in scoreLists:
        cell2 = "\\textbf{%s}" % (metricName)
        cell3 = "\\textbf{%s}" % (metricName)
        for i in scoreLists[metricName]:
            currentScores = scoreLists[metricName][i]
            authorScores = currentScores["author"]
            workScores = currentScores["work"]
            name = currentScores["name"]
            sameWork = "%.2f%%" % (100 * np.mean(workScores))
            sameAuth = "%.2f%%" % (100 * np.mean(authorScores))
            # sameWork = "%.2f%%, (%d/%d)" % (100*np.mean(workScores), np.sum(workScores), len(workScores))
            # sameAuth = "%.2f%%, (%d/%d)" % (100*np.mean(authorScores), np.sum(authorScores), len(authorScores))

            # cell = "%s & %s & %s & %s & %s & %s" % (name, sameAuth, sameWork, otherWork, diffAuthClosest, median)
            cell = "%s & %s & %s" % (name, sameWork, sameAuth)
            cell = cell.replace("%", "\\%")
            evalTableOutput.append("%s\\\\\\hline" % cell)

            cell2 += " & %s" % (sameWork)  # work_p
            cell3 += " & %s" % (sameAuth)  # , author_p)

            for j, baseScore in enumerate(baseScores):
                a = baseScore["work"]
                b = currentScores["work"]
                work_t, work_p = stats.ttest_rel(a, b)
                workSigReport.append(name)
                # Degrees of freedom
                df = len(b) - 1
                workSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                    np.mean(b), np.std(b), df, work_t, work_p)
                workSigReport.append(workSig)

                a = baseScore["author"]
                b = currentScores["author"]
                author_t, author_p = stats.ttest_rel(a, b)
                authorSigReport.append(name)
                # Degrees of freedom
                df = len(b) - 1
                authorSig = "  (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % (
                    np.mean(b), np.std(b), df, author_t, author_p)
                authorSigReport.append(authorSig)

                if (name == bestMetricName or name == baseScore["name"]):
                    bestMetricSigWork.append("%s vs %s" %
                                             (name, baseScore["name"]))
                    bestMetricSigWork.append(workSig)

                    bestMetricSigAuthor.append("%s vs %s" %
                                               (name, baseScore["name"]))
                    bestMetricSigAuthor.append(authorSig)

                #print("  Author: t-statistic = %6.3f pvalue = %f" %  stats.ttest_rel(a, b))

                # Significance notes
                if (j == 0):
                    if (work_p < 0.01):
                        cell2 += "\\textbf{†}"
                    elif (work_p < 0.05):
                        cell2 += "\\textbf{*}"
                    if (author_p < 0.01):
                        cell3 += "\\textbf{†}"
                    elif (author_p < 0.05):
                        cell3 += "\\textbf{*}"
                else:
                    if (work_p < 0.01):
                        cell2 += "\\textbf{‡}"
                    if (author_p < 0.01):
                        cell3 += "\\textbf{‡}"

        cell2 = cell2.replace("%", "\\%")
        sameWorkTableOutput.append("%s\\\\\\hline" % cell2)

        cell3 = cell3.replace("%", "\\%")
        sameAuthorTableOutput.append("%s\\\\\\hline" % cell3)

    evalTableOutput.append("""
      \\end{tabular}
      \\caption{How well similarity metrics identify whether two segments come from the same work or the same author.}
      \\label{table:metric_eval}
    \\end{table}
    """)

    utils.safeWrite(
        "%smetric/extraInfo/metricEvalTable%s.tex" % (baseFolder, suffix),
        "\n".join(evalTableOutput))

    sameWorkTableOutput.append("\\end{tabular}")
    sameWorkTableOutput.append(
        "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same work.]{"
    )
    sameWorkTableOutput.append(
        "How well similarity metrics based on a given set of words identify whether two segments come from the same work. \\newline"
    )
    sameWorkTableOutput.append(
        "†: Results very significant (p < 0.01) when compared to %s. \\newline"
        % baseScores[0]["name"])
    sameWorkTableOutput.append(
        "*: Results significant (p < 0.05) when compared to %s. \\newline" %
        baseScores[0]["name"])
    sameWorkTableOutput.append(
        "‡: Results very significant (p < 0.01) when compared to %s. " %
        baseScores[1]["name"])
    sameWorkTableOutput.append("}")
    sameWorkTableOutput.append("\\label{table:metric_eval_work}")
    sameWorkTableOutput.append("\\end{table}")

    utils.safeWrite("%smetric/sameWorkEvalTable%s.tex" % (baseFolder, suffix),
                    "\n".join(sameWorkTableOutput))

    sameAuthorTableOutput.append("\\end{tabular}")
    sameAuthorTableOutput.append(
        "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same author.]{"
    )
    sameAuthorTableOutput.append(
        "How well similarity metrics based on a given set of words identify whether two segments come from the same author. \\newline"
    )
    sameAuthorTableOutput.append(
        "†: Results very significant (p < 0.01) when compared to %s. \\newline"
        % baseScores[0]["name"])
    sameAuthorTableOutput.append(
        "*: Results significant (p < 0.05) when compared to %s. \\newline" %
        baseScores[0]["name"])
    sameAuthorTableOutput.append(
        "‡: Results very significant (p < 0.01) when compared to %s. " %
        baseScores[1]["name"])
    sameAuthorTableOutput.append("}")
    sameAuthorTableOutput.append("\\label{table:metric_eval_author}")
    sameAuthorTableOutput.append("\\end{table}")

    utils.safeWrite(
        "%smetric/sameAuthorEvalTable%s.tex" % (baseFolder, suffix),
        "\n".join(sameAuthorTableOutput))

    sigReport = "Work:\n" + (
        "\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + (
            "\n".join(bestMetricSigAuthor))
    utils.safeWrite(
        "%smetric/bestMetricSignificance%s.txt" % (baseFolder, suffix),
        sigReport)
    # utils.safeWrite("%smetric/bestMetricSignificanceWork%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigWork))
    # utils.safeWrite("%smetric/bestMetricSignificanceAuthor%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigAuthor))

    utils.safeWrite(
        "%smetric/extraInfo/metricSignificanceReportWork%s.txt" %
        (baseFolder, suffix), "\n".join(workSigReport))
    utils.safeWrite(
        "%smetric/extraInfo/metricSignificanceReportAuthor%s.txt" %
        (baseFolder, suffix), "\n".join(authorSigReport))
示例#31
0
RAW_FOLDER = "rawTexts/"
PARSED_FOLDER = "texts/"


# given a location, convert it from XML to the format we want
def convertBook(location):
    filename = loc.replace(RAW_FOLDER, "")
    newLoc = PARSED_FOLDER + filename
    t = utils.XMLText(loc)
    res = t.convertFromXML()
    utils.safeWrite(newLoc, res, True)
    return newLoc, res["booksRaw"]


# get the available texts and count them up
available = utils.getContent(RAW_FOLDER + "available.json", True)

numTexts = 0
for o in available:
    workLocs = o["works"]
    for w in workLocs:
        numTexts += 1

# Parse each book
i = 1
allBooks = []
for o in available:
    workLocs = o["works"]
    for w in workLocs:
        if (i % 20 == 0):
            print("%d out of %d (%.2f%%)" % (i, numTexts,
示例#32
0
def fourCenturiesTables(topStr, simMetrics, baseFolder):
    comparisonOutput = []
    topSimsToExamine = 100

    # Grab this from the best metric
    authorSims = utils.getContent(
        "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    topDistantSims = []
    topDistantAuthors = {}
    for i, sim in enumerate(authorSims):
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 4 and i < topSimsToExamine):
            topDistantSims.append(sim)
            topDistantAuthors[sim[11:]] = {}

        authors = " (".join(sim.split(" - ")[1].split(" (")[:-1])
        if authors == "Isocrates, Lysias" or authors == "Plato, Xenophon" or authors == "AratusSolensis, Callimachus" or authors == "Herodotus, Thucydides":
            comparisonOutput.append("Rank %d: %s" % (i + 1, sim))

    fourCenturiesApartOutput = []
    fourCenturiesApartOutput.append(
        "%d of the top %d are at least 4 centuries apart." %
        (len(topDistantSims), topSimsToExamine))
    fourCenturiesApartOutput.append("---")
    fourCenturiesApartOutput.extend(topDistantSims)

    utils.safeWrite("%swordUse/fourCenturiesApart.txt" % baseFolder,
                    "\n".join(fourCenturiesApartOutput))

    # Comparison to English and Icelandic
    numGreek = len(authorSims)
    fracGreek = topSimsToExamine / numGreek
    numDistantGreek = len(topDistantSims)

    englishSims = utils.getContent(
        "output/english/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    numEnglish = len(englishSims)
    topSimsEnglish = int(np.ceil(numEnglish * fracGreek))
    fracEnglish = topSimsEnglish / numEnglish
    numDistantEnglish = 0
    num2English = 0
    for sim in englishSims[:topSimsEnglish]:
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 2):
            num2English += 1
        if (centuries_apart >= 4):
            numDistantEnglish += 1

    iceSims = utils.getContent(
        "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/sims.txt" %
        (topStr), False).split("\n")
    numIcelandic = len(iceSims)
    topSimsIcelandic = int(np.ceil(numIcelandic * fracGreek))
    fracIcelandic = topSimsIcelandic / numIcelandic
    numDistantIcelandic = 0
    for sim in iceSims[:topSimsIcelandic]:
        centuries_apart = int(sim.split("(")[-1].split(" ")[0])
        if (centuries_apart >= 4):
            numDistantIcelandic += 1

    comparisonOutput.append("\n=========\n")
    comparisonOutput.append("Top similar pairs")
    comparisonOutput.append("Greek:")
    comparisonOutput.append("  examining top %d of %d pairs (%.2f%%)" %
                            (topSimsToExamine, numGreek, 100 * fracGreek))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantGreek, 100 * numDistantGreek / topSimsToExamine))
    comparisonOutput.append("English:")
    comparisonOutput.append("  examining top %d of %d pairs (%.2f%%)" %
                            (topSimsEnglish, numEnglish, 100 * fracEnglish))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantEnglish, 100 * numDistantEnglish / topSimsEnglish))
    comparisonOutput.append("  %d (%.2f%%) are at least 2 centuries apart" %
                            (num2English, 100 * num2English / topSimsEnglish))
    comparisonOutput.append("Icelandic:")
    comparisonOutput.append(
        "  examining top %d of %d pairs (%.2f%%)" %
        (topSimsIcelandic, numIcelandic, 100 * fracIcelandic))
    comparisonOutput.append(
        "  %d (%.2f%%) are at least 4 centuries apart" %
        (numDistantIcelandic, 100 * numDistantIcelandic / topSimsIcelandic))

    utils.safeWrite("%swordUse/fourApartComparisonInfo.txt" % baseFolder,
                    "\n".join(comparisonOutput))

    # Table
    for simMetric in simMetrics:
        dir, name = simMetric
        # "" or "+p" depending on which is better
        metricSims = utils.getContent(
            "output/greek/no_split/%s/%s/metric/Authors/sims.txt" %
            (topStr, dir), False).split("\n")
        for i, sim in enumerate(metricSims):
            pairName = sim[11:]
            if pairName in topDistantAuthors:
                topDistantAuthors[pairName][dir] = i + 1

    # prepare values for coloring table cells
    maxVal = 0
    minVal = 1000000

    for authorPair in topDistantAuthors:
        for simDir, _ in simMetrics:
            val = topDistantAuthors[authorPair][simDir]
            minVal = min(minVal, val)
            maxVal = max(maxVal, val)

    pairRankOutput = []
    pairRankOutputSimple = []
    pairRankOutput.append("""
    \\begin{table}[!bt]
      \\centering
      \\def\\arraystretch{1}
      \\begin{tabular}{| l | c | c | c | c | c | c |}
    \\hline
    & \\multicolumn{5}{c|}{\\textbf{Rank according to}} \\\\
    & \\textbf{Jensen-} & \\textbf{Burrows'} & & & & \\\\
    \\textbf{Authors} & \\textbf{Shannon} & \\textbf{Delta} & \\textbf{Min-Max} & \\textbf{Manhattan} & \\textbf{Canberra} & \\textbf{Cosine} \\\\\\hline
    """)
    pairRankOutputSimple.append("%s,%s,%s,%s,%s,%s,%s" %
                                ("Authors", "Jensen-Shannon", "Burrow's Delta",
                                 "Min-Max", "Manhattan", "Canberra", "Cosine"))
    authorConvert = {
        "ApolloniusRhodius": "Apollonius",
        "DionysiusOfHalicarnassus": "Dionysius",
        "EusebiusOfCaesarea": "Eusebius",
        "ClementOfAlexandria": "Clement",
        "BasilBishopOfCaesarea": "Basil",
        "Anonymous(Hymns_Aphrodite)": "Hymns Aphrodite",
        "Anonymous(Hymns_Apollo)": "Hymns Apollo",
        "Anonymous(Hymns_Demeter)": "Hymns Demeter",
        "Anonymous(Hymns_Hermes)": "Hymns Hermes",
        "Anonymous(Hymns_Rest)": "Hymns Rest",
    }
    for authorPair in topDistantAuthors:
        pair = "(".join(authorPair.split(" (")[:-1])
        pairSplit = pair.split(", ")
        author1 = pairSplit[0]
        author2 = pairSplit[1]

        if author1 in authorConvert:
            author1 = authorConvert[author1]
        if author2 in authorConvert:
            author2 = authorConvert[author2]

        pairName = author1 + ", " + author2
        cell = "%s &" % pairName
        cellSimple = "%s," % re.sub(", ", "/", pairName)
        firstVal = None
        for simDir, _ in simMetrics:
            val = topDistantAuthors[authorPair][simDir]

            cutoff = 100
            if (val < cutoff):
                r, g, b = colorConvert(minVal, cutoff, val, COLOR_ORANGE,
                                       COLOR_GRAY)
            else:
                r, g, b = colorConvert(cutoff, maxVal, val, COLOR_GRAY,
                                       COLOR_BLUE)
            cell += "\\cellcolor[rgb]{%.3f,%.3f,%.3f} " % (r, g, b)

            if (firstVal == None):
                firstVal = val
                cell += "%d & " % (val)
                cellSimple += "%d," % (val)
            else:
                cell += "%d (%+d) & " % (val, firstVal - val)
                rel = "(%d)" % (firstVal - val)
                cellSimple += "%d %s," % (val, rel)
        cell = cell[:-2]
        pairRankOutput.append("%s\\\\\\hline" % cell)
        pairRankOutputSimple.append(cellSimple)
    pairRankOutput.append("""
      \\end{tabular}
      \\caption{Rank of these pair's similarity by different metrics.}
      \\label{table:pair_rank}
    \\end{table}
    """)

    utils.safeWrite("%swordUse/pairRankTable.tex" % baseFolder,
                    "\n".join(pairRankOutput))
    utils.safeWrite("%swordUse/pairRankTableSimple.csv" % baseFolder,
                    "\n".join(pairRankOutputSimple))