def getSkippedWordInfo(baseFolder): output = [] splitter = "\n------\n" output.append("Greek:\n") output.append( utils.getContent("output/greek/no_split/top250/chosenWordInfo.txt", False)) output.append("\nPoetry:") output.append( utils.getContent( "output/greek/no_split/top250+p/chosenWordInfoPoetry.txt", False)) output.append(splitter) output.append("English:\n") output.append( utils.getContent("output/english/no_split/top250/chosenWordInfo.txt", False)) output.append("\nPoetry:") output.append( utils.getContent( "output/english/no_split/top250+p/chosenWordInfoPoetry.txt", False)) output.append(splitter) output.append("Icelandic:\n") output.append( utils.getContent("output/icelandic/no_split/top250/chosenWordInfo.txt", False)) output.append(splitter) utils.safeWrite("%s/skippedWords.txt" % baseFolder, "\n".join(output))
def getTokenColorMap(saveDir, topWords, topName): numTops = len(topWords) tokenMap = [] usePrecomputed = True if (usePrecomputed): fname = "%scolorByIndex.json" % (saveDir) colors = utils.getContent(fname, True) for c in colors: tokenMap.append((c[0], c[1], c[2])) else: fname = "%s../wordCountData/wordPrincipalComponents_%d.json" % ( saveDir, topName) components = np.array(utils.getContent(fname, True)) skipFirst = 4 minVals = np.min(wordPCAFitTransform(components), axis=0) valRange = np.max(wordPCAFitTransform(components), axis=0) - minVals normalizedComponents = np.round(255 * np.clip( (components - minVals) / valRange, 0, 1)) for i in range(numTops): comps = normalizedComponents[i] rgb = (int(comps[0]), int(comps[1]), int(comps[2])) tokenMap.append(rgb) width = 400 height = 20 * numTops im = Image.new("RGB", (width, height), "#FFFFFF") # get drawing context d = ImageDraw.Draw(im) # get a font fnt = ImageFont.truetype('fonts/DejaVuSans.ttf', 16) includedColors = {} colorList = [] for i in range(numTops): rgb = tokenMap[i] baseY = 20 * i colorValuesText = "(%03d,%03d,%03d) " % rgb # keep track of each new color if not (colorValuesText in includedColors): includedColors[colorValuesText] = True colorList.append(rgb) text = colorValuesText + topWords[i] d.text((50, baseY + 2), text, font=fnt, fill=(0, 0, 0)) d.rectangle(((10, baseY + 2), (40, baseY + 18)), fill=rgb) fname = saveDir + "images/key.png" utils.check_and_create_path(fname) im.save(fname) return tokenMap, colorList
def getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder): # Copy full eval files for jensen-shannon subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_tops.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Books/comparisonInfo.txt %smetric/extraInfo/metricEvaluation_+p.txt" % (topStr, baseFolder), shell=True) # Grab median distance fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/comparisonInfo.txt" % ( topStr) metricEvalInfo = utils.getContent( fname, False).split("=========")[-2].split("\n")[2:-1] sameAuthorRanks = [] for i, line in enumerate(metricEvalInfo): sameAuthorRank = line.split("with same author: ")[1].split(".")[0] sameAuthorRanks.append(int(sameAuthorRank)) median = np.median(sameAuthorRanks) utils.safeWrite( "%smetric/extraInfo/medianForDifferentAuthor.txt" % (baseFolder), "Median distance for closest author: %f" % median) # get info on the indica subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Books/sims/Arrian.Indica.1.txt %smetric/extraInfo/arrianIndica.txt" % (topStr, baseFolder), shell=True) # Info on book distance # Grab this from the best metric fname = "output/greek/no_split/%s/jensen-shannon/metric/Books/sims.txt" % ( topStr) allBookSims = utils.getContent(fname, False).split("\n") utils.safeWrite("%smetric/lowestSimilarity.txt" % (baseFolder), "Lowest similarity between segments: %s" % allBookSims[-1]) # Info on top similar authors makeTopAuthorTable(topStr, baseFolder) # =============================== makeMetricEvalTables("", topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder)
def dota_news(message): if intime(message): cid = getCID(message) content = getContent(message) url = "http://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/?appid=570&count=1&maxlength=300&format=json" request = requests.get(url) data = request.json() if content != "?": if request.status_code == 200: title = data['appnews']['newsitems'][0]['title'] content = data['appnews']['newsitems'][0]['contents'] content_nice = content.replace(" - ", "\n - ") content_nice = content_nice.replace("*", "\n*") content_nice = parser.unescape(content_nice) url = data['appnews']['newsitems'][0]['url'] bot.send_message( cid, u'*{title}*```> \n{content_nice}\n[...]\n```'.format(title=title, content_nice=content_nice) + u'[More info here]({url})'.format(url=url), parse_mode="Markdown", disable_web_page_preview=True) else: bot.reply_to( message, "`There has been an error, the number {error} to be specific.`" .format(error=request.status_code), parse_mode="Markdown") else: bot.reply_to( message, "`Send this command alone and I will show you the last Steam News for Dota2 entry`", parse_mode="Markdown")
def loadWCData(saveDir, dataSplit, topName, type=""): wcData = utils.getContent(getWCFilename(saveDir, topName, type), True) # load author data authors = [] for key in wcData["authors"]: a = wcData["authors"][key] authorName = a["name"] auth = utils.Author(authorName) auth.counts = a["counts"] auth.totalTokenCount = np.sum(a["counts"]) authors.append(auth) # load book data books = [] for key in wcData["books"]: b = wcData["books"][key] raw = {"bookText": "", "bookNumber": b["number"]} book = utils.Book(raw, b["name"], b["author"]) book.counts = b["counts"] book.numTokens = np.sum(b["counts"]) books.append(book) topWords = wcData["topWords"] calculateFrequencies(authors, books, topWords) return authors, books, topWords
def makeTopAuthorTable(topStr, baseFolder): # Grab this from the best metric fname = "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % ( topStr) allAuthorSims = utils.getContent(fname, False).split("\n") topAuthorPairs = [] topAuthorPairs.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1.2} \\begin{tabular}{| r | l | l | l | l |} \\hline & \\textbf{Author 1} & \\textbf{Author 2} & \\textbf{Score} & \\textbf{Notes} \\\\\\hline """) for i, pair in enumerate(allAuthorSims[:10]): splt1 = pair.split(" - ") sim = splt1[0] auths = splt1[1].split(" (")[0].split(", ") topAuthorPairs.append(" %.2d & %s & %s & %s & TODO \\\\\\hline" % (i + 1, auths[0], auths[1], sim)) topAuthorPairs.append(""" \\end{tabular} \\caption{Top author pairs by similarity score according to Jensen-Shannon Similarity.} \\label{table:top_author_pairs} \\end{table} """) utils.safeWrite("%smetric/topAuthorPairs.tex" % baseFolder, "\n".join(topAuthorPairs))
def pro_matches(message): """Gets recent pro matches, will give a number of matches equal to argument.""" default_number_of_posts = 5 posts_max = 20 if intime(message): cid = getCID(message) param = getContent(message) try: param = int(param) except ValueError: param = 0 number_of_posts = param if 0 < param <= posts_max else default_number_of_posts open_dota_url = 'https://api.opendota.com/api/proMatches' response = requests.get(open_dota_url) response_json = response.json() # Array of 100 most recent pro matches matches_json = response_json[:number_of_posts] matches_text = [] for match_json in matches_json: matches_text.append(match_short_description(match_json)) message_text = 'Last {number} pro matches:'.format( number=number_of_posts) for match_text in matches_text: message_text = message_text + '\n{match}'.format(match=match_text) bot.send_message(cid, message_text, disable_web_page_preview=True, parse_mode="Markdown")
def getTokenColorMapMultiRun(saveDir, topWords, topName): numTops = len(topWords) tokenMaps = [] fnames = ["%scolorByIndex.json" % (saveDir)] for file in utils.listFiles("%sextra_runs/" % saveDir): fnames.append("%sextra_runs/%s" % (saveDir, file)) for fname in fnames: colors = utils.getContent(fname, True) tokenMap = [] for c in colors: tokenMap.append((c[0], c[1], c[2])) tokenMaps.append(tokenMap) text_end = 0 rect_width = 12 rect_margin_h = 5 rect_height = 5 rect_top = 1 rect_bottom = 4 width = text_end + len(tokenMaps) * (rect_margin_h + rect_width) + rect_margin_h height = rect_height * numTops + 5 im = Image.new("RGB", (width, height), "#FFFFFF") # get drawing context d = ImageDraw.Draw(im) # get a font fnt = ImageFont.truetype('fonts/DejaVuSans.ttf', int(0.8 * rect_height)) includedColors = {} colorList = [] # draw text labels for i in range(numTops): baseY = rect_height * i # text = topWords[i] # text_width, _ = d.textsize(text, font=fnt) # # d.text((text_end - text_width,baseY+rect_top), text, font=fnt, fill=(0, 0, 0)) rect_right = text_end # draw groupings for this word for tm in tokenMaps: rgb = tm[i] rect_left = rect_right + rect_margin_h rect_right = rect_left + rect_width d.rectangle(((rect_left, baseY + rect_top), (rect_right, baseY + rect_bottom)), fill=rgb) fname = saveDir + "images/groupingCompare.png" utils.check_and_create_path(fname) im.save(fname)
def computeHash(fname): """ Pattern-like """ content = getContent(fname) # Hash hf = hashlib.md5() # Possibility to change hash function hf.update(content) h = hf.digest() return h
def getTextCounts(textLocation, saveDir): subprocess.run("cp %savailable.json %savailable.json" % (textLocation, saveDir), shell=True) available = utils.getContent(textLocation + "available.json", True) # For each available text for i, o in enumerate(available): if (i % 20 == 0): print(i, end=" ", flush=True) workLocs = o["works"] # Process each work for w in workLocs: t = utils.getContent(w["location"], True) booksRaw = t["booksRaw"] booksCounts = [] for b in booksRaw: rawTokens = re.sub(r'\.,;:᾽῾\'', "", b["bookText"]).split(" ") tokenCounts = {} for token in rawTokens: if (token == ""): continue if not (token in tokenCounts): tokenCounts[token] = 1 else: tokenCounts[token] += 1 bookWithCounts = {} bookWithCounts["bookNumber"] = b["bookNumber"] bookWithCounts["bookTokenCounts"] = tokenCounts bookWithCounts["bookText"] = "" booksCounts.append(bookWithCounts) t["booksRaw"] = booksCounts # Remove "texts/" from start filename = "textCounts/" + w["location"][6:] utils.safeWrite(filename, t, True)
def process(logfile): """ Take a given log file and return the system name, along with all the dated entries in a dictionary """ print('processing log file:', logfile) loglines = getContent( logfile) # getContent() returns reversed lines for pop() return gather_data(logfile, loglines) # gather_data() returns sysname, sysobjlist
def getOverlapInfo(baseFolder): output = [] splitter = "\n------\n" output.append("Greek:\n") output.append( utils.getContent("output/greek/topWordOverlapOverTime.txt", False)) output.append(splitter) output.append("English:\n") output.append( utils.getContent("output/english/topWordOverlapOverTime.txt", False)) output.append(splitter) output.append("Icelandic:\n") output.append( utils.getContent("output/icelandic/topWordOverlapOverTime.txt", False)) output.append(splitter) utils.safeWrite("%s/topWordOverlapOverTime.txt" % baseFolder, "\n".join(output))
def combineTexts(textName, sourceTexts): allLines = [] for source in sourceTexts: inFileName = utils.getTextFn(source) lines = utils.getContent(inFileName, True) allLines.extend(lines) jsonDump = json.dumps(allLines) outFileName = utils.getTextFn(textName) utils.safeWrite(outFileName, jsonDump)
def makeMLTable(source, norm, filename): output = [] output.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1.2} """) # No naive bayes if normed due to negative data if norm: output.append(" \\begin{tabular}{| r | l | l |} \\hline") output.append( " \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} \\\\\\hline" ) else: output.append(" \\begin{tabular}{| r | l | l | l |} \\hline") output.append( " \\textbf{Prediction Task} & \\textbf{Majority Class} & \\textbf{KNN} & \\textbf{Naive Bayes} \\\\\\hline" ) for t in ["Authors", "Books", "Books_2"]: cats = ["genre", "dialect", "timeframe"] if (t == "Books"): cats.append("author") if (t == "Books_2"): cats = ["work", "genre", "dialect", "timeframe", "author"] for cat in cats: fname = source + "res_%s_%s.txt" % (cat, t) lines = utils.getContent(fname, False).split("\n") maj_class = lines[1].split(" - ")[0].strip() knn = lines[2].split(" - ")[0].strip() naive_bayes = lines[3].split(" - ")[0].strip() t_name = t if t_name == "Books": t_name = "Segments" if t_name == "Books_2": t_name = "Segments*" if norm: output.append(" %s of %s & %s & %s \\\\\\hline" % (cat, t_name, maj_class, knn)) else: output.append(" %s of %s & %s & %s & %s \\\\\\hline" % (cat, t_name, maj_class, knn, naive_bayes)) output.append(""" \\end{tabular} \\caption{Results of running simple machine learning on the frequency data.} \\label{table:ml+p} \\end{table} """) utils.safeWrite(filename, "\n".join(output))
def getAuthorBookCounts(baseFolder): ab_counts_output = [] splitter = "\n------\n" ab_counts_output.append("Greek:\n") ab_counts_output.append( utils.getContent("output/greek/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/greek/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) ab_counts_output.append("English:\n") ab_counts_output.append( utils.getContent("output/english/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/english/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) ab_counts_output.append("Icelandic:\n") ab_counts_output.append( utils.getContent("output/icelandic/numberOfAuthors_Books.txt", False)) ab_counts_output.append( utils.getContent("output/icelandic/numberOfTypes_Tokens.txt", False)) ab_counts_output.append(splitter) utils.safeWrite("%s/AuthorBookNumbers.txt" % baseFolder, "\n".join(ab_counts_output))
def printKeyWords(dataSplit, top, subsetSize, language, saveDirBase): topName, _, _ = top # calculate save directory based on input parameters saveDir = saveDirBase + "%s" % (topName) keyWordsDir = saveDir + "/wordImportance/keyWords/" # find all the relevant json files files = os.listdir(keyWordsDir) for f in files: if f[-5:] == ".json": nameCore = f.split(".json")[0] # get the word info for this author pair words = utils.getContent(keyWordsDir + f, True) # get the authors authors = nameCore.split("_") a1 = authors[0] a2 = authors[1] print(a1, a2) # save dir for new files wordsDir = keyWordsDir + nameCore + "/" # gather the list of words and print them out along with percentiles wordList = [] out = ["index, percentile, token"] for word in words: wordList.append("%03d_%s" % (words[word][0] + 1, word)) out.append("%d, %.2f, %s" % (words[word][0], words[word][1], word)) utils.safeWrite(wordsDir + "words.txt", "\n".join(out)) # get the info for each occurrence of the given words # associated with these authors target = { a1: wordList, a2: wordList, } printOccs(wordsDir, target, language)
def visualizeWordOrder(authors, books, baseSaveDir, topWords, topName): baseSaveDir += "textsOnlyTopWords/" for numGroups in getWordGroupsRange(len(topWords)): if (numGroups == -1): print(" part of speech groups...") saveDir = "%spos_group/" % (baseSaveDir) else: print(" %d groups..." % numGroups) saveDir = "%s%d_group/" % (baseSaveDir, numGroups) # generate color map tokenToColor, colorList = getTokenColorMap(saveDir, topWords, topName) maxHeight = 0 bars = {} # Create visualizations of each individual word colored by its group. for author in authors: # get author tokens as a block fname = baseSaveDir + "lists/authors/" + author.getSaveName( ) + ".json" tokens = utils.getContent(fname, True) arr = [] for t in tokens: arr.append(tokenToColor[t]) counts = barsFromRGBArray(arr, author.totalTokenCount, colorList) bars[author.authorName] = counts mh = np.max(counts) if mh > maxHeight: maxHeight = mh # This is not used for the paper but is rather interesting, as it lets # you potentially *see* different books, and certainly lets you see # different word usage # fname = saveDir + "images/authors_in_order/" + author.getSaveName() + ".png" # imageFromRGBArray(arr, fname) # # # get author's tokens divided by book # bookTokensArr = [] # for book in books: # if book.author == author.authorName: # fname = baseSaveDir + "lists/books/" + book.getSaveName() + ".json" # tokens = utils.getContent(fname, True) # bookArr = [] # for t in tokens: # bookArr.append(tokenToColor[t]) # bookTokensArr.append(bookArr) # # fname = saveDir + "images/authors-divided/" + author.getSaveName() + ".png" # imageFromRGBArrays(bookTokensArr, fname) # Graph word use bar charts now that we know the maximum scale. yHeight = (np.ceil(maxHeight * 100.0)) / 100.0 groupLabels = utils.getContent(saveDir + "groupLabels.json", True) title = "Group Frequency" # for author in authors: # fname = saveDir + "images/authors_bars/" + author.getSaveName() # graphUtils.wordUseBarChart(bars[author.authorName], colorList, yHeight, groupLabels, title, fname) quadList = [ ("demosthenesHomer", ["AeliusAristides", "Demosthenes", "ApolloniusRhodius", "Homer"]), ("clementThucydides", ["JohnOfDamascus", "ClementOfAlexandria", "Appian", "Thucydides"]), ] for saveName, quad in quadList: fname = saveDir + "images/" + saveName counts4 = [] for authorName in quad: counts4.append(bars[authorName]) graphUtils.wordUseBarChart4Up(counts4, colorList, yHeight, groupLabels, quad, fname) #octo = ["ApolloniusRhodius", "Homer", "AeliusAristides", "Demosthenes", "Appian", "Thucydides", "JohnOfDamascus", "ClementOfAlexandria"] octo = [ "Homer", "ApolloniusRhodius", "Demosthenes", "AeliusAristides", "Thucydides", "Appian", "ClementOfAlexandria", "JohnOfDamascus" ] fname = saveDir + "images/dhct" counts8 = [] for authorName in octo: counts8.append(bars[authorName]) graphUtils.wordUseBarChart8Up(counts8, colorList, yHeight, groupLabels, octo, fname) #utils.safeWrite(saveDir+ "textsOnlyTopWords/images/authors/" + author.getSaveName() + ".json", tokens, dumpJSON=True) # Group by author numGroups = len(colorList) groups = [] for i in range(numGroups): groups.append([]) for author in bars: for i in range(numGroups): groups[i].append([author, bars[author][i]]) for i, group in enumerate(groups): groupName = groupLabels[i] g = sorted(group, key=lambda x: x[1], reverse=True) tickLabels = [] data = [] dataErr = [] # for each author for a in g: tickLabels.append(a[0].replace("Anonymous", "Anon ")) data.append(a[1]) dataErr.append(0) fname = "byAuthor/%.2d_%s" % (i + 1, groupName) graphUtils.authorWordUseHistogram(data, ["Freq"], tickLabels, "Word usage for group %s" % groupName, "Frequency", saveDir, fname, True, color=colorList[i])
def groupWordsMultipleRuns(topName, topWords, baseSaveDir): fname = baseSaveDir + "wordCountData/wordCountByText_%s.json" % (topName) rawCounts = np.array(utils.getContent(fname, True)) tokensByItem = np.sum(rawCounts, axis=0) data = rawCounts / tokensByItem baseSaveDir = baseSaveDir + "textsOnlyTopWords/" # cluster data for numGroups in getWordGroupsRangeTest(len(topWords)): fname = "%s%d_group/colorByIndex.json" % (baseSaveDir, numGroups) baseColorsRaw = utils.getContent(fname, True) baseColors = [] colorIndices = {} indexToColor = {} indexToColorName = [ "Black", "Yellow", "Purple", "Orange", "Blue", "Red", "Tan", "Gray", "Green" ] numColors = 0 labelToIndicesBase = {} for i, c in enumerate(baseColorsRaw): colorString = "%d,%d,%d" % (c[0], c[1], c[2]) baseColors.append(colorString) if not (colorString in colorIndices): colorIndices[colorString] = numColors indexToColor[numColors] = colorString numColors += 1 labelToIndicesBase[colorString] = [i] else: labelToIndicesBase[colorString].append(i) saveDir = "%s%d_group/extra_runs/" % (baseSaveDir, numGroups) print(" calculating extra for %.2d groups..." % numGroups, end=" ", flush=True) # we already ran the 0th run for run in range(1, RUNS): print(run, end=" ", flush=True) startOffset = 1 # Make deterministic using group seed # 10000 kmeans = cluster.KMeans(n_clusters=numGroups, n_init=1000, random_state=GROUP_SEED + run) kmeans.fit(data) wordLabels = kmeans.labels_ # rename groups and keep track of color associated with each word # get the indices for each label labelToIndices = {} maxLabel = -1 for i in range(len(topWords)): label = wordLabels[i] if label in labelToIndices: labelToIndices[label].append(i) else: labelToIndices[label] = [i] if label > maxLabel: maxLabel = label # store colors used already takenColors = {} # this will convert from label to color labelToColor = {} # store labels already assigned takenLabels = {} unassignedColors = [] # Go through each color in original grouping, assign it to the # group in this grouping that most closely matches it. for i in range(maxLabel + 1): labelCounts = np.full((numColors), 0) for j in labelToIndicesBase[indexToColor[i]]: labelCounts[wordLabels[j]] += 1 # find colors with highest overlap bestLabels = np.flipud(np.argsort(labelCounts)) #print(labelCounts) #print(bestLabels) for j in bestLabels: # If there an no longer any matches if (labelCounts[j] == 0): # print("No valid label for color %s" % indexToColorName[i]) unassignedColors.append(i) break # print("trying to assign color %s to best label %d" % (indexToColorName[i], j)) if not (j in takenLabels): splt = indexToColor[i].split(",") labelToColor[j] = (int(splt[0]), int(splt[1]), int(splt[2])) takenLabels[j] = True #print(labelToIndices[j]) break # print("---") # assing labels that aren't taken for i in range(maxLabel + 1): # ignore taken colors if i in labelToColor: continue freeColorIndex = unassignedColors[0] splt = indexToColor[freeColorIndex].split(",") labelToColor[i] = (int(splt[0]), int(splt[1]), int(splt[2])) unassignedColors = unassignedColors[1:] # print("========") colorsUsed = [] for i in range(len(topWords)): colorsUsed.append(labelToColor[wordLabels[i]]) # save used colors fname = "%s/groups_%.3d.json" % (saveDir, run) utils.safeWrite(fname, colorsUsed, True) print("")
def groupAndPlotWords(topName, topWords, wordToPOS, baseSaveDir, groupings=None): fname = baseSaveDir + "wordCountData/wordCountByText_%s.json" % (topName) rawCounts = np.array(utils.getContent(fname, True)) tokensByItem = np.sum(rawCounts, axis=0) data = rawCounts / tokensByItem baseSaveDir = baseSaveDir + "textsOnlyTopWords/" if (groupings == None): groupings = getWordGroupsRange(len(topWords)) #print(data) # cluster data for numGroups in groupings: startOffset = 1 if (numGroups == -1): saveDir = "%spos_group/" % (baseSaveDir) print(" calculating for part of speech groups") wordLabels = [] for word in topWords: if not (word in wordToPOS): raise Exception( "Word %s not in part of speech dictionary" % word) wordLabels.append(wordToPOS[word]) else: saveDir = "%s%d_group/" % (baseSaveDir, numGroups) print(" calculating for %d groups" % numGroups) # Make deterministic using group seed kmeans = cluster.KMeans(n_clusters=numGroups, n_init=10000, random_state=GROUP_SEED) kmeans.fit(data) wordLabels = kmeans.labels_ # rename groups and keep track of color associated with each word labelsSeen = 0 labelConversion = {} target = [] firstWords = [] colorsUsed = [] for i in range(len(topWords)): if not (wordLabels[i] in labelConversion): labelConversion[wordLabels[i]] = labelsSeen labelsSeen += 1 firstWords.append(topWords[i]) label = labelConversion[wordLabels[i]] target.append(label) colorsUsed.append(KELLY_COLORS[startOffset + label]) # save used colors fname = "%s/colorByIndex.json" % (saveDir) utils.safeWrite(fname, colorsUsed, True) # create labels targetLabels = [] for i in range(numGroups): targetLabels.append("Group %d (%s)" % (i + 1, firstWords[i])) # group data and colors targetList = [] targetList.append({ "name": "Word_Groupings_", "target": np.array(target), "labels": targetLabels }) dataSet = graphUtils.Dataset(data, targetList) # Save group labels groupLabels = firstWords if (numGroups == -1): origGroupLabels = [ "noun", "verb", "adj", "adv", "pron", "article", "prep", "conj", "partic" ] groupLabels = ["", "", "", "", "", "", "", "", ""] for i in range(len(origGroupLabels)): groupLabels[labelConversion[str(i)]] = origGroupLabels[i] utils.safeWrite(saveDir + "groupLabels.json", groupLabels, True) # graph the data tSNEDir = saveDir + "tSNE/" colors = KELLY_COLORS[startOffset:startOffset + numGroups] for u in [False]: # [False, True]: graphUtils.tSNE_2D(dataSet, topWords, 20.0, True, tSNEDir, True, predefinedColors=colors, verbose=False, useUMAP=u)
def getWordUseInfo(topStr, baseFolder): # total +p words tops = utils.getContent( "output/greek/no_split/%s/wordInfo_%s.txt" % (topStr, topStr), False).split("\n")[1:] poetrys = utils.getContent( "output/greek/no_split/top_p/wordInfo_top_p.txt", False).split("\n")[1:] # Top plus poetry totals = utils.getContent( "output/greek/no_split/%s+p/wordInfo_%s+p.txt" % (topStr, topStr), False).split("\n")[1:] numWordsOutput = [] numWordsOutput.append("Number of Top Words: %d" % len(tops)) numWordsOutput.append("Number of Poetry Words: %d" % len(poetrys)) numWordsOutput.append("Total Number of Words: %d" % len(totals)) utils.safeWrite("%s/wordUse/totalWords.txt" % baseFolder, "\n".join(numWordsOutput)) # Create Table of words topRanks = {} poetryRanks = {} for i, line in enumerate(tops): w = line.split(":")[0] topRanks[w] = i + 1 for i, line in enumerate(poetrys): w = line.split(":")[0] poetryRanks[w] = i + 1 rankInfo = [] for line in totals: w = line.split(":")[0] topRank = "" if w in topRanks: topRank = "%d" % topRanks[w] poetryRank = "" if w in poetryRanks: poetryRank = "%d" % poetryRanks[w] rankInfo.append((w, topRank, poetryRank)) rankTableOutput = [] rankTableOutput.append(""" \\begin{table}[!hbt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | l | l ||| l | l | l ||| l | l | l ||| l | l | l |} \\hline \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P} & \\textbf{Token} & \\textbf{A} & \\textbf{P}\\\\\\hline """) columnHeight = 43 for i in range(columnHeight): cells = [] for j in range(4): index = i + j * columnHeight cell = "" if (index < len(rankInfo)): cell = "%s & %s & %s" % rankInfo[index] cells.append(cell) rankTableOutput.append("%s \\\\\\hline" % (" & ".join(cells))) rankTableOutput.append(""" \\end{tabular} \\caption{List of tokens used, along with their rank in the top 150 tokens found in all texts (\\textbf{A}) and rank in the top 100 tokens found in poetry texts (\\textbf{P}).} \\label{table:top_words} \\end{table} """) utils.safeWrite("%swordUse/topWordsTable.tex" % baseFolder, "\n".join(rankTableOutput))
def loadTexts(splitParameter, subsetSize, textLocation, language, saveDir, useTextCounts): useSplitParam = splitParameter != -1 if useTextCounts: textLocation = convertToTextCounts(textLocation) available = utils.getContent(textLocation + "available.json", True) authors = [] allWorks = [] books = [] workTokenLengths = [] bookTokenLengths = [] print(len(available), end=" - ", flush=True) # For each available text for i, o in enumerate(available): if (i % 20 == 0): print(i, end=" ", flush=True) authorName = o["author"] # split into two authors if necessary if useSplitParam: a1 = utils.Author(authorName) a2 = utils.Author(authorName + "_2") else: a = utils.Author(authorName) workLocs = o["works"] works = [] authorTokens1 = [] authorTokens2 = [] # Process each work for w in workLocs: allWorks.append(w) # if authorName == "Arrian" and w["name"] != "Anabasis": # continue location = w["location"] if useTextCounts: location = convertToTextCounts(location) t = utils.Text(location) if useSplitParam: a1.addWork(t) a2.addWork(t) else: a.addWork(t) workTokenLength = 0 # For each book, process all of its tokens, count them, # add them to this author. for b in t.books: tokens = [] if not (useTextCounts): rawTokens = re.sub(r'\.,;:᾽῾\'', "", b.bookText).split(" ") for token in rawTokens: if language == "Greek": token = preprocessTokenGreek(token) token = utils.transformElided(token) if language == "Icelandic": token = preprocessTokenIcelandic(token) if (token == ""): continue tokens.append(token) else: tokenCounts = b.bookTokenCounts for token in tokenCounts: cleanToken = token if language == "Greek": cleanToken = preprocessTokenGreek(cleanToken) cleanToken = utils.transformElided(cleanToken) if language == "Icelandic": cleanToken = preprocessTokenIcelandic(cleanToken) if (cleanToken == ""): continue # Add token once per each count. Bit of a hack and the # text will end up out of order, but since the paper # doesn't consider word order this should be fine. for i in range(tokenCounts[token]): tokens.append(cleanToken) b.tokens = tokens books.append(b) bookTokenLength = len(tokens) bookTokenLengths.append(bookTokenLength) workTokenLength += bookTokenLength if useSplitParam: # add in the tokens from this book as well if (splitParameter == -2): authorTokens1.extend(tokens) authorTokens2.extend(tokens) else: modul = splitParameter * 2 t1 = [ tokens[i] for i in range(len(tokens)) if ((i % modul) < splitParameter) ] t2 = [ tokens[i] for i in range(len(tokens)) if ((i % modul) >= splitParameter) ] authorTokens1.extend(t1) authorTokens2.extend(t2) a1.bookSplits[len(authorTokens1)] = True a2.bookSplits[len(authorTokens2)] = True else: # add in the tokens from this book as well authorTokens1.extend(tokens) a.bookSplits[len(authorTokens1)] = True workTokenLengths.append(workTokenLength) if useSplitParam: if splitParameter == -2: half = int(len(authorTokens1) / 2) a1.allTokens = authorTokens1[:half] a2.allTokens = authorTokens2[half:] else: a1.allTokens = selectSubset(authorTokens1, subsetSize) a2.allTokens = selectSubset(authorTokens2, subsetSize) authors.append(a1) authors.append(a2) else: a.allTokens = selectSubset(authorTokens1, subsetSize) authors.append(a) numProseA = 0 numPoetryA = 0 for a in authors: if (toGenre(a.authorName) == 0): numProseA += 1 else: numPoetryA += 1 numProseB = 0 numPoetryB = 0 for b in books: if (toGenre(b.author) == 0): numProseB += 1 else: numPoetryB += 1 print("") countInfo = [] countInfo.append("Number of authors: %d" % len(authors)) countInfo.append(" prose: %d" % numProseA) countInfo.append(" poetry: %d" % numPoetryA) countInfo.append("Number of works: %d" % len(allWorks)) countInfo.append("Number of segments: %d" % len(books)) countInfo.append(" prose: %d" % numProseB) countInfo.append(" poetry: %d" % numPoetryB) countInfo.append("-----") countInfo.append(" 5%, 25%, 50%, 75%, 95%") countInfo.append( "works: %d, %d, %d, %d, %d" % tuple(np.percentile(workTokenLengths, [5, 25, 50, 75, 95]).tolist())) countInfo.append( "segments: %d, %d, %d, %d, %d" % tuple(np.percentile(bookTokenLengths, [5, 25, 50, 75, 95]).tolist())) countInfoStr = "\n".join(countInfo) print(countInfoStr) if (saveDir != ""): utils.safeWrite(saveDir + "numberOfAuthors_Books.txt", countInfoStr) # If true, print all of the loaded texts. printLoaded = False if printLoaded: tab = " " print("Authors:") s = [] for author in authors: s.append(tab + str(author)) print("\n".join(s)) print("----") print("Books:") s = [] for book in books: s.append(tab + str(book)) print("\n".join(s)) print("----") return authors, books
def visualizeItemData(data, target, names, authornames, typeName, saveDir): targetList = [] targetList.append({"name": "", "target": np.array(target), "labels": []}) # go through all grouping options for fun in genre.labelList: targetList.append(fun(authornames)) # visualize author data dataSet = graphUtils.Dataset(np.array(data), targetList) # Dummy test data testSet = graphUtils.Dataset([], []) testNames = [] saveOutput = True perplexity = 20.0 baseSaveDir = saveDir # I have some state carrying over that I can't figure out, so only one of # these can run at a time, but umap results look roughly the same as tSNE. for u in [False]: algorithmName = "tSNE" if u: algorithmName = "umap" print("Visualizing using %s" % algorithmName) tSNEDir = baseSaveDir + algorithmName + "/" graphUtils.tSNE_2D(dataSet, names, perplexity, saveOutput, tSNEDir, True, useUMAP=u) if (typeName == "Authors"): #graphUtils.tSNE_2D_2color(dataSet, names, perplexity, saveOutput, tSNEDir, True) # load tsne data saveDir = tSNEDir precalcFilename = saveDir + ("%s_2D_data.txt" % algorithmName) precalculated = utils.getContent(precalcFilename, True) tsneX = np.array(precalculated["x"], dtype=np.float64) # skip first and last target dataSet = graphUtils.Dataset(tsneX, targetList[1:-1]) graphUtils.tSNE_2D_4Up(dataSet, names, False, saveOutput, saveDir, "info_no_labels_4Up", False, useUMAP=u) # Create fourup containing books if (typeName == "Books"): saveDir = tSNEDir precalcFilename = saveDir + ("%s_2D_data.txt" % algorithmName) precalculated = utils.getContent(precalcFilename, True) tsneX = np.array(precalculated["x"], dtype=np.float64) preY = np.array(precalculated["y"], dtype=np.float64) names = precalculated["names"] #print("Precalculateds loaded") targets = [] targets.append({ "name": "_", "target": preY != 30, "labels": ["Demosthenes", "Others"], "outlierName": "Demos.speec.59", "prettyName": "Speech 59" }) # Demosthenes .15 targets.append({ "name": "_", "target": preY != 55, "labels": ["Isocrates", "Others"], "outlierName": "Isocr.speec.21", "prettyName": "Speech 21" }) # Isocrates .10 targets.append({ "name": "_", "target": preY != 91, "labels": ["Xenophon", "Others"], "outlierName": "Xenop.hunti.1", "prettyName": "Cynegeticus" }) # Xenophon .helle.7 targets.append({ "name": "_", "target": preY != 76, "labels": ["Plato", "Others"], "outlierName": "Plato.menex.1", "prettyName": "Menexenus" }) # Plato .laws.8 #targets.append(genre.labelList[]()) dataSet = graphUtils.Dataset(tsneX, targets) #graphUtils.clickable_tSNE_2D(dataSet, names, -1, saveDir, False) graphUtils.tSNE_2D_4Up(dataSet, names, True, saveOutput, saveDir, "outliers4Up", False, useUMAP=u)
def keyAuthorComparisonWithImportance(authors, books, baseSaveDir, splitParam, topWords): makeWordImportanceGraphs = False keyAuthData = getKeyAuthorData(authors, books) saveDir = baseSaveDir + "wordImportance/" allDiffLineData = {} allCumulDiffLineData = {} allRCumulDiffLineData = {} allPercentageLineData = {} # load diffs for plotting internal similarities allDiffsFilename = baseSaveDir + "dists/diffLists.json" allDiffs = utils.getContent(allDiffsFilename, True) # For each set of key authors, make necessary visaulizations for dat in keyAuthData: data, _, dataLabels, chartFileName = dat print(" %s..." % chartFileName) numWords = len(topWords) numTexts = len(dataLabels) tickLabels = topWords distsFilename = baseSaveDir + "dists/" + chartFileName + ".json" dists = utils.getContent(distsFilename, True) # dists = [ # {"name": "D1", "vals": (np.random.random((numWords))*1.5 - 0.5)}, # {"name": "D2", "vals": (np.random.random((numWords))*1.5 - 0.5)} # ] for d in dists: d["vals"] = np.array(d["vals"]) if (makeWordImportanceGraphs): graphUtils.wordImportanceComparison(data, dataLabels, tickLabels, dists, saveDir + "unsorted/", chartFileName, True) # display versions sorted by each metric for d in dists: sortedSaveDir = saveDir + d["name"] + "-sorted/" fname = chartFileName sortedInds = np.array( list( map( lambda x: x[0], sorted(enumerate(d["vals"]), key=lambda x: x[1][0], reverse=True)))) data1 = copy.deepcopy(data) tickLabels1 = copy.deepcopy(tickLabels) wordsUsed = len(topWords) # If the similarity metric includes remainder, we have to add it if (len(dists[0]["vals"]) == len(data[0]) + 1): newData = [] for row in data1: r = np.append(row, 1 - np.sum(row)) newData.append(r) data1 = newData tickLabels1.append("Remainder") wordsUsed += 1 data2 = list(map(lambda x: np.array(x)[sortedInds], data1)) tickLabels2 = np.array(tickLabels1)[sortedInds] dists2 = copy.deepcopy(dists) percentiles = [] for d2 in dists2: d2["vals"] = np.copy(d2["vals"])[sortedInds] if (makeWordImportanceGraphs): graphUtils.wordImportanceComparison(data2, dataLabels, tickLabels2, dists2, sortedSaveDir, fname, True) # save all words if d["name"] == "Jensen-shannon": fname = saveDir + "keyWords/" + chartFileName + ".json" SimDiff = {} for i, val in enumerate(d["vals"][sortedInds]): if (True): SimDiff[tickLabels2[i]] = [i, val[1]] utils.safeWrite(fname, SimDiff, True) # Diff data trueDiffs = np.array( list(map(lambda x: x[0], d["vals"][sortedInds]))) y = (chartFileName, trueDiffs) y_cumul = (chartFileName, np.cumsum(trueDiffs)) linesToGraphDiff = [y] linesToGraphDiffCumul = [y_cumul] # store info for the chart with all authors if d["name"] in allDiffLineData: allDiffLineData[d["name"]].extend([y]) else: allDiffLineData[d["name"]] = [y] if d["name"] in allCumulDiffLineData: allCumulDiffLineData[d["name"]].extend([y_cumul]) else: allCumulDiffLineData[d["name"]] = [y_cumul] # dif percentile data percentiles = list(map(lambda x: x[1], d["vals"][sortedInds])) y = (chartFileName, percentiles) linesToGraphPct = [y] # store info for the chart with all authors if d["name"] in allPercentageLineData: allPercentageLineData[d["name"]].append(y) else: allPercentageLineData[d["name"]] = [y] if splitParam == -1: # get percentiles for internal consistency of second author author1 = dataLabels[0] author2 = dataLabels[1] authorInternalConsistencies = [ # ["split5", author1, "-split5"], # ["split-2", author1, "-splitHalf"], # ["split5", author2, "-split5"], # ["split-2", author2, "-splitHalf"] ] # Gen information comparing consistencies within given authors. for aic in authorInternalConsistencies: a2DiffsFilename = baseSaveDir.replace( "no_split", aic[0]) + "dists/%s_%s_2.json" % (aic[1], aic[1]) if (utils.fileExists(a2DiffsFilename)): a2Diffs = utils.getContent(a2DiffsFilename, True) diffNums = None for ad in allDiffs: if ad["name"] == d["name"]: diffNums = ad["allDiffs"] a2RawDiffs = None for ad in a2Diffs: if ad["name"] == d["name"]: a2RawDiffs = ad["vals"] if (diffNums != None and a2RawDiffs != None): # Add difference data aicName = aic[1] + aic[2] a2SortedInds = np.array( list( map( lambda x: int(x[0]), sorted(enumerate(a2RawDiffs), key=lambda x: x[1][0], reverse=True)))) trueDiffs = np.array( list( map(lambda x: x[0], np.array(a2RawDiffs)[a2SortedInds]))) y_diff = (aicName, trueDiffs) y_diff_cumul = (aicName, np.cumsum(trueDiffs)) linesToGraphDiff.append(y_diff) linesToGraphDiffCumul.append(y_diff_cumul) # Add Percentile data a2Percentiles = [] for rd in a2RawDiffs: index = bisect.bisect_left(diffNums, rd[0]) a2Percentiles.append( (100.0 * index) / len(diffNums)) a2Percentiles = sorted(a2Percentiles, reverse=True) y2 = (aicName, a2Percentiles) linesToGraphPct.append(y2) else: print("File does not exist: \"%s\"" % a2DiffsFilename) # Create charts showing differences for various authors graphUtils.lineChart(range(wordsUsed), linesToGraphDiff, True, sortedSaveDir, chartFileName + "_diff-chart", yLim=None) #[-0.002, 0] graphUtils.lineChart(range(wordsUsed), linesToGraphDiffCumul, True, sortedSaveDir, chartFileName + "_diff-cumul-chart", yLim=None, yAdjust=1) #[-0.002, 0] #graphUtils.lineChart(range(wordsUsed), linesToGraphPct, True, sortedSaveDir, chartFileName+"_pct-chart") linesToGraphDiffRCumul = [] for name, c in linesToGraphDiffCumul: name = name.replace("-split5", " Local Split") name = name.replace("-splitHalf", " Global Split") linesToGraphDiffRCumul.append((name, c[-1] - np.array(c))) if d["name"] in allRCumulDiffLineData: allRCumulDiffLineData[d["name"]].extend( [linesToGraphDiffRCumul]) else: allRCumulDiffLineData[d["name"]] = [linesToGraphDiffRCumul] graphUtils.lineChart(range(wordsUsed), linesToGraphDiffRCumul, True, sortedSaveDir, chartFileName + "_diff-r-cumul-chart", yLim=None, yAdjust=1) #[-0.002, 0] for d in dists: # 4-Up Chart for these authors sortedSaveDir = saveDir + d["name"] + "-sorted/" graphUtils.lineChart4Up(range(wordsUsed), allRCumulDiffLineData[d["name"]], True, sortedSaveDir, "4up-r-cumul", yLim=None, yAdjust=1) # Create graph charts for all data in a cloud graphTypes = [ ("all-diffs", allDiffLineData, None, 0), ("all-diffs-cumul", allCumulDiffLineData, None, 1), #("all-pcts", allPercentageLineData, [0, 100], 0) ] alls = {} for graphType, lineList, yLim, adjust in graphTypes: medFilename = baseSaveDir + "dists/median-%s.json" % graphType med = utils.getContent(medFilename, True) alls[graphType] = {} for d in med: lineList[d["name"]].append(["Median", d["line"]]) alls[graphType][d["name"]] = d["all"] for name in allPercentageLineData: sortedSaveDir = baseSaveDir + "wordImportance/" + name + "-sorted/" for log in [False]: #, True]: print(" %s..." % graphType) graphUtils.lineChart(range(wordsUsed), lineList[name], True, sortedSaveDir, graphType, yLim=yLim, log=log, yAdjust=adjust) print(" %s cloud..." % graphType) graphUtils.lineChart(range(wordsUsed), lineList[name], True, sortedSaveDir, graphType + "-cloud", yLim=yLim, allLines=alls[graphType][name], log=log, yAdjust=adjust) # Create chart showing ignored top words n = "Jensen-shannon" sortedSaveDir = baseSaveDir + "wordImportance/" + n + "-sorted/" # Cumulative data = allCumulDiffLineData[n] # Add lines res = [] targetSim = -1 for item in alls["all-diffs-cumul"][n]: name, c = item # "Aristotle_Pindar" in name or #"AeliusAristides_Demosthenes", "DioChrysostom_Plato" if ("ApolloniusRhodius_QuintusSmyrnaeus" in name or "DioChrysostom_Xenophon" == name): res.append((name, "-", 1 + c[-1] - np.array(c))) # Lowest of our top authors if ("DioChrysostom_Xenophon" == name): targetSim = c[-1] # add median # for item in allCumulDiffLineData[n]: # name, c = item # if ("Median" in name): # res.append((name, "-", 1 + c[-1] - np.array(c))) # Add line cloud resAll = [] for item in alls["all-diffs-cumul"][n]: name, c = item if not ("Hymns_Dionysus" in name or "Euclid" in name): n1, n2 = name.replace("Hymns_", "Hymns").split("_") n1 = n1.replace("Hymns", "Hymns_") n2 = n2.replace("Hymns", "Hymns_") centuryDiff = centDiff(genre.toCent(n1), genre.toCent(n2)) #print("%s, %s: %d" % (n1, n2, centuryDiff)) if (centuryDiff >= 4): # color top sims differently color = "k-" resAll.append((name, color, 1 + c[-1] - np.array(c))) # for name, c in data: # y = c[-1] - np.array(c) # res.append((name, y)) #resAll = map(lambda n, c: (n, c[-1] - np.array(c))) graphUtils.compareWordUsageChart(res, True, sortedSaveDir, "ignoreBestWords", yLim=None, allLines=resAll)
def getCenturyInfo(topStr, baseFolder): subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_no_labels.pdf %scentury/centuriesGreek.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_overall_labels.pdf %scentury/extraInfo/Greek_CenturyOverall_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Greek_SimRange.txt" % (topStr, baseFolder), shell=True) # ------------------------- # Century similarity data subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek_Century_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/extraInfo/Greek+p_Century_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Greek+p_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek_Century_Cutoff_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_No_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_labels.pdf %scentury/extraInfo/Greek+p_Century_Cutoff_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels.pdf %scentury/centuriesGreek2.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_no_labels_violin.pdf %scentury/centuriesGreekViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/English_SimRange.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesEnglish.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesEnglishViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/English_Century_Label.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/simRange.txt %scentury/extraInfo/Icelandic_SimRange.txt" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels.pdf %scentury/centuriesIcelandic.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_no_labels_violin.pdf %scentury/centuriesIcelandicViolin.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_labels.pdf %scentury/extraInfo/Icelandic_Century_Label.pdf" % (topStr, baseFolder), shell=True) # Get pvalue + other regression information for charts greekPval = utils.getContent( "output/greek/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt" % (topStr), False) englishPval = utils.getContent( "output/english/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) icelandicPval = utils.getContent( "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) pvalOutput = [] pvalOutput.append("Greek:") pvalOutput.append(greekPval) pvalOutput.append("English:") pvalOutput.append(englishPval) pvalOutput.append("Icelandic:") pvalOutput.append(icelandicPval) utils.safeWrite("%scentury/century_pvals.txt" % baseFolder, "\n".join(pvalOutput))
#Integrity #See with custommer what he wants... for the moment just print the different results: if integrity.checkHash(fname) : if integrity.checkIntegrity(fname) : print "Integrity OK for : " + fname else : print "Integrity PROBLEM, the file " + fname + " has been modified" else : print "No integrity check for : " + fname ##### #TEST if __name__ == "__main__" : fname = "toto" putFile(fname) print 'ENCYPHERED : ' + getContent(fname) getFile(fname) print 'DECYPHERED : ' + getContent(fname) ####################################"
def gatherFilesFull(topStr, topNum, comparableTopStr, comparableNum, poetryNum): baseFolder = "output/full/" folders = [ "", "data", "genre", "metric", "metric/extraInfo", "century", "century/extraInfo", "wordUse", "wordUse/extraInfo", "wordUse/grouping", ] createFolders(folders, baseFolder) # Get info for the data section getDataInfo(topStr, baseFolder) # Get info for approach section getWordUseInfo(topStr, baseFolder) # Get genre info getGenreInfo(topStr, baseFolder) # Gather 4up tsne charts for standard data and data normalized by genre # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/Authors/tSNE/info_no_labels_4Up.pdf %sgenre/groupings.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/outliers4up.pdf %sgenre/bookOutliers.pdf" % (topStr, baseFolder), shell=True) # Get book tsne charts # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_no_labels.pdf %sgenre/books_tSNE_no_labels.pdf" % (topStr, baseFolder), shell=True) subprocess.run( "cp output/greek/no_split/%s/Books/tSNE/tSNE_2D_labels.pdf %sgenre/books_tSNE_labels.pdf" % (topStr, baseFolder), shell=True) # To get a look at these, run python3 visualizeBooks # Get info for standard and normalized by poetry makeMLTable("output/greek/no_split/%s/dataPreds/" % (topStr), False, "%sgenre/ml_table.tex" % baseFolder) # makeMLTable("output/greek/no_split/%s+p/dataPreds/" % (topStr), False, "%sgenre/ml_table+p.tex" % baseFolder) # ========================= # Get info for results section # ----------- # Metric getMetricInfo(topStr, comparableTopStr, topNum, poetryNum, comparableNum, SIM_METRICS, baseFolder) makeMetricInternalTables("", topStr, SIM_METRICS, baseFolder) makeMetricInternalTables("", topStr + "+p", SIM_METRICS, baseFolder) # ----------- # Century # Get information on century comparison getCenturyInfo(topStr, baseFolder) # Get pvalue + other regression information for charts that are + p greekPval = utils.getContent( "output/greek/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_under_9_pslope.txt" % (topStr), False) englishPval = utils.getContent( "output/english/no_split/%s+p/jensen-shannon/metric/Authors/century_sims_genre_pslope.txt" % (topStr), False) pvalOutput = [] pvalOutput.append("Greek:") pvalOutput.append(greekPval) pvalOutput.append("English:") pvalOutput.append(englishPval) utils.safeWrite("%scentury/century_pvals+p.txt" % baseFolder, "\n".join(pvalOutput)) # ------------------------- # Grab this from the best metric subprocess.run( "cp output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt %swordUse/authorSims.txt" % (topStr, baseFolder), shell=True) fourCenturiesTables(topStr, SIM_METRICS, baseFolder) # get word usage charts and info getWordUsageInfo(topStr, baseFolder)
def cleanAndCombineFeatures(texts, approach): matrix = [] textNames = [] featureNames = [] numTexts = len(texts) # for all the texts for i in range(numTexts): text = texts[i] textName = text["textName"] divideByBook = text["divideByBook"] toBeCombined = text["toBeCombined"] if (toBeCombined or textName == "Iliad1" or textName == "Odyssey1"): continue ofn = generalUtils.getTextFeatureDataOdikonFn(textName, approach) tfn = generalUtils.getTextFeatureDataTamnonFn(textName) odikonFeaturesRaw = generalUtils.getContent(ofn, True) tamnonFeaturesRaw = generalUtils.getContent(tfn, True) if (len(odikonFeaturesRaw) != len(tamnonFeaturesRaw)): raise Exception("Number of subtexts for " + textName + " do not match") # for each set of features (the books plus the overall text) for j in range(len(odikonFeaturesRaw)): # get the raw features for this subtext ro = odikonFeaturesRaw[j] rt = tamnonFeaturesRaw[j] # determine the names for these two texts and make sure they match roString = ro["TextName"] + ": " + ro["SubName"] rtString = rt["TextName"] + ": " + rt["SubName"] if (roString != rtString): raise Exception("Book mismatch! " + roString + " and " + rtString) # add the cleaned features to the row row = [] row.extend(cleanRawOdikon(ro, False)) row.extend(cleanRawTamnon(rt, False)) matrix.append(row) textNames.append(roString) # and one time, get the list of feature names. if (i == 0 and j == 0): featureNames.extend(cleanRawOdikon(ro, True)) featureNames.extend(cleanRawTamnon(rt, True)) # output the information. print "Number of Features: %d." % len(matrix[0]) output = { "rowNames": textNames, "matrix": matrix, "featureNames": featureNames } fName = generalUtils.getFeatureMatrixFn() generalUtils.safeWrite(fName, json.dumps(output))
def makeMetricInternalTables(suffix, topStr, simMetrics, baseFolder): metricInternalTables = [] for simMetric in simMetrics: dir, metricName = simMetric # skip Jensen-Shannon if metricName == "Jensen-Shannon": continue tableOutput = [] temp = """ \\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c |} \\hline """ tableOutput.append(temp) temp = "\\textbf{Metric Options} & \\textbf{Author} & \\textbf{Work} & \\textbf{Total} \\\\\\hline" tableOutput.append(temp) workSigReport = [] authorSigReport = [] totalSigReport = [] # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median} metricOptions = [("Baseline", "-remainder-smoothed"), ("+1 Smoothing", "-remainder+smoothed"), ("Remainder", "+remainder-smoothed"), ("Both", "+remainder+smoothed")] # Get the list of authors and works the metric got correct scoreLists = {} for _, opt in metricOptions: scoreLists[opt] = {} name = opt # Use Poetry Words metricTopStr = topStr fname = "output/greek/no_split/%s/%s/metric%s/Books/scores.json" % ( metricTopStr, dir, opt) scores = utils.getContent(fname, True) scoreLists[opt] = scores scoreLists[opt]["name"] = name baseScore = scoreLists["-remainder-smoothed"] # baseScores = [] # for bsi in baseScoreInfo: # baseScoreMetric, baseScoreIndex = bsi # baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex]) # Create a table of the information using the provided scores for optName, opt in metricOptions: cell = "\\textbf{%s}" % (optName) currentScores = scoreLists[opt] authorScores = currentScores["author"] workScores = currentScores["work"] name = currentScores["name"] sameWork = "%.2f%%, (%d/%d)" % ( 100 * np.mean(workScores), np.sum(workScores), len(workScores)) sameAuth = "%.2f%%, (%d/%d)" % (100 * np.mean(authorScores), np.sum(authorScores), len(authorScores)) all = np.concatenate((workScores, authorScores)) total = "%.2f%%, (%d/%d)" % (100 * np.mean(all), np.sum(all), len(all)) wrk = " & %s" % (sameWork) auth = " & %s" % (sameAuth) tot = " & %s" % (total) # Calculate significance a = baseScore["work"] b = currentScores["work"] work_t, work_p = stats.ttest_rel(a, b) workSigReport.append(name) # Degrees of freedom df = len(b) - 1 workSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, work_t, work_p) workSigReport.append(workSig) a = baseScore["author"] b = currentScores["author"] author_t, author_p = stats.ttest_rel(a, b) authorSigReport.append(name) # Degrees of freedom df = len(b) - 1 authorSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, author_t, author_p) authorSigReport.append(authorSig) a = np.concatenate((baseScore["work"], baseScore["author"])) b = np.concatenate( (currentScores["work"], currentScores["author"])) all_t, all_p = stats.ttest_rel(a, b) totalSigReport.append(name) # Degrees of freedom df = len(b) - 1 totalSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, all_t, all_p) totalSigReport.append(totalSig) # if (name == bestMetricName or name == baseScore["name"]): # bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"])) # bestMetricSigWork.append(workSig) # # bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"])) # bestMetricSigAuthor.append(authorSig) #print(" Author: t-statistic = %6.3f pvalue = %f" % stats.ttest_rel(a, b)) # Significance notes if (work_p < 0.01): wrk += "\\textbf{†}" elif (work_p < 0.05): wrk += "\\textbf{*}" if (author_p < 0.01): auth += "\\textbf{†}" elif (author_p < 0.05): auth += "\\textbf{*}" if (all_p < 0.01): tot += "\\textbf{†}" elif (all_p < 0.05): tot += "\\textbf{*}" # wrk += " %.4f" % work_p # auth += " %.4f" % author_p # tot += " %.4f" % all_p cell += "%s%s%s" % (wrk, auth, tot) cell = cell.replace("%", "\\%") tableOutput.append("%s\\\\\\hline" % cell) tableOutput.append("\\end{tabular}") tableOutput.append("\\caption{") tableOutput.append( "How well %s performs with the remainder words and smoothing included. " % metricName) tableOutput.append( "†: Results very significant (p < 0.01) when compared to baseline. " ) tableOutput.append( "*: Results significant (p < 0.05) when compared to baseline. ") tableOutput.append("}") tableOutput.append("\\label{table:metric_options_eval_%s}" % dir) tableOutput.append("\\end{table}") tableOutput.append("") tableOutput.append("") metricInternalTables.append("\n".join(tableOutput)) utils.safeWrite( "%smetric/%s_optionsEvalTable%s.tex" % (baseFolder, metricName, suffix), "\n".join(tableOutput)) # sigReport = "Work:\n" + ("\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ("\n".join(bestMetricSigAuthor)) # utils.safeWrite("%smetric/bestMetricSignificance%s_2.txt" % (baseFolder, suffix), sigReport) # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportWork%s_2.txt" % (baseFolder, suffix), "\n".join(workSigReport)) # utils.safeWrite("%smetric/extraInfo/metricSignificanceReportAuthor%s_2.txt" % (baseFolder, suffix), "\n".join(authorSigReport)) utils.safeWrite( "%smetric/extraInfo/optionsEvalTables%s.tex" % (baseFolder, suffix), "\n".join(metricInternalTables))
def find_match(message): if intime(message): cid = getCID(message) content = getContent(message) match_id = message.text match_id = match_id.split()[1] try: match = api.get_match_details(match_id) url = match.url request = requests.get(url) match_data = request.json() if content != "?": if request.status_code == 200: hero_list = [] if match_data['result']['radiant_win']: title = "Radiant!" else: title = "Dire!" url = "http://www.dotabuff.com/matches/" + match_id radiant_content = "" dire_content = "" for player in match_data['result']['players']: if player['player_slot'] < 100: # radiant for hero in heroes_list: if hero['id'] == player['hero_id']: hero_list.append(hero['localized_name']) radiant_content = (radiant_content + hero['localized_name'] + " " + str(player['kills']) + "/" + str(player['deaths']) + "/" + str(player['assists']) + '\n') else: # dire for hero in heroes_list: if hero['id'] == player['hero_id']: hero_list.append(hero['localized_name']) dire_content = (dire_content + hero['localized_name'] + " " + str(player['kills']) + "/" + str(player['deaths']) + "/" + str(player['assists']) + '\n') bot.send_message( cid, 'Winner: *{title}* \n _Radiant:_ \n{radiant}\n _Dire:_\n{dire}\n' .format(title=title, radiant=radiant_content, dire=dire_content) + '[Dotabuff link]({url})'.format(url=url), parse_mode="Markdown", disable_web_page_preview=True) else: bot.reply_to( message, "`There has been an error, the number {error} to be specific.`".format(error=request.status_code), parse_mode="Markdown") else: bot.reply_to(message, "`wat`", parse_mode="Markdown") except Exception as ex: bot.reply_to( message, "There has been an error, its message is:\n `{error}`".format(error=ex.msg), parse_mode="Markdown")
def makeMetricEvalTables(suffix, topStr, comparableTopStr, topNum, poetryNum, comparableNum, simMetrics, baseFolder): baseScoreInfo = [ ("Cosine", 0), ("Burrows' Delta", 0), ] bestMetricName = "Jensen-Shannon (250)" #Jensen-Shannon+p bestMetricSigWork = [] bestMetricSigAuthor = [] evalTableOutput = [] evalTableOutput.append("""\\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | r | r |} \\hline & \\multicolumn{2}{c|}{\\textbf{Percentage of segments most similar to a segment...}} \\\\ \\textbf{Metric}& \\textbf{from the same work} & \\textbf{by the same author} \\\\\\hline """) sameWorkTableOutput = [] sameAuthorTableOutput = [] temp = """\\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c |} \\hline """ sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) temp = "& & \\textbf{Top %d +} & \\\\" % (topNum) sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) temp = "\\textbf{Metric}& \\textbf{Top %d} & \\textbf{Top %d in Poetry} & \\textbf{Top %d} \\\\\\hline" % ( topNum, poetryNum, comparableNum) sameWorkTableOutput.append(temp) sameAuthorTableOutput.append(temp) workSigReport = [] authorSigReport = [] # & \\textbf{Sim to another work} & \\textbf{Closest to diff author} & \\textbf{Median} # Get the list of authors and works the metric got correct scoreLists = {} for simMetric in simMetrics: dir, metricName = simMetric scoreLists[metricName] = {} for i, params in enumerate([ (False, False), (True, False), (False, True), ]): name = metricName addP, comparable = params metricTopStr = topStr if addP: metricTopStr += "+p" name += "+p" # look at comparable number of non-poetry words elif comparable: metricTopStr = comparableTopStr name += " (%d)" % comparableNum else: name += " (%d)" % topNum fname = "output/greek/no_split/%s/%s/metric/Books/scores.json" % ( metricTopStr, dir) scores = utils.getContent(fname, True) scoreLists[metricName][i] = scores scoreLists[metricName][i]["name"] = name baseScores = [] for bsi in baseScoreInfo: baseScoreMetric, baseScoreIndex = bsi baseScores.append(scoreLists[baseScoreMetric][baseScoreIndex]) # Create a table of the information using the provided scores for metricName in scoreLists: cell2 = "\\textbf{%s}" % (metricName) cell3 = "\\textbf{%s}" % (metricName) for i in scoreLists[metricName]: currentScores = scoreLists[metricName][i] authorScores = currentScores["author"] workScores = currentScores["work"] name = currentScores["name"] sameWork = "%.2f%%" % (100 * np.mean(workScores)) sameAuth = "%.2f%%" % (100 * np.mean(authorScores)) # sameWork = "%.2f%%, (%d/%d)" % (100*np.mean(workScores), np.sum(workScores), len(workScores)) # sameAuth = "%.2f%%, (%d/%d)" % (100*np.mean(authorScores), np.sum(authorScores), len(authorScores)) # cell = "%s & %s & %s & %s & %s & %s" % (name, sameAuth, sameWork, otherWork, diffAuthClosest, median) cell = "%s & %s & %s" % (name, sameWork, sameAuth) cell = cell.replace("%", "\\%") evalTableOutput.append("%s\\\\\\hline" % cell) cell2 += " & %s" % (sameWork) # work_p cell3 += " & %s" % (sameAuth) # , author_p) for j, baseScore in enumerate(baseScores): a = baseScore["work"] b = currentScores["work"] work_t, work_p = stats.ttest_rel(a, b) workSigReport.append(name) # Degrees of freedom df = len(b) - 1 workSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, work_t, work_p) workSigReport.append(workSig) a = baseScore["author"] b = currentScores["author"] author_t, author_p = stats.ttest_rel(a, b) authorSigReport.append(name) # Degrees of freedom df = len(b) - 1 authorSig = " (M=%.3f, SD=%.3f) t(%d)=%.3f, p=%.3e" % ( np.mean(b), np.std(b), df, author_t, author_p) authorSigReport.append(authorSig) if (name == bestMetricName or name == baseScore["name"]): bestMetricSigWork.append("%s vs %s" % (name, baseScore["name"])) bestMetricSigWork.append(workSig) bestMetricSigAuthor.append("%s vs %s" % (name, baseScore["name"])) bestMetricSigAuthor.append(authorSig) #print(" Author: t-statistic = %6.3f pvalue = %f" % stats.ttest_rel(a, b)) # Significance notes if (j == 0): if (work_p < 0.01): cell2 += "\\textbf{†}" elif (work_p < 0.05): cell2 += "\\textbf{*}" if (author_p < 0.01): cell3 += "\\textbf{†}" elif (author_p < 0.05): cell3 += "\\textbf{*}" else: if (work_p < 0.01): cell2 += "\\textbf{‡}" if (author_p < 0.01): cell3 += "\\textbf{‡}" cell2 = cell2.replace("%", "\\%") sameWorkTableOutput.append("%s\\\\\\hline" % cell2) cell3 = cell3.replace("%", "\\%") sameAuthorTableOutput.append("%s\\\\\\hline" % cell3) evalTableOutput.append(""" \\end{tabular} \\caption{How well similarity metrics identify whether two segments come from the same work or the same author.} \\label{table:metric_eval} \\end{table} """) utils.safeWrite( "%smetric/extraInfo/metricEvalTable%s.tex" % (baseFolder, suffix), "\n".join(evalTableOutput)) sameWorkTableOutput.append("\\end{tabular}") sameWorkTableOutput.append( "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same work.]{" ) sameWorkTableOutput.append( "How well similarity metrics based on a given set of words identify whether two segments come from the same work. \\newline" ) sameWorkTableOutput.append( "†: Results very significant (p < 0.01) when compared to %s. \\newline" % baseScores[0]["name"]) sameWorkTableOutput.append( "*: Results significant (p < 0.05) when compared to %s. \\newline" % baseScores[0]["name"]) sameWorkTableOutput.append( "‡: Results very significant (p < 0.01) when compared to %s. " % baseScores[1]["name"]) sameWorkTableOutput.append("}") sameWorkTableOutput.append("\\label{table:metric_eval_work}") sameWorkTableOutput.append("\\end{table}") utils.safeWrite("%smetric/sameWorkEvalTable%s.tex" % (baseFolder, suffix), "\n".join(sameWorkTableOutput)) sameAuthorTableOutput.append("\\end{tabular}") sameAuthorTableOutput.append( "\\caption[How well similarity metrics based on a given set of words identify whether two segments come from the same author.]{" ) sameAuthorTableOutput.append( "How well similarity metrics based on a given set of words identify whether two segments come from the same author. \\newline" ) sameAuthorTableOutput.append( "†: Results very significant (p < 0.01) when compared to %s. \\newline" % baseScores[0]["name"]) sameAuthorTableOutput.append( "*: Results significant (p < 0.05) when compared to %s. \\newline" % baseScores[0]["name"]) sameAuthorTableOutput.append( "‡: Results very significant (p < 0.01) when compared to %s. " % baseScores[1]["name"]) sameAuthorTableOutput.append("}") sameAuthorTableOutput.append("\\label{table:metric_eval_author}") sameAuthorTableOutput.append("\\end{table}") utils.safeWrite( "%smetric/sameAuthorEvalTable%s.tex" % (baseFolder, suffix), "\n".join(sameAuthorTableOutput)) sigReport = "Work:\n" + ( "\n".join(bestMetricSigWork)) + "\n\n-------------\n\nAuthor:\n" + ( "\n".join(bestMetricSigAuthor)) utils.safeWrite( "%smetric/bestMetricSignificance%s.txt" % (baseFolder, suffix), sigReport) # utils.safeWrite("%smetric/bestMetricSignificanceWork%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigWork)) # utils.safeWrite("%smetric/bestMetricSignificanceAuthor%s.txt" % (baseFolder, suffix), "\n".join(bestMetricSigAuthor)) utils.safeWrite( "%smetric/extraInfo/metricSignificanceReportWork%s.txt" % (baseFolder, suffix), "\n".join(workSigReport)) utils.safeWrite( "%smetric/extraInfo/metricSignificanceReportAuthor%s.txt" % (baseFolder, suffix), "\n".join(authorSigReport))
RAW_FOLDER = "rawTexts/" PARSED_FOLDER = "texts/" # given a location, convert it from XML to the format we want def convertBook(location): filename = loc.replace(RAW_FOLDER, "") newLoc = PARSED_FOLDER + filename t = utils.XMLText(loc) res = t.convertFromXML() utils.safeWrite(newLoc, res, True) return newLoc, res["booksRaw"] # get the available texts and count them up available = utils.getContent(RAW_FOLDER + "available.json", True) numTexts = 0 for o in available: workLocs = o["works"] for w in workLocs: numTexts += 1 # Parse each book i = 1 allBooks = [] for o in available: workLocs = o["works"] for w in workLocs: if (i % 20 == 0): print("%d out of %d (%.2f%%)" % (i, numTexts,
def fourCenturiesTables(topStr, simMetrics, baseFolder): comparisonOutput = [] topSimsToExamine = 100 # Grab this from the best metric authorSims = utils.getContent( "output/greek/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") topDistantSims = [] topDistantAuthors = {} for i, sim in enumerate(authorSims): centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 4 and i < topSimsToExamine): topDistantSims.append(sim) topDistantAuthors[sim[11:]] = {} authors = " (".join(sim.split(" - ")[1].split(" (")[:-1]) if authors == "Isocrates, Lysias" or authors == "Plato, Xenophon" or authors == "AratusSolensis, Callimachus" or authors == "Herodotus, Thucydides": comparisonOutput.append("Rank %d: %s" % (i + 1, sim)) fourCenturiesApartOutput = [] fourCenturiesApartOutput.append( "%d of the top %d are at least 4 centuries apart." % (len(topDistantSims), topSimsToExamine)) fourCenturiesApartOutput.append("---") fourCenturiesApartOutput.extend(topDistantSims) utils.safeWrite("%swordUse/fourCenturiesApart.txt" % baseFolder, "\n".join(fourCenturiesApartOutput)) # Comparison to English and Icelandic numGreek = len(authorSims) fracGreek = topSimsToExamine / numGreek numDistantGreek = len(topDistantSims) englishSims = utils.getContent( "output/english/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") numEnglish = len(englishSims) topSimsEnglish = int(np.ceil(numEnglish * fracGreek)) fracEnglish = topSimsEnglish / numEnglish numDistantEnglish = 0 num2English = 0 for sim in englishSims[:topSimsEnglish]: centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 2): num2English += 1 if (centuries_apart >= 4): numDistantEnglish += 1 iceSims = utils.getContent( "output/icelandic/no_split/%s/jensen-shannon/metric/Authors/sims.txt" % (topStr), False).split("\n") numIcelandic = len(iceSims) topSimsIcelandic = int(np.ceil(numIcelandic * fracGreek)) fracIcelandic = topSimsIcelandic / numIcelandic numDistantIcelandic = 0 for sim in iceSims[:topSimsIcelandic]: centuries_apart = int(sim.split("(")[-1].split(" ")[0]) if (centuries_apart >= 4): numDistantIcelandic += 1 comparisonOutput.append("\n=========\n") comparisonOutput.append("Top similar pairs") comparisonOutput.append("Greek:") comparisonOutput.append(" examining top %d of %d pairs (%.2f%%)" % (topSimsToExamine, numGreek, 100 * fracGreek)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantGreek, 100 * numDistantGreek / topSimsToExamine)) comparisonOutput.append("English:") comparisonOutput.append(" examining top %d of %d pairs (%.2f%%)" % (topSimsEnglish, numEnglish, 100 * fracEnglish)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantEnglish, 100 * numDistantEnglish / topSimsEnglish)) comparisonOutput.append(" %d (%.2f%%) are at least 2 centuries apart" % (num2English, 100 * num2English / topSimsEnglish)) comparisonOutput.append("Icelandic:") comparisonOutput.append( " examining top %d of %d pairs (%.2f%%)" % (topSimsIcelandic, numIcelandic, 100 * fracIcelandic)) comparisonOutput.append( " %d (%.2f%%) are at least 4 centuries apart" % (numDistantIcelandic, 100 * numDistantIcelandic / topSimsIcelandic)) utils.safeWrite("%swordUse/fourApartComparisonInfo.txt" % baseFolder, "\n".join(comparisonOutput)) # Table for simMetric in simMetrics: dir, name = simMetric # "" or "+p" depending on which is better metricSims = utils.getContent( "output/greek/no_split/%s/%s/metric/Authors/sims.txt" % (topStr, dir), False).split("\n") for i, sim in enumerate(metricSims): pairName = sim[11:] if pairName in topDistantAuthors: topDistantAuthors[pairName][dir] = i + 1 # prepare values for coloring table cells maxVal = 0 minVal = 1000000 for authorPair in topDistantAuthors: for simDir, _ in simMetrics: val = topDistantAuthors[authorPair][simDir] minVal = min(minVal, val) maxVal = max(maxVal, val) pairRankOutput = [] pairRankOutputSimple = [] pairRankOutput.append(""" \\begin{table}[!bt] \\centering \\def\\arraystretch{1} \\begin{tabular}{| l | c | c | c | c | c | c |} \\hline & \\multicolumn{5}{c|}{\\textbf{Rank according to}} \\\\ & \\textbf{Jensen-} & \\textbf{Burrows'} & & & & \\\\ \\textbf{Authors} & \\textbf{Shannon} & \\textbf{Delta} & \\textbf{Min-Max} & \\textbf{Manhattan} & \\textbf{Canberra} & \\textbf{Cosine} \\\\\\hline """) pairRankOutputSimple.append("%s,%s,%s,%s,%s,%s,%s" % ("Authors", "Jensen-Shannon", "Burrow's Delta", "Min-Max", "Manhattan", "Canberra", "Cosine")) authorConvert = { "ApolloniusRhodius": "Apollonius", "DionysiusOfHalicarnassus": "Dionysius", "EusebiusOfCaesarea": "Eusebius", "ClementOfAlexandria": "Clement", "BasilBishopOfCaesarea": "Basil", "Anonymous(Hymns_Aphrodite)": "Hymns Aphrodite", "Anonymous(Hymns_Apollo)": "Hymns Apollo", "Anonymous(Hymns_Demeter)": "Hymns Demeter", "Anonymous(Hymns_Hermes)": "Hymns Hermes", "Anonymous(Hymns_Rest)": "Hymns Rest", } for authorPair in topDistantAuthors: pair = "(".join(authorPair.split(" (")[:-1]) pairSplit = pair.split(", ") author1 = pairSplit[0] author2 = pairSplit[1] if author1 in authorConvert: author1 = authorConvert[author1] if author2 in authorConvert: author2 = authorConvert[author2] pairName = author1 + ", " + author2 cell = "%s &" % pairName cellSimple = "%s," % re.sub(", ", "/", pairName) firstVal = None for simDir, _ in simMetrics: val = topDistantAuthors[authorPair][simDir] cutoff = 100 if (val < cutoff): r, g, b = colorConvert(minVal, cutoff, val, COLOR_ORANGE, COLOR_GRAY) else: r, g, b = colorConvert(cutoff, maxVal, val, COLOR_GRAY, COLOR_BLUE) cell += "\\cellcolor[rgb]{%.3f,%.3f,%.3f} " % (r, g, b) if (firstVal == None): firstVal = val cell += "%d & " % (val) cellSimple += "%d," % (val) else: cell += "%d (%+d) & " % (val, firstVal - val) rel = "(%d)" % (firstVal - val) cellSimple += "%d %s," % (val, rel) cell = cell[:-2] pairRankOutput.append("%s\\\\\\hline" % cell) pairRankOutputSimple.append(cellSimple) pairRankOutput.append(""" \\end{tabular} \\caption{Rank of these pair's similarity by different metrics.} \\label{table:pair_rank} \\end{table} """) utils.safeWrite("%swordUse/pairRankTable.tex" % baseFolder, "\n".join(pairRankOutput)) utils.safeWrite("%swordUse/pairRankTableSimple.csv" % baseFolder, "\n".join(pairRankOutputSimple))