def get_sim_pair(corpus, target_word1, target_word2, year, results_dir): results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv' embedds = SequentialEmbedding.load(corpus, range(year, year + 10, 10)) embedd = embedds.get_embed(year) cos = embedd.similarity(target_word1, target_word2) if os.path.isfile(results_dir + results_pair): print('file exists') with open(results_dir + results_pair) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir + results_pair, 'a') as outfile: result = target_word1 + '-' + target_word2 + '\t' + str( year) + '\t' + str(cos) + '\n' if result.strip() in existing_results: print('result already there') else: outfile.write(result) print(cos)
def outputSimilarities(): """ outputs similarity scores between stim word, high, low, incongruent """ wordPairsList = parseTxtFile() print(wordPairsList) real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990]) #with open("task1Results.txt", "w") as resultFile: with open("stimWordSimScores.txt", "w") as resultFile: for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent) for year, sim in time_sim_high.iteritems(): simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim) highStr = str(stimWord) + " " + str(high) + " " + simStr + "\n" resultFile.write(highStr) for year, sim in time_sim_low.iteritems(): simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim) lowStr = str(stimWord) + " " + str(low) + " " + simStr + "\n" resultFile.write(lowStr) for year, sim in time_sim_incon.iteritems(): simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim) inconStr = str(stimWord) + " " + str(incongruent) + " " + simStr + "\n" resultFile.write(inconStr) resultFile.write("\n")
def histword_similarity(target_word, attribute_word): fiction_embeddings = SequentialEmbedding.load( "embeddings/eng-fiction-all_sgns", range(1900, 2000, 10)) time_sims = fiction_embeddings.get_time_sims(target_word, attribute_word) #print "Similarity between gay and lesbian drastically increases from 1950s to the 1990s:" for year, sim in time_sims.iteritems(): print("{year:d}, cosine similarity={sim:0.2f}".format(year=year, sim=sim))
def getSimScores(self, dataFile, resultFile): real_embeddings = SequentialEmbedding.load( "../embeddings/eng-all_sgns", range(1930, 1990, 10)) for line in self.parsedLines: stim, word = line[0], line[1] timeSimIter = real_embeddings.get_time_sims( stim.lower(), word.lower()) self.writeToOutput(timeSimIter, stim, word, resultFile)
def outputSimilaritiesAveraged(): wordPairsList = parseTxtFile() startYear, endYear = 1900, 1990 embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startYear, endYear, 10)) highVals, lowVals, inconVals = [], [], [] with open("stimWordsIntervals/" + str(startYear) + "_avg.txt", "a") as resultFile: for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] high, low, incon = wordList[1], wordList[2], wordList[3] stimWordSubembeds = embeddings.get_subembeds([stimWord]) highWordSubembeds = embeddings.get_subembeds([high]) lowWordSubembeds = embeddings.get_subembeds([low]) inconWordSubembeds = embeddings.get_subembeds([incon]) stimWordAvgEmbed = getAveragedEmbedding(stimWordSubembeds, startYear, endYear) highAvgEmbed = getAveragedEmbedding(highWordSubembeds, startYear, endYear) lowAvgEmbed = getAveragedEmbedding(lowWordSubembeds, startYear, endYear) inconAvgEmbed = getAveragedEmbedding(inconWordSubembeds, startYear, endYear) highSim = getSimScore(stimWordAvgEmbed, highAvgEmbed) lowSim = getSimScore(stimWordAvgEmbed, lowAvgEmbed) inconSim = getSimScore(stimWordAvgEmbed, inconAvgEmbed) highVals.append(highSim) lowVals.append(lowSim) inconVals.append(inconSim) resultFile.write( getResultStr(stimWord, high, startYear, endYear, highSim)) resultFile.write( getResultStr(stimWord, low, startYear, endYear, lowSim)) resultFile.write( getResultStr(stimWord, incon, startYear, endYear, inconSim)) resultFile.write("\n") barHeights = [ sum(highVals) / len(highVals), sum(lowVals) / len(lowVals), sum(inconVals) / len(inconVals) ] barNames = ["1900 high", "1900 low", "1900 incon"] plt.bar(barNames, barHeights, color='blue') save_str = "stimWordsIntervals/" + str(startYear) + "_avg.png" plt.figure(0) plt.savefig(save_str)
def load_embeddings(filename=None): if not filename: filename = EMBEDDING with embed_lock: print "LOADING EMBEDDINGS %s" % filename start = time.time() if filename in EMBED_CACHE: return EMBED_CACHE[filename] print "THIS MIGHT TAKE A WHILE..." embeddings = SequentialEmbedding.load(filename, range(1840, 2000, 10)) print "LOAD EMBEDDINGS TOOK %s" % (time.time() - start) EMBED_CACHE[filename] = embeddings return embeddings
def load_embeddings(filename=None, start_year=1840, end_year=2000, step=10): if not filename: filename = EMBEDDING with embed_lock: print("LOADING EMBEDDINGS %s" % filename) start = time.time() if filename + str(step) in EMBED_CACHE: return EMBED_CACHE[filename + str(step)] print("THIS MIGHT TAKE A WHILE...") embeddings = SequentialEmbedding.load( filename, range(start_year, end_year, step)) print("LOAD EMBEDDINGS TOOK %s" % (time.time() - start)) EMBED_CACHE[filename + str(step)] = embeddings return embeddings
def load_embeddings(filename=None): if not filename: filename = EMBEDDING with embed_lock: print("LOADING EMBEDDINGS %s" % filename) start = time.time() if filename in EMBED_CACHE: return EMBED_CACHE[filename] print("THIS MIGHT TAKE A WHILE...") embeddings = SequentialEmbedding.load(filename, range(1840, 2000, 10)) print("LOAD EMBEDDINGS TOOK %s" % (time.time() - start)) EMBED_CACHE[filename] = embeddings return embeddings
def evaluate_diachronic_accuracy(embedding_path, word_pairs_path, start_year, end_year, year_inc): word_pairs_1, word_pairs_2 = ioutils.load_word_pairs(word_pairs_path) embeddings = SequentialEmbedding.load( embedding_path, range(start_year, end_year + 1, year_inc)) stat_sig_count = 0 pairs_len = len(word_pairs_1) print "Getting similarities for", word_pairs_1[0] print "Correlation", "\t", "p-value" print "-----------------------------" for i in range(pairs_len): p1 = word_pairs_1[i] p2 = word_pairs_2[i] time_sims = embeddings.get_time_sims(p1, p2) spear_corr = compute_spear_corr(time_sims) print "{corr:0.7f}\t{p:0.7f}".format(corr=spear_corr[0], p=spear_corr[1]) if spear_corr[1] <= 0.05: stat_sig_count += 1 return stat_sig_count * 1.0 / pairs_len
def pltHistograms(): wordList = parseTxtFile() yearHigh, yearLow, yearIncon = {}, {}, {} startDecade, endDecade = 1800, 2000 years = [year for year in range(startDecade, endDecade, 10)] for year in years: yearHigh[year] = [] yearLow[year] = [] yearIncon[year] = [] real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10)) wordPairsList = parseTxtFile() for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent) for sim_year, sim in time_sim_high.iteritems(): yearHigh[sim_year].append(sim) for sim_year, sim in time_sim_low.iteritems(): yearLow[sim_year].append(sim) for sim_year, sim in time_sim_incon.iteritems(): yearIncon[sim_year].append(sim) figCount = 0 for year in yearHigh: plt.figure(figCount) plt.title("cosine sim histogram in year " + str(year)) plt.xlabel("cosine sim scores") plt.ylabel("frequency of score") plt.hist(yearHigh[year], color='r') plt.hist(yearLow[year], color='b') plt.hist(yearIncon[year], color='g') plt.savefig("histograms/" + str(year)) figCount += 1
def evaluate_by_hw(target, ref, t, gold, embeddingfile): gold = int(gold) offset = int(t / 10) * 10 #round off to nearest smaller decade, like I did in my R script get_correlations #print(offset) embeddings = SequentialEmbedding.load(embeddingfile, range(1800, 2000, 10)) time_sims = embeddings.get_time_sims(target, ref) t = collections.OrderedDict([]) for y in range(offset, 2000, 10): t[y]=time_sims[y] rho, p = scipy.stats.spearmanr(t.keys(), t.values()) if p <= 0.05: sig = 1 else: sig = 0 #print(t) if np.isnan(rho): correct = float('nan') sig = float('nan') elif np.sign(rho) == np.sign(gold): correct = 1 else: correct = 0 return {"correct":correct, 'sig':sig, "p":p, "corr":rho}
def outputSimilaritiesAveraged(): real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990]) with open("stimWordSimScores.txt", "a") as resultFile: for stimPhrase in multipleContentPairMap: stimWords = stimPhrase.split(' ') #stimWordEmbedding = real_embeddings.get_subembeds(stimWords) high, low, incon = multipleContentPairMap[stimPhrase] highSim = phraseSimScores(stimWords, high, real_embeddings) lowSim = phraseSimScores(stimWords, low, real_embeddings) inconSim = phraseSimScores(stimWords, incon, real_embeddings) simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(highSim)) highStr = str(stimPhrase) + " " + str(high) + " " + simStr + "\n" resultFile.write(highStr) simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(lowSim)) lowStr = str(stimPhrase) + " " + str(low) + " " + simStr + "\n" resultFile.write(lowStr) simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(inconSim)) inconStr = str(stimPhrase) + " " + str(incon) + " " + simStr + "\n" resultFile.write(inconStr) resultFile.write("\n")
def plotIndividualLines(startDecade, endDecade, averaged=False, contextRelevant=False): cosDenom = 0 wordPairsList = parseTxtFile() figureCount = 0 years = [i for i in range(startDecade, endDecade, 10)] real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10)) yearHighVals, yearLowVals, yearLowVals = [], [], [] stimWordDenomMap = {} if contextRelevant == True: stimWordDenomMap = getContextSimScoreDenom(startDecade, endDecade, wordPairsList, real_embeddings) allHighVals, allLowVals, allInconVals = [], [], [] for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] highVals, lowVals, inconVals = [], [], [] highCounter, lowCounter, inconCounter = 0, 0, 0 high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent) triplet = (stimWord, high, low, incongruent) # only used if contextScores = true for sim_year, sim in time_sim_high.iteritems(): if averaged: nextHigh = (sum(highVals) + sim)/(len(highVals) + 1) highVals.append(nextHigh) elif contextRelevant: if stimWordDenomMap[triplet][sim_year] != 0: nextHigh = sim/stimWordDenomMap[triplet][sim_year] else: nextHigh = 0.0 highVals.append(nextHigh) else: highVals.append(sim) for sim_year, sim in time_sim_low.iteritems(): if averaged: nextLow = (sum(lowVals) + sim)/(len(lowVals) + 1) lowVals.append(nextLow) elif contextRelevant: if stimWordDenomMap[triplet][sim_year] != 0: nextLow = sim/stimWordDenomMap[triplet][sim_year] else: nextLow = 0.0 lowVals.append(nextLow) else: lowVals.append(sim) for sim_year, sim in time_sim_incon.iteritems(): if averaged: nextIncon = (sum(inconVals) + sim)/(len(inconVals) + 1) inconVals.append(nextIncon) elif contextRelevant: if stimWordDenomMap[triplet][sim_year] != 0: nextIncon = sim/stimWordDenomMap[triplet][sim_year] else: nextIncon = 0.0 inconVals.append(nextIncon) else: inconVals.append(sim) plt.figure() plt.plot(years, highVals, 'g') plt.plot(years, lowVals, 'r') plt.plot(years, inconVals, 'm') plotTitle = stimWord + " " + "(" + high + ", " + low + ", " + incongruent + ", " + ")" if averaged: plotFileName = "averagedLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent elif contextRelevant: plotFileName = "contextRelevantLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent else: plotFileName = "individualLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent plt.title(plotTitle) plt.savefig(plotFileName) plt.close() allHighVals.append(highVals) allLowVals.append(lowVals) allInconVals.append(inconVals)
from representations.sequentialembedding import SequentialEmbedding """ Example showing how to load a series of historical embeddings and compute similarities over time. Warning that loading all the embeddings into main memory can take a lot of RAM """ if __name__ == "__main__": fiction_embeddings = SequentialEmbedding.load("embeddings/eng-fiction-all_sgns", range(1950, 2000, 10)) time_sims = fiction_embeddings.get_time_sims("lesbian", "gay") print "Similarity between gay and lesbian drastically increases from 1950s to the 1990s:" for year, sim in time_sims.iteritems(): print "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)
#!/usr/bin/env python # Constructs a list of neighbor words for each target term. from representations.sequentialembedding import SequentialEmbedding import pandas as pd import numpy as np save_as = "lingua_technica_neighbors.csv" embedding_path = "embeddings/all-eng" print("loading embeddings...") embeddings = SequentialEmbedding.load(embedding_path, range(1850, 2000, 10)) # The set of target words to query neighbors for targets = [ "motor", "engine", "computer", "laser", "rocket", "radar", "microwave" ] # The number of neighbors to query for each word n = 100 # The start year for the range of decades to query start_year = 1900 # The end year for the range of decades to query end_year = 2000
from representations.sequentialembedding import SequentialEmbedding """ Example showing how to load a series of historical embeddings and compute similarities over time. Warning that loading all the embeddings into main memory can take a lot of RAM """ if __name__ == "__main__": fiction_embeddings = SequentialEmbedding.load( "embeddings/eng-fiction-all_sgns", list(range(1840, 2000, 10))) time_sims = fiction_embeddings.get_time_sims("happy", "gay") for year, sim in time_sims.items(): print("{year:d}, cosine similarity={sim:0.2f}".format(year=year, sim=sim))
"human body": ["leg", "chest", "gyms"], "place live": ["apartment", "tent", "domino"], "four footed animal": ["cat", "pig", "bus"], "part tree": ["branch", "twig", "plaza"], "room house": ["bedroom", "attic", "brooms"], "kitchen tool": ["fork", "mixer", "text"], "four footed animal": ["dog", "mouse", "pink"], "part bicycle": ["wheel", "lock", "store"], "musical instrument": ["flute", "harp", "snap"], "type bread": ["white", "french", "ruins"], "room house": ["bathroom", "study", "zombie"], "farm animal": ["pig", "mule", "pipe"], "thing read": ["newspaper", "letter", "vulture"], "carpenter tool": ["hammer", "ruler", "box"]} yearEmbeddingMap = {1940: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1940]), 1950: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1950]), 1960: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1960]), 1970: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1970]), 1980: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1980]), 1990: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990])} # returns list of similarity scores for phrases from 1940 - 1990 #def phraseSimScoresOverTime(stimWords, word): def phraseSimScores(stimWords, word, real_embeddings): stimWordVectors = [] for sWord in stimWords: sEmbed = real_embeddings.get_subembeds([sWord]) try:
def get_sim_neighbors_svd(corpus, target_word1, target_word2, year1, year2, n, results_dir): """Two options: either 2 differnt years and 1 target word or the same year and 2 target words""" if not os.path.isdir(results_dir+'neighbors'): os.mkdir(results_dir+'neighbors') results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv' if (year1 != year2) and (target_word1 == target_word2): results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv' embedds = SequentialSVDEmbedding.load(corpus, range(year1, year2+10, 10)) embedd_year1 = embedds.get_embed(year1) embedd_year2 = embedds.get_embed(year2) neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n) neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n) union = get_union(neighbors_year1, neighbors_year2) filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1) vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1) vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1) neighbor_words1 = get_nearest_neighbor_words(neighbors_year1) neighbor_words2 = get_nearest_neighbor_words(neighbors_year2) elif (year1 == year2) and (target_word1 != target_word2): results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv' embedds = SequentialEmbedding.load(corpus, range(year1, year2+10, 10)) embedd_year = embedds.get_embed(year1) neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n) neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n) union = get_union(neighbors_word1, neighbors_word2) vec1 = get_second_order_vector(embedd_year, union, target_word1) vec2 = get_second_order_vector(embedd_year, union, target_word2) neighbor_words1 = get_nearest_neighbor_words(neighbors_word1) neighbor_words2 = get_nearest_neighbor_words(neighbors_word2) cos = get_cosine(vec1, vec2) if os.path.isfile(results_dir+results_cosine): print('file exists') with open(results_dir+results_cosine) as infile: existing_results = infile.read().split('\n') else: existing_results = [] with open(results_dir+results_words, 'w') as outfile1: for word1, word2 in zip(neighbor_words1, neighbor_words2): outfile1.write(word1+'\t'+word2+'\n') with open(results_dir+'/'+results_cosine, 'a') as outfile2: result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n' if result.strip() in existing_results: print('result already there') else: outfile2.write(result) print(cos)
def outputAndPlotSimilaritiesRange(): wordPairsList = parseTxtFile() for year in [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980]: highLongRangeVals, lowLongRangeVals, inconLongRangeVals = [], [], [] highShortRangeVals, lowShortRangeVals, inconShortRangeVals = [], [], [] wordCount = 0 real_embeddings = SequentialEmbedding.load( "../embeddings/eng-all_sgns", range(year, 2000, 10)) fileStr = "stimWordsIntervals/" + str(year) + "_range.txt" for wordList in wordPairsList: if len(wordList) < 4: continue wordCount += 1 stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims( stimWord, incongruent) high_long_avg, high_short_avg = computeAvgs(time_sim_high) low_long_avg, low_short_avg = computeAvgs(time_sim_low) incon_long_avg, incon_short_avg = computeAvgs(time_sim_incon) # writeToFile(time_sim_high, year, stimWord, high) # writeToFile(time_sim_low, year, stimWord, low) # writeToFile(time_sim_incon, year, stimWord, incongruent) highLongRangeVals.append(high_long_avg) highShortRangeVals.append(high_short_avg) lowLongRangeVals.append(low_long_avg) lowShortRangeVals.append(low_short_avg) inconLongRangeVals.append(incon_long_avg) inconShortRangeVals.append(incon_short_avg) nameEnd = "-1990" barNames = (str(year) + " high", str(year) + " short", str(year) + " incon", "1980-1990highBar", "1980-1990lowBar", "1980-1990inconBar") #print("sum: ", sum(highLongRangeVals)) avg_high_long = sum(highLongRangeVals) / len(highLongRangeVals) avg_low_long = sum(lowLongRangeVals) / len(lowLongRangeVals) avg_incon_long = sum(inconLongRangeVals) / len(inconLongRangeVals) avg_high_short = sum(highShortRangeVals) / len(highShortRangeVals) avg_low_short = sum(lowShortRangeVals) / len(lowShortRangeVals) avg_incon_short = sum(inconShortRangeVals) / len( inconShortRangeVals) with open("stimWordsIntervals/rangeSimScores.txt", "a") as f: f.write( str(year) + ", 1990 high average: " + str(avg_high_long) + "\n") f.write( str(year) + ", 1990 low average: " + str(avg_low_long) + "\n") f.write( str(year) + ", 1990 incongruent average: " + str(avg_incon_long) + "\n") f.write("1980, 1990 high average: " + str(avg_high_short) + "\n") f.write("1980, 1990 low average: " + str(avg_low_short) + "\n") f.write("1980, 1990 incongruent average " + str(avg_incon_short) + "\n") f.write("\n") #barHeights = [avg_high_long, avg_low_long, avg_incon_long, # avg_high_short, avg_low_short, avg_incon_short] #plt.figure() #plt.bar(barNames, barHeights) #save_str = "stimWordsIntervals/" + str(year) + "_range" #plt.savefig(save_str) #figureCount += 1 #outputAndPlotSimilaritiesRange()
# clics3_dict = np.load("clics_1_languages.npy",allow_pickle = True) # clics3_dict = clics3_dict[0] # non_word counts: # 891 total # 532 remove bracket # 427 remove OR (105 words are first word in "OR" clause) # transfer the clics3 dictionary to concepts expressed in only 1 english word. # transferable, filtered = transfer_to_one_word(clics3_dict) # count is for the reduandent word # dict_one_word,count = dict_to_one_word(clics3_dict, transferable) # np.save("clics_2_dict",[dict_one_word],allow_pickle=True) #get word histwords embeddings embds = SequentialEmbedding.load("embeddings/eng-all_sgns/sgns", range(1890, 2000, 10)) year_s = 1890 year_e = 1990 # rate = calcualte_overlap_rate(embds,dict_one_word,1990,n=100) # clics_2_languages #1990 rate mean = 0.18535519972 #1890 rate mean = 0.21535519972 #clics_1_languages # 1990 rate mean = 0.08764466776061722 # 1890 rate mean = 0.09757857460610882 #----------------7.23.2020----------------------------------- # untransfered word list
def makeLineGraphs(startDecade, endDecade): wordPairsList = parseTxtFile() #real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(1940, 2000, 10)) real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10)) highPairs, lowPairs, inconPairs = {}, {}, {} key = 0 def writeToFileFromDict(startDecade, pairsDictList, stim, word, fileName): with open(fileName, "a") as f: currentYear = startDecade for score in pairsDictList: writeStr = stim + ", " + word + " " + str(currentYear) + " " + str(score) + "\n" f.write(writeStr) currentYear += 10 f.write("\n") for i in range(len(wordPairsList)): wordList = wordPairsList[i] if len(wordList) < 4: continue stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent) highPairs[i], lowPairs[i], inconPairs[i] = [], [], [] for year, sim in time_sim_high.iteritems(): highPairs[i].append(sim) for year, sim in time_sim_low.iteritems(): lowPairs[i].append(sim) for year, sim in time_sim_incon.iteritems(): inconPairs[i].append(sim) # dump data to a text file # leave this commented out if you already have data files generated #writeToFileFromDict(startDecade, highPairs[i], stimWord, high, "highPairsCosSimScores.txt") #writeToFileFromDict(startDecade, lowPairs[i], stimWord, low, "lowPairsCosSimScores.txt") #writeToFileFromDict(startDecade, inconPairs[i], stimWord, incongruent, "incongruentPairsCosSimScores.txt") mpl.style.use('default') highAvgs = [0 for i in range(startDecade, endDecade, 10)] lowAvgs = [0 for i in range(startDecade, endDecade, 10)] inconAvgs = [0 for i in range(startDecade, endDecade, 10)] for i in range(len(highAvgs)): highIndexSum, lowIndexSum, inconIndexSum = 0, 0, 0 for key in highPairs: highIndexSum += highPairs[key][i] lowIndexSum += lowPairs[key][i] inconIndexSum += inconPairs[key][i] highIndexAvg = highIndexSum/len(highPairs) lowIndexAvg = lowIndexSum/len(lowPairs) inconIndexAvg = inconIndexSum/len(inconPairs) highAvgs[i] = highIndexAvg lowAvgs[i] = lowIndexAvg inconAvgs[i] = inconIndexAvg years = [y for y in range(startDecade, endDecade, 10)] for key in highPairs: plt.plot(years, highPairs[key], 'g') for key in lowPairs: plt.plot(years, lowPairs[key], 'r') for key in inconPairs: plt.plot(years, inconPairs[key], 'm') plt.plot(years, highAvgs, 'k', linewidth = 2.0) plt.plot(years, lowAvgs, 'k', linewidth=2.0) plt.plot(years, inconAvgs, 'k', linewidth=2.0) plt.title("Changes in similarity scores over time (extended)") plt.xlabel("Years") plt.ylabel("Similarity Scores") plt.show()
def outputAndPlotSimilarities(): wordPairsList = parseTxtFile() figureCount = 0 highVal8090, lowVal8090, inconVal8090 = [], [], [] base_embedding = SequentialEmbedding.load("../embeddings/eng-all_sgns", [1980, 1990]) for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = base_embedding.get_time_sims(stimWord, high) time_sim_low = base_embedding.get_time_sims(stimWord, low) time_sim_incon = base_embedding.get_time_sims(stimWord, incongruent) for sim_year, sim in time_sim_high.iteritems(): #print("sim, sim_year: ", sim, sim_year) highVal8090.append(sim) for sim_year, sim in time_sim_low.iteritems(): lowVal8090.append(sim) for sim_year, sim in time_sim_incon.iteritems(): inconVal8090.append(sim) for year in [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980]: real_embeddings = SequentialEmbedding.load( "../embeddings/eng-all_sgns", [year, 1990]) yearHighVals, yearLowVals, yearInconVals = [], [], [] for wordList in wordPairsList: if len(wordList) < 4: continue stimWord = wordList[0] high, low, incongruent = wordList[1], wordList[2], wordList[3] time_sim_high = real_embeddings.get_time_sims(stimWord, high) time_sim_low = real_embeddings.get_time_sims(stimWord, low) time_sim_incon = real_embeddings.get_time_sims( stimWord, incongruent) for sim_year, sim in time_sim_high.iteritems(): yearHighVals.append(sim) for sim_year, sim in time_sim_low.iteritems(): yearLowVals.append(sim) for sim_year, sim in time_sim_incon.iteritems(): yearInconVals.append(sim) writeToFile(time_sim_high, year, stimWord, high) writeToFile(time_sim_low, year, stimWord, low) writeToFile(time_sim_incon, year, stimWord, incongruent) nameEnd = "-1990" #barNames = (str(year) + nameEnd + "highBar", str(year) + nameEnd + "shortBar", str(year) + nameEnd + "inconBar", # "1980-1990highBar", "1980-1990lowBar", "1980-1990inconBar") barNames = (str(year) + " high", str(year) + " low", str(year) + " incon", "1990 high", "1990 low", "1990 incon") yearHighAvg = sum(yearHighVals) / len(yearHighVals) yearLowAvg = sum(yearLowVals) / len(yearLowVals) yearInconAvg = sum(yearInconVals) / len(yearInconVals) high8090Avg = sum(highVal8090) / len(highVal8090) low8090Avg = sum(lowVal8090) / len(lowVal8090) incon8090Avg = sum(inconVal8090) / len(inconVal8090) # with open("singleSimScores.txt", "a") as f: # f.write(str(year) + ",1990 high average: " + str(yearHighAvg) + "\n") # f.write(str(year) + ",1990 low average: " + str(yearLowAvg) + "\n") # f.write(str(year) + ",1990 incongruent average: " + str(yearInconAvg) + "\n") # f.write("1980, 1990 high average: " + str(high8090Avg) + "\n") # f.write("1980 1990 low average: " + str(low8090Avg) + "\n") # f.write("1980 1990 incongruent average " + str(incon8090Avg) + "\n") # f.write("\n") barHeights = [ yearHighAvg, yearLowAvg, yearInconAvg, high8090Avg, low8090Avg, incon8090Avg ] print(barHeights) plt.figure(figureCount) plt.bar(barNames, barHeights) save_str = "stimWordsIntervals/" + str(year) plt.savefig(save_str) figureCount += 1
# value: list of CN words, corresponding to the EN word from_EN_to_CN = {} for group, group_df in df.groupby("EN"): from_EN_to_CN[group] = list(group_df["CN"].values) print(from_EN_to_CN) """ output {'Independence': ['个性', '自己', '独特'], 'Interdependence': ['屈服', '共享', '社区', '和谐'], 'Need': ['需要', '必要', '前提'], 'Status & Achievement': ['级别', '地位', '声望', '声誉']} """ """ histwords """ from representations.sequentialembedding import SequentialEmbedding ch_embeddings = SequentialEmbedding.load("embeddings/CN", range(1950, 2000, 10)) # get the similarity coefficient between all words in lists A & B, A & C, B & C, and so on data = [] from itertools import combinations, product # 1. make combinations between every EN words for A, B in list(combinations(from_EN_to_CN.keys(), 2)): # 2. product the lists of the corresponding CN words for A_word, B_word in list(product(from_EN_to_CN[A], from_EN_to_CN[B])): # output for debug # print("%s-%s: %s-%s" % (A, B, A_word, B_word)) # 3. get similarity coefficient of each pair of word time_sims = ch_embeddings.get_time_sims(A_word, B_word) for year, sim in time_sims.items(): data.append(["%s-%s: %s-%s" % (A, B, A_word, B_word), year, sim])
if neighbour != word and fiction_embeddings.get_sim( 1990, word, neighbour) > 0.25: neighbours_list.append(neighbour) for neighbour_of_neighbour in fiction_embeddings.get_seq_closest( neighbour, 1990): if neighbour_of_neighbour not in neighbours_list and \ fiction_embeddings.get_sim(1990, neighbour, neighbour_of_neighbour) > 0.25 and \ neighbour_of_neighbour != word: neighbours_list.append(neighbour_of_neighbour) if len(neighbours_list): print(word + ' ' + str(len(neighbours_list))) dict_neighbours_1990[word] = neighbours_list if __name__ == "__main__": fiction_embeddings = SequentialEmbedding.load( "embeddings/eng-fiction-all_sgns", range(1900, 2000, 90)) vocab1900 = sorted( util.load_pickle('embeddings/eng-fiction-all_sgns/1900-vocab.pkl')) #construct_neighbours_list(vocab1900) ''' for word in tqdm(vocab1900[5000:]): list1900 = [] for neighbour in fiction_embeddings.get_seq_closest(word, 1900): if neighbour != word and fiction_embeddings.get_sim(1900, word, neighbour) > 0.4: list1900.append(neighbour) if len(list1900): dict_neighbours_1900[word] = list1900 ''' pool = ThreadPool(8) pool.map(res, vocab1900[5000:99965]) pool.close()
from representations.sequentialembedding import SequentialEmbedding """ Example showing how to load a series of historical embeddings and compute similarities over time. Warning that loading all the embeddings into main memory can take a lot of RAM """ if __name__ == "__main__": # fiction_embeddings = SequentialEmbedding.load("embeddings/eng-fiction-all_sgns", range(1950, 2000, 10)) # fiction_embeddings = SequentialEmbedding.load("eng-all_sgns", range(1950, 2000, 10)) # # time_sims = fiction_embeddings.get_time_sims("lesbian", "gay") # time_sims = fiction_embeddings.get_seq_neighbour_set("gay",n=2) # print ("Similarity between gay and lesbian drastically increases from 1950s to the 1990s:") # for year, sim in iter(time_sims.items()): # print ("{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)) for year in range(1800, 2000, 20): fiction_embeddings = SequentialEmbedding.load( "eng-all_sgns", range(year, year + 20, 20)) # fiction_embeddings = SequentialEmbedding.load("fre-all", range(year, year+20, 20)) time_sims = fiction_embeddings.get_seq_neighbour_set("awful", n=5) print(year) print(time_sims)