示例#1
0
def get_sim_pair(corpus, target_word1, target_word2, year, results_dir):

    results_pair = target_word1 + '-' + target_word2 + '-cosines.tsv'

    embedds = SequentialEmbedding.load(corpus, range(year, year + 10, 10))
    embedd = embedds.get_embed(year)

    cos = embedd.similarity(target_word1, target_word2)

    if os.path.isfile(results_dir + results_pair):
        print('file exists')
        with open(results_dir + results_pair) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir + results_pair, 'a') as outfile:
        result = target_word1 + '-' + target_word2 + '\t' + str(
            year) + '\t' + str(cos) + '\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile.write(result)

    print(cos)
def outputSimilarities():
    """
    outputs similarity scores between stim word, high, low, incongruent
    """
    wordPairsList = parseTxtFile()
    print(wordPairsList)
    real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990])

    #with open("task1Results.txt", "w") as resultFile:
    with open("stimWordSimScores.txt", "w") as resultFile:
        for wordList in wordPairsList:
            if len(wordList) < 4:
                continue
            stimWord = wordList[0]
            high, low, incongruent = wordList[1], wordList[2], wordList[3]
            time_sim_high = real_embeddings.get_time_sims(stimWord, high)
            time_sim_low = real_embeddings.get_time_sims(stimWord, low)
            time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent)

            for year, sim in time_sim_high.iteritems():
                simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)
                highStr = str(stimWord) + " " + str(high) + " " +  simStr + "\n"
                resultFile.write(highStr)

            for year, sim in time_sim_low.iteritems():
                simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)
                lowStr = str(stimWord) + " " +  str(low) +  " " + simStr + "\n"
                resultFile.write(lowStr)
            
            for year, sim in time_sim_incon.iteritems():
                simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)
                inconStr = str(stimWord) + " " + str(incongruent) + " " + simStr + "\n"
                resultFile.write(inconStr)
            
            resultFile.write("\n")
示例#3
0
def histword_similarity(target_word, attribute_word):
    fiction_embeddings = SequentialEmbedding.load(
        "embeddings/eng-fiction-all_sgns", range(1900, 2000, 10))
    time_sims = fiction_embeddings.get_time_sims(target_word, attribute_word)
    #print "Similarity between gay and lesbian drastically increases from 1950s to the 1990s:"
    for year, sim in time_sims.iteritems():
        print("{year:d}, cosine similarity={sim:0.2f}".format(year=year,
                                                              sim=sim))
示例#4
0
 def getSimScores(self, dataFile, resultFile):
     real_embeddings = SequentialEmbedding.load(
         "../embeddings/eng-all_sgns", range(1930, 1990, 10))
     for line in self.parsedLines:
         stim, word = line[0], line[1]
         timeSimIter = real_embeddings.get_time_sims(
             stim.lower(), word.lower())
         self.writeToOutput(timeSimIter, stim, word, resultFile)
def outputSimilaritiesAveraged():
    wordPairsList = parseTxtFile()
    startYear, endYear = 1900, 1990
    embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns",
                                          range(startYear, endYear, 10))

    highVals, lowVals, inconVals = [], [], []
    with open("stimWordsIntervals/" + str(startYear) + "_avg.txt",
              "a") as resultFile:
        for wordList in wordPairsList:
            if len(wordList) < 4:
                continue
            stimWord = wordList[0]
            high, low, incon = wordList[1], wordList[2], wordList[3]
            stimWordSubembeds = embeddings.get_subembeds([stimWord])
            highWordSubembeds = embeddings.get_subembeds([high])
            lowWordSubembeds = embeddings.get_subembeds([low])
            inconWordSubembeds = embeddings.get_subembeds([incon])

            stimWordAvgEmbed = getAveragedEmbedding(stimWordSubembeds,
                                                    startYear, endYear)
            highAvgEmbed = getAveragedEmbedding(highWordSubembeds, startYear,
                                                endYear)
            lowAvgEmbed = getAveragedEmbedding(lowWordSubembeds, startYear,
                                               endYear)
            inconAvgEmbed = getAveragedEmbedding(inconWordSubembeds, startYear,
                                                 endYear)

            highSim = getSimScore(stimWordAvgEmbed, highAvgEmbed)
            lowSim = getSimScore(stimWordAvgEmbed, lowAvgEmbed)
            inconSim = getSimScore(stimWordAvgEmbed, inconAvgEmbed)

            highVals.append(highSim)
            lowVals.append(lowSim)
            inconVals.append(inconSim)

            resultFile.write(
                getResultStr(stimWord, high, startYear, endYear, highSim))
            resultFile.write(
                getResultStr(stimWord, low, startYear, endYear, lowSim))
            resultFile.write(
                getResultStr(stimWord, incon, startYear, endYear, inconSim))
            resultFile.write("\n")

            barHeights = [
                sum(highVals) / len(highVals),
                sum(lowVals) / len(lowVals),
                sum(inconVals) / len(inconVals)
            ]
            barNames = ["1900 high", "1900 low", "1900 incon"]
            plt.bar(barNames, barHeights, color='blue')
            save_str = "stimWordsIntervals/" + str(startYear) + "_avg.png"
            plt.figure(0)
            plt.savefig(save_str)
示例#6
0
def load_embeddings(filename=None):
    if not filename:
        filename = EMBEDDING

    with embed_lock:
        print "LOADING EMBEDDINGS %s" % filename
        start = time.time()

        if filename in EMBED_CACHE:
            return EMBED_CACHE[filename]

        print "THIS MIGHT TAKE A WHILE..."

        embeddings = SequentialEmbedding.load(filename, range(1840, 2000, 10))
        print "LOAD EMBEDDINGS TOOK %s" % (time.time() - start)

        EMBED_CACHE[filename] = embeddings
        return embeddings
示例#7
0
def load_embeddings(filename=None, start_year=1840, end_year=2000, step=10):
    if not filename:
        filename = EMBEDDING

    with embed_lock:
        print("LOADING EMBEDDINGS %s" % filename)
        start = time.time()

        if filename + str(step) in EMBED_CACHE:
            return EMBED_CACHE[filename + str(step)]

        print("THIS MIGHT TAKE A WHILE...")
        embeddings = SequentialEmbedding.load(
            filename, range(start_year, end_year, step))
        print("LOAD EMBEDDINGS TOOK %s" % (time.time() - start))

        EMBED_CACHE[filename + str(step)] = embeddings
        return embeddings
示例#8
0
def load_embeddings(filename=None):
    if not filename:
        filename = EMBEDDING

    with embed_lock:
        print("LOADING EMBEDDINGS %s" % filename)
        start = time.time()

        if filename in EMBED_CACHE:
            return EMBED_CACHE[filename]

        print("THIS MIGHT TAKE A WHILE...")

        embeddings = SequentialEmbedding.load(filename, range(1840, 2000, 10))
        print("LOAD EMBEDDINGS TOOK %s" % (time.time() - start))

        EMBED_CACHE[filename] = embeddings
        return embeddings
示例#9
0
def evaluate_diachronic_accuracy(embedding_path, word_pairs_path, start_year,
                                 end_year, year_inc):
    word_pairs_1, word_pairs_2 = ioutils.load_word_pairs(word_pairs_path)
    embeddings = SequentialEmbedding.load(
        embedding_path, range(start_year, end_year + 1, year_inc))
    stat_sig_count = 0
    pairs_len = len(word_pairs_1)
    print "Getting similarities for", word_pairs_1[0]
    print "Correlation", "\t", "p-value"
    print "-----------------------------"
    for i in range(pairs_len):
        p1 = word_pairs_1[i]
        p2 = word_pairs_2[i]
        time_sims = embeddings.get_time_sims(p1, p2)
        spear_corr = compute_spear_corr(time_sims)
        print "{corr:0.7f}\t{p:0.7f}".format(corr=spear_corr[0],
                                             p=spear_corr[1])
        if spear_corr[1] <= 0.05:
            stat_sig_count += 1
    return stat_sig_count * 1.0 / pairs_len
示例#10
0
def pltHistograms():
    wordList = parseTxtFile()
    yearHigh, yearLow, yearIncon = {}, {}, {}
    startDecade, endDecade = 1800, 2000
    years = [year for year in range(startDecade, endDecade, 10)]
    for year in years:
        yearHigh[year] = []
        yearLow[year] = []
        yearIncon[year] = []
    
    real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10))
    wordPairsList = parseTxtFile()
    for wordList in wordPairsList:
        if len(wordList) < 4:
            continue
        
        stimWord = wordList[0]
        high, low, incongruent = wordList[1], wordList[2], wordList[3]
        time_sim_high = real_embeddings.get_time_sims(stimWord, high)
        time_sim_low = real_embeddings.get_time_sims(stimWord, low)
        time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent)
        for sim_year, sim in time_sim_high.iteritems():
            yearHigh[sim_year].append(sim)
        
        for sim_year, sim in time_sim_low.iteritems():
            yearLow[sim_year].append(sim)
        
        for sim_year, sim in time_sim_incon.iteritems():
            yearIncon[sim_year].append(sim)

    figCount = 0
    for year in yearHigh:
        plt.figure(figCount)
        plt.title("cosine sim histogram in year " + str(year))
        plt.xlabel("cosine sim scores")
        plt.ylabel("frequency of score")
        plt.hist(yearHigh[year], color='r')
        plt.hist(yearLow[year], color='b')
        plt.hist(yearIncon[year], color='g')
        plt.savefig("histograms/" + str(year))
        figCount += 1
示例#11
0
def evaluate_by_hw(target, ref, t, gold, embeddingfile):
    gold   = int(gold)
    offset =  int(t / 10) * 10 #round off to nearest smaller decade, like I did in my R script get_correlations
    #print(offset)
    embeddings = SequentialEmbedding.load(embeddingfile, range(1800, 2000, 10))
    time_sims = embeddings.get_time_sims(target, ref)
    t = collections.OrderedDict([])
    for y in range(offset, 2000, 10): t[y]=time_sims[y]
    rho, p = scipy.stats.spearmanr(t.keys(), t.values())
    if p <= 0.05: 
	sig = 1
    else:
        sig = 0
    #print(t)
    if np.isnan(rho):
	correct = float('nan')
	sig = float('nan')
    elif np.sign(rho) == np.sign(gold):
	correct = 1
    else:
	correct = 0
    return {"correct":correct, 'sig':sig, "p":p, "corr":rho}
def outputSimilaritiesAveraged():
    real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990])
    with open("stimWordSimScores.txt", "a") as resultFile:
        for stimPhrase in multipleContentPairMap:
            stimWords = stimPhrase.split(' ')
            #stimWordEmbedding = real_embeddings.get_subembeds(stimWords)
            high, low, incon = multipleContentPairMap[stimPhrase]
            
            highSim = phraseSimScores(stimWords, high, real_embeddings)
            lowSim =  phraseSimScores(stimWords, low, real_embeddings) 
            inconSim =  phraseSimScores(stimWords, incon, real_embeddings)

            simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(highSim))
            highStr = str(stimPhrase) + " " + str(high) + " " +  simStr + "\n"
            resultFile.write(highStr)

            simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(lowSim))
            lowStr = str(stimPhrase) + " " +  str(low) +  " " + simStr + "\n"
            resultFile.write(lowStr)

            simStr = "{year:d}, cosine similarity={sim:0.2f}".format(year=1990,sim=float(inconSim))
            inconStr = str(stimPhrase) + " " + str(incon) + " " + simStr + "\n"
            resultFile.write(inconStr)
            resultFile.write("\n")
示例#13
0
def plotIndividualLines(startDecade, endDecade, averaged=False, contextRelevant=False):
    cosDenom = 0
    wordPairsList = parseTxtFile()
    figureCount = 0
    years = [i for i in range(startDecade, endDecade, 10)]
    real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10))
    yearHighVals, yearLowVals, yearLowVals = [], [], []
    stimWordDenomMap = {}
    if contextRelevant == True:
        stimWordDenomMap = getContextSimScoreDenom(startDecade, endDecade, wordPairsList, real_embeddings)

    allHighVals, allLowVals, allInconVals = [], [], []
    for wordList in wordPairsList:
        if len(wordList) < 4:
            continue
        stimWord = wordList[0]
        highVals, lowVals, inconVals = [], [], []
        highCounter, lowCounter, inconCounter = 0, 0, 0
        high, low, incongruent = wordList[1], wordList[2], wordList[3]      
        time_sim_high = real_embeddings.get_time_sims(stimWord, high)
        time_sim_low = real_embeddings.get_time_sims(stimWord, low)
        time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent)
        triplet = (stimWord, high, low, incongruent) # only used if contextScores = true

        for sim_year, sim in time_sim_high.iteritems():
            if averaged:
                nextHigh = (sum(highVals) + sim)/(len(highVals) + 1)
                highVals.append(nextHigh)
            elif contextRelevant:
                if stimWordDenomMap[triplet][sim_year] != 0:
                    nextHigh = sim/stimWordDenomMap[triplet][sim_year]
                else:
                    nextHigh = 0.0
                highVals.append(nextHigh)

            else:
                highVals.append(sim)
        
        for sim_year, sim in time_sim_low.iteritems():
            if averaged:
                nextLow = (sum(lowVals) + sim)/(len(lowVals) + 1)
                lowVals.append(nextLow)
            elif contextRelevant:
                if stimWordDenomMap[triplet][sim_year] != 0:
                    nextLow = sim/stimWordDenomMap[triplet][sim_year]
                else:
                    nextLow = 0.0
                lowVals.append(nextLow)
            else:
                lowVals.append(sim)
        
        for sim_year, sim in time_sim_incon.iteritems():
            if averaged:
                nextIncon = (sum(inconVals) + sim)/(len(inconVals) + 1)
                inconVals.append(nextIncon)
            elif contextRelevant:
                if stimWordDenomMap[triplet][sim_year] != 0:
                    nextIncon = sim/stimWordDenomMap[triplet][sim_year]
                else:
                    nextIncon = 0.0
                inconVals.append(nextIncon)
            else:
                inconVals.append(sim)
        
        plt.figure()
        plt.plot(years, highVals, 'g')
        plt.plot(years, lowVals, 'r')
        plt.plot(years, inconVals, 'm')
        plotTitle = stimWord + " " + "(" + high + ", " + low + ", " + incongruent + ", " + ")"

        if averaged:
            plotFileName = "averagedLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent
        elif contextRelevant:
            plotFileName = "contextRelevantLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent
        else:
            plotFileName = "individualLineGraphs/" + stimWord + "_" + high + "_" + low + "_" + incongruent
        plt.title(plotTitle)
        plt.savefig(plotFileName)
        plt.close()
    
        allHighVals.append(highVals)
        allLowVals.append(lowVals)
        allInconVals.append(inconVals)
示例#14
0
from representations.sequentialembedding import SequentialEmbedding

"""
Example showing how to load a series of historical embeddings and compute similarities over time.
Warning that loading all the embeddings into main memory can take a lot of RAM
"""

if __name__ == "__main__":
    fiction_embeddings = SequentialEmbedding.load("embeddings/eng-fiction-all_sgns", range(1950, 2000, 10))
    time_sims = fiction_embeddings.get_time_sims("lesbian", "gay")   
    print "Similarity between gay and lesbian drastically increases from 1950s to the 1990s:"
    for year, sim in time_sims.iteritems():
        print "{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim)
#!/usr/bin/env python

# Constructs a list of neighbor words for each target term.

from representations.sequentialembedding import SequentialEmbedding
import pandas as pd
import numpy as np

save_as = "lingua_technica_neighbors.csv"
embedding_path = "embeddings/all-eng"

print("loading embeddings...")
embeddings = SequentialEmbedding.load(embedding_path, range(1850, 2000, 10))

# The set of target words to query neighbors for
targets = [
       "motor",
       "engine",
       "computer",
       "laser",
       "rocket",
       "radar",
       "microwave"
]

# The number of neighbors to query for each word
n = 100
# The start year for the range of decades to query
start_year = 1900
# The end year for the range of decades to query
end_year = 2000
示例#16
0
文件: example.py 项目: cheapmon/wot
from representations.sequentialembedding import SequentialEmbedding
"""
Example showing how to load a series of historical embeddings and compute similarities over time.
Warning that loading all the embeddings into main memory can take a lot of RAM
"""

if __name__ == "__main__":
    fiction_embeddings = SequentialEmbedding.load(
        "embeddings/eng-fiction-all_sgns", list(range(1840, 2000, 10)))
    time_sims = fiction_embeddings.get_time_sims("happy", "gay")
    for year, sim in time_sims.items():
        print("{year:d}, cosine similarity={sim:0.2f}".format(year=year,
                                                              sim=sim))
                          "human body": ["leg", "chest", "gyms"],
                          "place live": ["apartment", "tent", "domino"],
                          "four footed animal": ["cat", "pig", "bus"],
                          "part tree": ["branch", "twig", "plaza"],
                          "room house": ["bedroom", "attic", "brooms"],
                          "kitchen tool": ["fork", "mixer", "text"],
                          "four footed animal": ["dog", "mouse", "pink"],
                          "part bicycle": ["wheel", "lock", "store"],
                          "musical instrument": ["flute", "harp", "snap"],
                          "type bread": ["white", "french", "ruins"],
                          "room house": ["bathroom", "study", "zombie"],
                          "farm animal": ["pig", "mule", "pipe"],
                          "thing read": ["newspaper", "letter", "vulture"],
                          "carpenter tool": ["hammer", "ruler", "box"]}

yearEmbeddingMap = {1940: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1940]),
                    1950: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1950]),
                    1960: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1960]),
                    1970: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1970]),
                    1980: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1980]),
                    1990: SequentialEmbedding.load("../embeddings/eng-all_sgns", [1990])}

# returns list of similarity scores for phrases from 1940 - 1990
#def phraseSimScoresOverTime(stimWords, word):


def phraseSimScores(stimWords, word, real_embeddings):
    stimWordVectors = []
    for sWord in stimWords:
        sEmbed = real_embeddings.get_subembeds([sWord])
        try:
示例#18
0
def get_sim_neighbors_svd(corpus, target_word1, target_word2, year1, year2, n, results_dir):


    """Two options: either 2 differnt years and 1 target word
    or the same year and 2 target words"""

    if not os.path.isdir(results_dir+'neighbors'):

        os.mkdir(results_dir+'neighbors')

    results_words = 'neighbors/'+target_word1+'-'+target_word2+'-'+str(year1)+'-'+str(year2)+'.tsv'

    if (year1 != year2) and (target_word1 == target_word2):
        results_cosine = 'cosines-'+target_word1+'-n-'+str(n)+'.tsv'
        embedds = SequentialSVDEmbedding.load(corpus, range(year1, year2+10, 10))
        embedd_year1 = embedds.get_embed(year1)
        embedd_year2 = embedds.get_embed(year2)

        neighbors_year1 = get_nearest_neighbors(embedd_year1, target_word1, n)
        neighbors_year2 = get_nearest_neighbors(embedd_year2, target_word1, n)

        union = get_union(neighbors_year1, neighbors_year2)

        filtered_union = filter_union(union, embedd_year1, embedd_year2, target_word1)
        vec1 = get_second_order_vector(embedd_year1, filtered_union, target_word1)
        vec2 = get_second_order_vector(embedd_year2, filtered_union, target_word1)

        neighbor_words1 = get_nearest_neighbor_words(neighbors_year1)
        neighbor_words2 = get_nearest_neighbor_words(neighbors_year2)

    elif (year1 == year2) and (target_word1 != target_word2):
        results_cosine = 'cosines-'+target_word1+'-'+target_word2+'-n-'+str(n)+'.tsv'

        embedds = SequentialEmbedding.load(corpus, range(year1, year2+10, 10))
        embedd_year = embedds.get_embed(year1)


        neighbors_word1 = get_nearest_neighbors(embedd_year, target_word1, n)
        neighbors_word2 = get_nearest_neighbors(embedd_year, target_word2, n)

        union = get_union(neighbors_word1, neighbors_word2)

        vec1 = get_second_order_vector(embedd_year, union, target_word1)
        vec2 = get_second_order_vector(embedd_year, union, target_word2)

        neighbor_words1 = get_nearest_neighbor_words(neighbors_word1)
        neighbor_words2 = get_nearest_neighbor_words(neighbors_word2)

    cos = get_cosine(vec1, vec2)

    if os.path.isfile(results_dir+results_cosine):
        print('file exists')
        with open(results_dir+results_cosine) as infile:
            existing_results = infile.read().split('\n')

    else:
        existing_results = []

    with open(results_dir+results_words, 'w') as outfile1:
        for word1, word2 in zip(neighbor_words1, neighbor_words2):
            outfile1.write(word1+'\t'+word2+'\n')

    with open(results_dir+'/'+results_cosine, 'a') as outfile2:
        result = target_word1+'-'+target_word2+'\t'+str(year1)+'-'+str(year2)+'\t'+str(cos)+'\n'
        if result.strip() in existing_results:
            print('result already there')
        else:
            outfile2.write(result)
    print(cos)
示例#19
0
def outputAndPlotSimilaritiesRange():
    wordPairsList = parseTxtFile()
    for year in [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980]:
        highLongRangeVals, lowLongRangeVals, inconLongRangeVals = [], [], []
        highShortRangeVals, lowShortRangeVals, inconShortRangeVals = [], [], []
        wordCount = 0
        real_embeddings = SequentialEmbedding.load(
            "../embeddings/eng-all_sgns", range(year, 2000, 10))
        fileStr = "stimWordsIntervals/" + str(year) + "_range.txt"
        for wordList in wordPairsList:
            if len(wordList) < 4:
                continue
            wordCount += 1
            stimWord = wordList[0]
            high, low, incongruent = wordList[1], wordList[2], wordList[3]
            time_sim_high = real_embeddings.get_time_sims(stimWord, high)
            time_sim_low = real_embeddings.get_time_sims(stimWord, low)
            time_sim_incon = real_embeddings.get_time_sims(
                stimWord, incongruent)

            high_long_avg, high_short_avg = computeAvgs(time_sim_high)
            low_long_avg, low_short_avg = computeAvgs(time_sim_low)
            incon_long_avg, incon_short_avg = computeAvgs(time_sim_incon)

            # writeToFile(time_sim_high, year, stimWord, high)
            # writeToFile(time_sim_low, year, stimWord, low)
            # writeToFile(time_sim_incon, year, stimWord, incongruent)

            highLongRangeVals.append(high_long_avg)
            highShortRangeVals.append(high_short_avg)
            lowLongRangeVals.append(low_long_avg)
            lowShortRangeVals.append(low_short_avg)
            inconLongRangeVals.append(incon_long_avg)
            inconShortRangeVals.append(incon_short_avg)

            nameEnd = "-1990"
            barNames = (str(year) + " high", str(year) + " short",
                        str(year) + " incon", "1980-1990highBar",
                        "1980-1990lowBar", "1980-1990inconBar")

            #print("sum: ", sum(highLongRangeVals))
            avg_high_long = sum(highLongRangeVals) / len(highLongRangeVals)
            avg_low_long = sum(lowLongRangeVals) / len(lowLongRangeVals)
            avg_incon_long = sum(inconLongRangeVals) / len(inconLongRangeVals)

            avg_high_short = sum(highShortRangeVals) / len(highShortRangeVals)
            avg_low_short = sum(lowShortRangeVals) / len(lowShortRangeVals)
            avg_incon_short = sum(inconShortRangeVals) / len(
                inconShortRangeVals)

            with open("stimWordsIntervals/rangeSimScores.txt", "a") as f:
                f.write(
                    str(year) + ", 1990 high average: " + str(avg_high_long) +
                    "\n")
                f.write(
                    str(year) + ", 1990 low average: " + str(avg_low_long) +
                    "\n")
                f.write(
                    str(year) + ", 1990 incongruent average: " +
                    str(avg_incon_long) + "\n")
                f.write("1980, 1990 high average: " + str(avg_high_short) +
                        "\n")
                f.write("1980, 1990 low average: " + str(avg_low_short) + "\n")
                f.write("1980, 1990 incongruent average " +
                        str(avg_incon_short) + "\n")
                f.write("\n")

            #barHeights = [avg_high_long, avg_low_long, avg_incon_long,
            #              avg_high_short, avg_low_short, avg_incon_short]

            #plt.figure()
            #plt.bar(barNames, barHeights)
            #save_str = "stimWordsIntervals/" + str(year) + "_range"
            #plt.savefig(save_str)
            #figureCount += 1


#outputAndPlotSimilaritiesRange()
示例#20
0
# clics3_dict = np.load("clics_1_languages.npy",allow_pickle = True)
# clics3_dict = clics3_dict[0]

# non_word counts:
# 891 total
# 532 remove bracket
# 427 remove OR (105 words are first word in "OR" clause)

# transfer the clics3 dictionary to concepts expressed in only 1 english word.
# transferable, filtered = transfer_to_one_word(clics3_dict)
# count is for the reduandent word
# dict_one_word,count = dict_to_one_word(clics3_dict, transferable)

# np.save("clics_2_dict",[dict_one_word],allow_pickle=True)
#get word histwords embeddings
embds = SequentialEmbedding.load("embeddings/eng-all_sgns/sgns",
                                 range(1890, 2000, 10))
year_s = 1890
year_e = 1990

# rate = calcualte_overlap_rate(embds,dict_one_word,1990,n=100)

# clics_2_languages
#1990 rate mean = 0.18535519972
#1890 rate mean = 0.21535519972

#clics_1_languages
# 1990 rate mean = 0.08764466776061722
# 1890 rate mean = 0.09757857460610882

#----------------7.23.2020-----------------------------------
# untransfered word list
def makeLineGraphs(startDecade, endDecade):
    wordPairsList = parseTxtFile()
    #real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(1940, 2000, 10))
    real_embeddings = SequentialEmbedding.load("../embeddings/eng-all_sgns", range(startDecade, endDecade, 10))
    highPairs, lowPairs, inconPairs = {}, {}, {}
    key = 0

    def writeToFileFromDict(startDecade, pairsDictList, stim, word, fileName):
        with open(fileName, "a") as f:
            currentYear = startDecade
            for score in pairsDictList:
                writeStr = stim + ", " + word + " " + str(currentYear) + " "  + str(score) + "\n"
                f.write(writeStr)
                currentYear += 10
            f.write("\n")

    for i in range(len(wordPairsList)):
        wordList = wordPairsList[i]
        if len(wordList) < 4:
            continue
        stimWord = wordList[0]
        high, low, incongruent = wordList[1], wordList[2], wordList[3]
        time_sim_high = real_embeddings.get_time_sims(stimWord, high)
        time_sim_low = real_embeddings.get_time_sims(stimWord, low)
        time_sim_incon = real_embeddings.get_time_sims(stimWord, incongruent)

        highPairs[i], lowPairs[i], inconPairs[i] = [], [], []

        for year, sim in time_sim_high.iteritems():
            highPairs[i].append(sim)

        for year, sim in time_sim_low.iteritems():
            lowPairs[i].append(sim)
        
        for year, sim in time_sim_incon.iteritems():
            inconPairs[i].append(sim)

        # dump data to a text file
        # leave this commented out if you already have data files generated
        #writeToFileFromDict(startDecade, highPairs[i], stimWord, high, "highPairsCosSimScores.txt")
        #writeToFileFromDict(startDecade, lowPairs[i], stimWord, low, "lowPairsCosSimScores.txt")
        #writeToFileFromDict(startDecade, inconPairs[i], stimWord, incongruent, "incongruentPairsCosSimScores.txt")
    
    mpl.style.use('default')
    
    highAvgs = [0 for i in range(startDecade, endDecade, 10)]
    lowAvgs = [0 for i in range(startDecade, endDecade, 10)]
    inconAvgs = [0 for i in range(startDecade, endDecade, 10)]
    for i in range(len(highAvgs)):
        highIndexSum, lowIndexSum, inconIndexSum = 0, 0, 0
        for key in highPairs:
            highIndexSum += highPairs[key][i]
            lowIndexSum += lowPairs[key][i]
            inconIndexSum += inconPairs[key][i]
        highIndexAvg = highIndexSum/len(highPairs)
        lowIndexAvg = lowIndexSum/len(lowPairs)
        inconIndexAvg = inconIndexSum/len(inconPairs)
        highAvgs[i] = highIndexAvg
        lowAvgs[i] = lowIndexAvg
        inconAvgs[i] = inconIndexAvg
    
    years = [y for y in range(startDecade, endDecade, 10)]
    

    for key in highPairs:
        plt.plot(years, highPairs[key], 'g')
    for key in lowPairs:
        plt.plot(years, lowPairs[key], 'r')
    for key in inconPairs:
        plt.plot(years, inconPairs[key], 'm')

    plt.plot(years, highAvgs, 'k', linewidth = 2.0)
    plt.plot(years, lowAvgs, 'k', linewidth=2.0)
    plt.plot(years, inconAvgs, 'k', linewidth=2.0)

    plt.title("Changes in similarity scores over time (extended)")
    plt.xlabel("Years")
    plt.ylabel("Similarity Scores")
    plt.show()
示例#22
0
def outputAndPlotSimilarities():
    wordPairsList = parseTxtFile()
    figureCount = 0
    highVal8090, lowVal8090, inconVal8090 = [], [], []
    base_embedding = SequentialEmbedding.load("../embeddings/eng-all_sgns",
                                              [1980, 1990])
    for wordList in wordPairsList:
        if len(wordList) < 4:
            continue
        stimWord = wordList[0]
        high, low, incongruent = wordList[1], wordList[2], wordList[3]
        time_sim_high = base_embedding.get_time_sims(stimWord, high)
        time_sim_low = base_embedding.get_time_sims(stimWord, low)
        time_sim_incon = base_embedding.get_time_sims(stimWord, incongruent)
        for sim_year, sim in time_sim_high.iteritems():
            #print("sim, sim_year: ", sim, sim_year)
            highVal8090.append(sim)
        for sim_year, sim in time_sim_low.iteritems():
            lowVal8090.append(sim)
        for sim_year, sim in time_sim_incon.iteritems():
            inconVal8090.append(sim)

    for year in [1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980]:
        real_embeddings = SequentialEmbedding.load(
            "../embeddings/eng-all_sgns", [year, 1990])
        yearHighVals, yearLowVals, yearInconVals = [], [], []
        for wordList in wordPairsList:
            if len(wordList) < 4:
                continue
            stimWord = wordList[0]
            high, low, incongruent = wordList[1], wordList[2], wordList[3]
            time_sim_high = real_embeddings.get_time_sims(stimWord, high)
            time_sim_low = real_embeddings.get_time_sims(stimWord, low)
            time_sim_incon = real_embeddings.get_time_sims(
                stimWord, incongruent)

            for sim_year, sim in time_sim_high.iteritems():
                yearHighVals.append(sim)
            for sim_year, sim in time_sim_low.iteritems():
                yearLowVals.append(sim)
            for sim_year, sim in time_sim_incon.iteritems():
                yearInconVals.append(sim)

            writeToFile(time_sim_high, year, stimWord, high)
            writeToFile(time_sim_low, year, stimWord, low)
            writeToFile(time_sim_incon, year, stimWord, incongruent)

        nameEnd = "-1990"
        #barNames = (str(year) + nameEnd + "highBar", str(year) + nameEnd + "shortBar", str(year) + nameEnd + "inconBar",
        #                "1980-1990highBar", "1980-1990lowBar", "1980-1990inconBar")

        barNames = (str(year) + " high", str(year) + " low",
                    str(year) + " incon", "1990 high", "1990 low",
                    "1990 incon")

        yearHighAvg = sum(yearHighVals) / len(yearHighVals)
        yearLowAvg = sum(yearLowVals) / len(yearLowVals)
        yearInconAvg = sum(yearInconVals) / len(yearInconVals)

        high8090Avg = sum(highVal8090) / len(highVal8090)
        low8090Avg = sum(lowVal8090) / len(lowVal8090)
        incon8090Avg = sum(inconVal8090) / len(inconVal8090)

        # with open("singleSimScores.txt", "a") as f:
        #     f.write(str(year) + ",1990 high average: " + str(yearHighAvg) + "\n")
        #     f.write(str(year) + ",1990 low average: " + str(yearLowAvg) + "\n")
        #     f.write(str(year) + ",1990 incongruent average: " + str(yearInconAvg) + "\n")
        #     f.write("1980, 1990 high average: " + str(high8090Avg) + "\n")
        #     f.write("1980 1990 low average: " + str(low8090Avg) + "\n")
        #     f.write("1980 1990 incongruent average " + str(incon8090Avg) + "\n")
        #     f.write("\n")

        barHeights = [
            yearHighAvg, yearLowAvg, yearInconAvg, high8090Avg, low8090Avg,
            incon8090Avg
        ]

        print(barHeights)
        plt.figure(figureCount)
        plt.bar(barNames, barHeights)
        save_str = "stimWordsIntervals/" + str(year)
        plt.savefig(save_str)
        figureCount += 1
# value: list of CN words, corresponding to the EN word
from_EN_to_CN = {}
for group, group_df in df.groupby("EN"):
    from_EN_to_CN[group] = list(group_df["CN"].values)

print(from_EN_to_CN)
"""
output
{'Independence': ['个性', '自己', '独特'], 'Interdependence': ['屈服', '共享', '社区', '和谐'], 'Need': ['需要', '必要', '前提'], 'Status & Achievement': ['级别', '地位', '声望', '声誉']}
"""
"""
histwords
"""

from representations.sequentialembedding import SequentialEmbedding
ch_embeddings = SequentialEmbedding.load("embeddings/CN",
                                         range(1950, 2000, 10))

# get the similarity coefficient between all words in lists A & B, A & C, B & C, and so on
data = []

from itertools import combinations, product
# 1. make combinations between every EN words
for A, B in list(combinations(from_EN_to_CN.keys(), 2)):
    # 2. product the lists of the corresponding CN words
    for A_word, B_word in list(product(from_EN_to_CN[A], from_EN_to_CN[B])):
        # output for debug
        # print("%s-%s: %s-%s" % (A, B, A_word, B_word))
        # 3. get similarity coefficient of each pair of word
        time_sims = ch_embeddings.get_time_sims(A_word, B_word)
        for year, sim in time_sims.items():
            data.append(["%s-%s: %s-%s" % (A, B, A_word, B_word), year, sim])
示例#24
0
        if neighbour != word and fiction_embeddings.get_sim(
                1990, word, neighbour) > 0.25:
            neighbours_list.append(neighbour)
            for neighbour_of_neighbour in fiction_embeddings.get_seq_closest(
                    neighbour, 1990):
                if neighbour_of_neighbour not in neighbours_list and \
                                fiction_embeddings.get_sim(1990, neighbour, neighbour_of_neighbour) > 0.25 and \
                                neighbour_of_neighbour != word:
                    neighbours_list.append(neighbour_of_neighbour)
    if len(neighbours_list):
        print(word + ' ' + str(len(neighbours_list)))
        dict_neighbours_1990[word] = neighbours_list


if __name__ == "__main__":
    fiction_embeddings = SequentialEmbedding.load(
        "embeddings/eng-fiction-all_sgns", range(1900, 2000, 90))
    vocab1900 = sorted(
        util.load_pickle('embeddings/eng-fiction-all_sgns/1900-vocab.pkl'))
    #construct_neighbours_list(vocab1900)
    '''
    for word in tqdm(vocab1900[5000:]):
        list1900 = []
        for neighbour in fiction_embeddings.get_seq_closest(word, 1900):
            if neighbour != word and fiction_embeddings.get_sim(1900, word, neighbour) > 0.4:
                list1900.append(neighbour)
        if len(list1900):
            dict_neighbours_1900[word] = list1900
    '''
    pool = ThreadPool(8)
    pool.map(res, vocab1900[5000:99965])
    pool.close()
示例#25
0
from representations.sequentialembedding import SequentialEmbedding
"""
Example showing how to load a series of historical embeddings and compute similarities over time.
Warning that loading all the embeddings into main memory can take a lot of RAM
"""

if __name__ == "__main__":
    # fiction_embeddings = SequentialEmbedding.load("embeddings/eng-fiction-all_sgns", range(1950, 2000, 10))
    # fiction_embeddings = SequentialEmbedding.load("eng-all_sgns", range(1950, 2000, 10))
    # # time_sims = fiction_embeddings.get_time_sims("lesbian", "gay")
    # time_sims = fiction_embeddings.get_seq_neighbour_set("gay",n=2)
    # print ("Similarity between gay and lesbian drastically increases from 1950s to the 1990s:")
    # for year, sim in iter(time_sims.items()):
    #     print ("{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim))

    for year in range(1800, 2000, 20):
        fiction_embeddings = SequentialEmbedding.load(
            "eng-all_sgns", range(year, year + 20, 20))
        # fiction_embeddings = SequentialEmbedding.load("fre-all", range(year, year+20, 20))
        time_sims = fiction_embeddings.get_seq_neighbour_set("awful", n=5)
        print(year)
        print(time_sims)