def syntacticWordRelations(wordIndexMap, embeddings, maxWords=10): global syntacticWordData if syntacticWordData is None: syntacticWordData = [] syntWordRelFilePath = "res/Syntactic-Word-Relations/questions-words.txt" with open(syntWordRelFilePath, "r") as file: lines = file.readlines() syntacticWordData = [tuple(line.lower().split(" ")) for line in lines if not line.startswith(":")] syntacticWordData = [ (word0.strip(), word1.strip(), word2.strip(), word3.strip()) for word0, word1, word2, word3 in syntacticWordData ] scores = [] for word0, word1, word2, word3 in syntacticWordData: if ( word0 not in wordIndexMap or word1 not in wordIndexMap or word2 not in wordIndexMap or word3 not in wordIndexMap ): continue word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word2Index = wordIndexMap[word2] word3Index = wordIndexMap[word3] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] word2Embedding = embeddings[word2Index] word3Embedding = embeddings[word3Index] similarity01 = vectors.cosineSimilarity(word0Embedding, word1Embedding) similarity23 = vectors.cosineSimilarity(word2Embedding, word3Embedding) score = 1 minSimilarityDelta = abs(similarity01 - similarity23) for embedding in embeddings[:maxWords]: similarity2N = vectors.cosineSimilarity(word2Embedding, embedding) similarityDelta = abs(similarity01 - similarity2N) score = not (similarityDelta < minSimilarityDelta) if not score: break scores.append(score) if len(scores) == 0: return numpy.nan syntacticWordRelationsMetric = float(sum(scores)) / len(scores) return syntacticWordRelationsMetric
def syntacticWordRelations(wordIndexMap, embeddings, maxWords=10): global syntacticWordData if syntacticWordData is None: syntacticWordData = [] syntWordRelFilePath = 'res/Syntactic-Word-Relations/questions-words.txt' with open(syntWordRelFilePath, 'r') as file: lines = file.readlines() syntacticWordData = [ tuple(line.lower().split(' ')) for line in lines if not line.startswith(':') ] syntacticWordData = [ (word0.strip(), word1.strip(), word2.strip(), word3.strip()) for word0, word1, word2, word3 in syntacticWordData ] scores = [] for word0, word1, word2, word3 in syntacticWordData: if word0 not in wordIndexMap or word1 not in wordIndexMap or word2 not in wordIndexMap or word3 not in wordIndexMap: continue word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word2Index = wordIndexMap[word2] word3Index = wordIndexMap[word3] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] word2Embedding = embeddings[word2Index] word3Embedding = embeddings[word3Index] similarity01 = vectors.cosineSimilarity(word0Embedding, word1Embedding) similarity23 = vectors.cosineSimilarity(word2Embedding, word3Embedding) score = 1 minSimilarityDelta = abs(similarity01 - similarity23) for embedding in embeddings[:maxWords]: similarity2N = vectors.cosineSimilarity(word2Embedding, embedding) similarityDelta = abs(similarity01 - similarity2N) score = not (similarityDelta < minSimilarityDelta) if not score: break scores.append(score) if len(scores) == 0: return numpy.nan syntacticWordRelationsMetric = float(sum(scores)) / len(scores) return syntacticWordRelationsMetric
def simLex999(wordIndexMap, embeddings): global simLex999Data if simLex999Data is None: simLex999Data = [] simLex999FilePath = 'res/SimLex-999/SimLex-999.txt' data = pandas.read_csv(simLex999FilePath, sep='\t') for word0, word1, targetScore in zip(data['word1'], data['word2'], data['SimLex999']): simLex999Data.append((word0, word1, targetScore)) targetScores = [] scores = [] for word0, word1, targetScore in simLex999Data: if word0 in wordIndexMap and word1 in wordIndexMap: targetScores.append(targetScore) word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] score = vectors.cosineSimilarity(word0Embedding, word1Embedding) scores.append(score) if len(scores) == 0: return numpy.nan pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores) spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores) simLex999Metric = numpy.mean([pearson, spearman]) return simLex999Metric
def buildEmbeddingsTree(indexMap, embeddings, comparator=None): embeddingsCount = len(embeddings) embeddingIndices = numpy.arange(0, embeddingsCount) xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)] comparator = lambda a, b: vectors.euclideanDistance(a, b) + 1 / (2 + 2*vectors.cosineSimilarity(a, b)) function = lambda xy: comparator(embeddings[xy[0]], embeddings[xy[1]]) if xy[0] != xy[1] else 0 comparisons = map(function, xy) maxComparison = max(comparisons) comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) / maxComparison comparisons = ssd.squareform(comparisons) links = linkage(comparisons) fig, ax = plt.subplots() fig.subplots_adjust(right=0.8) names = map(lambda nameIndexPair: nameIndexPair[0].split('/')[-1], indexMap.items()) names = sorted(names) dendrogram( links, leaf_rotation=90., leaf_font_size=8., orientation='right', labels=names, show_contracted=True, show_leaf_counts=True) plt.show()
def wordSimilarity353(wordIndexMap, embeddings): global wordSimilarity353Data if wordSimilarity353Data is None: wordSimilarity353Data = [] wordSimilarity353FilePath = 'res/WordSimilarity-353/combined.csv' data = pandas.read_csv(wordSimilarity353FilePath) for word0, word1, score in zip(data['Word1'], data['Word2'], data['Score']): wordSimilarity353Data.append((word0, word1, score)) scores = [] targetScores = [] for word0, word1, targetScore in wordSimilarity353Data: if word0 in wordIndexMap and word1 in wordIndexMap: targetScores.append(targetScore) word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] score = vectors.cosineSimilarity(word0Embedding, word1Embedding) scores.append(score) if len(scores) == 0: return numpy.nan pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores) spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores) metric = numpy.mean([pearson, spearman]) return metric
def innerSelect(self): indexMap = self.wordIndexMap embeddings = self.wordEmbeddings if self.leftMaster not in self.wordIndexMap: indexMap = self.fileIndexMap embeddings = self.fileEmbeddings leftMasterEmbedding = embeddings[indexMap[self.leftMaster]] leftSlaveEmbedding = embeddings[indexMap[self.leftSlave]] rightMasterEmbedding = embeddings[indexMap[self.rightMaster]] leftVector = leftMasterEmbedding + leftSlaveEmbedding names = [] scores = [] for item, embedding in zip(indexMap.items(), embeddings): name = item[0] names.append(name) rightVector = rightMasterEmbedding + embedding score = vectors.cosineSimilarity(leftVector, rightVector) scores.append(score) indices = np.argsort(scores)[-self.size:] names = np.asarray(names)[indices] embeddings = embeddings[indices] scores = np.asarray(scores)[indices] return names, embeddings, scores
def similarity(left, right, wordVocabulary, embeddings): leftIndex, leftFrequency = wordVocabulary[left] rightIndex, rightFrequency = wordVocabulary[right] leftEmbedding = embeddings[leftIndex] rightEmbedding = embeddings[rightIndex] return vectors.cosineSimilarity(leftEmbedding, rightEmbedding)
def similarity(left, right, wordVocabulary, embeddings): leftIndex = wordVocabulary[left] rightIndex = wordVocabulary[right] leftEmbedding = embeddings[leftIndex] rightEmbedding = embeddings[rightIndex] return vectors.cosineSimilarity(leftEmbedding, rightEmbedding)
def innerExecute(self, names, embeddings, scores): meanEmbedding = np.mean(embeddings, axis=0) comparator = lambda e: vectors.cosineSimilarity(meanEmbedding, e) names = map(lambda wordIndexPair: wordIndexPair[0], self.wordIndexMap.items()) scores = [] for wordEmbedding in self.wordEmbeddings: score = comparator(wordEmbedding) scores.append(score) indices = np.argsort(scores)[::-1] indices = indices[:self.size] names = np.asarray(names)[indices] log.info('Best tags: {0}', ', '.join(names))
def compareEmbeddings(indexMap, embeddingsList, comparator=None, annotate=False, axisLabels=True): embeddingsCount = len(indexMap) embeddingIndices = numpy.arange(0, embeddingsCount) xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)] xx, yy = zip(*xy) if comparator is None: comparator = lambda a, b: vectors.cosineSimilarity(a, b) + 1 / vectors.euclideanDistance(a, b) function = lambda xy: comparator(embeddingsList[xy[0]], embeddingsList[xy[1]]) if xy[0] != xy[1] else numpy.nan comparisons = map(function, xy) comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) nanxx, nanyy = numpy.where(numpy.isnan(comparisons)) nanxy = zip(nanxx, nanyy) leftx = lambda x: max(x, 0) rightx = lambda x: min(x, comparisons.shape[0]) lefty = lambda y: max(y, 0) righty = lambda y: min(y, comparisons.shape[1]) for x, y in nanxy: neighbours = comparisons[leftx(x-1):rightx(x+2),lefty(y-1):righty(y+2)] neighbours = neighbours[neighbours > 0] comparisons[x,y] = numpy.mean(neighbours) fig, ax = plt.subplots() fig.subplots_adjust(bottom=0.2) if axisLabels: filePaths = indexMap.keys() fileNames = [os.path.basename(filePath).split('.')[0] for filePath in filePaths] indices = [indexMap[filePath] for filePath in filePaths] plt.xticks(indices, fileNames, size='small', rotation='vertical') plt.yticks(indices, fileNames, size='small') plt.contourf(comparisons) if annotate: for x, y, c in zip(xx, yy, comparisons.flatten()): c = '{0:.1f}'.format(c*100) plt.annotate(c, (x, y)) plt.show()
def rubensteinGoodenough(wordIndexMap, embeddings): global rubensteinGoodenoughData if rubensteinGoodenoughData is None: rubensteinGoodenoughData = [] rubensteinGoodenoughFilePath = 'res/RG/EN-RG-65.txt' with open(rubensteinGoodenoughFilePath) as file: lines = file.readlines() wordPairs = [] targetScores = [] for line in lines: word0, word1, targetScore = tuple(line.strip().split('\t')) targetScore = float(targetScore) rubensteinGoodenoughData.append((word0, word1, targetScore)) scores = [] targetScores = [] for word0, word1, targetScore in rubensteinGoodenoughData: if word0 in wordIndexMap and word1 in wordIndexMap: targetScores.append(targetScore) word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] score = vectors.cosineSimilarity(word0Embedding, word1Embedding) scores.append(score) if len(scores) == 0: return numpy.nan pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores) spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores) rubensteinGoodenoughMetric = numpy.mean([pearson, spearman]) return rubensteinGoodenoughMetric
def rubensteinGoodenough(wordIndexMap, embeddings): global rubensteinGoodenoughData if rubensteinGoodenoughData is None: rubensteinGoodenoughData = [] rubensteinGoodenoughFilePath = "res/RG/EN-RG-65.txt" with open(rubensteinGoodenoughFilePath) as file: lines = file.readlines() wordPairs = [] targetScores = [] for line in lines: word0, word1, targetScore = tuple(line.strip().split("\t")) targetScore = float(targetScore) rubensteinGoodenoughData.append((word0, word1, targetScore)) scores = [] targetScores = [] for word0, word1, targetScore in rubensteinGoodenoughData: if word0 in wordIndexMap and word1 in wordIndexMap: targetScores.append(targetScore) word0Index = wordIndexMap[word0] word1Index = wordIndexMap[word1] word0Embedding = embeddings[word0Index] word1Embedding = embeddings[word1Index] score = vectors.cosineSimilarity(word0Embedding, word1Embedding) scores.append(score) if len(scores) == 0: return numpy.nan pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores) spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores) rubensteinGoodenoughMetric = numpy.mean([pearson, spearman]) return rubensteinGoodenoughMetric
def innerExecute(self, selector): words = [] embeddings = [] for operator, word in selector.operands: embedding = self.wordEmbeddings[self.wordIndexMap[word]] if operator == '-': embedding = embedding * (-1) words.append(word) embeddings.append(embedding) minIndex, maxIndex = ExplainFunction.getSurroundingIndices(words, self.wordIndexMap, 5000) result = embeddings[0] for embedding in embeddings[1:]: result += embedding scores = [] for index in xrange(minIndex, maxIndex): word = self.wordIndexItems[index][0] if word not in words: embedding = self.wordEmbeddings[index] score = vectors.cosineSimilarity(result, embedding) scores.append((word, score)) log.progress('Looking for closest matches: {0:.3f}%.', index - minIndex + 1, maxIndex - minIndex) log.lineBreak() scores = sorted(scores, key=lambda s: s[1], reverse=True) for score in scores[:10]: print score
def satQuestions(wordIndexMap, embeddings): global satQuestionsData if satQuestionsData is None: satQuestionsData = [] satQuestionsFilePath = 'res/SAT-Questions/SAT-package-V3.txt' maxLineLength = 50 aCode = ord('a') with open(satQuestionsFilePath) as satFile: line = satFile.readline() while line != '': if len(line) < maxLineLength: match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) if match: stemWord0, stemWord1 = match.group('word0'), match.group('word1') satQuestion = [stemWord0, stemWord1] line = satFile.readline() match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) while match: choiceWord0, choiceWord1 = match.group('word0'), match.group('word1') satQuestion.append(choiceWord0) satQuestion.append(choiceWord1) line = satFile.readline() match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) correctChoiceIndex = ord(line.strip()) - aCode satQuestion.append(correctChoiceIndex) satQuestionsData.append(satQuestion) line = satFile.readline() scores = [] for satQuestion in satQuestionsData: if any([word not in wordIndexMap for word in satQuestion[:-1]]): continue stemWord0, stemWord1 = satQuestion[:2] stemWord0Index = wordIndexMap[stemWord0] stemWord1Index = wordIndexMap[stemWord1] stemWord0Embedding, stemWord1Embedding = embeddings[stemWord0Index], embeddings[stemWord1Index] stemSimilarity = vectors.cosineSimilarity(stemWord0Embedding, stemWord1Embedding) correctChoiceIndex = satQuestion[-1] choiceSimilarityDeltas = [] choices = satQuestion[2:-1] for i in xrange(0, len(choices), 2): choiceWord0, choiceWord1 = choices[i], choices[i+1] choiceWord0Index, choiceWord1Index = wordIndexMap[choiceWord0], wordIndexMap[choiceWord1] choiceWord0Embedding, choiceWord1Embedding = embeddings[choiceWord0Index], embeddings[choiceWord1Index] choiceSimilarity = vectors.cosineSimilarity(choiceWord0Embedding, choiceWord1Embedding) choiceSimilarityDelta = abs(stemSimilarity - choiceSimilarity) choiceSimilarityDeltas.append(choiceSimilarityDelta) choiceIndex = numpy.argmin(choiceSimilarityDeltas) scores.append(int(choiceIndex == correctChoiceIndex)) if len(scores) == 0: return numpy.nan metric = float(sum(scores)) / len(scores) return metric
def satQuestions(wordIndexMap, embeddings): global satQuestionsData if satQuestionsData is None: satQuestionsData = [] satQuestionsFilePath = 'res/SAT-Questions/SAT-package-V3.txt' maxLineLength = 50 aCode = ord('a') with open(satQuestionsFilePath) as file: line = file.readline() while line != '': if len(line) < maxLineLength: match = re.match( '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) if match: stemWord0, stemWord1 = match.group( 'word0'), match.group('word1') satQuestion = [stemWord0, stemWord1] line = file.readline() match = re.match( '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) while match: choiceWord0, choiceWord1 = match.group( 'word0'), match.group('word1') satQuestion.append(choiceWord0) satQuestion.append(choiceWord1) line = file.readline() match = re.match( '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line) correctChoiceIndex = ord(line.strip()) - aCode satQuestion.append(correctChoiceIndex) satQuestionsData.append(satQuestion) line = file.readline() scores = [] for satQuestion in satQuestionsData: if any([word not in wordIndexMap for word in satQuestion[:-1]]): continue stemWord0, stemWord1 = satQuestion[:2] stemWord0Index = wordIndexMap[stemWord0] stemWord1Index = wordIndexMap[stemWord1] stemWord0Embedding, stemWord1Embedding = embeddings[ stemWord0Index], embeddings[stemWord1Index] stemSimilarity = vectors.cosineSimilarity(stemWord0Embedding, stemWord1Embedding) correctChoiceIndex = satQuestion[-1] choiceSimilarityDeltas = [] choices = satQuestion[2:-1] for i in xrange(0, len(choices), 2): choiceWord0, choiceWord1 = choices[i], choices[i + 1] choiceWord0Index, choiceWord1Index = wordIndexMap[ choiceWord0], wordIndexMap[choiceWord1] choiceWord0Embedding, choiceWord1Embedding = embeddings[ choiceWord0Index], embeddings[choiceWord1Index] choiceSimilarity = vectors.cosineSimilarity( choiceWord0Embedding, choiceWord1Embedding) choiceSimilarityDelta = abs(stemSimilarity - choiceSimilarity) choiceSimilarityDeltas.append(choiceSimilarityDelta) choiceIndex = numpy.argmin(choiceSimilarityDeltas) scores.append(int(choiceIndex == correctChoiceIndex)) if len(scores) == 0: return numpy.nan metric = float(sum(scores)) / len(scores) return metric