示例#1
0
def syntacticWordRelations(wordIndexMap, embeddings, maxWords=10):
    global syntacticWordData

    if syntacticWordData is None:
        syntacticWordData = []
        syntWordRelFilePath = "res/Syntactic-Word-Relations/questions-words.txt"

        with open(syntWordRelFilePath, "r") as file:
            lines = file.readlines()
            syntacticWordData = [tuple(line.lower().split(" ")) for line in lines if not line.startswith(":")]
            syntacticWordData = [
                (word0.strip(), word1.strip(), word2.strip(), word3.strip())
                for word0, word1, word2, word3 in syntacticWordData
            ]

    scores = []
    for word0, word1, word2, word3 in syntacticWordData:
        if (
            word0 not in wordIndexMap
            or word1 not in wordIndexMap
            or word2 not in wordIndexMap
            or word3 not in wordIndexMap
        ):
            continue

        word0Index = wordIndexMap[word0]
        word1Index = wordIndexMap[word1]
        word2Index = wordIndexMap[word2]
        word3Index = wordIndexMap[word3]

        word0Embedding = embeddings[word0Index]
        word1Embedding = embeddings[word1Index]
        word2Embedding = embeddings[word2Index]
        word3Embedding = embeddings[word3Index]

        similarity01 = vectors.cosineSimilarity(word0Embedding, word1Embedding)
        similarity23 = vectors.cosineSimilarity(word2Embedding, word3Embedding)

        score = 1
        minSimilarityDelta = abs(similarity01 - similarity23)
        for embedding in embeddings[:maxWords]:
            similarity2N = vectors.cosineSimilarity(word2Embedding, embedding)
            similarityDelta = abs(similarity01 - similarity2N)

            score = not (similarityDelta < minSimilarityDelta)
            if not score:
                break

        scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    syntacticWordRelationsMetric = float(sum(scores)) / len(scores)

    return syntacticWordRelationsMetric
示例#2
0
def syntacticWordRelations(wordIndexMap, embeddings, maxWords=10):
    global syntacticWordData

    if syntacticWordData is None:
        syntacticWordData = []
        syntWordRelFilePath = 'res/Syntactic-Word-Relations/questions-words.txt'

        with open(syntWordRelFilePath, 'r') as file:
            lines = file.readlines()
            syntacticWordData = [
                tuple(line.lower().split(' ')) for line in lines
                if not line.startswith(':')
            ]
            syntacticWordData = [
                (word0.strip(), word1.strip(), word2.strip(), word3.strip())
                for word0, word1, word2, word3 in syntacticWordData
            ]

    scores = []
    for word0, word1, word2, word3 in syntacticWordData:
        if word0 not in wordIndexMap or word1 not in wordIndexMap or word2 not in wordIndexMap or word3 not in wordIndexMap:
            continue

        word0Index = wordIndexMap[word0]
        word1Index = wordIndexMap[word1]
        word2Index = wordIndexMap[word2]
        word3Index = wordIndexMap[word3]

        word0Embedding = embeddings[word0Index]
        word1Embedding = embeddings[word1Index]
        word2Embedding = embeddings[word2Index]
        word3Embedding = embeddings[word3Index]

        similarity01 = vectors.cosineSimilarity(word0Embedding, word1Embedding)
        similarity23 = vectors.cosineSimilarity(word2Embedding, word3Embedding)

        score = 1
        minSimilarityDelta = abs(similarity01 - similarity23)
        for embedding in embeddings[:maxWords]:
            similarity2N = vectors.cosineSimilarity(word2Embedding, embedding)
            similarityDelta = abs(similarity01 - similarity2N)

            score = not (similarityDelta < minSimilarityDelta)
            if not score:
                break

        scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    syntacticWordRelationsMetric = float(sum(scores)) / len(scores)

    return syntacticWordRelationsMetric
示例#3
0
def simLex999(wordIndexMap, embeddings):
    global simLex999Data

    if simLex999Data is None:
        simLex999Data = []
        simLex999FilePath = 'res/SimLex-999/SimLex-999.txt'
        data = pandas.read_csv(simLex999FilePath, sep='\t')

        for word0, word1, targetScore in zip(data['word1'], data['word2'],
                                             data['SimLex999']):
            simLex999Data.append((word0, word1, targetScore))

    targetScores = []
    scores = []
    for word0, word1, targetScore in simLex999Data:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    simLex999Metric = numpy.mean([pearson, spearman])

    return simLex999Metric
示例#4
0
def buildEmbeddingsTree(indexMap, embeddings, comparator=None):
    embeddingsCount = len(embeddings)
    embeddingIndices = numpy.arange(0, embeddingsCount)
    xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)]

    comparator = lambda a, b: vectors.euclideanDistance(a, b) + 1 / (2 + 2*vectors.cosineSimilarity(a, b))

    function = lambda xy: comparator(embeddings[xy[0]], embeddings[xy[1]]) if xy[0] != xy[1] else 0
    comparisons = map(function, xy)
    maxComparison = max(comparisons)
    comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) / maxComparison
    comparisons = ssd.squareform(comparisons)
    links = linkage(comparisons)

    fig, ax = plt.subplots()
    fig.subplots_adjust(right=0.8)

    names = map(lambda nameIndexPair: nameIndexPair[0].split('/')[-1], indexMap.items())
    names = sorted(names)
    dendrogram(
        links,
        leaf_rotation=90.,
        leaf_font_size=8.,
        orientation='right',
        labels=names,
        show_contracted=True,
        show_leaf_counts=True)

    plt.show()
示例#5
0
def wordSimilarity353(wordIndexMap, embeddings):
    global wordSimilarity353Data

    if wordSimilarity353Data is None:
        wordSimilarity353Data = []

        wordSimilarity353FilePath = 'res/WordSimilarity-353/combined.csv'
        data = pandas.read_csv(wordSimilarity353FilePath)

        for word0, word1, score in zip(data['Word1'], data['Word2'], data['Score']):
            wordSimilarity353Data.append((word0, word1, score))

    scores = []
    targetScores = []
    for word0, word1, targetScore in wordSimilarity353Data:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    metric = numpy.mean([pearson, spearman])

    return metric
示例#6
0
def simLex999(wordIndexMap, embeddings):
    global simLex999Data

    if simLex999Data is None:
        simLex999Data = []
        simLex999FilePath = 'res/SimLex-999/SimLex-999.txt'
        data = pandas.read_csv(simLex999FilePath, sep='\t')

        for word0, word1, targetScore in zip(data['word1'], data['word2'], data['SimLex999']):
            simLex999Data.append((word0, word1, targetScore))

    targetScores = []
    scores = []
    for word0, word1, targetScore in simLex999Data:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    simLex999Metric = numpy.mean([pearson, spearman])

    return simLex999Metric
示例#7
0
    def innerSelect(self):
        indexMap = self.wordIndexMap
        embeddings = self.wordEmbeddings
        if self.leftMaster not in self.wordIndexMap:
            indexMap = self.fileIndexMap
            embeddings = self.fileEmbeddings

        leftMasterEmbedding = embeddings[indexMap[self.leftMaster]]
        leftSlaveEmbedding = embeddings[indexMap[self.leftSlave]]
        rightMasterEmbedding = embeddings[indexMap[self.rightMaster]]

        leftVector = leftMasterEmbedding + leftSlaveEmbedding
        names = []
        scores = []
        for item, embedding in zip(indexMap.items(), embeddings):
            name = item[0]
            names.append(name)
            rightVector = rightMasterEmbedding + embedding
            score = vectors.cosineSimilarity(leftVector, rightVector)
            scores.append(score)

        indices = np.argsort(scores)[-self.size:]
        names = np.asarray(names)[indices]
        embeddings = embeddings[indices]
        scores = np.asarray(scores)[indices]

        return names, embeddings, scores
示例#8
0
def similarity(left, right, wordVocabulary, embeddings):
    leftIndex, leftFrequency = wordVocabulary[left]
    rightIndex, rightFrequency = wordVocabulary[right]

    leftEmbedding = embeddings[leftIndex]
    rightEmbedding = embeddings[rightIndex]

    return vectors.cosineSimilarity(leftEmbedding, rightEmbedding)
示例#9
0
def similarity(left, right, wordVocabulary, embeddings):
    leftIndex = wordVocabulary[left]
    rightIndex = wordVocabulary[right]

    leftEmbedding = embeddings[leftIndex]
    rightEmbedding = embeddings[rightIndex]

    return vectors.cosineSimilarity(leftEmbedding, rightEmbedding)
示例#10
0
    def innerExecute(self, names, embeddings, scores):
        meanEmbedding = np.mean(embeddings, axis=0)

        comparator = lambda e: vectors.cosineSimilarity(meanEmbedding, e)
        names = map(lambda wordIndexPair: wordIndexPair[0], self.wordIndexMap.items())
        scores = []
        for wordEmbedding in self.wordEmbeddings:
            score = comparator(wordEmbedding)
            scores.append(score)

        indices = np.argsort(scores)[::-1]
        indices = indices[:self.size]
        names = np.asarray(names)[indices]

        log.info('Best tags: {0}', ', '.join(names))
示例#11
0
def compareEmbeddings(indexMap, embeddingsList, comparator=None, annotate=False, axisLabels=True):
    embeddingsCount = len(indexMap)
    embeddingIndices = numpy.arange(0, embeddingsCount)

    xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)]
    xx, yy = zip(*xy)

    if comparator is None:
        comparator = lambda a, b: vectors.cosineSimilarity(a, b) + 1 / vectors.euclideanDistance(a, b)

    function = lambda xy: comparator(embeddingsList[xy[0]], embeddingsList[xy[1]]) if xy[0] != xy[1] else numpy.nan
    comparisons = map(function, xy)
    comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount))

    nanxx, nanyy = numpy.where(numpy.isnan(comparisons))
    nanxy = zip(nanxx, nanyy)
    leftx = lambda x: max(x, 0)
    rightx = lambda x: min(x, comparisons.shape[0])
    lefty = lambda y: max(y, 0)
    righty = lambda y: min(y, comparisons.shape[1])
    for x, y in nanxy:
        neighbours = comparisons[leftx(x-1):rightx(x+2),lefty(y-1):righty(y+2)]
        neighbours = neighbours[neighbours > 0]
        comparisons[x,y] = numpy.mean(neighbours)

    fig, ax = plt.subplots()
    fig.subplots_adjust(bottom=0.2)

    if axisLabels:
        filePaths = indexMap.keys()
        fileNames = [os.path.basename(filePath).split('.')[0] for filePath in filePaths]
        indices = [indexMap[filePath] for filePath in filePaths]

        plt.xticks(indices, fileNames, size='small', rotation='vertical')
        plt.yticks(indices, fileNames, size='small')

    plt.contourf(comparisons)

    if annotate:
        for x, y, c in zip(xx, yy, comparisons.flatten()):
            c = '{0:.1f}'.format(c*100)
            plt.annotate(c, (x, y))

    plt.show()
示例#12
0
def rubensteinGoodenough(wordIndexMap, embeddings):
    global rubensteinGoodenoughData

    if rubensteinGoodenoughData is None:
        rubensteinGoodenoughData = []

        rubensteinGoodenoughFilePath = 'res/RG/EN-RG-65.txt'

        with open(rubensteinGoodenoughFilePath) as file:
            lines = file.readlines()

        wordPairs = []
        targetScores = []
        for line in lines:
            word0, word1, targetScore = tuple(line.strip().split('\t'))
            targetScore = float(targetScore)

            rubensteinGoodenoughData.append((word0, word1, targetScore))

    scores = []
    targetScores = []
    for word0, word1, targetScore in rubensteinGoodenoughData:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    rubensteinGoodenoughMetric = numpy.mean([pearson, spearman])

    return rubensteinGoodenoughMetric
示例#13
0
def rubensteinGoodenough(wordIndexMap, embeddings):
    global rubensteinGoodenoughData

    if rubensteinGoodenoughData is None:
        rubensteinGoodenoughData = []

        rubensteinGoodenoughFilePath = "res/RG/EN-RG-65.txt"

        with open(rubensteinGoodenoughFilePath) as file:
            lines = file.readlines()

        wordPairs = []
        targetScores = []
        for line in lines:
            word0, word1, targetScore = tuple(line.strip().split("\t"))
            targetScore = float(targetScore)

            rubensteinGoodenoughData.append((word0, word1, targetScore))

    scores = []
    targetScores = []
    for word0, word1, targetScore in rubensteinGoodenoughData:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    rubensteinGoodenoughMetric = numpy.mean([pearson, spearman])

    return rubensteinGoodenoughMetric
示例#14
0
    def innerExecute(self, selector):
        words = []
        embeddings = []

        for operator, word in selector.operands:
            embedding = self.wordEmbeddings[self.wordIndexMap[word]]
            if operator == '-':
                embedding = embedding * (-1)

            words.append(word)
            embeddings.append(embedding)

        minIndex, maxIndex = ExplainFunction.getSurroundingIndices(words, self.wordIndexMap, 5000)

        result = embeddings[0]
        for embedding in embeddings[1:]:
            result += embedding

        scores = []
        for index in xrange(minIndex, maxIndex):
            word = self.wordIndexItems[index][0]

            if word not in words:
                embedding = self.wordEmbeddings[index]
                score = vectors.cosineSimilarity(result, embedding)
                scores.append((word, score))

            log.progress('Looking for closest matches: {0:.3f}%.',
                         index - minIndex + 1,
                         maxIndex - minIndex)

        log.lineBreak()

        scores = sorted(scores, key=lambda s: s[1], reverse=True)

        for score in scores[:10]:
            print score
示例#15
0
def wordSimilarity353(wordIndexMap, embeddings):
    global wordSimilarity353Data

    if wordSimilarity353Data is None:
        wordSimilarity353Data = []

        wordSimilarity353FilePath = 'res/WordSimilarity-353/combined.csv'
        data = pandas.read_csv(wordSimilarity353FilePath)

        for word0, word1, score in zip(data['Word1'], data['Word2'],
                                       data['Score']):
            wordSimilarity353Data.append((word0, word1, score))

    scores = []
    targetScores = []
    for word0, word1, targetScore in wordSimilarity353Data:
        if word0 in wordIndexMap and word1 in wordIndexMap:
            targetScores.append(targetScore)

            word0Index = wordIndexMap[word0]
            word1Index = wordIndexMap[word1]
            word0Embedding = embeddings[word0Index]
            word1Embedding = embeddings[word1Index]

            score = vectors.cosineSimilarity(word0Embedding, word1Embedding)
            scores.append(score)

    if len(scores) == 0:
        return numpy.nan

    pearson, pearsonDeviation = scipy.stats.pearsonr(scores, targetScores)
    spearman, spearmanDeviation = scipy.stats.spearmanr(scores, targetScores)

    metric = numpy.mean([pearson, spearman])

    return metric
示例#16
0
def satQuestions(wordIndexMap, embeddings):
    global satQuestionsData

    if satQuestionsData is None:
        satQuestionsData = []
        satQuestionsFilePath = 'res/SAT-Questions/SAT-package-V3.txt'

        maxLineLength = 50
        aCode = ord('a')

        with open(satQuestionsFilePath) as satFile:
            line = satFile.readline()
            while line != '':
                if len(line) < maxLineLength:
                    match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line)
                    if match:
                        stemWord0, stemWord1 = match.group('word0'), match.group('word1')
                        satQuestion = [stemWord0, stemWord1]

                        line = satFile.readline()
                        match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line)
                        while match:
                            choiceWord0, choiceWord1 = match.group('word0'), match.group('word1')
                            satQuestion.append(choiceWord0)
                            satQuestion.append(choiceWord1)

                            line = satFile.readline()
                            match = re.match('(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]', line)

                        correctChoiceIndex = ord(line.strip()) - aCode
                        satQuestion.append(correctChoiceIndex)

                        satQuestionsData.append(satQuestion)

                line = satFile.readline()

    scores = []
    for satQuestion in satQuestionsData:
        if any([word not in wordIndexMap for word in satQuestion[:-1]]):
            continue

        stemWord0, stemWord1 = satQuestion[:2]

        stemWord0Index = wordIndexMap[stemWord0]
        stemWord1Index = wordIndexMap[stemWord1]
        stemWord0Embedding, stemWord1Embedding = embeddings[stemWord0Index], embeddings[stemWord1Index]
        stemSimilarity = vectors.cosineSimilarity(stemWord0Embedding, stemWord1Embedding)

        correctChoiceIndex = satQuestion[-1]
        choiceSimilarityDeltas = []

        choices = satQuestion[2:-1]
        for i in xrange(0, len(choices), 2):
            choiceWord0, choiceWord1 = choices[i], choices[i+1]
            choiceWord0Index, choiceWord1Index = wordIndexMap[choiceWord0], wordIndexMap[choiceWord1]
            choiceWord0Embedding, choiceWord1Embedding = embeddings[choiceWord0Index], embeddings[choiceWord1Index]

            choiceSimilarity = vectors.cosineSimilarity(choiceWord0Embedding, choiceWord1Embedding)

            choiceSimilarityDelta = abs(stemSimilarity - choiceSimilarity)
            choiceSimilarityDeltas.append(choiceSimilarityDelta)

            choiceIndex = numpy.argmin(choiceSimilarityDeltas)
            scores.append(int(choiceIndex == correctChoiceIndex))

    if len(scores) == 0:
        return numpy.nan

    metric = float(sum(scores)) / len(scores)

    return metric
示例#17
0
def satQuestions(wordIndexMap, embeddings):
    global satQuestionsData

    if satQuestionsData is None:
        satQuestionsData = []
        satQuestionsFilePath = 'res/SAT-Questions/SAT-package-V3.txt'

        maxLineLength = 50
        aCode = ord('a')

        with open(satQuestionsFilePath) as file:
            line = file.readline()
            while line != '':
                if len(line) < maxLineLength:
                    match = re.match(
                        '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]',
                        line)
                    if match:
                        stemWord0, stemWord1 = match.group(
                            'word0'), match.group('word1')
                        satQuestion = [stemWord0, stemWord1]

                        line = file.readline()
                        match = re.match(
                            '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]',
                            line)
                        while match:
                            choiceWord0, choiceWord1 = match.group(
                                'word0'), match.group('word1')
                            satQuestion.append(choiceWord0)
                            satQuestion.append(choiceWord1)

                            line = file.readline()
                            match = re.match(
                                '(?P<word0>[\w-]+)\s(?P<word1>[\w-]+)\s[nvar]:[nvar]',
                                line)

                        correctChoiceIndex = ord(line.strip()) - aCode
                        satQuestion.append(correctChoiceIndex)

                        satQuestionsData.append(satQuestion)

                line = file.readline()

    scores = []
    for satQuestion in satQuestionsData:
        if any([word not in wordIndexMap for word in satQuestion[:-1]]):
            continue

        stemWord0, stemWord1 = satQuestion[:2]

        stemWord0Index = wordIndexMap[stemWord0]
        stemWord1Index = wordIndexMap[stemWord1]
        stemWord0Embedding, stemWord1Embedding = embeddings[
            stemWord0Index], embeddings[stemWord1Index]
        stemSimilarity = vectors.cosineSimilarity(stemWord0Embedding,
                                                  stemWord1Embedding)

        correctChoiceIndex = satQuestion[-1]
        choiceSimilarityDeltas = []

        choices = satQuestion[2:-1]
        for i in xrange(0, len(choices), 2):
            choiceWord0, choiceWord1 = choices[i], choices[i + 1]
            choiceWord0Index, choiceWord1Index = wordIndexMap[
                choiceWord0], wordIndexMap[choiceWord1]
            choiceWord0Embedding, choiceWord1Embedding = embeddings[
                choiceWord0Index], embeddings[choiceWord1Index]

            choiceSimilarity = vectors.cosineSimilarity(
                choiceWord0Embedding, choiceWord1Embedding)

            choiceSimilarityDelta = abs(stemSimilarity - choiceSimilarity)
            choiceSimilarityDeltas.append(choiceSimilarityDelta)

            choiceIndex = numpy.argmin(choiceSimilarityDeltas)
            scores.append(int(choiceIndex == correctChoiceIndex))

    if len(scores) == 0:
        return numpy.nan

    metric = float(sum(scores)) / len(scores)

    return metric