示例#1
0
def load_embedding(opts, paddingSym):
    if opts["lexicon"]:
        emb = np.load(opts["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(opts["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)
    elif opts["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(opts['word_embedding'],
                                                'UUUKNNN',
                                                hasHeader=False,
                                                paddingSym=paddingSym)

    return lexicon, embedding
示例#2
0
    def fromFile(file,
                 unknownSymbol,
                 lexiconName=None,
                 hasHeader=True,
                 paddingSym=None):
        """
        Creates  a lexicon and a embedding from word2vec file.
        :param file: path of file
        :param unknownSymbol: the string that represents the unknown words.
        :return: (data.Lexicon.Lexicon, Embedding)
        """
        log = logging.getLogger(__name__)
        fVec = codecs.open(file, 'r', 'utf-8')

        # Read the number of words in the dictionary and the embedding size
        if hasHeader:
            nmWords, embeddingSizeStr = fVec.readline().strip().split(" ")
            embeddingSize = int(embeddingSizeStr)
        else:
            embeddingSize = None

        lexicon = Lexicon(unknownSymbol, lexiconName)
        # The empty array represents the array of unknown
        # At end, this array will be replaced by one array that exist in the  w2vFile or a random array.
        vectors = [[]]
        nmEmptyWords = 0

        for line in fVec:
            splitLine = line.rstrip().split(u' ')
            word = splitLine[0]

            if len(word) == 0:
                log.warning(
                    "Insert in the embedding a empty string. This embeddings will be thrown out."
                )
                nmEmptyWords += 1
                continue

            vec = [float(num) for num in splitLine[1:]]

            if word == unknownSymbol:
                if len(vectors[0]) != 0:
                    raise Exception("A unknown symbol was already inserted.")

                vectors[0] = vec
            else:
                lexicon.put(word)
                vectors.append(vec)

        expected_size = lexicon.getLen() - 1 + nmEmptyWords

        if len(vectors[0]) == 0:
            if embeddingSize is None:
                embeddingSize = len(vectors[-1])

            vectors[0] = generateVector(embeddingSize)
            expected_size += 1

        if hasHeader:
            if int(nmWords) != expected_size:
                raise Exception(
                    "The size of lexicon is different of number of vectors")

        if paddingSym is None:
            paddingIdx = None
        else:
            if not lexicon.exist(paddingSym):
                paddingIdx = lexicon.put(paddingSym)
                vectors.append([0.0] * embeddingSize)
            else:
                paddingIdx = lexicon.getLexiconIndex(paddingSym)

        fVec.close()

        return lexicon, Embedding(lexicon, vectors, paddingIdx=paddingIdx)