Python Lexicon.put示例

编程语言: Python

命名空间/包名称: Lexicon

类/类型: Lexicon

方法/功能: put

hotexamples.com的示例: 2

Python Lexicon.put - 已找到2个示例。这些是从开源项目中提取的最受好评的Lexicon.Lexicon.put现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Lexicon(4)

fromList(2)

put(2)

getLexiconIndex(2)

getnumeroDoc(1)

setidf(1)

setUnknown(1)

save(1)

load(1)

getListaPost(1)

get_words_as_string(1)

_convertBTrees(1)

getLen(1)

from_string(1)

extractFromTreebank(1)

exist(1)

add_to_string(1)

to_string(1)

示例#1

显示文件

def load_embedding(opts, paddingSym):
    if opts["lexicon"]:
        emb = np.load(opts["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(opts["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)
    elif opts["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(opts['word_embedding'],
                                                'UUUKNNN',
                                                hasHeader=False,
                                                paddingSym=paddingSym)

    return lexicon, embedding

示例#2

显示文件

    def fromFile(file,
                 unknownSymbol,
                 lexiconName=None,
                 hasHeader=True,
                 paddingSym=None):
        """
        Creates  a lexicon and a embedding from word2vec file.
        :param file: path of file
        :param unknownSymbol: the string that represents the unknown words.
        :return: (data.Lexicon.Lexicon, Embedding)
        """
        log = logging.getLogger(__name__)
        fVec = codecs.open(file, 'r', 'utf-8')

        # Read the number of words in the dictionary and the embedding size
        if hasHeader:
            nmWords, embeddingSizeStr = fVec.readline().strip().split(" ")
            embeddingSize = int(embeddingSizeStr)
        else:
            embeddingSize = None

        lexicon = Lexicon(unknownSymbol, lexiconName)
        # The empty array represents the array of unknown
        # At end, this array will be replaced by one array that exist in the  w2vFile or a random array.
        vectors = [[]]
        nmEmptyWords = 0

        for line in fVec:
            splitLine = line.rstrip().split(u' ')
            word = splitLine[0]

            if len(word) == 0:
                log.warning(
                    "Insert in the embedding a empty string. This embeddings will be thrown out."
                )
                nmEmptyWords += 1
                continue

            vec = [float(num) for num in splitLine[1:]]

            if word == unknownSymbol:
                if len(vectors[0]) != 0:
                    raise Exception("A unknown symbol was already inserted.")

                vectors[0] = vec
            else:
                lexicon.put(word)
                vectors.append(vec)

        expected_size = lexicon.getLen() - 1 + nmEmptyWords

        if len(vectors[0]) == 0:
            if embeddingSize is None:
                embeddingSize = len(vectors[-1])

            vectors[0] = generateVector(embeddingSize)
            expected_size += 1

        if hasHeader:
            if int(nmWords) != expected_size:
                raise Exception(
                    "The size of lexicon is different of number of vectors")

        if paddingSym is None:
            paddingIdx = None
        else:
            if not lexicon.exist(paddingSym):
                paddingIdx = lexicon.put(paddingSym)
                vectors.append([0.0] * embeddingSize)
            else:
                paddingIdx = lexicon.getLexiconIndex(paddingSym)

        fVec.close()

        return lexicon, Embedding(lexicon, vectors, paddingIdx=paddingIdx)