def load_vectors_from_csv(self, fname, vocab_size=973265, vector_size=100):
        print("Loading vectors from file: {0}".format(fname))
        result = keyedvectors.KeyedVectors(fname)
        result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
        result.vecor_size = vector_size
        counts = None

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                print("duplicate word '%s' in %s, ignoring all but first",
                      word, fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = keyedvectors.Vocab(index=word_id,
                                                        count=vocab_size -
                                                        word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = keyedvectors.Vocab(index=word_id,
                                                        count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                print("vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = keyedvectors.Vocab(index=word_id,
                                                        count=None)
            result.syn0[word_id] = weights
            result.index2word.append(word)

        file = codecs.open(fname, "r", "utf-8")
        i = 0
        for line in file:
            i += 1
            if i == 1:  #ommit header
                continue
            parts = line.strip().split(",")
            word, weights = parts[1], [REAL(x) for x in parts[2:]]
            add_word(word, weights)
            if i % 100000 == 0:
                print(i, "word vectors loaded so far ...")
        file.close()
        print(i - 1, "word vectors loaded!")
        return result
def _load_word2vec_format(cls,
                          fname,
                          fvocab=None,
                          binary=False,
                          encoding='utf8',
                          unicode_errors='strict',
                          limit=None,
                          datatype=REAL):
    """Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    Parameters
    ----------
    fname : str
        The file path to the saved word2vec-format file.
    fvocab : str
            Optional file path to the vocabulary.Word counts are read from `fvocab` filename,
            if set (this is the file generated by `-save-vocab` flag of the original C tool).
    binary : bool
        If True, indicates whether the data is in binary word2vec format.
    encoding : str
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
    unicode_errors : str
        default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
    limit : int
        Sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
    datatype : :class: `numpy.float*`
        (Experimental) Can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)

    Returns
    -------
    :obj: `cls`
        Returns the loaded model as an instance of :class: `cls`.

    """
    from gensim.models.keyedvectors import Vocab
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = utils.to_unicode(b''.join(word),
                                        encoding=encoding,
                                        errors=unicode_errors)
                weights = fromstring(fin.read(binary_len), dtype=REAL)
                add_word(word, weights)
        else:
            for line_no in xrange(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = utils.to_unicode(line.rstrip(),
                                         encoding=encoding,
                                         errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [REAL(x) for x in parts[1:]]
                add_word(word, weights)
    if result.vectors.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.vectors.shape[0], len(result.vocab))
        result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.vectors.shape

    logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
    return result
Exemplo n.º 3
0
    def load_word2vec_format(
            cls,
            fname,
            fvocab=None,
            binary=False,
            encoding='utf8',
            unicode_errors='strict',
            limit=None,
            datatype=REAL):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
        counts = None
        if fvocab is not None:
            print("loading word counts from %s" % fvocab)
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        print("loading projection weights from %s" % fname)
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            # throws for invalid file format
            vocab_size, vector_size = (int(x) for x in header.split())
            if limit:
                vocab_size = min(vocab_size, limit)
            result = cls()
            result.vector_size = vector_size
            result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

            def add_word(word, weights):
                word_id = len(result.vocab)
                # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
                if word in result.vocab:
                    print(
                        "duplicate word '%s' in %s, ignoring all but first" %
                        (word, fname))
                    return
                if counts is None:
                    # most common scenario: no vocab file given. just make up
                    # some bogus counts, in descending order
                    result.vocab[word] = Vocab(
                        index=word_id, count=vocab_size - word_id)
                elif word in counts:
                    # use count from the vocab file
                    result.vocab[word] = Vocab(
                        index=word_id, count=counts[word])
                else:
                    # vocab file given, but word is missing -- set count to
                    # None (TODO: or raise?)
                    print(
                        "vocabulary file is incomplete: '%s' is missing" %
                        word)
                    result.vocab[word] = Vocab(index=word_id, count=None)
                result.syn0[word_id] = weights
                result.index2word.append(word)

            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for _ in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch == b'':
                            raise EOFError(
                                "unexpected end of input; is count incorrect or file otherwise damaged?")
                        # ignore newlines in front of words (some binary files
                        # have)
                        if ch != b'\n':
                            word.append(ch)
                    word = utils.to_unicode(
                        b''.join(word), encoding=encoding, errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    add_word(word, weights)
            else:
                for line_no in xrange(vocab_size):
                    line = fin.readline()
                    if line == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?")
                    parts = utils.to_unicode(
                        line.rstrip(),
                        encoding=encoding,
                        errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)" %
                            line_no)
                    word, weights = parts[0], [REAL(x) for x in parts[1:]]
                    add_word(word, weights)
        if result.syn0.shape[0] != len(result.vocab):
            print(
                "duplicate words detected, shrinking matrix size from %i to %i" %
                (result.syn0.shape[0], len(result.vocab)))
            result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
        assert (len(result.vocab), vector_size) == result.syn0.shape
        '''
        KDTree
        Build KDTree with vectors.
        http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
        '''
        result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean")
        print("loaded %s matrix from %s" % (result.syn0.shape, fname))
        return result
Exemplo n.º 4
0
Arquivo: generic.py Projeto: zxlzr/REL
    def load_word2emb(self,
                      file_name,
                      batch_size=5000,
                      limit=np.inf,
                      reset=False):
        self.seen = set()
        if reset:
            self.clear()

        batch = []
        start = time()

        # Loop over file.
        with utils.open(file_name, "rb") as fin:
            # Determine size file.
            header = utils.to_unicode(fin.readline(), encoding="utf-8")
            vocab_size, vector_size = (int(x) for x in header.split()
                                       )  # throws for invalid file format
            if limit < vocab_size:
                vocab_size = limit

            for line_no in range(vocab_size):
                line = fin.readline()
                if line == b"":
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )

                parts = utils.to_unicode(line.rstrip(),
                                         encoding="utf-8",
                                         errors="strict").split(" ")

                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)

                word, vec = parts[0], np.array([REAL(x) for x in parts[1:]])

                if word in self.seen:
                    continue

                self.seen.add(word)
                batch.append((word, vec))

                if "ENTITY/" in word:
                    self.avg_cnt["entity"]["cnt"] += 1
                    self.avg_cnt["entity"]["sum"] += vec
                else:
                    self.avg_cnt["word"]["cnt"] += 1
                    self.avg_cnt["word"]["sum"] += vec

                if len(batch) == batch_size:
                    print("Another {}".format(batch_size), line_no,
                          time() - start)
                    start = time()
                    self.insert_batch_emb(batch)
                    batch.clear()

        for x in ['entity', 'word']:
            if self.avg_cnt[x]["cnt"] > 0:
                batch.append((
                    "#{}/UNK#".format(x.upper()),
                    self.avg_cnt[x]["sum"] / self.avg_cnt[x]["cnt"],
                ))
                print("Added #{}/UNK#".format(x.upper()))

        if batch:
            self.insert_batch_emb(batch)
Exemplo n.º 5
0
def load_vector_bin(fname, binary=True):
    encoding = 'utf-8'
    unicode_errors = 'strict'

    with open(fname, 'rb') as fin:
        header = str(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        vector_size = vector_size
        vectors = np.zeros((vocab_size, vector_size))

        vocab = {}

        def add_word(word, weights):
            word_id = len(vocab)
            if word in vocab:
                print("duplicate word '%s' in %s, ignoring all but first",
                      word, fname)
                return
            vocab[word] = word_id
            vectors[word_id] = weights

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in range(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = str(b''.join(word),
                           encoding=encoding,
                           errors=unicode_errors)

                weights = fromstring(fin.read(binary_len), dtype=REAL)
                add_word(word, weights)
        else:
            for line_no in range(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = str(line.rstrip(),
                            encoding=encoding,
                            errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [REAL(x) for x in parts[1:]]
                add_word(word, weights)
    if vectors.shape[0] != len(vocab):
        print("duplicate words detected, shrinking matrix size from %i to %i",
              vectors.shape[0], len(vocab))
        vectors = ascontiguousarray(vectors[:len(vocab)])
    assert (len(vocab), vector_size) == vectors.shape

    print("loaded %s matrix from %s", vectors.shape, fname)
    return vectors