Python to_utf8示例，gensim.utils.to_utf8 Python示例

示例#1

0

显示文件

def save_word2vec(fname, vocab, vectors, binary=False):
    """Store the weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.
    Parameters
    ----------
    fname : str
        The file path used to save the vectors in.
    vocab : dict
        The vocabulary of words.
    vectors : numpy.array
        The vectors to be stored.
    binary : bool, optional
        If True, the data wil be saved in binary word2vec format, else in plain text.
.
    """
    if not (vocab or vectors):
        raise RuntimeError("no input")
    vector_size = vectors.shape[1]
    logging.info("storing %sx%s projection weights into %s", len(vocab),
                 vector_size, fname)
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % (len(vocab), vector_size)))
        for el in sorted(vocab.keys()):
            row = vectors[vocab[el]]
            if binary:
                row = row.astype(np.float32)
                fout.write(utils.to_utf8(el) + b" " + row.tostring())
            else:
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (el, ' '.join(repr(val) for val in row))))

示例#2

0

显示文件

def write_log(f, commit, changes):
    for change in changes:
        f.write(change.type.upper()[0] + '      ')
        path = '/dev/null'
        if change.old.path and change.new.path:
            path = change.new.path
        elif change.old.path:
            path = change.old.path
        elif change.new.path:
            path = change.new.path

        f.write(to_utf8(path.strip().rstrip() + '\n'))

    if changes:
        f.write('\n')

    f.write('commit %s\n' % commit.id)
    raw_lines = commit.as_raw_string().splitlines()

    # tab the message like git
    tab = False
    for line in raw_lines:
        if not tab and not line:
            tab = True

        if tab and line:
            f.write('    ')

        if line:
            f.write(to_utf8(line))

        f.write('\n')

    f.write('\n')

示例#3

0

显示文件

文件： utilities.py 项目： ChristofHenkel/toxic_comments

def save_mini_fasttext_format(model, fname, words_dict, binary=False):
    """
    Store the input-hidden weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.

     `fname` is the file used to save the vectors in
     `fvocab` is an optional file used to save the vocabulary
     `binary` is an optional boolean indicating whether the data is to be saved
     in binary word2vec format (default: False)
     `total_vec` is an optional parameter to explicitly specify total no. of vectors
     (in case word vectors are appended with document vectors afterwards)

    """

    total_vec = len(model.vocab)
    vector_size = model.syn0.shape[1]
    print("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
    assert (len(model.vocab), vector_size) == model.syn0.shape
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count):
            if word in words_dict:
                row = model.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

示例#4

0

显示文件

def word2vec2tsv(word2vec_model, tensor_filename, vocab=[], binary=False):
    """only visualize input vocab
    Keyword Arguments:
    model --
    vocab --
    """
    from gensim.utils import to_utf8
    model = word2vec_model
    if not os.path.exists(tensor_filename):
        logger.info("create dir %s" % tensor_filename)
        os.makedirs(tensor_filename)

    outfiletsv = complete_dir_path(tensor_filename) + 'tensor.tsv'
    outfiletsvmeta = complete_dir_path(tensor_filename) + 'metadata.tsv'

    if len(vocab) != 0:
        absent_vocab = []
        # remove absent vocabulary
        for token in vocab:
            if token not in model:
                #         logger.debug('pop %s' % vocab)
                absent_vocab.append(token)
                vocab.remove(token)
        logger.debug("absent vocabulary in the model %s" % absent_vocab)
    else:
        vocab = model.index2word

    # write tensor value
    with open(outfiletsv, 'w+') as file_vector:
        with open(outfiletsvmeta, 'w+') as file_metadata:
            for word in vocab:
                file_metadata.write(to_utf8(word) + to_utf8('\n'))
                vector_row = '\t'.join(str(x) for x in model[word])
                file_vector.write(vector_row + '\n')
    return vocab

示例#5

0

显示文件

文件： review_cleaning.py 项目： devanshg-enixta/Smaartpulse-db

def review_cleaner(corpus, logging_filename, preprocessed_file,
                   review_tag_field, review_field, review_id, delimiter):
    logger = logging.getLogger(__name__)
    logging.basicConfig(filename=logging_filename,
                        filemode='w',
                        format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # Opening the files for reading
    infile = smart_open(corpus, 'rb')
    outfile = smart_open(preprocessed_file, 'wb')

    # Calling the Tokenizer class
    tok = Tokenizer(preserve_case=False)

    # Iterating over the review file
    for line_num, line in enumerate(infile):
        if line_num % 100 is 0:
            logger.info("processing line number {}".format(line_num))

        line = to_unicode(line)
        cols_list = line.strip().split(delimiter)

        # doing a testing
        if len(cols_list) != 11:
            logger.info("incorrect format of line {}".format(line_num))
            continue

        # reading the input_id, review_tag and review
        input_id = cols_list[0:review_id + 1]
        input_tag = cols_list[review_tag_field]
        review = cols_list[review_field]

        # tokenizing the tag and review
        tok_tag = tok.tokenize(input_tag)
        tok_review = tok.tokenize(review)

        new_tag = " ".join(tok_tag)
        new_review = " ".join(tok_review)

        # replace 2 or more occurrence of periods
        # with a single period
        regex = r"[..]+"
        replace_regex = r"."

        new_tag = re.sub(regex, replace_regex, new_tag)
        new_review = re.sub(regex, replace_regex, new_review)

        input_id = to_unicode(delimiter.join(input_id))

        # writing the review tag to output file
        outfile.write(to_utf8(delimiter.join([input_id, new_tag])))
        outfile.write("\n")

        # writing the review to file
        outfile.write(to_utf8(delimiter.join([input_id, new_review])))
        outfile.write("\n")

    infile.close()
    outfile.close()

示例#6

0

显示文件

文件： matutils.py 项目： Miss-ada/question-answer-system

    def write_headers(self, num_docs, num_terms, num_nnz):
        """Write headers to file.

        Parameters
        ----------
        num_docs : int
            Number of documents in corpus.
        num_terms : int
            Number of term in corpus.
        num_nnz : int
            Number of non-zero elements in corpus.

        """
        self.fout.write(MmWriter.HEADER_LINE)

        if num_nnz < 0:
            # we don't know the matrix shape/density yet, so only log a general line
            logger.info("saving sparse matrix to %s", self.fname)
            self.fout.write(utils.to_utf8(' ' * 50 + '\n'))  # 48 digits must be enough for everybody
        else:
            logger.info(
                "saving sparse %sx%s matrix with %i non-zero entries to %s",
                num_docs, num_terms, num_nnz, self.fname
            )
            self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
        self.last_docno = -1
        self.headers_written = True

示例#7

0

显示文件

文件： w2v_handler.py 项目： hansmoen/AnnotationsIKITIK

def save_word2vec_format(vectors, vocab, save_filename, fvocab=None, binary=True):
    """
    Store the input-hidden weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.
     `vectors` is the numpy 2d array of context vectors to save.
     `vocab` is the vocabulary for the vectors.
     `fname` is the file used to save the vectors in
     `fvocab` is an optional file used to save the vocabulary
     `binary` is an optional boolean indicating whether the data is to be saved
     in binary word2vec format (default: False)
    """
    if fvocab is not None:
        # logger.info("storing vocabulary in %s" % (fvocab))
        with utils.smart_open(fvocab, 'wb') as vout:
            for word, vocab in sorted(iteritems(vocab), key=lambda item: -item[1].count):
                vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
    # logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.vector_size, fname))
    # assert (len(vocab), self.vector_size) == self.syn0.shape
    with utils.smart_open(save_filename, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % vectors.shape))
        # store in sorted order: most frequent words at the top
        for word, voc in sorted(iteritems(vocab), key=lambda item: -item[1].count):
            row = vectors[voc.index]
            if binary:
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

示例#8

0

显示文件

    def save_word2vec_format(self, fname, fvocab=None, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        if fvocab is not None:
            logger.info("Storing vocabulary in %s" % (fvocab))
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab),
                                          key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s" %
                    (len(self.vocab), self.layer1_size, fname))
        assert (len(self.vocab), self.layer1_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab),
                                      key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word, ' '.join("%f" % val
                                                      for val in row))))

示例#9

0

显示文件

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with "
                "integer elements; %i float entries were truncated to integer value"
                % truncated)
        return offsets

示例#10

0

显示文件

文件： bleicorpus.py 项目： 234205367/gensim

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets

示例#11

0

显示文件

文件： keyedvectors.py 项目： jMonteroMunoz/gensim

    def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

         `fname` is the file used to save the vectors in
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)
         `total_vec` is an optional parameter to explicitly specify total no. of vectors
         (in case word vectors are appended with document vectors afterwards)

        """
        if total_vec is None:
            total_vec = len(self.vocab)
        vector_size = self.syn0.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s", fvocab)
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
        assert (len(self.vocab), vector_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

示例#12

0

显示文件

    def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

         `fname` is the file used to save the vectors in
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)
         `total_vec` is an optional parameter to explicitly specify total no. of vectors
         (in case word vectors are appended with document vectors afterwards)

        """
        if total_vec is None:
            total_vec = len(self.vocab)
        vector_size = self.syn0.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s", fvocab)
            with utils.open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
        assert (len(self.vocab), vector_size) == self.syn0.shape
        with utils.open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

示例#13

0

显示文件

文件： word2vec.py 项目： satya77/Entity_Embedding

    def save_to_txt(self, path, node_list_path):
        columns = ["entity_id", "cover_text"]
        self.vertext = pd.read_csv(node_list_path,
                                   sep='\t',
                                   index_col=0,
                                   header=None,
                                   comment='#')
        self.vertext.columns = columns
        vector_size = self.model.wv.vectors.shape[1]
        total_vec = len(self.model.wv.vocab)
        with utils.smart_open(path + "/" + self.name + ".txt", 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab_ in sorted(iteritems(self.model.wv.vocab),
                                       key=lambda item: -item[1].count):

                word_label = self.vertext.loc[self.vertext['entity_id'] ==
                                              word].values
                if (len(word_label) > 0):
                    word_label = word_label[0][1]
                    row = self.model.wv.vectors[vocab_.index]
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (word_label, ' '.join("%f" % val
                                                            for val in row))))

示例#14

0

显示文件

文件： lowcorpus.py 项目： 52nlp/paragraph2vec

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning("List-of-words format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)
        return offsets

示例#15

0

显示文件

文件： helpers.py 项目： nicholas-botzer/path2vec

def save_word2vec_format(fname, vocab, vectors, binary=False):
    """Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.
        Parameters
        ----------
        fname : str
            The file path used to save the vectors in
        vocab : dict
            The vocabulary of words with their ranks
        vectors : numpy.array
            The vectors to be stored
        binary : bool
            If True, the data wil be saved in binary word2vec format, else in plain text.
        """
    if not (vocab or vectors):
        raise RuntimeError('no input')
    total_vec = len(vocab)
    vector_size = vectors.shape[1]
    print('storing %dx%d projection weights into %s' %
          (total_vec, vector_size, fname))
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8('%s %s\n' % (total_vec, vector_size)))
        position = 0
        for element in sorted(vocab, key=lambda word: vocab[word]):
            row = vectors[position]
            if binary:
                row = row.astype(real)
                fout.write(utils.to_utf8(element) + b" " + row.tostring())
            else:
                fout.write(
                    utils.to_utf8('%s %s\n' %
                                  (element, ' '.join(repr(val)
                                                     for val in row))))
            position += 1

示例#16

0

显示文件

文件： dictionary.py 项目： Garren-Koller/CoronaVirus-Chatbot

    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            Sort words in lexicographical order before writing them out?

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
        to other tools and frameworks. For better performance and to store the entire object state,
        including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` instead.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
            Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import Dictionary
            >>> from gensim.test.utils import get_tmpfile
            >>>
            >>> tmp_fname = get_tmpfile("dictionary")
            >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
            >>>
            >>> dct = Dictionary(corpus)
            >>> dct.save_as_text(tmp_fname)
            >>>
            >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
            >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

示例#17

0

显示文件

    def save(self,
             np2vec_model_file="np2vec.model",
             binary=False,
             word2vec_format=True):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word2vec_format(bool): boolean indicating whether to save the model in original
            word2vec format.
        """
        if self.word_embedding_type == "fasttext" and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True.")
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            if self.prune_non_np:
                logger.info("pruning np2vec model")
                total_vec = 0
                vector_size = self.model.vector_size
                for word in self.model.wv.vocab.keys():
                    if self.is_marked(word) and len(word) > 1:
                        total_vec += 1
                logger.info(
                    "storing %sx%s projection weights for NP's into %s",
                    total_vec,
                    vector_size,
                    np2vec_model_file,
                )
                with smart_open(np2vec_model_file, "wb") as fout:
                    fout.write(
                        utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                    # store NP vectors in sorted order: most frequent NP's at the top
                    for word, vocab in sorted(iteritems(self.model.wv.vocab),
                                              key=lambda item: -item[1].count):
                        if self.is_marked(word) and len(
                                word) > 1:  # discard empty marked np's
                            embedding_vec = self.model.wv.syn0[vocab.index]
                            if binary:
                                fout.write(
                                    utils.to_utf8(word) + b" " +
                                    embedding_vec.tostring())
                            else:
                                fout.write(
                                    utils.to_utf8("%s %s\n" % (word, " ".join(
                                        "%f" % val for val in embedding_vec))))
                if not word2vec_format:
                    # pylint: disable=attribute-defined-outside-init
                    self.model = KeyedVectors.load_word2vec_format(
                        np2vec_model_file, binary=binary)
            if not word2vec_format:
                self.model.save(np2vec_model_file)

示例#18

0

显示文件

文件： dictionary.py 项目： RaRe-Technologies/gensim

    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            Sort words in lexicographical order before writing them out?

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
        to other tools and frameworks. For better performance and to store the entire object state,
        including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` instead.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
            Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import Dictionary
            >>> from gensim.test.utils import get_tmpfile
            >>>
            >>> tmp_fname = get_tmpfile("dictionary")
            >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
            >>>
            >>> dct = Dictionary(corpus)
            >>> dct.save_as_text(tmp_fname)
            >>>
            >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
            >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

示例#19

0

显示文件

    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            if True - sort by word in lexicographical order.

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        Warnings
        --------
        Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> tmp_fname = get_tmpfile("dictionary")
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>>
        >>> dct = Dictionary(corpus)
        >>> dct.save_as_text(tmp_fname)
        >>>
        >>> loaded_dct = Dictionary.load_from_text("testdata")
        >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

示例#20

0

显示文件

    def save_word_embedding(self, words, embeddings, out_path):
        with smart_open(out_path, 'wb') as fout:
            fout.write(to_utf8("%s %s\n" % embeddings.shape))

            for index, word in enumerate(words):
                row = embeddings[index]
                fout.write(
                    to_utf8("%s %s\n" % (word, ' '.join("%f" % val
                                                        for val in row))))

示例#21

0

显示文件

文件： dictionary.py 项目： abs51295/gensim

    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            if True - sort by word in lexicographical order.

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        Warnings
        --------
        Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> tmp_fname = get_tmpfile("dictionary")
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>>
        >>> dct = Dictionary(corpus)
        >>> dct.save_as_text(tmp_fname)
        >>>
        >>> loaded_dct = Dictionary.load_from_text("testdata")
        >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

示例#22

0

显示文件

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(
                    utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets

示例#23

0

显示文件

文件： save_output_vectors.py 项目： jihyunp/word_embeddings

def save_syn1neg_vectors(model, fname='syn1.vectors', binary=False):
    # Save output vectors

    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % model.syn1neg.shape))
        # store in sorted order: most frequent words at the top
        for word, vocab in sorted(iteritems(model.vocab), key=lambda item: -item[1].count):
            row = model.syn1neg[vocab.index]
            if binary:
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))

示例#24

0

显示文件

文件： utils_any2vec.py 项目： Miss-ada/question-answer-system

def _save_word2vec_format(fname,
                          vocab,
                          vectors,
                          fvocab=None,
                          binary=False,
                          total_vec=None):
    """Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        Parameters
        ----------
        fname : str
            The file path used to save the vectors in
        vocab : dict
            The vocabulary of words
        vectors : numpy.array
            The vectors to be stored
        fvocab : str
            Optional file path used to save the vocabulary
        binary : bool
            If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
        total_vec :  int
            Optional parameter to explicitly specify total no. of vectors
            (in case word vectors are appended with document vectors afterwards)

        """
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    if fvocab is not None:
        logger.info("storing vocabulary in %s", fvocab)
        with utils.smart_open(fvocab, 'wb') as vout:
            for word, vocab_ in sorted(iteritems(vocab),
                                       key=lambda item: -item[1].count):
                vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
    logger.info("storing %sx%s projection weights into %s", total_vec,
                vector_size, fname)
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, vocab_ in sorted(iteritems(vocab),
                                   key=lambda item: -item[1].count):
            row = vectors[vocab_.index]
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (word, ' '.join(repr(val) for val in row))))

示例#25

0

显示文件

文件： lowcorpus.py 项目： zjyeon/ODSA-PythonAdvModels

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the GibbsLda++ format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset`

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated)
        return offsets

示例#26

0

显示文件

文件： matutils.py 项目： zanderdk/gensim

    def write_headers(self, num_docs, num_terms, num_nnz):
        self.fout.write(MmWriter.HEADER_LINE)

        if num_nnz < 0:
            # we don't know the matrix shape/density yet, so only log a general line
            logger.info("saving sparse matrix to %s" % self.fname)
            self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody
        else:
            logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s" %
                         (num_docs, num_terms, num_nnz, self.fname))
            self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
        self.last_docno = -1
        self.headers_written = True

示例#27

0

显示文件

文件： dtmmodel.py 项目： abs51295/gensim

    def convert_input(self, corpus, time_slices):
        """
        Serialize documents in LDA-C format to a temporary text file,.

        """
        logger.info("serializing temporary corpus to %s", self.fcorpustxt())
        # write out the corpus in a file format that DTM understands:
        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)

        with utils.smart_open(self.ftimeslices(), 'wb') as fout:
            fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
            for sl in time_slices:
                fout.write(utils.to_utf8(str(sl) + "\n"))

示例#28

0

显示文件

    def convert_input(self, corpus, time_slices):
        """
        Serialize documents in LDA-C format to a temporary text file,.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that DTM understands:
        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)

        with utils.smart_open(self.ftimeslices(), 'wb') as fout:
            fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
            for sl in time_slices:
                fout.write(utils.to_utf8(str(sl) + "\n"))

示例#29

0

显示文件

文件： entity_embedding.py 项目： luke-feng/advancedTM_Red

def save_word2vec_format(page_vec, p2v_path=p2v_path, binary=True):
    total_vec = len(page_vec)
    voctor_size = _len
    with utils.open(p2v_path, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, voctor_size)))
        for word, row in page_vec.items():
            if binary:
                row = row.astype(REAL)
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (word, ' '.join(repr(val) for val in row))))

示例#30

0

显示文件

文件： lowcorpus.py 项目： RaRe-Technologies/gensim

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the GibbsLda++ format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset`

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated
            )
        return offsets

示例#31

0

显示文件

文件： doc2vec.py 项目： sunnyweilai/Finding-Theme-Color-Palettes

    def save_word2vec_format(self,
                             fname,
                             doctag_vec=False,
                             word_vec=True,
                             prefix='*dt_',
                             fvocab=None,
                             binary=False):
        """
        Store the input-hidden weight matrix.

         `fname` is the file used to save the vectors in
         `doctag_vec` is an optional boolean indicating whether to store document vectors
         `word_vec` is an optional boolean indicating whether to store word vectors
         (if both doctag_vec and word_vec are True, then both vectors are stored in the same file)
         `prefix` to uniquely identify doctags from word vocab, and avoid collision
         in case of repeated string in doctag and word vocab
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)

        """
        total_vec = len(self.wv.vocab) + len(self.docvecs)
        # save word vectors
        if word_vec:
            if not doctag_vec:
                total_vec = len(self.wv.vocab)
            KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary,
                                              total_vec)
        # save document vectors
        if doctag_vec:
            with utils.smart_open(fname, 'ab') as fout:
                if not word_vec:
                    total_vec = len(self.docvecs)
                    logger.info("storing %sx%s projection weights into %s",
                                total_vec, self.vector_size, fname)
                    fout.write(
                        utils.to_utf8("%s %s\n" %
                                      (total_vec, self.vector_size)))
                # store as in input order
                for i in range(len(self.docvecs)):
                    doctag = u"%s%s" % (prefix,
                                        self.docvecs.index_to_doctag(i))
                    row = self.docvecs.doctag_syn0[i]
                    if binary:
                        fout.write(
                            utils.to_utf8(doctag) + b" " + row.tostring())
                    else:
                        fout.write(
                            utils.to_utf8("%s %s\n" %
                                          (doctag, ' '.join("%f" % val
                                                            for val in row))))

示例#32

0

显示文件

文件： bleicorpus.py 项目： RaRe-Technologies/gensim

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets

示例#33

0

显示文件

文件： corpus.py 项目： silviatti/Multilingual-Gaussian-Latent-Dirichlet-Allocation-MGLDA

    def convert_dictionary_to_words2vec(self, fname):
        """ we need dictioany of word2vec format to be able to use words2vec.most_similar() function for printing n_top words of resp. topic """
        vocab = self.words2vec_ny
        vectors = self.words2vec_ny.values()
        vector_size = self.word_vec_size
        total_vec = len(self.words2vec_ny)

        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            for word, row in vocab.items():
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (word, ' '.join(repr(val) for val in row))))
        words2vec2 = KeyedVectors.load_word2vec_format(fname)
        return (words2vec2)

示例#34

0

显示文件

文件： ldamallet.py 项目： wpli/gensim

    def convert_input(self, corpus, infer=False):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that MALLET understands: one document per line:
        # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
        with utils.smart_open(self.fcorpustxt(), 'wb') as fout:
            for docno, doc in enumerate(corpus):
                if self.id2word:
                    tokens = sum(([self.id2word[tokenid]] * int(cnt)
                                  for tokenid, cnt in doc), [])
                else:
                    tokens = sum(([str(tokenid)] * int(cnt)
                                  for tokenid, cnt in doc), [])
                fout.write(
                    utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))

        # convert the text file above into MALLET's internal format
        cmd = self.mallet_path + " import-file --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
        if infer:
            cmd += ' --use-pipe-from ' + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + '.infer')
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s" %
                    cmd)
        call(cmd, shell=True)

示例#35

0

显示文件

文件： hashdictionary.py 项目： abs51295/gensim

    def save_as_text(self, fname):
        """Save this HashDictionary to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.

        Notes
        -----
        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.


        Examples
        --------
        >>> from gensim.corpora import HashDictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>> data = HashDictionary(corpus)
        >>> data.save_as_text(get_tmpfile("dictionary_in_text_format"))

        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda x: -x[1])]
                    words_df = '\t'.join(words_df)
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), words_df)))

示例#36

0

显示文件

文件： ldamallet.py 项目： Hansli1993/gensim

    def convert_input(self, corpus, infer=False):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that MALLET understands: one document per line:
        # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
        with utils.smart_open(self.fcorpustxt(), "wb") as fout:
            for docno, doc in enumerate(corpus):
                if self.id2word:
                    tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
                else:
                    tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
                fout.write(utils.to_utf8("%s 0 %s\n" % (docno, " ".join(tokens))))

        # convert the text file above into MALLET's internal format
        cmd = (
            self.mallet_path
            + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
        )
        if infer:
            cmd += " --use-pipe-from " + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + ".infer")
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s" % cmd)
        call(cmd, shell=True)

示例#37

0

显示文件

文件： entity_embedding.py 项目： luke-feng/advancedTM_Red

def main():
    tokens_c = Counter()
    articles = 0
    pool = Pool(cpus)
    entites = get_embedded_entities(temp_path)
    model = load_model(wordvector_path)
    idfdict = load_idf(idf_path)
    mapping = load_ID_mapping(mapping_path)

    with utils.open(temp_path, 'ab') as fout:
        tbar = tqdm.tqdm(total=len(mapping_path))
        for _id, tokens in pool.imap_unordered(process_line,
                                               get_lines(wikidata_path)):
            articles += 1
            tbar.update(1)
            tokens_c.update(tokens)
            wikeID = get_wikidataID(_id, mapping)
            if wikeID in entites:
                continue
            else:
                vec = get_p2v(tokens, model, idfdict)
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (wikeID, ' '.join(repr(val)
                                                    for val in vec))))

    pool.terminate()
    print("Finished, Done %d articles.", articles)

示例#38

0

显示文件

文件： interfaces.py 项目： abs51295/gensim

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus:  # iterate over the document stream
                fmt = str(doc)  # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt))  # serialize the formatted document to disk

示例#39

0

显示文件

文件： matutils.py 项目： shy-forked/s2018_02_16_gensim_package_shy

    def write_vector(self, docno, vector):
        """Write a single sparse vector to the file.

        Parameters
        ----------
        docno : int
            Number of document.
        vector : list of (int, float)
            Vector in BoW format.

        Returns
        -------
        (int, int)
            Max word index in vector and len of vector. If vector is empty, return (-1, 0).

        """
        assert self.headers_written, "must write Matrix Market file headers before writing data!"
        assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (
            self.last_docno, docno)
        vector = sorted((i, w) for i, w in vector
                        if abs(w) > 1e-12)  # ignore near-zero entries
        for termid, weight in vector:  # write term ids in sorted order
            # +1 because MM format starts counting from 1
            self.fout.write(
                utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight)))
        self.last_docno = docno
        return (vector[-1][0], len(vector)) if vector else (-1, 0)

示例#40

0

显示文件

文件： svmlightcorpus.py 项目： zjyeon/ODSA-PythonAdvModels

    def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents
        if `labels` is not supplied.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word.
        labels : list or False
            An SVMlight `<target>` class tags or False if not present.
        metadata : bool
            ARGUMENT WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        logger.info("converting corpus to SVMlight format: %s", fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[
                    docno] if labels else 0  # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets

示例#41

0

显示文件

文件： interfaces.py 项目： Xsardas1000/Django-server

    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus:  # iterate over the document stream
                fmt = str(doc)  # format the document appropriately...
                fout.write(utils.to_utf8(
                    "%s\n" % fmt))  # serialize the formatted document to disk

示例#42

0

显示文件

文件： ucicorpus.py 项目： 234205367/gensim

    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)

示例#43

0

显示文件

    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with open(fname_vocab, 'w') as fout:
            for featureid in xrange(num_terms):
                fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)

示例#44

0

显示文件

文件： np2vec.py 项目： cdj0311/nlp-architect

    def save(self, np2vec_model_file='np2vec.model', binary=False):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
        """
        if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True.")
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            logger.info('pruning np2vec model')
            total_vec = 0
            vector_size = self.model.vector_size
            for word in self.model.wv.vocab.keys():
                if self.is_marked(word):
                    total_vec += 1
            logger.info(
                "storing %sx%s projection weights for NP's into %s" %
                (total_vec, vector_size, np2vec_model_file))
            with utils.smart_open(np2vec_model_file, 'wb') as fout:
                fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                # store NP vectors in sorted order: most frequent NP's at the top
                for word, vocab in sorted(
                        iteritems(
                            self.model.wv.vocab), key=lambda item: -item[1].count):
                    if self.is_marked(word):
                        embedding_vec = self.model.wv.syn0[vocab.index]
                        if binary:
                            fout.write(
                                utils.to_utf8(word) + b" " + embedding_vec.tostring())
                        else:
                            fout.write(
                                utils.to_utf8(
                                    "%s %s\n" %
                                    (word, ' '.join(
                                        "%f" %
                                        val for val in embedding_vec))))

示例#45

0

显示文件

文件： hashdictionary.py 项目： jMonteroMunoz/gensim

 def restricted_hash(self, token):
     """
     Calculate id of the given token. Also keep track of what words were mapped
     to what ids, for debugging reasons.
     """
     h = self.myhash(utils.to_utf8(token)) % self.id_range
     if self.debug:
         self.token2id[token] = h
         self.id2token.setdefault(h, set()).add(token)
     return h

示例#46

0

显示文件

文件： preprocessor.py 项目： gtlim/DeVise

	def prep_text(self,p_num, sentences,outfile):
		output = open(outfile,'w')
		#distribute textfiles
		for i,sen in enumerate(sentences):
			sentence = [utils.any2utf8(w) for w in sen ]
			for word_a, word_b in zip(sentence, sentence[1:]):
				word_a = re.sub("[^a-zA-Z]+", "", word_a )
				word_b = re.sub("[^a-zA-Z]+", "", word_b )
				if not word_a: continue
				phrase = word_a + "_" + word_b;
        		        if phrase in self.vocab:
					output.write(utils.to_utf8(self.vocab[phrase]+' '))
				else:
					output.write(utils.to_utf8(self.vocab[word_a]+' '))
		
			if i % 10000 == 0:
        			logger.info("PROGRESS: at sentence #%i " %(i))
        	logger.info("PROGRESS: at sentence #%i " %(i))
    		output.close()

示例#47

0

显示文件

文件： utils_any2vec.py 项目： lopusz/gensim

def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None):
        """Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        Parameters
        ----------
        fname : str
            The file path used to save the vectors in
        vocab : dict
            The vocabulary of words
        vectors : numpy.array
            The vectors to be stored
        fvocab : str
            Optional file path used to save the vocabulary
        binary : bool
            If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
        total_vec :  int
            Optional parameter to explicitly specify total no. of vectors
            (in case word vectors are appended with document vectors afterwards)

        """
        if not (vocab or vectors):
            raise RuntimeError("no input")
        if total_vec is None:
            total_vec = len(vocab)
        vector_size = vectors.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s", fvocab)
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
        logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
        assert (len(vocab), vector_size) == vectors.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count):
                row = vectors[vocab_.index]
                if binary:
                    row = row.astype(REAL)
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

示例#48

0

显示文件

文件： dictionary.py 项目： nonva/gensim

    def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

示例#49

0

显示文件

文件： ucicorpus.py 项目： dpritsos/DoGSWrapper

    def update_headers(self, num_docs, num_terms, num_nnz):
        """Update headers with actual values."""
        offset = 0
        values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]]

        for value in values:
            if len(value) > len(self.FAKE_HEADER):
                raise ValueError('Invalid header: value too large!')
            self.fout.seek(offset)
            self.fout.write(value)
            offset += len(self.FAKE_HEADER)

示例#50

0

显示文件

文件： dtmmodel.py 项目： RaRe-Technologies/gensim

    def convert_input(self, corpus, time_slices):
        """Convert corpus into LDA-C format by :class:`~gensim.corpora.bleicorpus.BleiCorpus` and save to temp file.
        Path to temporary file produced by :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.ftimeslices`.

        Parameters
        ----------
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        time_slices : list of int
            Sequence of timestamps.

        """
        logger.info("serializing temporary corpus to %s", self.fcorpustxt())
        # write out the corpus in a file format that DTM understands:
        corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus)

        with utils.smart_open(self.ftimeslices(), 'wb') as fout:
            fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n"))
            for sl in time_slices:
                fout.write(utils.to_utf8(str(sl) + "\n"))

示例#51

0

显示文件

文件： ldamallet.py 项目： abs51295/gensim

    def corpus2mallet(self, corpus, file_like):
        """
        Write out `corpus` in a file format that MALLET understands: one document per line:

          document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens[NEWLINE]
        """
        for docno, doc in enumerate(corpus):
            if self.id2word:
                tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
            else:
                tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
            file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))

示例#52

0

显示文件

文件： Assignment3-Part3-LDAToyTextFiles.py 项目： eltond/Programming-Projects

def Perform_LDA(i):
 NewFile = [] 
 tokens = [utils.to_utf8(token) for token in utils.tokenize(' '.join(i), lower=True, errors='ignore')] 
 for i in nltk.pos_tag(tokens):
  if findPOS(i) == '':
   NewFile.append(i[0])
  else:
   NewFile.append(wordnet_lemmatizer.lemmatize(i[0], pos = findPOS(i)))
 texts = [[i.encode('utf-8')] for i in NewFile]
 dictionary = corpora.Dictionary(texts)
 corpus = [dictionary.doc2bow(text) for text in texts]
 return corpus, models.ldamodel.LdaModel(corpus=corpus, passes = 30, id2word=dictionary, num_topics=3)

示例#53

0

显示文件

文件： bidictionary.py 项目： lum4chi/mygensim

 def save_as_text(self, fname, sort_by_word=True):
     """
     Save this Dictionary to a text file, in format:
     `id[TAB]fid[TAB]sid[TAB]document frequency[NEWLINE]`
     and _unidict has an usual gensim dictionary
     """
     self._unidict.save_as_text(fname + '.index', sort_by_word)
     with utils.smart_open(fname, 'wb') as fout:
         # no word to display in bidict
         for fid_sid, id in sorted(iteritems(self.fid_sid2bid)):
             line = "%i\t%i\t%i\t%i\n" % (id, fid_sid[0], fid_sid[1], self.dfs.get(id, 0))
             fout.write(utils.to_utf8(line))

示例#54

0

显示文件

文件： feat2vec.py 项目： StevenLOL/feat2vec

 def save_word2vec_format(self, fname, fvocab=None, binary=False):
     """
     Store the input-hidden weight matrix in the same format used by the original
     C word2vec-tool, for compatibility.
     """
     if fvocab is not None:
         logger.info("Storing vocabulary in %s" % (fvocab))
         with utils.smart_open(fvocab, 'wb') as vout:
             for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                 vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
     logger.info("storing %sx%s projection weights into %s" % (len(self.vocab), self.layer1_size, fname))
     assert (len(self.vocab), self.layer1_size) == self.syn0.shape
     with utils.smart_open(fname, 'wb') as fout:
         fout.write(utils.to_utf8("%s %s\n" % self.syn0.shape))
         # store in sorted order: most frequent words at the top
         for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
             row = self.syn0[vocab.index]
             if binary:
                 fout.write(word + b" " + row.tostring())
             else:
                 fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))

示例#55

0

显示文件

文件： matutils.py 项目： ArifAhmed1995/gensim

    def write_vector(self, docno, vector):
        """
        Write a single sparse vector to the file.

        Sparse vector is any iterable yielding (field id, field value) pairs.
        """
        assert self.headers_written, "must write Matrix Market file headers before writing data!"
        assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
        vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries
        for termid, weight in vector: # write term ids in sorted order
            self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1
        self.last_docno = docno
        return (vector[-1][0], len(vector)) if vector else (-1, 0)

示例#56

0

显示文件

文件： test_corpora_dictionary.py 项目： JKamlah/gensim

    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)

示例#57

0

显示文件

文件： word2vec_sg_negs.py 项目： sebastien-j/Word2vec

    def build_vocab(self, sentences):
        """
        Build vocabulary from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        logger.info("collecting all words and their counts")
        sentence_no, vocab = -1, {}
        total_words = lambda: sum(v.count for v in vocab.itervalues())
        for sentence_no, sentence in enumerate(sentences):
            if sentence_no % 10000 == 0:
                logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                    (sentence_no, total_words(), len(vocab)))
            for word in sentence:
                if word in vocab:
                    vocab[word].count += 1
                else:
                    vocab[word] = Vocab(count=1)
        logger.info("collected %i word types from a corpus of %i words and %i sentences" %
            (len(vocab), total_words(), sentence_no + 1))

        # assign a unique index to each word
        self.vocab, self.index2word = {}, []
        for word, v in vocab.iteritems():
            if v.count >= self.min_count and v.count <= self.max_count:
                try:
                    utils.to_utf8(word)
                    v.index = len(self.vocab)
                    self.index2word.append(word)
                    self.vocab[word] = v
                    self.vocab[word].count_power = pow(v.count, self.c_power)
                    self.vocab[word].count_power_2 = pow(v.count, 2*self.c_power)
                except:
                    pass
        logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count))

        # add info about each word's Huffman encoding
        self.create_binary_tree()
        self.reset_weights()

示例#58

0

显示文件

文件： dictionary.py 项目： biddyweb/gensim

    def doc2bow(self, document, allow_update=False, return_missing=False):
        """
        Convert `document` (a list of words) into the bag-of-words format = list
        of `(token_id, token_count)` 2-tuples. Each word is assumed to be a
        **tokenized and normalized** utf-8 encoded string. No further preprocessing
        is done on the words in `document`; apply tokenization, stemming etc. before
        calling this method.

        If `allow_update` is set, then also update dictionary in the process: create
        ids for new words. At the same time, update document frequencies -- for
        each word appearing in this document, increase its document frequency (`self.dfs`)
        by one.

        If `allow_update` is **not** set, this function is `const`, aka read-only.
        """
        result = {}
        missing = {}
        if isinstance(document, string_types):
            raise TypeError("doc2bow expects an array of utf8 tokens on input, not a string")
        document = sorted(utils.to_utf8(token) for token in document)
        # construct (word, frequency) mapping. in python3 this is done simply
        # using Counter(), but here i use itertools.groupby() for the job
        for word_norm, group in itertools.groupby(document):
            frequency = len(list(group)) # how many times does this word appear in the input document
            tokenid = self.token2id.get(word_norm, None)
            if tokenid is None:
                # first time we see this token (~normalized form)
                if return_missing:
                    missing[word_norm] = frequency
                if not allow_update: # if we aren't allowed to create new tokens, continue with the next unique token
                    continue
                tokenid = len(self.token2id)
                self.token2id[word_norm] = tokenid # new id = number of ids made so far; NOTE this assumes there are no gaps in the id sequence!

            # update how many times a token appeared in the document
            result[tokenid] = frequency

        if allow_update:
            self.num_docs += 1
            self.num_pos += len(document)
            self.num_nnz += len(result)
            # increase document count for each unique token that appeared in the document
            for tokenid in iterkeys(result):
                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

        # return tokenids, in ascending id order
        result = sorted(iteritems(result))
        if return_missing:
            return result, missing
        else:
            return result