예제 #1
0
    def save(self, fname, *args, **kwargs):
        """Save model to file.

        Parameters
        ----------
        fname : str
            Path to output file.

        """
        if os.path.exists(self._model_filename):
            # Vowpal Wabbit uses its own binary model file, read this into
            # variable before serialising this object - keeps all data
            # self contained within a single serialised file
            logger.debug("Reading model bytes from '%s'", self._model_filename)
            with utils.smart_open(self._model_filename, 'rb') as fhandle:
                self._model_data = fhandle.read()

        if os.path.exists(self._topics_filename):
            logger.debug("Reading topic bytes from '%s'", self._topics_filename)
            with utils.smart_open(self._topics_filename, 'rb') as fhandle:
                self._topics_data = fhandle.read()

        if 'ignore' not in kwargs:
            kwargs['ignore'] = frozenset(['_topics', 'tmp_dir'])

        super(LdaVowpalWabbit, self).save(fname, *args, **kwargs)
예제 #2
0
    def load(cls, fname, *args, **kwargs):
        """Load model from `fname`.

        Parameters
        ----------
        fname : str
            Path to file with :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`.

        """
        lda_vw = super(LdaVowpalWabbit, cls).load(fname, *args, **kwargs)
        lda_vw._init_temp_dir(prefix=lda_vw.tmp_prefix)

        if lda_vw._model_data:
            # Vowpal Wabbit operates on its own binary model file - deserialise
            # to file at load time, making it immediately ready for use
            logger.debug("Writing model bytes to '%s'", lda_vw._model_filename)
            with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle:
                fhandle.write(lda_vw._model_data)
            lda_vw._model_data = None  # no need to keep in memory after this

        if lda_vw._topics_data:
            logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename)
            with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle:
                fhandle.write(lda_vw._topics_data)
            lda_vw._topics_data = None

        return lda_vw
예제 #3
0
    def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

         `fname` is the file used to save the vectors in
         `fvocab` is an optional file used to save the vocabulary
         `binary` is an optional boolean indicating whether the data is to be saved
         in binary word2vec format (default: False)
         `total_vec` is an optional parameter to explicitly specify total no. of vectors
         (in case word vectors are appended with document vectors afterwards)

        """
        if total_vec is None:
            total_vec = len(self.vocab)
        vector_size = self.syn0.shape[1]
        if fvocab is not None:
            logger.info("storing vocabulary in %s", fvocab)
            with utils.smart_open(fvocab, 'wb') as vout:
                for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                    vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count)))
        logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname)
        assert (len(self.vocab), vector_size) == self.syn0.shape
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
            # store in sorted order: most frequent words at the top
            for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count):
                row = self.syn0[vocab.index]
                if binary:
                    fout.write(utils.to_utf8(word) + b" " + row.tostring())
                else:
                    fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
예제 #4
0
	def get_texts(self):
		total_docs = 0
		if os.path.isdir( self.input ):
			# Read two levels of files
			filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs
예제 #5
0
    def test_corpus_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()
            summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
예제 #6
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
예제 #7
0
 def testLineSentenceWorksWithNormalFile(self):
     """Does LineSentence work with a file object argument, rather than filename?"""
     with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
         with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
             sentences = word2vec.LineSentence(fin)
             for words in sentences:
                 self.assertEqual(words, utils.to_unicode(orig.readline()).split())
예제 #8
0
 def testPathLineSentences(self):
     """Does PathLineSentences work with a path argument?"""
     with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
     utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
         sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
         orig = orig1.readlines() + orig2.readlines()
         orig_counter = 0  # to go through orig while matching PathLineSentences
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
             orig_counter += 1
예제 #9
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
예제 #10
0
    def test_text_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Makes a summary of the text.
        generated_summary = summarize(text)

        # To be compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()

        self.assertEqual(generated_summary, summary)
예제 #11
0
    def test_text_keywords_pos(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords using only certain parts of speech
        generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
예제 #12
0
    def test_text_keywords(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # calculate keywords
        generated_keywords = keywords(text, split=True)

        # To be compared to the reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f:
            kw = f.read().strip().split("\n")

        self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
예제 #13
0
    def __init__(self, input):
        """

        Parameters
        ----------
        input : str
            Path to file in UCI format.

        """

        logger.info('Initializing corpus reader from %s', input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info(
            "accepted corpus with %i documents, %i features, %i non-zero entries",
            self.num_docs, self.num_terms, self.num_nnz
        )
    def __iter__(self):
        for source in sources:
            with utils.smart_open(source) as fin:
                title = ''
                conference = ''
                field = ''
                abstract = ''
                for line in fin.readlines():
                    line = line.decode("utf-8")
                    if line == '\r\n':
                        yield LabeledSentence(\
                            preprocess_string(title), \
                            tags=[conference, field])
                        title = ''
                        conference = ''
                        field = ''

                    if line.startswith('#*'):
                        title = str(line).strip()[2:]

                    if line.startswith('#c'):
                        conference = str(line).strip()[2:]

                    if line.startswith('#f'):
                        field = str(line).strip()[2:]
예제 #15
0
    def docbyoffset(self, offset):
        """Return document at file offset `offset` (in bytes)"""
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin = utils.smart_open(self.input)
        else:
            fin = self.input

        fin.seek(offset) # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    return document
                previd = docid

            document.append((termid, val,)) # add another field to the current document
        return document
예제 #16
0
    def __init__(self, fname, fname_vocab=None):
        """
        Parameters
        ----------
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        Examples
        --------
        >>> from gensim.corpora import UciCorpus
        >>> from gensim.test.utils import datapath
        >>>
        >>> corpus = UciCorpus(datapath('testcorpus.uci'))
        >>> for document in corpus:
        ...     pass

        """
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
예제 #17
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
예제 #18
0
 def testPathLineSentencesOneFile(self):
     """Does PathLineSentences work with a single file argument?"""
     test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
     with utils.smart_open(test_file) as orig:
         sentences = word2vec.PathLineSentences(test_file)
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
예제 #19
0
    def _load_vw_topics(self):
        """Read topics file generated by Vowpal Wabbit, convert to numpy array.

        Output consists of many header lines, followed by a number of lines
        of:
        <word_id> <topic_1_gamma> <topic_2_gamma> ...
        """
        topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)

        with utils.smart_open(self._topics_filename) as topics_file:
            found_data = False

            for line in topics_file:
                # look for start of data
                if not found_data:
                    if line.startswith(b'0 ') and b':' not in line:
                        found_data = True
                    else:
                        continue

                fields = line.split()
                word_id = int(fields[0])

                # output contains entries for 2**b terms, where b was set
                # by the '-b' option, ignore anything past num_terms
                if word_id >= self.num_terms:
                    break

                topics[:, word_id] = fields[1:]

        # normalise to probability distribution
        self._topics = topics / topics.sum(axis=1, keepdims=True)
예제 #20
0
    def test_get_offsets_and_start_doctags_win(self):
        # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows)
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1)
        self.assertEqual(offsets, [0])
        self.assertEqual(start_doctags, [0])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2)
        self.assertEqual(offsets, [0, 14])
        self.assertEqual(start_doctags, [0, 2])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3)
        self.assertEqual(offsets, [0, 7, 21])
        self.assertEqual(start_doctags, [0, 1, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4)
        self.assertEqual(offsets, [0, 7, 14, 21])
        self.assertEqual(start_doctags, [0, 1, 2, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        self.assertEqual(offsets, [0, 7, 14, 21, 28])
        self.assertEqual(start_doctags, [0, 1, 2, 3, 4])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6)
        self.assertEqual(offsets, [0, 0, 7, 14, 14, 21])
        self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
예제 #21
0
 def docbyoffset(self, offset):
     """
     Return the document stored at file position `offset`.
     """
     with utils.smart_open(self.fname) as f:
         f.seek(offset)
         return self.line2doc(f.readline())
예제 #22
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        fname + '.vocab',
                        fname + '/vocab.txt',
                        fname_base + '.vocab',
                        fname_dir + '/vocab.txt',
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')


        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
        self.length = None
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
     return self.sentences
예제 #24
0
    def docbyoffset(self, offset):
        """Get the document stored in file by `offset` position.

        Parameters
        ----------
        offset : int
            Offset (in bytes) to begin of document.

        Returns
        -------
        list of (int, int)
            Document in BoW format (+"document_id" and "lang" if metadata=True).

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.corpora import MalletCorpus
            >>>
            >>> data = MalletCorpus(datapath("testcorpus.mallet"))
            >>> data.docbyoffset(1)  # end of first line
            [(3, 1), (4, 1)]
            >>> data.docbyoffset(4)  # start of second line
            [(4, 1)]

        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())
예제 #25
0
 def to_array(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(
                     LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)]))
     return self.sentences
예제 #26
0
    def convert_input(self, corpus, infer=False):
        """
        Serialize documents (lists of unicode tokens) to a temporary text file,
        then convert that text file to MALLET format `outfile`.

        """
        logger.info("serializing temporary corpus to %s" % self.fcorpustxt())
        # write out the corpus in a file format that MALLET understands: one document per line:
        # document id[SPACE]label (not used)[SPACE]whitespace delimited utf8-encoded tokens
        with utils.smart_open(self.fcorpustxt(), "wb") as fout:
            for docno, doc in enumerate(corpus):
                if self.id2word:
                    tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
                else:
                    tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
                fout.write(utils.to_utf8("%s 0 %s\n" % (docno, " ".join(tokens))))

        # convert the text file above into MALLET's internal format
        cmd = (
            self.mallet_path
            + " import-file --preserve-case --keep-sequence --remove-stopwords --token-regex '\S+' --input %s --output %s"
        )
        if infer:
            cmd += " --use-pipe-from " + self.fcorpusmallet()
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet() + ".infer")
        else:
            cmd = cmd % (self.fcorpustxt(), self.fcorpusmallet())
        logger.info("converting temporary corpus to MALLET format with %s" % cmd)
        call(cmd, shell=True)
예제 #27
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus:  # iterate over the document stream
                fmt = str(doc)  # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt))  # serialize the formatted document to disk
예제 #28
0
def write_corpus_as_vw(corpus, filename):
    """Covert `corpus` to  Vowpal Wabbit format and save it to `filename`.

    Parameters
    ----------
    corpus : iterable of list of (int, int)
        Collection of texts in BoW format.
    filename : str
        Path to output file.

    Returns
    -------
    int
        Number of lines in `filename`.

    """
    logger.debug("Writing corpus to: %s", filename)

    corpus_size = 0
    with utils.smart_open(filename, 'wb') as corpus_file:
        for line in corpus_to_vw(corpus):
            corpus_file.write(line.encode('utf-8') + b'\n')
            corpus_size += 1

    return corpus_size
예제 #29
0
    def load_word_topics(self):
        """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file.

        Returns
        -------
        numpy.ndarray
            Matrix words X topics.

        """
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # noqa:F841 beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
예제 #30
0
    def _predict(self, chunk):
        """Run given chunk of documents against currently trained model.

        Parameters
        ----------
        chunk : iterable of list of (int, int)
            Sequence of documents in BoW format.

        Returns
        -------
        predictions : ndarray
            Tuple of prediction matrix.
        vw_data : dict
            Vowpal Wabbit data.

        """
        corpus_size = write_corpus_as_vw(chunk, self._corpus_filename)

        cmd = self._get_vw_predict_command(corpus_size)
        vw_data = _parse_vw_output(_run_vw_command(cmd))
        vw_data['corpus_size'] = corpus_size

        predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32)

        with utils.smart_open(self._predict_filename) as fhandle:
            for i, line in enumerate(fhandle):
                predictions[i, :] = line.split()

        predictions = predictions / predictions.sum(axis=1, keepdims=True)

        return predictions, vw_data
예제 #31
0
 def __init__(self, fname):
     self.fname = fname
     if fname.endswith(".gz") or fname.endswith('.bz2'):
         raise NotImplementedError("compressed output not supported with MmWriter")
     self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
     self.headers_written = False
예제 #32
0
 def _calculate_num_docs(self):
     with utils.smart_open(self.fname) as fin:
         result = sum([1 for x in fin])
     return result
예제 #33
0
 def my_fake_header(self, num_docs, num_terms, num_nnz):
     self.fout.close()
     self.fout = utils.smart_open(self.fname, 'r+b')
     super(MyMmWriter, self).fake_headers(num_docs, num_terms, num_nnz)
     self.fout.close()
     self.fout = utils.smart_open(self.fname, 'ab+')
예제 #34
0
파일: lowcorpus.py 프로젝트: wpli/gensim
 def _calculate_num_docs(self):
     # the first line in input data is the number of documents (integer). throws exception on bad input.
     with utils.smart_open(self.fname) as fin:
         result = int(fin.readline())
     return result
예제 #35
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
예제 #36
0
    def read_doctopics(self, fname, eps=1e-6, renorm=True):
        """Get document topic vectors from MALLET's "doc-topics" format, as sparse gensim vectors.

        Parameters
        ----------
        fname : str
            Path to input file with document topics.
        eps : float, optional
            Threshold for probabilities.
        renorm : bool, optional
            If True - explicitly re-normalize distribution.

        Raises
        ------
        RuntimeError
            If any line in invalid format.

        Yields
        ------
        list of (int, float)
            LDA vectors for document.

        """
        mallet_version = self.get_version(self.mallet_path)
        with utils.smart_open(fname) as fin:
            for lineno, line in enumerate(fin):
                if lineno == 0 and line.startswith(b"#doc "):
                    continue  # skip the header line if it exists

                parts = line.split()[2:]  # skip "doc" and "source" columns

                # the MALLET doctopic format changed in 2.0.8 to exclude the id,
                # this handles the file differently dependent on the pattern
                if len(parts) == 2 * self.num_topics:
                    doc = [(int(id_), float(weight))
                           for id_, weight in zip(*[iter(parts)] * 2)
                           if abs(float(weight)) > eps]
                elif len(parts
                         ) == self.num_topics and mallet_version != '2.0.7':
                    doc = [(id_, float(weight))
                           for id_, weight in enumerate(parts)
                           if abs(float(weight)) > eps]
                else:
                    if mallet_version == "2.0.7":
                        """

                            1   1   0   1.0780612802674239  30.005575655428533364   2   0.005575655428533364
                            2   2   0   0.9184413079632608  40.009062076892971008   3   0.009062076892971008
                            In the above example there is a mix of the above if and elif statement.
                            There are neither `2*num_topics` nor `num_topics` elements.
                            It has 2 formats 40.009062076892971008 and 0   1.0780612802674239
                            which cannot be handled by above if elif.
                            Also, there are some topics are missing(meaning that the topic is not there)
                            which is another reason why the above if elif fails even when the `mallet`
                            produces the right results

                        """
                        count = 0
                        doc = []
                        if len(parts) > 0:
                            while count < len(parts):
                                """
                                if section is to deal with formats of type 2 0.034
                                so if count reaches index of 2 and since int(2) == float(2) so if block is executed
                                now  there is one extra element afer 2, so count + 1 access should not give an error

                                else section handles  formats of type 20.034
                                now count is there on index of 20.034 since float(20.034) != int(20.034) so else block
                                is executed

                                """
                                if float(parts[count]) == int(parts[count]):
                                    if float(parts[count + 1]) > eps:
                                        doc.append((int(parts[count]),
                                                    float(parts[count + 1])))
                                    count += 2
                                else:
                                    if float(parts[count]) - int(
                                            parts[count]) > eps:
                                        doc.append((int(parts[count]) % 10,
                                                    float(parts[count]) -
                                                    int(parts[count])))
                                    count += 1
                    else:
                        raise RuntimeError(
                            "invalid doc topics format at line %i in %s" %
                            (lineno + 1, fname))

                if renorm:
                    # explicitly normalize weights to sum up to 1.0, just to be sure...
                    total_weight = float(sum([weight for _, weight in doc]))
                    if total_weight:
                        doc = [(id_, float(weight) / total_weight)
                               for id_, weight in doc]
                yield doc
예제 #37
0
def load_word2vec_format(cls=gensim.models.KeyedVectors,
                         fname='',
                         fvocab=None,
                         binary=False,
                         encoding='utf8',
                         unicode_errors='strict',
                         limit=None,
                         datatype=REAL):
    """


        Load the input-hidden weight matrix from the original C word2vec-tool format.
        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.
        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
        `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
        `limit` sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
        `datatype` (experimental) can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)
        """
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = map(
            int, header.split())  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls()
        result.vector_size = vector_size
        result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.syn0[word_id] = weights
            result.index2word.append(word)

        if binary:
            # TODO: delegate
            pass
        else:
            for line_no in xrange(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                if '"' in utils.to_unicode(line,
                                           encoding=encoding,
                                           errors=unicode_errors):
                    line = utils.to_unicode(line,
                                            encoding=encoding,
                                            errors=unicode_errors)
                    label = line.split('"', 1)[1].rsplit('"')[0].strip()
                    other = line.rsplit('"', 1)[1].strip().split(' ')
                    parts = [label] + other
                else:
                    parts = utils.to_unicode(line.rstrip(),
                                             encoding=encoding,
                                             errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % (line_no))

                word, weights = parts[0], list(map(REAL, parts[1:]))
                add_word(word, weights)

    if result.syn0.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.syn0.shape[0], len(result.vocab))
        result.syn0 = ascontiguousarray(result.syn0[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.syn0.shape

    logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
    return result
예제 #38
0
def load_word2vec_format(fname,
                         fvocab=None,
                         binary=False,
                         norm_only=True,
                         encoding='utf8'):
    """
    !!! Code modified from gensim.models.Word2Vec.load_word2vec_format: original version cannot
    load files created by original C word2vec if vocabulary contains words which are not correct
    Unicode byte sequences - this could happen due to corpora encoding issues. !!!

    Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    `binary` is a boolean indicating whether the data is in binary word2vec format.
    `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
    Word counts are read from `fvocab` filename, if set (this is the file generated
    by `-save-vocab` flag of the original C tool).

    If you trained the C model using non-utf8 encoding for words, specify that
    encoding in `encoding`.

    """
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s" % (fvocab))
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s" % (fname))
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = map(
            int, header.split())  # throws for invalid file format
        result = Word2Vec(size=vector_size)
        result.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for line_no in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)

                        word.append(ch)
                try:
                    word = utils.to_unicode(b''.join(word), encoding=encoding)
                except UnicodeDecodeError, e:
                    logger.warning(
                        "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..."
                        % e.start)
                    word = utils.to_unicode(b''.join(word[:e.start]),
                                            encoding=encoding)
                    logger.warning("... first %d bytes converted to '%s'" %
                                   (e.start, word))

                if counts is None:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=vocab_size - line_no)
                elif word in counts:
                    result.vocab[word] = Vocab(index=line_no,
                                               count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.vocab[word] = Vocab(index=line_no, count=None)
                result.index2word.append(word)
                result.syn0[line_no] = fromstring(fin.read(binary_len),
                                                  dtype=REAL)
        else:
예제 #39
0
 def load_binary_data(self, encoding='utf8'):
     """Loads data from the output binary file created by FastText training"""
     with utils.smart_open(self.file_name, 'rb') as f:
         self.load_model_params(f)
         self.load_dict(f, encoding=encoding)
         self.load_vectors(f)
예제 #40
0
    def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s" % (fvocab))
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline())
            vocab_size, layer1_size = map(int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word))
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line).split()
                    if len(parts) != layer1_size + 1:
                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                    word, weights = parts[0], map(REAL, parts[1:])
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims(norm_only)
        return result
    def enseemble_results_extra(self, questions, topn):
        """
                Returns a list of the results from an accuracy test

        """
        ok_vocab = self.get_vocabulary()
        new_vocab = [(w, self.model.wv.vocab[w]) for w in ok_vocab]
        new_vocab = {w.upper(): v for w, v in new_vocab}
        new_vocab = dict(new_vocab)

        results = []
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                continue
            else:

                try:
                    a, b, c, expected = [word.upper() for word in line.split()]
                except ValueError:
                    logger.info("skipping invalid line #%i in %s", line_no,
                                questions)
                    continue
                if a not in new_vocab or b not in new_vocab or c not in new_vocab or expected not in new_vocab:
                    """if a not in new_vocab:
                        print("Dont know: " + a)
                    if b not in new_vocab:
                        print("Dont know: " + b)
                    if c not in new_vocab:
                        print("Dont know: " + c)
                    if expected not in new_vocab:
                        print("Dont know: " + expected)
                    """
                    logger.debug("skipping line #%i with OOV words: %s",
                                 line_no, line.strip())
                    results.append(None)
                    continue

                original_vocab = self.get_vocabulary()
                self.set_vocabulary(new_vocab)
                ignore = {a, b, c}  # input words to be ignored
                #print('topn')
                #print(topn)
                # find the most likely prediction, ignoring OOV words and input words
                sims = self.most_similar(positive_words=[b, c],
                                         negative_words=[a],
                                         topn=topn)
                # print("sims")
                #print(sims)
                self.set_vocabulary(original_vocab)
                inner_results = []
                for predict in sims:
                    predicted = predict[0]
                    predicted = predicted.upper()
                    predicted_tuple = (predicted, predict[1])
                    #print(predicted_tuple)
                    inner_results.append(predicted_tuple)
                    #print(predicted)
                results.append(inner_results)
        #print(results)

        return results
    def special_danish_accuracy(self, questions):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See questions-words.txt in
        https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
        for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
        words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
        case normalization is performed.

        Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
        evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
        and question words. In case of multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = self.get_vocabulary()
        print("ok vocab")
        #print(ok_vocab)
        new_vocab = [(w, self.model.wv.vocab[w]) for w in ok_vocab]
        print("not dict")
        #new_vocab = [w.upper() for w in ok_vocab]
        #print(new_vocab)
        new_vocab = {w.upper(): v for w, v in new_vocab}
        new_vocab = dict(new_vocab)
        #print(new_vocab)

        sections, section = [], None
        wrong_predictions = []
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': [],
                    'incorrect': []
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    a, b, c, d, e, expected = [
                        word.upper() for word in line.split()
                    ]
                except ValueError:
                    logger.info("skipping invalid line #%i in %s", line_no,
                                questions)
                    continue
                if a not in new_vocab or b not in new_vocab or c not in new_vocab or d not in new_vocab or e not in new_vocab or expected not in new_vocab:
                    #print('not in vocab')
                    logger.debug("skipping line #%i with OOV words: %s",
                                 line_no, line.strip())
                    continue

                original_vocab = self.get_vocabulary()
                self.set_vocabulary(new_vocab)
                ignore = {a, b, c, d, e}  # input words to be ignored

                # find the most likely prediction, ignoring OOV words and input words
                sims = self.most_similar(positive_words=[c, d, e],
                                         negative_words=[a, b])
                #print("sims")
                #print(sims)
                self.set_vocabulary(original_vocab)

                predicted = sims[0][0]
                predicted = predicted.upper()
                #print(predicted)
                if predicted == expected:
                    section['correct'].append((a, b, c, d, e, expected))
                else:
                    wrong_message = a + " " + b + " " + c + " " + d + " " + e + ", predicted: " + predicted + ", should have been: " + expected
                    section['incorrect'].append((a, b, c, d, e, expected))
                    wrong_predictions.append(wrong_message)
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        print(wrong_predictions)
        return sections
예제 #43
0
data_prefix = data_path.split('/')[-1].split('.')[0]

""" Loading trained Doc2Vec model """
windowsize = int(sys.argv[1])
dimension = int(sys.argv[4])
nepoch = int(sys.argv[2])
mode = sys.argv[3]
name_tuple = ( data_prefix.strip('DATA').lower(), windowsize, nepoch )
model = Doc2Vec.load('./models/' + mode + '/' + str(dimension) + 'd' + '/semeval-%s-lc-ns-%dw-%de.d2v' % name_tuple)

nsamp = 0
sqerr = 0.0
nsqerr = 0.0
sentences = []
with utils.smart_open(data_path) as fin:
    for item_no, line in enumerate(fin):
        sentences.append(line)
        words = preprocessor(line)
        model_v = model.docvecs[ data_prefix + '_%s' % item_no ]
        infer_v = model.infer_vector(words)
        sim = dot(model_v, infer_v)
        sqerr += ( ( 1 - sim ) * ( 1 - sim ) )
        model_v /= norm(model_v)
        infer_v /= norm(infer_v)
        sim = dot(model_v, infer_v)
        nsqerr += ( ( 1 - sim ) * ( 1 - sim ) )
        nsamp += 1

rsqerr = 0.0
rnsqerr = 0.0
예제 #44
0
    def intersect_word2vec_format(self,
                                  fname,
                                  lockf=0.0,
                                  binary=False,
                                  encoding='utf8',
                                  unicode_errors='strict'):
        """
            Merge the input-hidden weight matrix from the original C word2vec-tool format
            given, where it intersects with the current vocabulary. (No words are added to the
            existing vocabulary, but intersecting words adopt the file's weights, and
            non-intersecting words are left alone.)

            `binary` is a boolean indicating whether the data is in binary word2vec format.

            `lockf` is a lock-factor value to be set for any imported word-vectors; the
            default value of 0.0 prevents further updating of the vector during subsequent
            training. Use 1.0 to allow further training updates of merged vectors.
            """
        overlap_count = 0
        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline(), encoding=encoding)
            vocab_size, vector_size = map(
                int, header.split())  # throws for invalid file format
            if not vector_size == self.vector_size:
                raise ValueError("incompatible vector size %d in file %s" %
                                 (vector_size, fname))
                # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)?
            if binary:
                binary_len = dtype(REAL).itemsize * vector_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word),
                                            encoding=encoding,
                                            errors=unicode_errors)
                    weights = fromstring(fin.read(binary_len), dtype=REAL)
                    if word in self.wv.vocab:
                        overlap_count += 1
                        self.wv.syn0[self.wv.vocab[word].index] = weights
                        self.syn0_lockf[
                            self.wv.vocab[word].
                            index] = lockf  # lock-factor: 0.0 stops further changes
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line.rstrip(),
                                             encoding=encoding,
                                             errors=unicode_errors).split(" ")
                    if len(parts) != vector_size + 1:
                        raise ValueError(
                            "invalid vector on line %s (is this really the text format?)"
                            % (line_no))
                    word, weights = parts[0], list(map(REAL, parts[1:]))
                    if word in self.wv.vocab:
                        overlap_count += 1
                        self.wv.syn0[self.wv.vocab[word].index] = weights
        logger.info("merged %d vectors into %s matrix from %s" %
                    (overlap_count, self.wv.syn0.shape, fname))
예제 #45
0
def evaluate_synsets(emb_model, pairs,flag_emb,flag_nv, our_logger, delimiter='\t', dummy4unknown=False):
    ok_vocab = [(w, emb_model.vocab[w]) for w in emb_model.index2word]
    ok_vocab = dict(ok_vocab)

    similarity_gold = []
    similarity_model = []
    oov = 0

    original_vocab = emb_model.vocab
    emb_model.vocab = ok_vocab

    for line_no, line in enumerate(utils.smart_open(pairs)):
        line = utils.to_unicode(line)
        if line.startswith('#'):
            # May be a comment
            continue
        else:
            try:
                a, b, sim = [word for word in line.split(delimiter)]
                sim = float(sim)
            except (ValueError, TypeError):
                our_logger.info('Skipping invalid line #%d in %s', line_no, pairs)
                continue

            # Finding correct synsets
            if flag_nv:
                synsets_a = wn.synsets(a.strip(), 'n')
                synsets_b = wn.synsets(b.strip(), 'n')
            else:
                synsets_a = wn.synsets(a.strip(), 'v')
                synsets_b = wn.synsets(b.strip(), 'v')

            if len(list(synsets_a)) == 0 or len(list(synsets_b)) == 0:
                oov += 1
                if dummy4unknown:
                    our_logger.debug('Zero similarity for line #%d with words with no synsets: %s',
                                     line_no, line.strip())
                    similarity_model.append(0.0)
                    similarity_gold.append(sim)
                    continue
                else:
                    our_logger.debug('Skipping line #%d with words with no synsets: %s',
                                     line_no, line.strip())
                    continue

            best_pair = None
            best_sim = 0.0
            for pair in product(synsets_a, synsets_b):
                if flag_emb:
                    possible_similarity = emb_model.similarity(pair[0].lemmas()[0].key(), pair[1].lemmas()[0].key())
                else:
                    possible_similarity = emb_model.similarity(pair[0].name(), pair[1].name())
                if possible_similarity > best_sim:
                    best_pair = pair
                    best_sim = possible_similarity
            our_logger.debug('Original words: %s', line.strip())
            our_logger.debug('Synsets chosen: %s with similarity %f', best_pair, best_sim)
            similarity_model.append(best_sim)  # Similarity from the model
            similarity_gold.append(sim)  # Similarity from the dataset

    emb_model.vocab = original_vocab
    spearman = stats.spearmanr(similarity_gold, similarity_model)
    pearson = stats.pearsonr(similarity_gold, similarity_model)
    if dummy4unknown:
        oov_ratio = float(oov) / len(similarity_gold) * 100
    else:
        oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

    our_logger.debug('Pearson correlation coefficient against %s: %f with p-value %f',
                     pairs, pearson[0], pearson[1])
    our_logger.debug(
        'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
        pairs, spearman[0], spearman[1])
    our_logger.debug('Pairs with unknown words: %d', oov)
    return pearson, spearman, oov_ratio
예제 #46
0
from gensim import utils
import json
import re
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
                    level=logging.INFO)

# iterate over the plain text data we just created
output = open('/data/yechen/bert/wiki.en.article.txt', 'w', encoding='utf8')
exclude_sections = {
    'See also', 'References', 'Further reading', 'External links', 'Sources',
    'Bibliography'
}
with utils.smart_open(
        '/data/yechen/bert/enwiki-20201101-pages-articles-multistream.json.gz',
        'rb') as f:
    numart = 0
    numsec = 0
    for line in f:
        output_text = ''
        numart = numart + 1
        article = json.loads(line)
        section_titles = article['section_titles']
        if not section_titles:
            continue
        i = -1
        for section_text in article['section_texts']:
            i = i + 1
            if (section_titles[i] in exclude_sections):
                continue
            numsec = numsec + 1
예제 #47
0
def read_file(path):
    with utils.smart_open(path) as fin:
        return fin.read()
 def _get_text_from_test_data(self, file):
     pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
     with utils.smart_open(os.path.join(pre_path, file), mode="r") as f:
         return f.read()
예제 #49
0
 def load_binary_data(self, model_binary_file):
     """Loads data from the output binary file created by FastText training"""
     with utils.smart_open(model_binary_file, 'rb') as f:
         self.load_model_params(f)
         self.load_dict(f)
         self.load_vectors(f)
    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            Sort words in lexicographical order before writing them out?

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        This text format is great for corpus inspection and debugging. As plaintext, it's also easily portable
        to other tools and frameworks. For better performance and to store the entire object state,
        including collected corpus statistics, use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` instead.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`
            Load :class:`~gensim.corpora.dictionary.Dictionary` from text file.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import Dictionary
            >>> from gensim.test.utils import get_tmpfile
            >>>
            >>> tmp_fname = get_tmpfile("dictionary")
            >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
            >>>
            >>> dct = Dictionary(corpus)
            >>> dct.save_as_text(tmp_fname)
            >>>
            >>> loaded_dct = Dictionary.load_from_text(tmp_fname)
            >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))
예제 #51
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the Mallet format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            If True - ????

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`.

        Notes
        -----
        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s", fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(
                    utils.to_utf8('%s %s %s\n' %
                                  (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning(
                "Mallet format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated)

        return offsets
예제 #52
0
    def evaluate_word_pairs(self,
                            pairs,
                            delimiter='\t',
                            restrict_vocab=300000,
                            case_insensitive=True,
                            dummy4unknown=False):
        """
        Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
        lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
        An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
        http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.

        The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
        between the similarities from the dataset and the similarities produced by the model itself.
        The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).

        Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
        words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
        is performed.

        Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
        evaluating the model (default True). Useful when you expect case-mismatch between training tokens
        and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
        Otherwise (default False), these pairs are skipped entirely.
        """
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(
            ok_vocab)) if case_insensitive else dict(ok_vocab)

        similarity_gold = []
        similarity_model = []
        oov = 0

        original_vocab = self.vocab
        self.vocab = ok_vocab

        for line_no, line in enumerate(utils.smart_open(pairs)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:
                    if case_insensitive:
                        a, b, sim = [
                            word.upper() for word in line.split(delimiter)
                        ]
                    else:
                        a, b, sim = [word for word in line.split(delimiter)]
                    sim = float(sim)
                except:
                    logger.info('skipping invalid line #%d in %s', line_no,
                                pairs)
                    continue
                if a not in ok_vocab or b not in ok_vocab:
                    oov += 1
                    if dummy4unknown:
                        similarity_model.append(0.0)
                        similarity_gold.append(sim)
                        continue
                    else:
                        logger.debug('skipping line #%d with OOV words: %s',
                                     line_no, line.strip())
                        continue
                similarity_gold.append(sim)  # Similarity from the dataset
                similarity_model.append(self.similarity(
                    a, b))  # Similarity from the model
        self.vocab = original_vocab
        spearman = stats.spearmanr(similarity_gold, similarity_model)
        pearson = stats.pearsonr(similarity_gold, similarity_model)
        oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

        logger.debug(
            'Pearson correlation coefficient against %s: %f with p-value %f',
            pairs, pearson[0], pearson[1])
        logger.debug(
            'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
            pairs, spearman[0], spearman[1])
        logger.debug('Pairs with unknown words: %d' % oov)
        self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
        return pearson, spearman, oov_ratio
 def testLineSentenceWorksWithCompressedFile(self):
     """Does LineSentence work with a compressed file object argument?"""
     with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
         sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
예제 #54
0
def _load_word2vec_format(cls,
                          fname,
                          fvocab=None,
                          binary=False,
                          encoding='utf8',
                          unicode_errors='strict',
                          limit=None,
                          datatype=REAL):
    """Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    Parameters
    ----------
    fname : str
        The file path to the saved word2vec-format file.
    fvocab : str, optional
        File path to the vocabulary.Word counts are read from `fvocab` filename, if set
        (this is the file generated by `-save-vocab` flag of the original C tool).
    binary : bool, optional
        If True, indicates whether the data is in binary word2vec format.
    encoding : str, optional
        If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`.
    unicode_errors : str, optional
        default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
    limit : int, optional
        Sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
    datatype : type, optional
        (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
        Such types may result in much slower bulk operations or incompatibility with optimized routines.)

    Returns
    -------
    object
        Returns the loaded model as an instance of :class:`cls`.

    """
    from gensim.models.keyedvectors import Vocab
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = utils.to_unicode(b''.join(word),
                                        encoding=encoding,
                                        errors=unicode_errors)
                with utils.ignore_deprecation_warning():
                    # TODO use frombuffer or something similar
                    weights = fromstring(fin.read(binary_len),
                                         dtype=REAL).astype(datatype)
                add_word(word, weights)
        else:
            for line_no in xrange(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = utils.to_unicode(line.rstrip(),
                                         encoding=encoding,
                                         errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [datatype(x) for x in parts[1:]]
                add_word(word, weights)
    if result.vectors.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.vectors.shape[0], len(result.vocab))
        result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.vectors.shape

    logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
    return result
 def __iter__(self):
     with utils.smart_open(self.source) as fin:
         for item_no, line in enumerate(fin):
             text = line.rsplit(None, 1)[0]
             yield TaggedDocument(text.split(), [item_no])
예제 #56
0
    def accuracy(self,
                 questions,
                 restrict_vocab=30000,
                 most_similar=most_similar,
                 case_insensitive=True):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
        words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
        case normalization is performed.

        Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
        evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
        and question words. In case of multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(
            ok_vocab)) if case_insensitive else dict(ok_vocab)

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': [],
                    'incorrect': []
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [
                            word.upper() for word in line.split()
                        ]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except:
                    logger.info("skipping invalid line #%i in %s" %
                                (line_no, questions))
                    continue
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" %
                                 (line_no, line.strip()))
                    continue

                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = set([a, b, c])  # input words to be ignored
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                sims = most_similar(self,
                                    positive=[b, c],
                                    negative=[a],
                                    topn=False,
                                    restrict_vocab=restrict_vocab)
                self.vocab = original_vocab
                for index in matutils.argsort(sims, reverse=True):
                    predicted = self.index2word[index].upper(
                    ) if case_insensitive else self.index2word[index]
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s",
                                         line.strip(), expected, predicted)
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(LabeledSentenceclean_tweet(line), [prefix + '_%s' % item_no])
 def testLineSentenceWorksWithFilename(self):
     """Does LineSentence work with a filename argument?"""
     with utils.smart_open(datapath('lee_background.cor')) as orig:
         sentences = word2vec.LineSentence(datapath('lee_background.cor'))
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
예제 #59
0
    def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word whose frequency
        is not in the top-N most frequent words (default top 30,000).

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = dict(sorted(iteritems(self.vocab),
                               key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in itervalues(ok_vocab))

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    a, b, c, expected = [word.lower() for word in line.split()]  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip()))
                    continue

                ignore = set(self.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(most_similar(self, positive=[b, c], negative=[a], topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted))
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum(len(s['correct']) for s in sections),
            'incorrect': sum(len(s['incorrect']) for s in sections)
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
예제 #60
0
def load_word2vec_format(fname, fvocab=None, binary=False, norm_only=True, encoding='utf8'):
    """
    Load the input-hidden weight matrix from the original C word2vec-tool format.
    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.
    `binary` is a boolean indicating whether the data is in binary word2vec format.
    `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
    Word counts are read from `fvocab` filename, if set (this is the file generated
    by `-save-vocab` flag of the original C tool).
    If you trained the C model using non-utf8 encoding for words, specify that
    encoding in `encoding`.
    """
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s" % (fvocab))
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s" % (fname))
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = map(int, header.split())  # throws for invalid file format
        result = Word2Vec(size=vector_size)
        result.wv.syn0 = zeros((vocab_size, vector_size), dtype=REAL)
        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for line_no in range(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)

                        word.append(ch)
                try:
                    word = utils.to_unicode(b''.join(word), encoding=encoding)
                except UnicodeDecodeError as e:
                    logger.warning(
                        "Couldn't convert whole word to unicode: trying to convert first %d bytes only ..." % e.start)
                    word = utils.to_unicode(b''.join(word[:e.start]), encoding=encoding)
                    logger.warning("... first %d bytes converted to '%s'" % (e.start, word))

                word = word.replace('_NOUN', '').replace('_VERB', '').replace('_ADJ', '')
                if counts is None:
                    result.wv.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                elif word in counts:
                    result.wv.vocab[word] = Vocab(index=line_no, count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.wv.vocab[word] = Vocab(index=line_no, count=None)
                result.wv.index2word.append(word)
                result.wv.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
        else:
            for line_no, line in enumerate(fin):
                parts = utils.to_unicode(line[:-1], encoding=encoding).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                word, weights = parts[0], list(map(REAL, parts[1:]))
                word = word.replace('_NOUN', '').replace('_VERB', '').replace('_ADJ', '')
                if counts is None:
                    result.wv.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                elif word in counts:
                    result.wv.vocab[word] = Vocab(index=line_no, count=counts[word])
                else:
                    logger.warning("vocabulary file is incomplete")
                    result.wv.vocab[word] = Vocab(index=line_no, count=None)
                result.wv.index2word.append(word)
                result.wv.syn0[line_no] = weights
    logger.info("loaded %s matrix from %s" % (result.wv.syn0.shape, fname))
    result.init_sims(norm_only)
    return result