Exemplo n.º 1
0
    def __iter__(self):
        """Iterate through the lines in the source."""
        try:
            # Assume it is a file-like object and try treating it as such
            # Things that don't have seek will trigger an exception
            self.source.seek(0)
            for item_no, line in enumerate(self.source):
                # yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
                try:
                    tagged_document = TaggedDocument(
                        utils.to_unicode(line).split(), [item_no])
                except UnicodeDecodeError as e:
                    print(e)
                    continue
                yield tagged_document


        except AttributeError:
            # If it didn't work like a file, use it as a string filename
            with utils.smart_open(self.source) as fin:
                for item_no, line in enumerate(fin):
                    if len(line) > 100:
                        # yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
                        try:
                            tagged_document = TaggedDocument(
                                utils.to_unicode(line).split(), [item_no])
                        except UnicodeDecodeError as e:
                            print(e)
                            continue
                        yield tagged_document



                    else:
                        continue
Exemplo n.º 2
0
    def __init__(self, input, transposed=True):
        """
        Initialize the matrix reader.

        The `input` refers to a file on local filesystem, which is expected to
        be in the sparse (coordinate) Matrix Market format. Documents are assumed
        to be rows of the matrix (and document features are columns).

        `input` is either a string (file path) or a file-like object that supports
        `seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
        """
        logger.info("initializing corpus reader from %s" % input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
                    raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
                                    (self.input, header))
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
                     (self.num_docs, self.num_terms, self.num_nnz))
Exemplo n.º 3
0
def main(filename):
    fn = filename.split('.')
    fn[-1] = 'hdf5'
    hdf5_filename = '.'.join(fn)

    bin_f = utils.smart_open(filename)
    hdf5_f = h5py.File(hdf5_filename, 'w')

    header = utils.to_unicode(bin_f.readline(), encoding='utf8')
    vocab_size, vector_size = map(int, header.split())
    binary_len = dtype(float32).itemsize * vector_size
    
    for line_no in xrange(vocab_size):
        word = []

        while True:
            ch = bin_f.read(1)
            if ch == b' ':
                break
            if ch != b'\n':
                word.append(ch)

        word = utils.to_unicode(b''.join(word), encoding='utf8', errors='strict')
        vector = fromstring(bin_f.read(binary_len), dtype=float32)

        if word[:3] == '/en':
            w = word[4:]
            hdf5_f.create_dataset(w, data=vector)
            print w

    bin_f.close()
    hdf5_f.close()
Exemplo n.º 4
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for line in self.source:
             yield utils.to_unicode(line).split()
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for line in fin:
                 yield utils.to_unicode(line).split()
Exemplo n.º 5
0
 def __iter__(self):
     """Iterate through the lines in the source."""
     try:
         # Assume it is a file-like object and try treating it as such
         # Things that don't have seek will trigger an exception
         self.source.seek(0)
         for item_no, line in enumerate(self.source):
             yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no] + self.topics_name )
     except AttributeError:
         # If it didn't work like a file, use it as a string filename
         with utils.smart_open(self.source) as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no] + self.topics_name )
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [source])) # prefix + '_%s' % item_no
     return self.sentences
Exemplo n.º 7
0
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True):
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Exemplo n.º 8
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning("List-of-words format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)
        return offsets
Exemplo n.º 9
0
    def sort_embeddings(self, vocab_file):
        """Sort embeddings according to word frequency.

        Parameters
        ----------
        vocab_file : str
            Path to file with vocabulary.

        """
        counts = {}
        vocab_size = len(self.vocab)
        prev_syn0 = copy.deepcopy(self.syn0)
        prev_vocab = copy.deepcopy(self.vocab)
        self.index2word = []

        # sort embeddings using frequency sorted vocab file in wordrank
        with utils.smart_open(vocab_file) as fin:
            for index, line in enumerate(fin):
                word, count = utils.to_unicode(line).strip(), vocab_size - index
                # store word with it's count in a dict
                counts[word] = int(count)
                # build new index2word with frequency sorted words
                self.index2word.append(word)
        assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes'

        for word_id, word in enumerate(self.index2word):
            self.syn0[word_id] = prev_syn0[prev_vocab[word].index]
            self.vocab[word].index = word_id
            self.vocab[word].count = counts[word]
def strip_short(sentence, minsize=3):
    '''
        Split and Join words
        if size >= minsize
    '''
    sentence = utils.to_unicode(sentence)
    return " ".join(e for e in sentence.split() if len(e) >= minsize)
Exemplo n.º 11
0
    def load_from_text(fname):
        """
        Load a previously stored Dictionary from a text file.
        Mirror function to `save_as_text`.
        """
        result = Bidictionary()
        # restore _unidict as gensim dictionary
        result._unidict = corpora.Dictionary.load_from_text(fname + '.index')

        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                try:
                    bid, fid, sid, docfreq = line[:-1].split('\t')
                    fid_sid = (int(fid), int(sid))
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s"
                                     % (fname, line.strip()))
                bid = int(bid)
                if fid_sid in result.fid_sid2bid:
                    raise KeyError('token %s is defined as ID %d and as ID %d' % (fid_sid, bid, result.fid_sid2bid[fid_sid]))
                result.fid_sid2bid[fid_sid] = bid
                result.dfs[bid] = int(docfreq)

        return result
Exemplo n.º 12
0
 def load_from_text(fname):
     """
     Load a previously stored Dictionary from a text file.
     Mirror function to `save_as_text`.
     """
     result = Dictionary()
     with utils.smart_open(fname) as f:
         for lineno, line in enumerate(f):
             line = utils.to_unicode(line)
             if lineno == 0:
                 if line.strip().isdigit():
                     # Older versions of save_as_text may not write num_docs on first line.
                     result.num_docs = int(line.strip())
                     continue
                 else:
                     logging.warning("Text does not contain num_docs on the first line.")
             try:
                 wordid, word, docfreq = line[:-1].split('\t')
             except Exception:
                 raise ValueError("invalid line in dictionary file %s: %s"
                                  % (fname, line.strip()))
             wordid = int(wordid)
             if word in result.token2id:
                 raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
             result.token2id[word] = wordid
             result.dfs[wordid] = int(docfreq)
     return result
Exemplo n.º 13
0
def strip_non_alphanum(s):
    """Remove non-alphabetic characters from `s` using :const:`~gensim.parsing.preprocessing.RE_NONALPHA`.

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        Unicode string with alphabetic characters only.

    Notes
    -----
    Word characters - alphanumeric & underscore.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import strip_non_alphanum
    >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works")
    u'if you can read this then this method works'

    """
    s = utils.to_unicode(s)
    return RE_NONALPHA.sub(" ", s)
Exemplo n.º 14
0
def strip_short(s, minsize=3):
    """Remove words with length lesser than `minsize` from `s`.

    Parameters
    ----------
    s : str
    minsize : int, optional

    Returns
    -------
    str
        Unicode string without short words.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import strip_short
    >>> strip_short("salut les amis du 59")
    u'salut les amis'
    >>>
    >>> strip_short("one two three four five six seven eight nine ten", minsize=5)
    u'three seven eight'

    """
    s = utils.to_unicode(s)
    return " ".join(e for e in s.split() if len(e) >= minsize)
Exemplo n.º 15
0
    def from_corpus(corpus, id2word=None):
        """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.

        Parameters
        ----------
        corpus : iterable of iterable of (int, number)
            Corpus in BoW format.
        id2word : dict of (int, object)
            Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used.

        Notes
        -----
        This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original
        text corpus. This method will scan the term-document count matrix for all word ids that appear in it,
        then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`.
        `id2word` is an optional dictionary that maps the `word_id` to a token.
        In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used.

        Returns
        -------
        :class:`~gensim.corpora.dictionary.Dictionary`
            Inferred dictionary from corpus.

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>>
        >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
        >>> dct = Dictionary.from_corpus(corpus)
        >>> len(dct)
        3

        """
        result = Dictionary()
        max_id = -1
        for docno, document in enumerate(corpus):
            if docno % 10000 == 0:
                logger.info("adding document #%i to %s", docno, result)
            result.num_docs += 1
            result.num_nnz += len(document)
            for wordid, word_freq in document:
                max_id = max(wordid, max_id)
                result.num_pos += word_freq
                result.dfs[wordid] = result.dfs.get(wordid, 0) + 1

        if id2word is None:
            # make sure length(result) == get_max_id(corpus) + 1
            result.token2id = {unicode(i): i for i in xrange(max_id + 1)}
        else:
            # id=>word mapping given: simply copy it
            result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)}
        for idx in itervalues(result.token2id):
            # make sure all token ids have a valid `dfs` entry
            result.dfs[idx] = result.dfs.get(idx, 0)

        logger.info(
            "built %s from %i documents (total %i corpus positions)",
            result, result.num_docs, result.num_pos
        )
        return result
Exemplo n.º 16
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Exemplo n.º 17
0
 def to_array(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(
                     LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)]))
     return self.sentences
Exemplo n.º 18
0
 def testPathLineSentencesOneFile(self):
     """Does PathLineSentences work with a single file argument?"""
     test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
     with utils.smart_open(test_file) as orig:
         sentences = word2vec.PathLineSentences(test_file)
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Exemplo n.º 19
0
 def testLineSentenceWorksWithNormalFile(self):
     """Does LineSentence work with a file object argument, rather than filename?"""
     with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
         with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
             sentences = word2vec.LineSentence(fin)
             for words in sentences:
                 self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Exemplo n.º 20
0
 def line2doc(self, line):
     parts = utils.to_unicode(line).split()
     if int(parts[0]) != len(parts) - 1:
         raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
     doc = [part.rsplit(':', 1) for part in parts[1:]]
     doc = [(int(p1), float(p2)) for p1, p2 in doc]
     return doc
Exemplo n.º 21
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_').

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Example::

          >>> sentences = Text8Corpus(path_to_corpus)
          >>> bigram = Phrases(sentences, min_count=5, threshold=100)
          >>> for sentence in phrases[sentences]:
          ...     print(u' '.join(s))
            he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete
            nonviolence leo_tolstoy

        """
        warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
        try:
            is_single = not sentence or isinstance(sentence[0], string_types)
        except:
            is_single = False
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        s, new_s = [utils.any2utf8(w) for w in sentence], []
        last_bigram = False
        vocab = self.vocab
        threshold = self.threshold
        delimiter = self.delimiter
        min_count = self.min_count
        for word_a, word_b in zip(s, s[1:]):
            if word_a in vocab and word_b in vocab:
                bigram_word = delimiter.join((word_a, word_b))
                if bigram_word in vocab and not last_bigram:
                    pa = float(vocab[word_a])
                    pb = float(vocab[word_b])
                    pab = float(vocab[bigram_word])
                    score = (pab - min_count) / pa / pb * len(vocab)
                    # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                    #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                    if score > threshold:
                        new_s.append(bigram_word)
                        last_bigram = True
                        continue

            if not last_bigram:
                new_s.append(word_a)
            last_bigram = False

        if s:  # add last word skipped by previous loop
            last_token = s[-1]
            if not last_bigram:
                new_s.append(last_token)

        return [utils.to_unicode(w) for w in new_s]
Exemplo n.º 22
0
    def __getitem__(self, sentence):
        """
        Convert the input tokens `sentence` (=list of unicode strings) into phrase
        tokens (=list of unicode strings, where detected phrases are joined by u'_'
        (or other configured delimiter-character).

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        """
        is_single, sentence = _is_single(sentence)
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        delimiter = self.delimiter
        bigrams = self.analyze_sentence(
            sentence,
            threshold=self.threshold,
            common_terms=self.common_terms,
            scorer=None)  # we will use our score_item function redefinition
        new_s = []
        for words, score in bigrams:
            if score is not None:
                words = delimiter.join(words)
            new_s.append(words)
        return [utils.to_unicode(w) for w in new_s]
Exemplo n.º 23
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        fname + '.vocab',
                        fname + '/vocab.txt',
                        fname_base + '.vocab',
                        fname_dir + '/vocab.txt',
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')


        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
        self.length = None
Exemplo n.º 24
0
    def tokenize(self, document):
        """
        Break text into sentences and each sentence into a list of single words
        Ignore any token that falls into the stopwords set.
        """
        # use sentence tokenizer sent_tokenize from nltk package
        sentences = sent_tokenize(utils.to_unicode(document.lower()))

        # create stemmer of class SnowballStemmer
        stemmer = SnowballStemmer("english")

        for sentence in sentences:
            words = [word
                   for word in utils.tokenize(
                    self.cleanse_text(sentence)
                   )]

            if self.remove_stopwords:
                words = [ 
                         word for word in words 
                         if word not in self.en_stopwords
                        ]

            if self.stemming:
                words = [stemmer.stem(t) for t in words]

            yield words
Exemplo n.º 25
0
    def line2doc(self, line):
        """Covert line into document in BoW format.

        Parameters
        ----------
        line : str
            Line from input file.

        Returns
        -------
        list of (int, int)
            Document in BoW format (+"document_id" and "lang" if metadata=True).

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.corpora import MalletCorpus
            >>>
            >>> corpus = MalletCorpus(datapath("testcorpus.mallet"))
            >>> corpus.line2doc("en computer human interface")
            [(3, 1), (4, 1)]

        """
        split_line = utils.to_unicode(line).strip().split(None, 2)
        docid, doclang = split_line[0], split_line[1]
        words = split_line[2] if len(split_line) >= 3 else ''

        doc = super(MalletCorpus, self).line2doc(words)

        if self.metadata:
            return doc, (docid, doclang)
        else:
            return doc
 def to_array(self):
     self.sentences = []
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
     return self.sentences
Exemplo n.º 27
0
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
    """Tokenize a piece of text from wikipedia.

    Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens.

    Parameters
    ----------
    content : str
        String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`).
    token_min_len : int
        Minimal token length.
    token_max_len : int
        Maximal token length.
    lower : bool
         If True - convert `content` to lower case.

    Returns
    -------
    list of str
        List of tokens from `content`.

    """
    # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
    return [
        utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore')
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]
Exemplo n.º 28
0
    def load_word_topics(self):
        """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file.

        Returns
        -------
        numpy.ndarray
            Matrix words X topics.

        """
        logger.info("loading assigned topics from %s", self.fstate())
        word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64)
        if hasattr(self.id2word, 'token2id'):
            word2id = self.id2word.token2id
        else:
            word2id = revdict(self.id2word)

        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # noqa:F841 beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split(" ")
                if token not in word2id:
                    continue
                tokenid = word2id[token]
                word_topics[int(topic), tokenid] += 1.0
        return word_topics
    def to_array(self):
        self.sentences = []
        for email in self.emails:
#             taggedDoc = TaggedDocument(utils.to_unicode(email.subject + ' ' + email.one_line).split(), ['email_%s' % email.id])
            taggedDoc = TaggedDocument(utils.to_unicode(email.subject).split(), ['email_%s' % email.id])
            self.sentences.append(taggedDoc)
        return self.sentences
Exemplo n.º 30
0
    def test_switch_id2word(self):
        fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
        corpus = self.corpus_class(fname)
        if hasattr(corpus, 'id2word'):
            firstdoc = next(iter(corpus))
            testdoc = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc)

            self.assertEqual(testdoc, {('computer', 1), ('human', 1), ('interface', 1)})

            d = corpus.id2word
            d[0], d[1] = d[1], d[0]
            corpus.id2word = d

            firstdoc2 = next(iter(corpus))
            testdoc2 = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc2)
            self.assertEqual(testdoc2, {('computer', 1), ('human', 1), ('interface', 1)})
Exemplo n.º 31
0
def keywords(text,
             ratio=0.2,
             words=None,
             split=False,
             scores=False,
             pos_filter=('NN', 'JJ'),
             lemmatize=False,
             deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    if not graph.edges():
        return _format_results([], [], split, scores)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Exemplo n.º 32
0
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0):
    """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_

    Parameters
    ----------
    text: str
        Document for summarization.
    blocksize: int, optional
        Size of blocks to use in analysis.
    scores: bool, optional
        Whether to return score with keywords.
    split: bool, optional
        Whether to return results as list.
    weighted: bool, optional
        Whether to weight scores by word frequency.
        False can useful for shorter texts, and allows automatic thresholding.
    threshold: float or 'auto', optional
        Minimum score for returned keywords,  'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8,
        use 'auto' with `weighted=False`.

    Returns
    -------
    results: str
        newline separated keywords if `split` == False **OR**
    results: list(str)
        list of keywords if `scores` == False **OR**
    results: list(tuple(str, float))
        list of (keyword, score) tuples if `scores` == True

    Results are returned in descending order of score regardless of the format.

    Note
    ----
    This algorithm looks for keywords that contribute to the structure of the
    text on scales of `blocksize` words of larger. It is suitable for extracting
    keywords representing the major themes of long texts.

    References
    ----------
    .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in
           written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153,
           DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558

    """
    text = to_unicode(text)
    words = [word for word in _tokenize_by_word(text)]
    vocab = sorted(set(words))
    word_counts = numpy.array(
        [
            [words[i:i + blocksize].count(word) for word in vocab]
            for i in range(0, len(words), blocksize)
        ]
    ).astype('d')
    n_blocks = word_counts.shape[0]
    totals = word_counts.sum(axis=0)
    n_words = totals.sum()
    p = word_counts / totals
    log_p = numpy.log2(p)
    h = numpy.nan_to_num(p * log_p).sum(axis=0)
    analytic = __analytic_entropy(blocksize, n_blocks, n_words)
    h += analytic(totals).astype('d')
    if weighted:
        h *= totals / n_words
    if threshold == 'auto':
        threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8
    weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold]
    weights.sort(key=lambda x: -x[1])
    result = weights if scores else [word for (word, score) in weights]
    if not (scores or split):
        result = '\n'.join(result)
    return result
Exemplo n.º 33
0
 def testLineSentenceWorksWithCompressedFile(self):
     """Does LineSentence work with a compressed file object argument?"""
     with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
         sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Exemplo n.º 34
0
 def testLineSentenceWorksWithFilename(self):
     """Does LineSentence work with a filename argument?"""
     with utils.smart_open(datapath('lee_background.cor')) as orig:
         sentences = word2vec.LineSentence(datapath('lee_background.cor'))
         for words in sentences:
             self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True):
    return [
        to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split()
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]
Exemplo n.º 36
0
    def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word whose frequency
        is not in the top-N most frequent words (default top 30,000).

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = dict(sorted(iteritems(self.vocab),
                               key=lambda item: -item[1].count)[:restrict_vocab])
        ok_index = set(v.index for v in itervalues(ok_vocab))

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    a, b, c, expected = [word.lower() for word in line.split()]  # TODO assumes vocabulary preprocessing uses lowercase, too...
                except:
                    logger.info("skipping invalid line #%i in %s" % (line_no, questions))
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip()))
                    continue

                ignore = set(self.vocab[v].index for v in [a, b, c])  # indexes of words to ignore
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                for index in argsort(most_similar(self, positive=[b, c], negative=[a], topn=False))[::-1]:
                    if index in ok_index and index not in ignore:
                        predicted = self.index2word[index]
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted))
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
Exemplo n.º 37
0
def remove_stopwords(s):
    s = utils.to_unicode(s)
    return " ".join(w for w in s.split() if w not in STOPWORDS)
Exemplo n.º 38
0
def strip_multiple_whitespaces(s):
    s = utils.to_unicode(s)
    return RE_WHITESPACE.sub(" ", s)
Exemplo n.º 39
0
def split_alphanum(s):
    s = utils.to_unicode(s)
    s = RE_AL_NUM.sub(r"\1 \2", s)
    return RE_NUM_AL.sub(r"\1 \2", s)
Exemplo n.º 40
0
def strip_non_alphanum(s):
    s = utils.to_unicode(s)
    return RE_NONALPHA.sub(" ", s)
Exemplo n.º 41
0
def strip_numeric(s):
    s = utils.to_unicode(s)
    return RE_NUMERIC.sub("", s)
Exemplo n.º 42
0
def add_additional_papers():
    """ Add additional papers for which full text from Arxiv is not present. Care is taken that while
    adding references to THESE papers, these references should be in the set of papers stored
    in the allmagpaperids set (otherwise, there will be additional papers in the reference part
    of the concat contexts which are not in the files in the text.
    ALSO NOTE that allmagpaperids contains all papers which either cite or are cited so far
    inarxiv_papers_set contains the set of papers which are in arxiv (citing)
    A set difference (allmagpaperids - inarxiv_papers_set) gives the set of mag_ids for which we 
    get additional text"""
    with open('Pickles/inarxiv_papers_set.pickle', 'wb') as picc:
        pickle.dump(inarxiv_papers_set, picc)
    with open('Pickles/allmagpapers_en_magcontexts.pickle', 'wb') as picc2:
        pickle.dump(allmagpaperids, picc2)
    additional_mag_ids = allmagpaperids - inarxiv_papers_set
    for paperid in tqdm(additional_mag_ids):
        pcur.execute(magonly_query, (paperid, paperid))
        # Get paperid, contexts, abstract, title, refids of current paper id
        for row in pcur:
            # row is a dict with keys:
            # dict_keys(['paperid', 'papertitle', 'abstract', 'contexts', 'referenceids'])
            paperid = row.get('paperid')
            # Get all contexts and reference ids (delimiters set in the pSQL query)
            contexts = row.get('contexts').replace('\n', ' ')
            referenceids = row.get('referenceids')
            title = clean_text(row.get('papertitle'))
            abstract = clean_text(row.get('abstract'))
            print(title)
            # Get a single string for all the contexts
            if contexts is not None and referenceids is not None:
                contexts = contexts.split(' ||--|| ')
                referenceids = referenceids.split(',')
                contexts_with_refs = []
                # Go through context, refid pairs, one at a time
                for context, referenceid in zip(contexts, referenceids):
                    # VERY VERY IMPORTANT: check if the referenceid is not present in the allmagpaperids set,
                    # IGNORE IT! DESIGN DECISION: the other choice is to have a LOT of passes. 
                    if referenceid in allmagpaperids:
                        contextlist = clean_text(context).split()
                        # Insert the reference id as the MIDDLE word of the context
                        # NOTE, when multiple reference ids are present, only 1 is inserted. Mag issue.
                        # In the eg. nips file, it's like this: this paper uses our previous work on weight space 
                        # probabilities =-=nips05_0451-=- =-=nips05_0507-=-. 
                        index_to_insert = len(contextlist) // 2
                        value_to_insert = docid_prefix + referenceid + docid_suffix
                        # Add the ref id with the prefix and suffix
                        contextlist.insert(index_to_insert, value_to_insert)
                        # Insert the context with ref id into the contexts_with_refs list
                        contexts_with_refs.append(' '.join(contextlist))
                    # else: do nothing, next iteration
                # After all the contexts azre iterated to, make them a string.
                contexts_concatenated = ' '.join(contexts_with_refs)                    
            else:
                contexts_concatenated = ''
                # Do not write these to file????? OR 
            # Concatenate the paperid, title, abstract and the contexts together.
            content = "{} {} {} {}\n".format(paperid, title, abstract, contexts_concatenated)
            content = to_unicode(content)
            # Get rid of empty files
            #parts = content.split()
            #parts = [ word for word in parts if not all(letter in string.punctuation for letter in word)]
            #print(parts)
            #content = ' '.join(parts)
            if content.strip() != '':
                fulltext_file.write(content)
                print("Written file for {}".format(paperid))
Exemplo n.º 43
0
def strip_tags(s):
    s = utils.to_unicode(s)
    return RE_TAGS.sub("", s)
Exemplo n.º 44
0
def strip_punctuation(s):
    s = utils.to_unicode(s)
    return RE_PUNCT.sub(" ", s)
Exemplo n.º 45
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 #item, line = line.split(" : ")
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
Exemplo n.º 46
0
def preprocess_string(s, filters=DEFAULT_FILTERS):
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s.split()
Exemplo n.º 47
0
def get_raw_text_and_links_from_markup(raw):
    if raw == None:
        raw = ''
    text = utils.to_unicode(raw, 'utf-8', errors='ignore')
    text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
    return __remove_markup(text)
Exemplo n.º 48
0
 def constructLabeledSentences(data):
     sentences=[]
     for index, row in data.iteritems():
         sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
     return sentences
Exemplo n.º 49
0
    def accuracy(self,
                 questions,
                 restrict_vocab=30000,
                 most_similar=most_similar,
                 case_insensitive=True):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
        words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
        case normalization is performed.

        Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
        evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
        and question words. In case of multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(
            ok_vocab)) if case_insensitive else dict(ok_vocab)

        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': [],
                    'incorrect': []
                }
            else:
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [
                            word.upper() for word in line.split()
                        ]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except:
                    logger.info("skipping invalid line #%i in %s" %
                                (line_no, questions))
                    continue
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    logger.debug("skipping line #%i with OOV words: %s" %
                                 (line_no, line.strip()))
                    continue

                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = set([a, b, c])  # input words to be ignored
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                sims = most_similar(self,
                                    positive=[b, c],
                                    negative=[a],
                                    topn=False,
                                    restrict_vocab=restrict_vocab)
                self.vocab = original_vocab
                for index in matutils.argsort(sims, reverse=True):
                    predicted = self.index2word[index].upper(
                    ) if case_insensitive else self.index2word[index]
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s",
                                         line.strip(), expected, predicted)
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
Exemplo n.º 50
0
    def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True):
        """
        Load the input-hidden weight matrix from the original C word2vec-tool format.

        Note that the information stored in the file is incomplete (the binary tree is missing),
        so while you can query for word similarity etc., you cannot continue training
        with a model loaded this way.

        `binary` is a boolean indicating whether the data is in binary word2vec format.
        `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
        Word counts are read from `fvocab` filename, if set (this is the file generated
        by `-save-vocab` flag of the original C tool).
        """
        counts = None
        if fvocab is not None:
            logger.info("loading word counts from %s" % (fvocab))
            counts = {}
            with utils.smart_open(fvocab) as fin:
                for line in fin:
                    word, count = utils.to_unicode(line).strip().split()
                    counts[word] = int(count)

        logger.info("loading projection weights from %s" % (fname))
        with utils.smart_open(fname) as fin:
            header = utils.to_unicode(fin.readline())
            vocab_size, layer1_size = map(int, header.split())  # throws for invalid file format
            result = Word2Vec(size=layer1_size)
            result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL)
            if binary:
                binary_len = dtype(REAL).itemsize * layer1_size
                for line_no in xrange(vocab_size):
                    # mixed text and binary: read text first, then binary
                    word = []
                    while True:
                        ch = fin.read(1)
                        if ch == b' ':
                            break
                        if ch != b'\n':  # ignore newlines in front of words (some binary files have newline, some don't)
                            word.append(ch)
                    word = utils.to_unicode(b''.join(word))
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL)
            else:
                for line_no, line in enumerate(fin):
                    parts = utils.to_unicode(line).split()
                    if len(parts) != layer1_size + 1:
                        raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
                    word, weights = parts[0], map(REAL, parts[1:])
                    if counts is None:
                        result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no)
                    elif word in counts:
                        result.vocab[word] = Vocab(index=line_no, count=counts[word])
                    else:
                        logger.warning("vocabulary file is incomplete")
                        result.vocab[word] = Vocab(index=line_no, count=None)
                    result.index2word.append(word)
                    result.syn0[line_no] = weights
        logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname))
        result.init_sims(norm_only)
        return result
Exemplo n.º 51
0
def get_words(text):
    tokenizer = nltk.tokenize.TweetTokenizer()
    words = tokenizer.tokenize(utils.to_unicode(text))
    return words
Exemplo n.º 52
0
    def evaluate_word_pairs(self,
                            pairs,
                            delimiter='\t',
                            restrict_vocab=300000,
                            case_insensitive=True,
                            dummy4unknown=False):
        """
        Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where
        lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'.
        An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at
        http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html.

        The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient
        between the similarities from the dataset and the similarities produced by the model itself.
        The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words).

        Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab`
        words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization
        is performed.

        Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before
        evaluating the model (default True). Useful when you expect case-mismatch between training tokens
        and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words.
        Otherwise (default False), these pairs are skipped entirely.
        """
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = dict((w.upper(), v) for w, v in reversed(
            ok_vocab)) if case_insensitive else dict(ok_vocab)

        similarity_gold = []
        similarity_model = []
        oov = 0

        original_vocab = self.vocab
        self.vocab = ok_vocab

        for line_no, line in enumerate(utils.smart_open(pairs)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:
                    if case_insensitive:
                        a, b, sim = [
                            word.upper() for word in line.split(delimiter)
                        ]
                    else:
                        a, b, sim = [word for word in line.split(delimiter)]
                    sim = float(sim)
                except:
                    logger.info('skipping invalid line #%d in %s', line_no,
                                pairs)
                    continue
                if a not in ok_vocab or b not in ok_vocab:
                    oov += 1
                    if dummy4unknown:
                        similarity_model.append(0.0)
                        similarity_gold.append(sim)
                        continue
                    else:
                        logger.debug('skipping line #%d with OOV words: %s',
                                     line_no, line.strip())
                        continue
                similarity_gold.append(sim)  # Similarity from the dataset
                similarity_model.append(self.similarity(
                    a, b))  # Similarity from the model
        self.vocab = original_vocab
        spearman = stats.spearmanr(similarity_gold, similarity_model)
        pearson = stats.pearsonr(similarity_gold, similarity_model)
        oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

        logger.debug(
            'Pearson correlation coefficient against %s: %f with p-value %f',
            pairs, pearson[0], pearson[1])
        logger.debug(
            'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
            pairs, spearman[0], spearman[1])
        logger.debug('Pairs with unknown words: %d' % oov)
        self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
        return pearson, spearman, oov_ratio
Exemplo n.º 53
0
 def parseInstanceFromSentence(self, sentence_str):
     words_str = utils.to_unicode(sentence_str).split(' ')
     return array([self.getWordEmbeddingFromString(word_str) for word_str in words_str]).T
Exemplo n.º 54
0
    def __getitem__(self, sentence):
        """Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter.

        If `sentence` is an entire corpus (iterable of sentences rather than a single
        sentence), return an iterable that converts each of the corpus' sentences
        into phrases on the fly, one after another.

        Parameters
        ----------
        sentence : {list of str, iterable of list of str}
            Sentence or text corpus.

        Returns
        -------
        {list of str, :class:`gensim.interfaces.TransformedCorpus`}
            `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences
            if the input was a corpus.

        Examples
        ----------
        >>> from gensim.test.utils import datapath
        >>> from gensim.models.word2vec import Text8Corpus
        >>> from gensim.models.phrases import Phrases, Phraser
        >>>
        >>> #Create corpus
        >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
        >>>
        >>> #Train the detector with:
        >>> phrases = Phrases(sentences, min_count=1, threshold=1)
        >>> #Input is a list of unicode strings:
        >>> sent = [u'trees', u'graph', u'minors']
        >>> #Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1:
        >>> print(phrases[sent])
        [u'trees_graph', u'minors']
        >>>
        >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
        >>> phrases = Phrases(sentences, min_count=1, threshold=1)
        >>> phraser = Phraser(phrases)  # for speedup
        >>>
        >>> sent = [[u'trees', u'graph', u'minors'],[u'graph', u'minors']]
        >>> for phrase in phraser[sent]:
        ...     pass

        """
        warnings.warn(
            "For a faster implementation, use the gensim.models.phrases.Phraser class"
        )

        delimiter = self.delimiter  # delimiter used for lookup

        is_single, sentence = _is_single(sentence)
        if not is_single:
            # if the input is an entire corpus (rather than a single sentence),
            # return an iterable stream.
            return self._apply(sentence)

        delimiter = self.delimiter
        bigrams = self.analyze_sentence(
            sentence,
            threshold=self.threshold,
            common_terms=self.common_terms,
            scorer=ft.partial(
                self.scoring,
                len_vocab=float(len(self.vocab)),
                min_count=float(self.min_count),
                corpus_word_count=float(self.corpus_word_count),
            ),
        )
        new_s = []
        for words, score in bigrams:
            if score is not None:
                words = delimiter.join(words)
            new_s.append(words)

        return [utils.to_unicode(w) for w in new_s]
Exemplo n.º 55
0
def _load_word2vec_format(cls,
                          fname,
                          fvocab=None,
                          binary=False,
                          encoding='utf8',
                          unicode_errors='strict',
                          limit=None,
                          datatype=REAL):
    """Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    Parameters
    ----------
    fname : str
        The file path to the saved word2vec-format file.
    fvocab : str, optional
        File path to the vocabulary.Word counts are read from `fvocab` filename, if set
        (this is the file generated by `-save-vocab` flag of the original C tool).
    binary : bool, optional
        If True, indicates whether the data is in binary word2vec format.
    encoding : str, optional
        If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`.
    unicode_errors : str, optional
        default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
    limit : int, optional
        Sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
    datatype : type, optional
        (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory.
        Such types may result in much slower bulk operations or incompatibility with optimized routines.)

    Returns
    -------
    object
        Returns the loaded model as an instance of :class:`cls`.

    """
    from gensim.models.keyedvectors import Vocab
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in range(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = utils.to_unicode(b''.join(word),
                                        encoding=encoding,
                                        errors=unicode_errors)
                with utils.ignore_deprecation_warning():
                    # TODO use frombuffer or something similar
                    weights = fromstring(fin.read(binary_len),
                                         dtype=REAL).astype(datatype)
                add_word(word, weights)
        else:
            for line_no in range(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = utils.to_unicode(line.rstrip(),
                                         encoding=encoding,
                                         errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [datatype(x) for x in parts[1:]]
                add_word(word, weights)
    if result.vectors.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.vectors.shape[0], len(result.vocab))
        result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.vectors.shape

    logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
    return result
Exemplo n.º 56
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with open(path.join(mypath + '/tutorial', source), 'r', encoding="UTF-8") as fin:
             for item_no, line in enumerate(fin):
                 yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
Exemplo n.º 57
0
punctuation = u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝({“‘-—_…'''
RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation + string.punctuation),
                      re.UNICODE)
RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE)
RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
RE_NONALPHA = re.compile(r"\W", re.UNICODE)
RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE)
RE_NONSENSE = re.compile(
    r"[^!#$%&'()*+,-./:;<=>?@[\]^_`{|}~\u4e00-\u9fa5a-zA-Z0-9\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\n\s]+",
    re.UNICODE)
RE_L = re.compile("\u000C", re.UNICODE)

pdf_path = '/home/weiwu/share/deep_learning/docs/1_macro_economy/'
for (root, dirs, files) in os.walk(pdf_path):
    for filename in files:
        file_path = join(root, filename)
        if file_path.endswith('.pdf'):
            print(file_path)
            fout = file_path[:-3] + 'txt'
            try:
                p2t(file_path, fout, None)
            except:
                continue
            with open(fout, 'r') as fp:
                text = fp.read()
                s = utils.to_unicode(text)
                text = s.replace('\n\n', '')
            with open(fout, 'w') as f:
                f.write(text)
Exemplo n.º 58
0
def cleantext(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s
Exemplo n.º 59
0
def strip_short(s, minsize=3):
    s = utils.to_unicode(s)
    return " ".join(e for e in s.split() if len(e) >= minsize)
Exemplo n.º 60
0
 def __iter__(self):
     for source, prefix in self.sources.items():
         with utils.smart_open(source) as fin:
             for item_no, line in enumerate(fin):
                 yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])