def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for item_no, line in enumerate(self.source): # yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) try: tagged_document = TaggedDocument( utils.to_unicode(line).split(), [item_no]) except UnicodeDecodeError as e: print(e) continue yield tagged_document except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for item_no, line in enumerate(fin): if len(line) > 100: # yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) try: tagged_document = TaggedDocument( utils.to_unicode(line).split(), [item_no]) except UnicodeDecodeError as e: print(e) continue yield tagged_document else: continue
def __init__(self, input, transposed=True): """ Initialize the matrix reader. The `input` refers to a file on local filesystem, which is expected to be in the sparse (coordinate) Matrix Market format. Documents are assumed to be rows of the matrix (and document features are columns). `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). """ logger.info("initializing corpus reader from %s" % input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % (self.num_docs, self.num_terms, self.num_nnz))
def main(filename): fn = filename.split('.') fn[-1] = 'hdf5' hdf5_filename = '.'.join(fn) bin_f = utils.smart_open(filename) hdf5_f = h5py.File(hdf5_filename, 'w') header = utils.to_unicode(bin_f.readline(), encoding='utf8') vocab_size, vector_size = map(int, header.split()) binary_len = dtype(float32).itemsize * vector_size for line_no in xrange(vocab_size): word = [] while True: ch = bin_f.read(1) if ch == b' ': break if ch != b'\n': word.append(ch) word = utils.to_unicode(b''.join(word), encoding='utf8', errors='strict') vector = fromstring(bin_f.read(binary_len), dtype=float32) if word[:3] == '/en': w = word[4:] hdf5_f.create_dataset(w, data=vector) print w bin_f.close() hdf5_f.close()
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in self.source: yield utils.to_unicode(line).split() except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in fin: yield utils.to_unicode(line).split()
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for item_no, line in enumerate(self.source): yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no] + self.topics_name ) except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no] + self.topics_name )
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [source])) # prefix + '_%s' % item_no return self.sentences
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning("List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def sort_embeddings(self, vocab_file): """Sort embeddings according to word frequency. Parameters ---------- vocab_file : str Path to file with vocabulary. """ counts = {} vocab_size = len(self.vocab) prev_syn0 = copy.deepcopy(self.syn0) prev_vocab = copy.deepcopy(self.vocab) self.index2word = [] # sort embeddings using frequency sorted vocab file in wordrank with utils.smart_open(vocab_file) as fin: for index, line in enumerate(fin): word, count = utils.to_unicode(line).strip(), vocab_size - index # store word with it's count in a dict counts[word] = int(count) # build new index2word with frequency sorted words self.index2word.append(word) assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes' for word_id, word in enumerate(self.index2word): self.syn0[word_id] = prev_syn0[prev_vocab[word].index] self.vocab[word].index = word_id self.vocab[word].count = counts[word]
def strip_short(sentence, minsize=3): ''' Split and Join words if size >= minsize ''' sentence = utils.to_unicode(sentence) return " ".join(e for e in sentence.split() if len(e) >= minsize)
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Bidictionary() # restore _unidict as gensim dictionary result._unidict = corpora.Dictionary.load_from_text(fname + '.index') with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: bid, fid, sid, docfreq = line[:-1].split('\t') fid_sid = (int(fid), int(sid)) except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) bid = int(bid) if fid_sid in result.fid_sid2bid: raise KeyError('token %s is defined as ID %d and as ID %d' % (fid_sid, bid, result.fid_sid2bid[fid_sid])) result.fid_sid2bid[fid_sid] = bid result.dfs[bid] = int(docfreq) return result
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: if line.strip().isdigit(): # Older versions of save_as_text may not write num_docs on first line. result.num_docs = int(line.strip()) continue else: logging.warning("Text does not contain num_docs on the first line.") try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def strip_non_alphanum(s): """Remove non-alphabetic characters from `s` using :const:`~gensim.parsing.preprocessing.RE_NONALPHA`. Parameters ---------- s : str Returns ------- str Unicode string with alphabetic characters only. Notes ----- Word characters - alphanumeric & underscore. Examples -------- >>> from gensim.parsing.preprocessing import strip_non_alphanum >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works") u'if you can read this then this method works' """ s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s)
def strip_short(s, minsize=3): """Remove words with length lesser than `minsize` from `s`. Parameters ---------- s : str minsize : int, optional Returns ------- str Unicode string without short words. Examples -------- >>> from gensim.parsing.preprocessing import strip_short >>> strip_short("salut les amis du 59") u'salut les amis' >>> >>> strip_short("one two three four five six seven eight nine ten", minsize=5) u'three seven eight' """ s = utils.to_unicode(s) return " ".join(e for e in s.split() if len(e) >= minsize)
def from_corpus(corpus, id2word=None): """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus. Parameters ---------- corpus : iterable of iterable of (int, number) Corpus in BoW format. id2word : dict of (int, object) Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used. Notes ----- This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. This method will scan the term-document count matrix for all word ids that appear in it, then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`. `id2word` is an optional dictionary that maps the `word_id` to a token. In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. Returns ------- :class:`~gensim.corpora.dictionary.Dictionary` Inferred dictionary from corpus. Examples -------- >>> from gensim.corpora import Dictionary >>> >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] >>> dct = Dictionary.from_corpus(corpus) >>> len(dct) 3 """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): if docno % 10000 == 0: logger.info("adding document #%i to %s", docno, result) result.num_docs += 1 result.num_nnz += len(document) for wordid, word_freq in document: max_id = max(wordid, max_id) result.num_pos += word_freq result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 result.token2id = {unicode(i): i for i in xrange(max_id + 1)} else: # id=>word mapping given: simply copy it result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)} for idx in itervalues(result.token2id): # make sure all token ids have a valid `dfs` entry result.dfs[idx] = result.dfs.get(idx, 0) logger.info( "built %s from %i documents (total %i corpus positions)", result, result.num_docs, result.num_pos ) return result
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
def to_array(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append( LabeledSentence(words=utils.to_unicode(line).split(), tags=[prefix + '_%s' % str(item_no)])) return self.sentences
def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') with utils.smart_open(test_file) as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def line2doc(self, line): parts = utils.to_unicode(line).split() if int(parts[0]) != len(parts) - 1: raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) doc = [part.rsplit(':', 1) for part in parts[1:]] doc = [(int(p1), float(p2)) for p1, p2 in doc] return doc
def __getitem__(self, sentence): """ Convert the input tokens `sentence` (=list of unicode strings) into phrase tokens (=list of unicode strings, where detected phrases are joined by u'_'). If `sentence` is an entire corpus (iterable of sentences rather than a single sentence), return an iterable that converts each of the corpus' sentences into phrases on the fly, one after another. Example:: >>> sentences = Text8Corpus(path_to_corpus) >>> bigram = Phrases(sentences, min_count=5, threshold=100) >>> for sentence in phrases[sentences]: ... print(u' '.join(s)) he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete nonviolence leo_tolstoy """ warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class") try: is_single = not sentence or isinstance(sentence[0], string_types) except: is_single = False if not is_single: # if the input is an entire corpus (rather than a single sentence), # return an iterable stream. return self._apply(sentence) s, new_s = [utils.any2utf8(w) for w in sentence], [] last_bigram = False vocab = self.vocab threshold = self.threshold delimiter = self.delimiter min_count = self.min_count for word_a, word_b in zip(s, s[1:]): if word_a in vocab and word_b in vocab: bigram_word = delimiter.join((word_a, word_b)) if bigram_word in vocab and not last_bigram: pa = float(vocab[word_a]) pb = float(vocab[word_b]) pab = float(vocab[bigram_word]) score = (pab - min_count) / pa / pb * len(vocab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) if score > threshold: new_s.append(bigram_word) last_bigram = True continue if not last_bigram: new_s.append(word_a) last_bigram = False if s: # add last word skipped by previous loop last_token = s[-1] if not last_bigram: new_s.append(last_token) return [utils.to_unicode(w) for w in new_s]
def __getitem__(self, sentence): """ Convert the input tokens `sentence` (=list of unicode strings) into phrase tokens (=list of unicode strings, where detected phrases are joined by u'_' (or other configured delimiter-character). If `sentence` is an entire corpus (iterable of sentences rather than a single sentence), return an iterable that converts each of the corpus' sentences into phrases on the fly, one after another. """ is_single, sentence = _is_single(sentence) if not is_single: # if the input is an entire corpus (rather than a single sentence), # return an iterable stream. return self._apply(sentence) delimiter = self.delimiter bigrams = self.analyze_sentence( sentence, threshold=self.threshold, common_terms=self.common_terms, scorer=None) # we will use our score_item function redefinition new_s = [] for words, score in bigrams: if score is not None: words = delimiter.join(words) new_s.append(words) return [utils.to_unicode(w) for w in new_s]
def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ fname + '.vocab', fname + '/vocab.txt', fname_base + '.vocab', fname_dir + '/vocab.txt', ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words)) self.length = None
def tokenize(self, document): """ Break text into sentences and each sentence into a list of single words Ignore any token that falls into the stopwords set. """ # use sentence tokenizer sent_tokenize from nltk package sentences = sent_tokenize(utils.to_unicode(document.lower())) # create stemmer of class SnowballStemmer stemmer = SnowballStemmer("english") for sentence in sentences: words = [word for word in utils.tokenize( self.cleanse_text(sentence) )] if self.remove_stopwords: words = [ word for word in words if word not in self.en_stopwords ] if self.stemming: words = [stemmer.stem(t) for t in words] yield words
def line2doc(self, line): """Covert line into document in BoW format. Parameters ---------- line : str Line from input file. Returns ------- list of (int, int) Document in BoW format (+"document_id" and "lang" if metadata=True). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import MalletCorpus >>> >>> corpus = MalletCorpus(datapath("testcorpus.mallet")) >>> corpus.line2doc("en computer human interface") [(3, 1), (4, 1)] """ split_line = utils.to_unicode(line).strip().split(None, 2) docid, doclang = split_line[0], split_line[1] words = split_line[2] if len(split_line) >= 3 else '' doc = super(MalletCorpus, self).line2doc(words) if self.metadata: return doc, (docid, doclang) else: return doc
def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Tokenize a piece of text from wikipedia. Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Parameters ---------- content : str String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`). token_min_len : int Minimal token length. token_max_len : int Maximal token length. lower : bool If True - convert `content` to lower case. Returns ------- list of str List of tokens from `content`. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore') if token_min_len <= len(token) <= token_max_len and not token.startswith('_') ]
def load_word_topics(self): """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file. Returns ------- numpy.ndarray Matrix words X topics. """ logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def to_array(self): self.sentences = [] for email in self.emails: # taggedDoc = TaggedDocument(utils.to_unicode(email.subject + ' ' + email.one_line).split(), ['email_%s' % email.id]) taggedDoc = TaggedDocument(utils.to_unicode(email.subject).split(), ['email_%s' % email.id]) self.sentences.append(taggedDoc) return self.sentences
def test_switch_id2word(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) if hasattr(corpus, 'id2word'): firstdoc = next(iter(corpus)) testdoc = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc) self.assertEqual(testdoc, {('computer', 1), ('human', 1), ('interface', 1)}) d = corpus.id2word d[0], d[1] = d[1], d[0] corpus.id2word = d firstdoc2 = next(iter(corpus)) testdoc2 = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc2) self.assertEqual(testdoc2, {('computer', 1), ('human', 1), ('interface', 1)})
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not graph.edges(): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str Document for summarization. blocksize: int, optional Size of blocks to use in analysis. scores: bool, optional Whether to return score with keywords. split: bool, optional Whether to return results as list. weighted: bool, optional Whether to weight scores by word frequency. False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, use 'auto' with `weighted=False`. Returns ------- results: str newline separated keywords if `split` == False **OR** results: list(str) list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. Note ---- This algorithm looks for keywords that contribute to the structure of the text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) word_counts = numpy.array( [ [words[i:i + blocksize].count(word) for word in vocab] for i in range(0, len(words), blocksize) ] ).astype('d') n_blocks = word_counts.shape[0] totals = word_counts.sum(axis=0) n_words = totals.sum() p = word_counts / totals log_p = numpy.log2(p) h = numpy.nan_to_num(p * log_p).sum(axis=0) analytic = __analytic_entropy(blocksize, n_blocks, n_words) h += analytic(totals).astype('d') if weighted: h *= totals / n_words if threshold == 'auto': threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): result = '\n'.join(result) return result
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True): return [ to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split() if token_min_len <= len(token) <= token_max_len and not token.startswith('_') ]
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict(sorted(iteritems(self.vocab), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in itervalues(ok_vocab)) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = [word.lower() for word in line.split()] # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort(most_similar(self, positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected: logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'section': 'total', 'correct': sum((s['correct'] for s in sections), []), 'incorrect': sum((s['incorrect'] for s in sections), []), } self.log_accuracy(total) sections.append(total) return sections
def remove_stopwords(s): s = utils.to_unicode(s) return " ".join(w for w in s.split() if w not in STOPWORDS)
def strip_multiple_whitespaces(s): s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s)
def split_alphanum(s): s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) return RE_NUM_AL.sub(r"\1 \2", s)
def strip_non_alphanum(s): s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s)
def strip_numeric(s): s = utils.to_unicode(s) return RE_NUMERIC.sub("", s)
def add_additional_papers(): """ Add additional papers for which full text from Arxiv is not present. Care is taken that while adding references to THESE papers, these references should be in the set of papers stored in the allmagpaperids set (otherwise, there will be additional papers in the reference part of the concat contexts which are not in the files in the text. ALSO NOTE that allmagpaperids contains all papers which either cite or are cited so far inarxiv_papers_set contains the set of papers which are in arxiv (citing) A set difference (allmagpaperids - inarxiv_papers_set) gives the set of mag_ids for which we get additional text""" with open('Pickles/inarxiv_papers_set.pickle', 'wb') as picc: pickle.dump(inarxiv_papers_set, picc) with open('Pickles/allmagpapers_en_magcontexts.pickle', 'wb') as picc2: pickle.dump(allmagpaperids, picc2) additional_mag_ids = allmagpaperids - inarxiv_papers_set for paperid in tqdm(additional_mag_ids): pcur.execute(magonly_query, (paperid, paperid)) # Get paperid, contexts, abstract, title, refids of current paper id for row in pcur: # row is a dict with keys: # dict_keys(['paperid', 'papertitle', 'abstract', 'contexts', 'referenceids']) paperid = row.get('paperid') # Get all contexts and reference ids (delimiters set in the pSQL query) contexts = row.get('contexts').replace('\n', ' ') referenceids = row.get('referenceids') title = clean_text(row.get('papertitle')) abstract = clean_text(row.get('abstract')) print(title) # Get a single string for all the contexts if contexts is not None and referenceids is not None: contexts = contexts.split(' ||--|| ') referenceids = referenceids.split(',') contexts_with_refs = [] # Go through context, refid pairs, one at a time for context, referenceid in zip(contexts, referenceids): # VERY VERY IMPORTANT: check if the referenceid is not present in the allmagpaperids set, # IGNORE IT! DESIGN DECISION: the other choice is to have a LOT of passes. if referenceid in allmagpaperids: contextlist = clean_text(context).split() # Insert the reference id as the MIDDLE word of the context # NOTE, when multiple reference ids are present, only 1 is inserted. Mag issue. # In the eg. nips file, it's like this: this paper uses our previous work on weight space # probabilities =-=nips05_0451-=- =-=nips05_0507-=-. index_to_insert = len(contextlist) // 2 value_to_insert = docid_prefix + referenceid + docid_suffix # Add the ref id with the prefix and suffix contextlist.insert(index_to_insert, value_to_insert) # Insert the context with ref id into the contexts_with_refs list contexts_with_refs.append(' '.join(contextlist)) # else: do nothing, next iteration # After all the contexts azre iterated to, make them a string. contexts_concatenated = ' '.join(contexts_with_refs) else: contexts_concatenated = '' # Do not write these to file????? OR # Concatenate the paperid, title, abstract and the contexts together. content = "{} {} {} {}\n".format(paperid, title, abstract, contexts_concatenated) content = to_unicode(content) # Get rid of empty files #parts = content.split() #parts = [ word for word in parts if not all(letter in string.punctuation for letter in word)] #print(parts) #content = ' '.join(parts) if content.strip() != '': fulltext_file.write(content) print("Written file for {}".format(paperid))
def strip_tags(s): s = utils.to_unicode(s) return RE_TAGS.sub("", s)
def strip_punctuation(s): s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s)
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): #item, line = line.split(" : ") yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def preprocess_string(s, filters=DEFAULT_FILTERS): s = utils.to_unicode(s) for f in filters: s = f(s) return s.split()
def get_raw_text_and_links_from_markup(raw): if raw == None: raw = '' text = utils.to_unicode(raw, 'utf-8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' return __remove_markup(text)
def constructLabeledSentences(data): sentences=[] for index, row in data.iteritems(): sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)])) return sentences
def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): """ Compute accuracy of the model. `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See questions-words.txt in https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then case normalization is performed. Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens and question words. In case of multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = dict((w.upper(), v) for w, v in reversed( ok_vocab)) if case_insensitive else dict(ok_vocab) sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) self.log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': [] } else: if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [ word.upper() for word in line.split() ] else: a, b, c, expected = [word for word in line.split()] except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) continue if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line.strip())) continue original_vocab = self.vocab self.vocab = ok_vocab ignore = set([a, b, c]) # input words to be ignored predicted = None # find the most likely prediction, ignoring OOV words and input words sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) self.vocab = original_vocab for index in matutils.argsort(sims, reverse=True): predicted = self.index2word[index].upper( ) if case_insensitive else self.index2word[index] if predicted in ok_vocab and predicted not in ignore: if predicted != expected: logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) break if predicted == expected: section['correct'].append((a, b, c, expected)) else: section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) self.log_accuracy(section) total = { 'section': 'total', 'correct': sum((s['correct'] for s in sections), []), 'incorrect': sum((s['incorrect'] for s in sections), []), } self.log_accuracy(total) sections.append(total) return sections
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline()) vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch != b'\n': # ignore newlines in front of words (some binary files have newline, some don't) word.append(ch) word = utils.to_unicode(b''.join(word)) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line).split() if len(parts) != layer1_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif word in counts: result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
def get_words(text): tokenizer = nltk.tokenize.TweetTokenizer() words = tokenizer.tokenize(utils.to_unicode(text)) return words
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'. An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient between the similarities from the dataset and the similarities produced by the model itself. The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words). Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab` words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency. If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization is performed. Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before evaluating the model (default True). Useful when you expect case-mismatch between training tokens and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words. Otherwise (default False), these pairs are skipped entirely. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] ok_vocab = dict((w.upper(), v) for w, v in reversed( ok_vocab)) if case_insensitive else dict(ok_vocab) similarity_gold = [] similarity_model = [] oov = 0 original_vocab = self.vocab self.vocab = ok_vocab for line_no, line in enumerate(utils.smart_open(pairs)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: if case_insensitive: a, b, sim = [ word.upper() for word in line.split(delimiter) ] else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) except: logger.info('skipping invalid line #%d in %s', line_no, pairs) continue if a not in ok_vocab or b not in ok_vocab: oov += 1 if dummy4unknown: similarity_model.append(0.0) similarity_gold.append(sim) continue else: logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) continue similarity_gold.append(sim) # Similarity from the dataset similarity_model.append(self.similarity( a, b)) # Similarity from the model self.vocab = original_vocab spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 logger.debug( 'Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) logger.debug( 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', pairs, spearman[0], spearman[1]) logger.debug('Pairs with unknown words: %d' % oov) self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) return pearson, spearman, oov_ratio
def parseInstanceFromSentence(self, sentence_str): words_str = utils.to_unicode(sentence_str).split(' ') return array([self.getWordEmbeddingFromString(word_str) for word_str in words_str]).T
def __getitem__(self, sentence): """Convert the input tokens `sentence` into tokens where detected bigrams are joined by a selected delimiter. If `sentence` is an entire corpus (iterable of sentences rather than a single sentence), return an iterable that converts each of the corpus' sentences into phrases on the fly, one after another. Parameters ---------- sentence : {list of str, iterable of list of str} Sentence or text corpus. Returns ------- {list of str, :class:`gensim.interfaces.TransformedCorpus`} `sentence` with detected phrase bigrams merged together, or a streamed corpus of such sentences if the input was a corpus. Examples ---------- >>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases, Phraser >>> >>> #Create corpus >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> >>> #Train the detector with: >>> phrases = Phrases(sentences, min_count=1, threshold=1) >>> #Input is a list of unicode strings: >>> sent = [u'trees', u'graph', u'minors'] >>> #Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1: >>> print(phrases[sent]) [u'trees_graph', u'minors'] >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt')) >>> phrases = Phrases(sentences, min_count=1, threshold=1) >>> phraser = Phraser(phrases) # for speedup >>> >>> sent = [[u'trees', u'graph', u'minors'],[u'graph', u'minors']] >>> for phrase in phraser[sent]: ... pass """ warnings.warn( "For a faster implementation, use the gensim.models.phrases.Phraser class" ) delimiter = self.delimiter # delimiter used for lookup is_single, sentence = _is_single(sentence) if not is_single: # if the input is an entire corpus (rather than a single sentence), # return an iterable stream. return self._apply(sentence) delimiter = self.delimiter bigrams = self.analyze_sentence( sentence, threshold=self.threshold, common_terms=self.common_terms, scorer=ft.partial( self.scoring, len_vocab=float(len(self.vocab)), min_count=float(self.min_count), corpus_word_count=float(self.corpus_word_count), ), ) new_s = [] for words, score in bigrams: if score is not None: words = delimiter.join(words) new_s.append(words) return [utils.to_unicode(w) for w in new_s]
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. Parameters ---------- fname : str The file path to the saved word2vec-format file. fvocab : str, optional File path to the vocabulary.Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool, optional If True, indicates whether the data is in binary word2vec format. encoding : str, optional If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. unicode_errors : str, optional default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. limit : int, optional Sets a maximum number of word-vectors to read from the file. The default, None, means read all. datatype : type, optional (Experimental) Can coerce dimensions to a non-default float type (such as `np.float16`) to save memory. Such types may result in much slower bulk operations or incompatibility with optimized routines.) Returns ------- object Returns the loaded model as an instance of :class:`cls`. """ from gensim.models.keyedvectors import Vocab counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(vector_size) result.vector_size = vector_size result.vectors = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in range(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) with utils.ignore_deprecation_warning(): # TODO use frombuffer or something similar weights = fromstring(fin.read(binary_len), dtype=REAL).astype(datatype) add_word(word, weights) else: for line_no in range(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [datatype(x) for x in parts[1:]] add_word(word, weights) if result.vectors.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.vectors.shape[0], len(result.vocab)) result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.vectors.shape logger.info("loaded %s matrix from %s", result.vectors.shape, fname) return result
def __iter__(self): for source, prefix in self.sources.items(): with open(path.join(mypath + '/tutorial', source), 'r', encoding="UTF-8") as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
punctuation = u''':!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢々‖•·ˇˉ―--′’”([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝({“‘-—_…''' RE_PUNCT = re.compile(r'([%s])+' % re.escape(punctuation + string.punctuation), re.UNICODE) RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) RE_NONALPHA = re.compile(r"\W", re.UNICODE) RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) RE_NONSENSE = re.compile( r"[^!#$%&'()*+,-./:;<=>?@[\]^_`{|}~\u4e00-\u9fa5a-zA-Z0-9\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\n\s]+", re.UNICODE) RE_L = re.compile("\u000C", re.UNICODE) pdf_path = '/home/weiwu/share/deep_learning/docs/1_macro_economy/' for (root, dirs, files) in os.walk(pdf_path): for filename in files: file_path = join(root, filename) if file_path.endswith('.pdf'): print(file_path) fout = file_path[:-3] + 'txt' try: p2t(file_path, fout, None) except: continue with open(fout, 'r') as fp: text = fp.read() s = utils.to_unicode(text) text = s.replace('\n\n', '') with open(fout, 'w') as f: f.write(text)
def cleantext(s): s = s.lower() s = utils.to_unicode(s) for f in filters: s = f(s) return s
def strip_short(s, minsize=3): s = utils.to_unicode(s) return " ".join(e for e in s.split() if len(e) >= minsize)
def __iter__(self): for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])