def load_trie(self, trie_cache_file): ''' Load a prebuilt tree from file or create a new one :return: ''' trie = None if os.path.isfile(trie_cache_file): print('Start loading trie from %s' % trie_cache_file) with open(trie_cache_file, 'rb') as f: trie = pickle.load(f) else: print('Trie not found, creating %s' % trie_cache_file) count = 0 listwords = [] dict_files = [self.wordlist] for dict_file in dict_files: print(dict_file) file = open(dict_file, 'r', encoding='utf8') for line in file: tokens = nlp.preprocessText(line) if (len(tokens) > 0): listwords.append(tokens) trie = MyTrie(listwords) with open(trie_cache_file, 'wb') as f: pickle.dump(trie, f) return trie
def get_Topics_npFilter(self, topicdocs): ptopicdocs = nlp.preprocessed_docs(topicdocs) docs = [' '.join(doc.tokens) for doc in ptopicdocs] if not self.trained == True: self.train() matrix = self.model.transform(docs).todense() topic_dic = {} i = 0 for doci in ptopicdocs: chunks = self.npchunk(doci.sentences) temptokens = zip(matrix[i].tolist()[0], itertools.count()) temptokens1 = [] tfidf_dic = {} for (x, y) in temptokens: if x > 0.0: tfidf_dic[self.i2w[y]] = x for chunk in chunks: if chunk in tfidf_dic: temptokens1.append((tfidf_dic[chunk], ' '.join( nlp.preprocessText(chunk, stemming=False, stopwords_removal=False)))) topic_dic[doci.id] = temptokens1 i += 1 return topic_dic
def getConcepts(filename): gold_list = set() with open(filename) as f: for line in f: gold_list.add(' '.join( nlp.preprocessText(line.replace("\n", "").strip().lower()))) return list(gold_list)
def extract(self, testset, OUTPUT_FOL=None): if not os.path.exists(OUTPUT_FOL): os.makedirs(OUTPUT_FOL) for doc in testset: tokens = nlp.preprocessText(doc.text) vec = self.vectorize(tokens).tolist() with open(os.path.join(OUTPUT_FOL, doc.id + ".txt.phrases"), 'wb') as f_: pickle.dump(vec, f_)
def train_D2V(self, ldocuments): ''' Load or train Doc2Vec ''' document_dict = {} id2num_dict = {} documents = [] for doc in ldocuments: doc_num = len(document_dict) id2num_dict[doc.id] = doc_num words = nlp.preprocessText(doc.text) tagged_doc = TaggedDocument(words=words, tags=[doc_num]) document_dict[doc.id] = (doc_num, tagged_doc) documents.append(tagged_doc) # d2v_model = Doc2Vec(size=self.config['d2v_vector_length'], window=self.config['d2v_window_size'], min_count=self.config['d2v_min_count'], workers=4, alpha=0.025, min_alpha=0.025) # use fixed documents rate d2v_model = Doc2Vec(size=300, window=5, min_count=3, workers=10, iter=30) d2v_model.build_vocab(documents) if self.pretrained_w2v_path: if self.pretrained_w2v_path.endswith('bin'): d2v_model.intersect_word2vec_format(self.pretrained_w2v_path, binary=True) else: d2v_model.intersect_word2vec_format(self.pretrained_w2v_path, binary=False) # for epoch in range(20): # print('D2V training epoch = %d' % epoch) d2v_model.train(documents, total_examples=len(documents)) # d2v_model.alpha -= 0.002 # decrease the learning rate # d2v_model.min_alpha = d2v_model.alpha # fix the learning rate, no decay # store the model to mmap-able files d2v_model.save(self.model_path) if self.output_path != None: if not os.path.exists(self.output_path): os.makedirs(self.output_path) for doc.id, (doc_num, _) in document_dict.items(): with open( os.path.join(self.output_path, doc.id + ".txt.phrases"), 'wb') as f_: pickle.dump(d2v_model.docvecs[doc_num].tolist(), f_) return d2v_model
def scan(self, sentence, min_length=1, max_length=5): keyword_list = [] tokens = nlp.preprocessText(sentence) ngrams = [] for i in range(min_length, max_length + 1): ngrams += nltk.ngrams(tokens, i) for ngram in ngrams: if (self.search(' '.join(ngram))): keyword_list.append(' '.join(ngram)) return keyword_list
def npchunk(self, doc): npchunklist = [] for sen in doc: ichunklist = list(nlp_spacy(sen).noun_chunks) ichunklist = [ nlp.preprocessText(str(ichunk.text)) for ichunk in ichunklist ] ichunklist = [ichunk for ichunk in ichunklist if len(ichunk) > 0] # ichunklistt = [' '.join(ichunk) for ichunk in ichunklist if len(ichunk) <= 3 and len(ichunk) > 0] for ichunk in ichunklist: if len(ichunk) <= 3 and len(ichunk) > 0: npchunklist.append(' '.join(ichunk)) elif len(ichunk) > 3: newchunks = nltk.ngrams(ichunk, 3) for nc in newchunks: npchunklist.append(' '.join(nc)) return list(set(npchunklist))
def getGlobalngrams(grams, documents, threshold): singlecorpus = "" for doc in documents: singlecorpus += ' ' + doc.text + '\n' ncorpus = ' '.join(nlp.preprocessText(singlecorpus)) tf = TfidfVectorizer(analyzer='word', ngram_range=grams, stop_words=nlp.stopwords) tfidf_matrix = tf.fit_transform([ncorpus]) feature_names = tf.get_feature_names() doc = tfidf_matrix.todense() temptokens = zip(doc.tolist()[0], itertools.count()) temptokens = [(x, y) for (x, y) in temptokens if x > threshold] tokindex = heapq.nlargest(len(temptokens), temptokens) global1grams = dict([(feature_names[y], x) for (x, y) in tokindex]) topindex = [(feature_names[y], x) for (x, y) in tokindex] f = open('data/file' + str(grams[0]) + ".txt", 'w') for key in global1grams: f.write(key + "," + global1grams[key] + "\n") return global1grams, topindex