def expand(self, query): tokens = tokenize(query.lower()) latent = self.lda.tokens2latent(tokens) extra_terms = [] for topic in latent: topn = self.lda.model.show_topic(topicid=topic[0], topn=round(self.k*topic[1])) extra_terms += [e[1] for e in topn] extra_terms = list(set(extra_terms)) new_query = query + " " + " ".join(extra_terms) return new_query
def expand(self, query): tokens = tokenize(query.lower()) latent = self.lda.tokens2latent(tokens) extra_terms = [] for topic in latent: topn = self.lda.model.show_topic(topicid=topic[0], topn=round(self.k * topic[1])) extra_terms += [e[1] for e in topn] extra_terms = list(set(extra_terms)) new_query = query + " " + " ".join(extra_terms) return new_query
def create_corpus(): data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora']) docs = [] count = 1 max_count = 50000 for case in CaseReportLibrary(): # lower case all text (1) text = case.get_text() tokens = tokenize(text) docs.append(tokens) count += 1 if count % 100 == 0: print count,"/",max_count if count >= max_count: break dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] dictionary.save(os.path.join(data_folder, 'raw.dict')) corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)
def create_corpus(): data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora']) docs = [] count = 1 max_count = 50000 for case in CaseReportLibrary(): # lower case all text (1) text = case.get_text() tokens = tokenize(text) docs.append(tokens) count += 1 if count % 100 == 0: print count, "/", max_count if count >= max_count: break dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(doc) for doc in docs] dictionary.save(os.path.join(data_folder, 'raw.dict')) corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)