def tfidf_model(self): print('Logging Info - Get Tf-idf model...') tfidf_model_path = os.path.join(FEATURE_DIR, '{}_tfidf.model').format(self.genre) dict_path = os.path.join(FEATURE_DIR, '{}_tfidf.dict').format(self.genre) if os.path.exists(tfidf_model_path): dictionary = pickle_load(dict_path) tfidf_model = TfidfModel.load(tfidf_model_path) else: corpus = [ text.split() for text in self.train_data['premise'] + self.train_data['hypothesis'] + self.dev_data['premise'] + self.dev_data['hypothesis'] + self.test_data['premise'] + self.test_data['hypothesis'] ] dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] tfidf_model = TfidfModel(corpus) del corpus tfidf_model.save(tfidf_model_path) pickle_dump(dict_path, dictionary) return dictionary, tfidf_model
class TFIDF(object): def __init__(self, max_workers): self.max_workers = max_workers self.log = logging.getLogger('tfidf_model') self.model, self.dictionary = None, None # @timed def train(self, preprocessed_docs): logger.info('Building dictionary') self.dictionary = Dictionary(preprocessed_docs) logger.info('Dictionary built with %d words. Building corpus', len(self.dictionary)) corpus = [self.dictionary.doc2bow(line) for line in preprocessed_docs ] # convert dataset to BoW format logger.info('Built corpus') self.model = TfidfModel(corpus) def save_model(self, model_path): logger.info('Saving TFIDF model to file: %s', model_path) self.model.save(model_path) def save_dictionary(self, dict_path): logger.info('Saving dictionary to file: %s', dict_path) self.dictionary.save(dict_path) def build_corpus(self, doc_list, dictionary): return list(map(dictionary.doc2bow, doc_list)) @staticmethod def with_url_handling(max_workers): return TFIDF(max_workers)
def build_tfid_model(dictionary, corpus, should_rebuild): tfid = list() # DEBUG should_rebuild = True if not should_rebuild: try: print('Loading TFID Model backup...') tfid_file = utils.get_file_path(cfg.TFID_BACKUP) print('TFID file = {}'.format(tfid_file)) tfid = LdaModel.load(tfid_file) except Exception as exc: utils.print_exception_details('Building TFID Model', exc) else: print('Building TFID Model...') tfid = TfidfModel(corpus) print('Done!') # Save Model Structures TFID_FILE = utils.get_file_path(cfg.TFID_BACKUP) tfid.save(TFID_FILE) return tfid
def make_tfidf(target_posts): df_data = pd.read_csv(os.path.join(CONFIG.DATASET_PATH, target_posts, 'posts.csv'), index_col=0, header=None, encoding='utf-8-sig') with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'word_idx.json'), "r", encoding='utf-8') as f: word_idx = json.load(f) with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'corpus.txt'), 'r', encoding='utf-8') as f: data = f.read() text_data = [data.split()] print("making documents...") dct = Dictionary(text_data) documents = [dct.doc2bow(value[1].split()) for index, value in df_data.iterrows()] print("embedding started") embedding_model = TfidfModel(documents, id2word=word_idx[0]) model_name = "TFIDF_"+ target_posts + ".model" embedding_model.save(os.path.join(CONFIG.EMBEDDING_PATH, model_name)) print("embedding completed") corpus_tfidf = embedding_model[documents] d = {} for doc in corpus_tfidf: for id, value in doc: word = word_idx[0][str(id)] d[word] = value sorted_d = sorted(d.items(), key=operator.itemgetter(1), reverse=True) print(sorted_d[:10]) dictionary_list = [] sorted_d = sorted_d[:3000] for word, value in sorted_d: dictionary_list.append(word) with open(os.path.join(CONFIG.DATASET_PATH, target_posts, 'dictionary_list.p'), 'wb') as f: cPickle.dump(dictionary_list, f)
def create_tfidf_from_papers( path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH, path_to_bow: Path = BOW_PATH, outfile: Path = TFIDF_VECTORIZER, ) -> TfidfModel: """ Creates TFIDF model from BOW corpora. Parameters ---------- path_to_jsonl_index: Path Path to json lines index path_to_bow: Path Path to Bag of Words Dictionary outfile: Path Path to TFIDF vectorizer Returns ------- tfidf_model: TfidfModel Gensim TFIDF Model """ # Load dictionary dictionary = Dictionary.load(str(path_to_bow)) # Load corpus generator corpus = BiopapersCorpus(dictionary, path_to_jsonl_index) # Train TFIDF tfidf_model = TfidfModel(corpus) # Save TFIDF model to file: tfidf_model.save(str(outfile)) return tfidf_model
def get_corpus(docs): print("Building corpus ...") tfidf_model = None # load corpus from disk if ARGS.load_corpus: corpus = MmCorpus(ARGS.path_corpus) else: corpus = [dictionary.doc2bow(doc) for doc in docs] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm') if ARGS.corpus_type == "TFIDF": tfidf_model = TfidfModel(corpus) tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm") corpus = tfidf_model[corpus] # serialize corpus to disk to prevent memory problems if corpus gets too large MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus) corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm') return corpus, tfidf_model
def tfidf(dataframe, max_words=None): """Returns a tf-idf model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. max_words : int (default is 2000000) The maximum number of words stored by the model. Returns ------- model : Gensim TfidfModel tf-idf model for documents stored in the DataFrame. """ suffix = '_{}'.format(max_words) if max_words else '' filename = 'caches/models/tfidf{}.model'.format(suffix) if not os.path.isfile(filename): if max_words: dictionary = hashdictionary_corpus(dataframe, id_range=max_words) else: dictionary = dictionary_corpus(dataframe) tfidf_model = TfidfModel(dictionary=dictionary) tfidf_model.save(filename) else: tfidf_model = TfidfModel.load(filename) return tfidf_model
def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def main(argv=None): if argv is None: argv = sys.argv print('Creating simple wiki serialized corpus') # Download the raw file if we do not have it already if not os.path.isfile(WIKIFILE): # Get the file wget.download(WIKIURL) wiki = WikiCorpus(WIKIFILE, lemmatize=False) i = 0 article_dict = {} for text in wiki.get_texts(meta=True): url_string = 'https://simple.wikipedia.org/wiki/?curid={}' article_dict[i] = (url_string.format(text[0]), text[1]) i += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, ) wiki.dictionary.save_as_text(DICTFILE) print('Simple wiki serialized corpus created') # Now run LSI dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def tfidf_w2v_top5w(all_docs_prepro): # TFIDF MODEL exists = os.path.isfile('embedding/models/tfidf_all.model') if exists: print('Tfidf embedding model already existing') else: dct = Dictionary(all_docs_prepro) # fit dictionary corpus = [dct.doc2bow(line) for line in all_docs_prepro] # convert corpus to BoW format model_tfidf = TfidfModel(corpus) word_path = 'embedding/models/tfidf_all.model' model_tfidf.save(word_path) # WORD2VEC MODEL exists = os.path.isfile('embedding/models/word2vec_all.model') if exists: print('Word2vec embedding model already existing') else: print('Training word2vec on all answers') word_path = "embedding/models/word2vec_all.model" word_tempfile = get_tmpfile(word_path) word_model = Word2Vec(all_docs_prepro, size=128, window=5, min_count=1, workers=4) word_model.save(word_path)
def train(file=DATA_FILE, type=JSON): delete_previous_models() faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type) faq_df = clean_data(faq_df) faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess) faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess) print('Preprocessing Done') if DEBUG: print(faq_df.head()) for mode in modes: model = modes[mode] dictionary = corpora.Dictionary(faq_df[model.column]) dictionary.save(os.path.join(MODEL_DIR, model.dictionary)) corpus = faq_df[model.column].map(dictionary.doc2bow) if DEBUG: print(f'{model.corpus} generated') print(corpus.head()) corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus), corpus) tfidf_model = TfidfModel(corpus) if DEBUG: print(f'{model.tfidf} generated') tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf)) tfidf = tfidf_model[corpus] lda_model = LdaMulticore(corpus=tfidf, id2word=dictionary, num_topics=30) lda_model.save(os.path.join(MODEL_DIR, model.model)) if DEBUG: print(f'{model.model} generated') print(lda_model.print_topics(5)) print('Training completed')
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="]) except getopt.GetoptError: print 'training_model.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -i <inputfile> -o <outputfile>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg # Read messages from log file messages = [line for line in open(inputfile)] print('Log file contains {} lines'.format(len(messages))) tokens = Tokens(messages) tokenized = Tokens.clean_tokenized( tokens.pyonmttok(Tokens.TOKENIZER, messages)) dct = Dictionary(tokenized) corpus = [dct.doc2bow(line) for line in tokenized] tfidf = TfidfModel(corpus, normalize=True) tfidf.save(outputfile)
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def train_tfidf(inpath=config.path_train_cut): train_df = pd.read_csv(inpath, sep="\t", header=None, names=["id", "s1", "s2", "label"], encoding="utf-8") tfidf_txt = train_df["s1"].tolist() + train_df["s2"].tolist() texts = [tokenize(text) for text in tfidf_txt] # remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 documents = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = Dictionary(documents) dictionary.save_as_text("./model/words.dic") # dictionary = Dictionary.load_from_text("./model/words.dic") class MyCorpus(object): def __iter__(self): for doc in documents: yield dictionary.doc2bow(doc) corpus = MyCorpus() MmCorpus.serialize("./model/corpus.mm", corpus) # corpus = MmCorpus("./model/corpus.mm") tfidf = TfidfModel(corpus) tfidf.save("./model/tf_idf.model")
def convertBowTfidf(self, folderName, dataBow, dictionary): print "start building tfidf..." tfidf = TfidfModel(dataBow, id2word=dictionary, normalize=True) tfidf.save("data.tfidfModel") corpora.MmCorpus.serialize("dataTfidf.mm", tfidf[dataBow], progress=10000) print "data.tfidfModel and dataTfidf.mm have been saved."
def trainModelM2(self, sampleUtterances_tokens, outpath): dictionary = corpora.Dictionary.load( os.path.join(outpath, "dictionary.dict")) corpus = [dictionary.doc2bow(line) for line in sampleUtterances_tokens] model = TfidfModel(corpus) index = similarities.MatrixSimilarity(model[corpus]) model.save(os.path.join(outpath, "m2.model")) index.save(os.path.join(outpath, "m2.index"))
def create_model_tfidf_model(documents, model_name, matrix_name, dic_name): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidfmodel = TfidfModel(corpus) index = MatrixSimilarity(tfidfmodel[corpus], num_features=len(dictionary)) index.save(matrix_name) tfidfmodel.save(model_name) dictionary.save(dic_name) return tfidfmodel, index, dictionary
class EmbedReplace(object): def __init__(self, sample_path, wv_path): self.samples = read_sample(sample_path) self.refs = [ sample.split('<sep>')[1].split() for sample in self.samples ] self.wv = KeyedVectors.load_word2vec_format(wv_path, binary=False) if os.path.exists('saved/tfidf.model'): self.tfidf_model = TfidfModel.load('saved/tfidf.model') self.dct = Dictionary.load('saved/tfidf.dict') self.corpus = [self.dct.doc2bow(doc) for doc in self.refs] else: self.dct = Dictionary(self.refs) self.corpus = [self.dct.doc2bow(doc) for doc in self.refs] self.tfidf_model = TfidfModel(self.corpus) self.dct.save('saved/tfidf.dict') self.tfidf_model.save('saved/tfidf.model') self.vocab_size = len(self.dct.token2id) def vectorize(self, docs, vocab_size): return matutils.corpus2dense(docs, vocab_size) def extract_keywords(self, dct, tfidf, threshold=0.2, topk=5): tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True) return list( islice([dct[w] for w, score in tfidf if score > threshold], topk)) def replace(self, token_list, doc): keywords = self.extract_keywords(self.dct, self.tfidf_model[doc]) num = int(len(token_list) * 0.3) new_tokens = token_list.copy() while num == int(len(token_list) * 0.3): indexes = np.random.choice(len(token_list), num) for index in indexes: token = token_list[index] if isChinese( token) and token not in keywords and token in self.wv: new_tokens[index] = self.wv.most_similar(token, topn=1)[0][0] num -= 1 return ' '.join(new_tokens) def generate_samples(self, write_path): replaced = [] count = 0 for sample, token_list, doc in zip(self.samples, self.refs, self.corpus): count += 1 if count % 100 == 0: print(count) write_samples(replaced, write_path) replaced = [] replaced.append( sample.split('<sep>')[0] + '<sep>' + self.replace(token_list, doc))
def get_tfidf(self,path): path = path + '.tfidf' if not os.path.exists(path): tfidf_model = TfidfModel(self.corpus, smartirs='ntc') tfidf_model.save(path) # перевзвешивание корпуса self.corpus = tfidf_model[self.corpus] else: tfidf_model = TfidfModel.load(path) return tfidf_model
def lsa_twitter(cased_tokens): """ Latent Sentiment Analyis on random sampling of twitter search results for words listed in cased_tokens """ # Only 5 of these tokens are saved for a no_below=2 filter: # PyCons NLPS #PyCon2016 #NaturalLanguageProcessing #naturallanguageprocessing if cased_tokens is None: cased_tokens = ('PyConOpenSpaces PyCon PyCon2017 PyCon2018 PyCon2016 PyCon2015 OpenSpace PyconTutorial ' + 'NLP NaturalLanguageProcessing NLPInAction NaturalLanguageProcessingInAction NLPIA Twote Twip' ).split() cased_tokens += [s + 's' for s in cased_tokens] cased_tokens += 'TotalGood TotalGoods HobsonLane Hob Hobs TotalGood.com ' \ 'www.TotalGood.com http://www.TotalGood.com https://www.TotalGood.com'.split() allcase_tokens = cased_tokens + [s.lower() for s in cased_tokens] allcase_tokens += [s.title() for s in cased_tokens] allcase_tokens += [s.upper() for s in cased_tokens] KEEP_TOKENS = allcase_tokens + ['#' + s for s in allcase_tokens] # takes 15 minutes and 10GB of RAM for 500k tweets if you keep all 20M unique tokens/names URLs vocab_path = os.path.join(BIGDATA_PATH, 'vocab939370.pkl') if os.path.isfile(vocab_path): print('Loading vocab: {} ...'.format(vocab_path)) vocab = Dictionary.load(vocab_path) print(' len(vocab) loaded: {}'.format(len(vocab.dfs))) else: tweets_path = os.path.join(BIGDATA_PATH, 'tweets.csv.gz') print('Loading tweets: {} ...'.format(tweets_path)) tweets = read_csv(tweets_path) tweets = np.array(tweets.text.str.split()) with gzip.open(os.path.join(BIGDATA_PATH, 'tweets.txt.gz'), 'w') as f: for tokens in tweets: f.write((' '.join(tokens) + '\n').encode('utf-8')) # tweets['text'] = tweets.text.apply(lambda s: eval(s).decode('utf-8')) # tweets['user'] = tweets.user.apply(lambda s: eval(s).decode('utf-8')) # tweets.to_csv('tweets.csv.gz', compression='gzip') print('Computing vocab from {} tweets...'.format(len(tweets))) vocab = Dictionary(tweets, no_below=NO_BELOW, no_above=NO_ABOVE, keep_tokens=set(KEEP_TOKENS)) vocab.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N, keep_tokens=set(KEEP_TOKENS)) print(' len(vocab) after filtering: {}'.format(len(vocab.dfs))) # no time at all, just a bookeeping step, doesn't actually compute anything tfidf = TfidfModel(id2word=vocab, dictionary=vocab) tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs)))) tweets = [vocab.doc2bow(tw) for tw in tweets] json.dump(tweets, gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w')) gc.collect() # LSA is more useful name than LSA lsa = LsiModel(tfidf[tweets], num_topics=200, id2word=vocab, extra_samples=100, power_iters=2) return lsa
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) # corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join(HERE, "sogou.dict")) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, "sogou.model")) lsi_model.save(os.path.join(HERE, "sogou.lsi")) print "save dictionary and tfidf model" """
def trainModelM2(tests, detector_tokens, sampleUtterances_tokens, productid, outpath): dictionary = corpora.Dictionary.load( os.path.join(outpath, "dictionary.dict")) corpus = [dictionary.doc2bow(line) for line in sampleUtterances_tokens] model = TfidfModel(corpus) index = similarities.MatrixSimilarity(model[corpus]) for test in tests: if not testModelForSearch(model, dictionary, index, test): return model.save(os.path.join(outpath, "m2.model")) index.save(os.path.join(outpath, "m2.index"))
def _tfidf(corpus, dictionary): tfidf_file_name = get_tfidf_file_name(CORPUS_FILES["label"]) try: tfidf = TfidfModel.load(tfidf_file_name) except FileNotFoundError: corpus_numeric = [dictionary.doc2bow(document) for document in corpus] tfidf = TfidfModel(corpus=corpus_numeric) print("File does not exist - creating the tfidf model") create_file_and_folders_if_not_exist(tfidf_file_name) tfidf.save(tfidf_file_name) return tfidf
def make_corpus(): corpus = MyCorpus() tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] num_terms = 400 lsi_model = LsiModel(corpus_idf, id2word=corpus.dictionary, num_topics=num_terms) #corpora.MmCorpus.serialize('wiki_en_corpus.mm', corpus) # store to disk, for later use corpus.dictionary.save(os.path.join( HERE, 'sogou.dict')) # store the dictionary, for future reference tfidf_model.save(os.path.join(HERE, 'sogou.model')) lsi_model.save(os.path.join(HERE, 'sogou.lsi')) print 'save dictionary and tfidf model' '''
class Cos(): def __init__(self): self.tfidf = {} self.dict = Dictionary() def init(self, traindata, dict_path, tfidf_path): self.dict = Dictionary(traindata) # fit dictionary corpus = [self.dict.doc2bow(line) for line in traindata] # convert corpus to BoW format self.tfidf = TfidfModel(corpus) # fit model self.dict.save(dict_path) self.tfidf.save(tfidf_path) def load(self, dict_path, tfidf_path): self.dict = Dictionary.load(dict_path) self.tfidf = TfidfModel.load(tfidf_path)
def main(dataset_path): if not os.path.exists('../data/retriever/paragraph-ids.txt'): print('Writing paragraph ID to file...') with open('../data/retriever/paragraph-ids.txt', 'w') as f: for paragraph_id in load_ids(dataset_path): f.write(paragraph_id + '\n') dictionary_path = '../data/retriever/dct.pkl' if not os.path.exists(dictionary_path): print('Creating dictionary...') st = time.time() dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000) dct.save(dictionary_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating dictionary in {et - st}s.') else: print('Loading dictionary...') dct = Dictionary.load(dictionary_path) print('Dictionary loaded.') tfidf_path = '../data/retriever/tfidf.pkl' if not os.path.exists(tfidf_path): print('Creating model...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) model = TfidfModel(corpus) model.save(tfidf_path, pickle_protocol=3) et = time.time() print(f'\rFinished creating model in {et - st}s.') else: print('Loading model...') model = TfidfModel.load(tfidf_path) print('Model loaded.') index_path = '../data/retriever/indexes/master-index' if not os.path.exists(index_path): print('Creating index...') st = time.time() corpus = map(dct.doc2bow, load_paragraphs(dataset_path)) index = Similarity('../data/retriever/indexes/index', model[corpus], len(dct)) index.save(index_path) et = time.time() print(f'\rFinished creating index in {et - st}s.') print('Done') else: print('Nothing to do. Exiting...')
def build_dict_model(path): allFiles = glob.glob(path + "/*.tsv") _list = [ pd.read_csv(f, header=None, delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8') for f in allFiles ] frame = pd.concat(_list) myhash = str(random.getrandbits(8)) tokens = [word_tokenize(str(row)) for row in frame.ix[:, 3]] dictionary = Dictionary(tokens) dictionary.save("ressources/dictionary" + myhash) tfidf_model = TfidfModel([dictionary.doc2bow(t) for t in tokens], id2word=dictionary) tfidf_model.save("ressources/tfidf_model" + myhash)
def build_tfidf(corpus_dir,model_filename): stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() bigram_transformer = Phrases(TextCorpus(corpus)) for myfile in corpus.fileids(): try: chunks = bigram_transformer[[word.lower() for word in corpus.words(myfile)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) except Exception as e: print 'Warning error in file:', myfile model = TfidfModel(BowCorpus(corpus,dictionary,bigram_transformer), id2word=dictionary) model.save(model_filename)
def loadTfidfModel(self, type='offline'): ''' 加载Tfidf模型,若模型不存在则建立模型 ''' filePath = self.cachePath + '%s_tfidf_%s.model' % (self.name, type) if os.path.isfile(filePath): tfidfModel = SaveLoad.load(filePath) else: startTime = datetime.now() if type not in self.dictionary: self.loadDictionary(type) tfidfModel = TfidfModel(dictionary=self.dictionary[type]) # tfidfModel = makeTfidfModel(self.dictionary) tfidfModel.save(filePath) print('train tfidfModel time:', datetime.now() - startTime) self.tfidfModel[type] = tfidfModel return tfidfModel
def process(): # read all the text files in the directory and build a corpus corpus = TextDirectoryCorpus("C://Users//Kumar Abhijeet//Project/Preprocess_data//JD//") # save word-id dictionary #corpus.dictionary.save_as_text('wordids_JD2.txt') # save matrix market format vectors MmCorpus.serialize('JD_bow.mm', corpus) # load word-id dictionary id2word = Dictionary.load('foobar.txtdic') # load matrix market format vectors mm = MmCorpus('JD_bow.mm') # train tfidf tfidf = TfidfModel(mm, id2word=id2word, normalize=True) # save tfidf model tfidf.save('tfidf_JD.model') # save tfidf vectors in matrix market format MmCorpus.serialize('tfidf_JD.mm', tfidf[mm])
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute( 'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def tfidf(model_name): """ tf-idf """ print("prepare data.") os.chdir("data") set_data(mode="doc") sentences = bow_read_docs(folder_name="tmp_file") dic = Dictionary(sentences) ## 「出現頻度が20未満の単語」と「30%以上の文書で出現する単語」を排除 ## dic.filter_extremes(no_below = 20, no_above = 0.3) bow_corpus = [dic.doc2bow(d) for d in sentences] print("train model.") model = TfidfModel(bow_corpus) print("save model.") os.chdir("..") model.save("model/%s" % model_name)
class TFIDF(object): def __init__(self, max_workers, preprocessor=None): self.max_workers = max_workers self.log = logging.getLogger('tfidf_model') self.preprocessor = preprocessor if preprocessor is not None else Preprocessor( max_workers=max_workers) self.model, self.dictionary = None, None # @timed def train(self, doc_list): self.log.info( 'TFIDF.train called. Starting preprocessing %d documents', len(doc_list)) preprocessed_docs = self.preprocessor.process_docs(doc_list) self.log.info('Preprocessing ended. Building dictionary') dictionary = Dictionary(preprocessed_docs) self.log.info('Dictionary built with %d words. Building corpus', len(self.dictionary)) corpus = [dictionary.doc2bow(line) for line in preprocessed_docs ] # convert dataset to BoW format self.log.info('Built corpus') self.model = TfidfModel(corpus) def save_model(self, model_path): self.log.info('Saving LDA model to file: %s', model_path) self.model.save(model_path) def save_dictionary(self, dict_path): self.log.info('Saving dictionary to file: %s', dict_path) self.dictionary.save(dict_path) def build_corpus(self, doc_list, dictionary): return list(map(dictionary.doc2bow, doc_list)) @staticmethod def with_url_handling(max_workers): return TFIDF(max_workers, preprocessor=WithUrlPreprocessor(max_workers=max_workers))
def main(): datadir = path.abspath(path.join(os.getcwd(), "data")) # load back the id->word mapping directly from file fin = path.join(datadir, "reuters21578.dict.txt") vocabulary = Dictionary.load_from_text(fin) # load the corpus fin = path.join(datadir, "reuters21578.mm") mm = MmCorpus(fin) # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True) # save the TfidfModel instance to file fout = path.join(datadir, "reuters21578.tfidf.model") tfidf.save(fout) # save TF-IDF vectors in matrix market format fout = path.join(datadir, "reuters21578.tfidf.mm") MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
def build_tfidf_model(data_directory, corpus_path, wiki_text_output_path, model_output_path, multiwords=True, druid_cutoff_score=0.3): stemmer = nltk.stem.PorterStemmer() tokenid_dictionary = corpora.Dictionary() if not exists(wiki_text_output_path): logger.info('Converting ' + str(corpus_path) + ' into plain text file: ' + wiki_text_output_path) # Convert Wikipedia XML dump into .txt format wikidump2text.convert(corpus_path, wiki_text_output_path) else: logger.info('Found ', wiki_text_output_path, ' not converting from the raw bz2 file.') # Load Multiword Expressions as Dictionary stopwords_path = join(data_directory, 'stopwords_en.txt') if multiwords: druid_path = join(data_directory, 'druid_en.bz2') druid_dict = druid.DruidDictionary(druid_path, stopwords_path, cutoff_score=druid_cutoff_score) logger.info('Loaded Druid with cutoff' + str(druid_cutoff_score)) else: druid_dict = None logger.info("Building tfidf model...") start_time = time.time() if multiwords: logger.info('Using druid_en.bz2 in ' + data_directory + ' as multiword dictionary.') articles = TextCorpus(wiki_text_output_path, druid_dict, multiwords=True) # a memory-friendly iterator else: logger.info('Using no multiword dicitionary, just single words') articles = TextCorpus(wiki_text_output_path, None, multiwords=False) tokenid_dictionary.add_documents(articles) model = TfidfModel(BowCorpus(wiki_text_output_path, druid_dict, tokenid_dictionary, multiwords=multiwords), id2word=tokenid_dictionary) model.save(model_output_path) logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
def main(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) outp = OUT_PREFIX keep_words = DEFAULT_DICT_SIZE # the doc index dbc = get_cursor() dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id') docindex = [(pageid, title) for pageid, title in dbc] pickle.dump(docindex, open(outp + '_docindex.p', 'wb')) lemmatize = True # 'lemma' in program wiki = WikiCorpus(pages_gen, lemmatize=lemmatize) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format # another long task MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
return os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data') corpus_dir = os.path.join(data_directory(), 'audio_transcripts') model_filename = os.path.join(data_directory(), 'conversation.tfidf') stemmer = nltk.stem.PorterStemmer() corpus = PlaintextCorpusReader(corpus_dir, '.*\.txt$') # a memory-friendly iterator dictionary = corpora.Dictionary() # Train bigram transformer class TextCorpus(object): def __iter__(self): for file in corpus.fileids(): yield [word.lower() for word in corpus.words(file)] bigram_transformer = Phrases(TextCorpus()) for file in corpus.fileids(): chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]] dictionary.add_documents([[stemmer.stem(chunk) for chunk in chunks]]) class BowCorpus(object): def __iter__(self): for file in corpus.fileids(): chunks = bigram_transformer[[word.lower() for word in corpus.words(file)]] yield dictionary.doc2bow([stemmer.stem(chunk) for chunk in chunks]) model = TfidfModel(BowCorpus(), id2word=dictionary) model.save(model_filename)
corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus) # corpus.save(f_bow) # filter dictionary corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf) # TRAINING # lsa model if not os.path.exists(f_lsa): lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim) lsa.save(f_lsa) # word2vec model class MyCorpus(): def __iter__(self): for d in corpus.get_texts(): yield [w for w in d if w in corpus.dictionary.token2id] if not os.path.exists(f_w2v): w2v = Word2Vec(MyCorpus(), size=w2v_dim, min_count=1, window=5)
class TfidfVectorizer(): """ Transform text to tf-idf representation """ def __init__(self): self.base_path = os.path.dirname(__file__) self.dictionary_path = os.path.join(self.base_path, "dictionary") self.tf_idf_model_path = os.path.join(self.base_path, "tfidf") self.stemmer = NepStemmer() self.tf_idf_model = None def get_tokens(self, document): if not self.stemmer: raise Exception("Stemmer not available") return self.stemmer.get_stems(document) def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path) def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path) def doc2vector(self, document): """ Returns the sparse tf-idf vector for given document """ tokens = self.get_tokens(document) bag_of_words = self.dictionary.doc2bow(tokens) return (self.tf_idf_model[bag_of_words]) def obtain_feature_vector(self, document): """ Returns a single dense tf-idf vector for a given document """ self.load_data() tf_idf_vector = matutils.sparse2full( self.doc2vector(document), self.no_of_features ).reshape(1, -1) return tf_idf_vector def obtain_feature_matrix(self, documents): """ Returns the tf-idf dense matrix for the given documents """ self.load_data() input_matrix_sparse = [ self.doc2vector(x) for x in documents ] no_of_features = len(self.tf_idf_model.idfs) input_matrix = matutils.corpus2dense( input_matrix_sparse, no_of_features ).transpose() return input_matrix
def main(train, model, dic): logging.basicConfig(level=logging.INFO) corpus = SentenceDocCorpus(train) tfidf = TfidfModel(corpus) tfidf.save(model) corpus.dictionary.save(dic)
elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
# Remove stop words (additional removal of common words used in spoken language) stop_ids = [] with open(stop_words_file, 'r') as infile: for line in infile: try: stop_ids.append(wiki.dictionary.token2id[line.lower().strip()]) except KeyError: continue wiki.dictionary.filter_tokens(bad_ids=stop_ids) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(outp + '.tfidf_model') # save tfidf vectors in matrix market format MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
# In[ ]: tfidf. # In[42]: tfidf.num_docs # In[43]: tfidf.num_nnz # In[44]: tfidf.save(os.path.join(DATA_PATH, 'tfidf')) # In[45]: tfidf2 = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf')) # In[46]: tfidf2.num_nnz
class TextCorpus(object): def __init__(self, filename): self.corpus = codecs.open(filename, 'r', encoding='utf-8') def __iter__(self): # One line contains one wiki article. for line in self.corpus: ngrams = druid_dict.find_ngrams(line.lower().split()) yield [stemmer.stem(token) for token in ngrams] articles = TextCorpus(wiki_text_output_path) # a memory-friendly iterator dictionary.add_documents(articles) class BowCorpus(object): def __init__(self, filename): self.corpus = codecs.open(filename, 'r', encoding='utf-8') def __iter__(self): for line in self.corpus: ngrams = druid_dict.find_ngrams(line.lower().split()) stemmed_article = [stemmer.stem(token) for token in ngrams] yield dictionary.doc2bow(stemmed_article) model = TfidfModel(BowCorpus(wiki_text_output_path), id2word=dictionary) model.save(model_output_path) logger.info("Finished building tfidf model. Time needed: " + str(time.time() - start_time))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) inp, outp = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE online = 'online' in program lemmatize = 'lemma' in program debug = 'nodebug' not in program wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, keep_n=DEFAULT_DICT_SIZE) wiki.dictionary.save_as_text(outp + '_wordids.txt') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt') # build tfidf, ~50min tfidf = TfidfModel(wiki, normalize=True) tfidf.save('tfidf_all_words') logger.info("finished running %s" % program)
self.dictionary.save_as_text("wiki_en_wordids.txt") def __iter__(self): for tokens in iter_documents(): yield self.dictionary.doc2bow(tokens) corpus = MyCorpus() # create a dictionary corpora.MmCorpus.serialize("wiki_en_corpus.mm", corpus) # store to disk, for later use # for vector in corpus: # convert each document to a bag-of-word vector # print vector print "Create models" tfidf_model = TfidfModel(corpus) tfidf_model.save("wiki_en_tfidf.model") # lsi_model = LsiModel(corpus) # topic_id = 0 # for topic in lsi_model.show_topics(): # topic_id+=1 # print "TOPIC (LSI) " + str(topic_id) + " : " + topic # lsi_model.print_topic(20, topn=10) # corpus_lsi = lsi_model[corpus] corpus_tfidf = tfidf_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) # corpus_lsi_2 = lsi_model_2[corpus]