def main(argv): token_list = make_token_list() dictionary = Dictionary(token_list) corpus = [dictionary.doc2bow(doc_tokens) for doc_tokens in token_list] dictionary.save('dictionary.dict') with open('corpus.json', 'w') as out: json.dump(corpus, out)
def main(): parser = ArgumentParser() parser.add_argument('-d', '--wiki-dump') parser.add_argument('-l', '--limit', default=None, type=int) parser.add_argument('-p', '--num-procs', default=1, type=int) parser.add_argument('-o', '--out', default='vocab') opts = parser.parse_args() dump_loc = opts.wiki_dump limit = opts.limit n_procs = opts.num_procs out_fn = opts.out dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs) nlp = spacy.en.English() vocab = Dictionary(([ token.text.lower().strip() for token in doc if token.text.strip() != "" ] for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs, parse=False, tag=False, entity=False))) vocab.save('%s.vocab' % out_fn) vocab.save_as_text('%s.txt' % out_fn)
class LdaMalletHandler: def __init__(self, mallet_path): self.mallet_path = mallet_path def run_model(self, model_name, corpus, **kwargs): self.model_name = model_name self.dictionary = Dictionary(corpus) corpus_bow = [self.dictionary.doc2bow(text) for text in corpus] os.makedirs("ldamodels/"+model_name, exist_ok=True ) self.model = LdaMallet(self.mallet_path, corpus_bow, id2word=self.dictionary, prefix="./ldamodels/"+model_name+"/", **kwargs) def save_model(self): self.model.save("ldamodels/"+self.model_name+"/model.model") self.dictionary.save("ldamodels/"+self.model_name+"/dict.dict") def load_model(self, model_name): self.model_name = model_name self.dictionary = corpora.Dictionary.load("ldamodels/"+self.model_name+"/dict.dict") self.model = LdaMallet.load("ldamodels/"+self.model_name+"/model.model") self.model.mallet_path = self.mallet_path def doc_topics(self, doc_idx): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) return self.doc_retriever.doc_topics(doc_idx) def ext_doc_topics(self, ext_doc): doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] doc_topics.sort(key=lambda x: x[1], reverse=True) return doc_topics def ext_doc_n_most_similar(self, ext_doc, n=5, metric='cosine'): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) doc_bow = self.dictionary.doc2bow(ext_doc) doc_topics = self.model[doc_bow] topics = [] for topic in doc_topics: topics.append(topic[1]) most_similar = self.doc_retriever.n_most_similar(topics, n=n, metric=metric) return most_similar def n_most_representative(self, topic, n=3): if(not hasattr(self, 'doc_retriever')): self.doc_retriever = DocumentRetriever(self.model.fdoctopics()) topics = np.zeros(self.model.num_topics) topics[topic]=1 most_similar = self.doc_retriever.n_most_similar(topics, n=n) return most_similar def get_string_topics(self, num_topics=-1, num_words=10): if(num_topics==-1): num_topics = self.model.num_topics string_topics = [] for topic in self.model.print_topics(num_topics=num_topics, num_words=num_words): splitted = topic[1].split("\"") result = [splitted[2*i+1] for i in range(0,int(len(splitted)/2))] string_topics.append(" ".join(result)) return string_topics
class Corpus(object): def __init__(self, path, dict_path): self.dictionary = Dictionary() add_to_dict = True if dict_path and os.path.exists(dict_path): print('loading dictionary') self.dictionary = self.dictionary.load(dict_path) add_to_dict = False self.train = self.tokenize(os.path.join(path, 'train.txt'), add_to_dict) self.valid = self.tokenize(os.path.join(path, 'valid.txt'), add_to_dict) self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict) if dict_path and not os.path.exists(dict_path): self.dictionary.save(dict_path) def tokenize(self, path, add_to_dict): """Tokenizes a text file.""" assert os.path.exists(path) all_words = list( chain.from_iterable([ sent.split() + ['<eos>'] for sent in open(path).read().split('\n') ])) if add_to_dict: self.dictionary.add_documents([all_words]) return torch.LongTensor(self.dictionary.doc2idx(all_words))
def create_dictionary(analyzed_items_path, dictionary_path=None): dictionary = Dictionary(iter_docs(analyzed_items_path)) if dictionary_path: dictionary.save(dictionary_path) return dictionary
def build_dictionary(): dictionary = Dictionary() for line in open(wiki_index.ARTICLES_FILE): dictionary.add_documents([line.lower().split()]) dictionary.filter_extremes(no_below=2, no_above=0.5) dictionary.save(DICTIONARY_FILE) return dictionary
def build_dictionaries_from_splits(splits_template, n, save_pickle_tup=None): ''' Builds all 3 dictionaries from splits. If provided, `save_pickle_tup` must be a 3-tuple of the picklefile names in the following order: (title, body, tags) If `save_pickle_tup[i]` is None, the corresponding dictionary will not be saved. ''' utitledict, ubodydict, utagdict = Dictionary(), Dictionary(), Dictionary() for eid in xrange(n): for row in row_stream(splits_template % eid): ID, title, body, tags = row utitledict.doc2bow(title.split(), allow_update=True) ubodydict.doc2bow(body.split(), allow_update=True) utagdict.doc2bow(tags.split(), allow_update=True) assert ubodydict.num_docs == utitledict.num_docs == utagdict.num_docs print "Before filtering..." print "utitledict:", utitledict print "ubodydict:", ubodydict print "utagdict:", utagdict if save_pickle_tup: assert len(save_pickle_tup) == 3 if save_pickle_tup[0]: print "saving utitledict..." utitledict.save(save_pickle_tup[0]) if save_pickle_tup[1]: print "saving ubodydict..." ubodydict.save(save_pickle_tup[1]) if save_pickle_tup[2]: print "saving utagdict..." utagdict.save(save_pickle_tup[2]) return (utitledict, ubodydict, utagdict)
def lda_train(train_data, part, save_root): ids = list(train_data['id']) texts = list(train_data[part]) with Pool() as pool: texts = list( tqdm.tqdm(pool.imap(tokenize, texts), total=len(texts), ncols=100)) text_dictionary = Dictionary(texts) text_dictionary.save(os.path.join(save_root, 'dict')) with Pool(initializer=make_dictionary_global, initargs=(text_dictionary, )) as pool: texts = list( tqdm.tqdm(pool.imap(doc2bow_unit, texts), total=len(texts), ncols=100)) lda_model = LdaMulticore(texts, workers=7) lda_model.save(os.path.join(save_root, 'model')) with Pool(initializer=make_model_global, initargs=(lda_model, )) as pool: rows = list( tqdm.tqdm(pool.imap(get_document_topics_unit, texts), total=len(texts), ncols=100)) topics = pd.DataFrame(rows, columns=['topics', 'topic_num']) topics.insert(0, 'id', ids) topics.to_csv(os.path.join(save_root, 'train.csv'), index=False) return text_dictionary, lda_model
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def build_dictionary(self): documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: y.split()) dictionary = Dictionary(documents) dictionary.save(f'{self.board}.dictionary') return dictionary
def getDictionary(word_corpus, useSavedTill): if useSavedTill >= USESAVED.dictionary: common_logger.info("loading dictionary from file") dictionary = Dictionary.load(file_lda_gensim_dictionary) return dictionary else: common_logger.info("Creating dictionary from corpus") dictionary = Dictionary(word_corpus.values()) common_logger.info("saving dictionary") dictionary.save(file_lda_gensim_dictionary) return dictionary
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None): ''' Build dictionary from splits. If `save_pickle` is provided, then save. ''' unfiltered_dict = Dictionary() for eid in xrange(n): unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column)) print "Before filtering,", unfiltered_dict if save_pickle: print "\nsaving..." unfiltered_dict.save(save_pickle) return unfiltered_dict
class TextCorpus(gensim.corpora.TextCorpus): """A corpus class which makes some minor extensions to the Gensim `TextCorpus` implementation: - Support loading of pre-built dictionary """ def __init__(self, input=None, dictionary=None, dictionary_save_path=None, pre_tokenized=False, lowercase=False): super(gensim.corpora.TextCorpus, self).__init__() self.input = input self.metadata = False self.pre_tokenized = pre_tokenized self.lowercase = lowercase if dictionary is None: self.dictionary = Dictionary() if input is not None: self.dictionary.add_documents(self.get_texts()) else: logging.warning("No input document stream provided; " "assuming dictionary will be " "initialized in some other way.") else: self.dictionary = dictionary if dictionary_save_path is not None: self.dictionary.save(dictionary_save_path) def get_texts(self): length = 0 # Input should have one document (sentence, for the word2vec case) per line for line in getstream(self.input): length += 1 if self.pre_tokenized: if not isinstance(line, unicode): line = unicode(line, encoding='utf8', errors='strict') yield line else: yield gensim.utils.tokenize(line, lowercase=self.lowercase) self.length = length
def init_dictionary(self, save=True): import gzip from collections import Counter corpus_file = self.params.get( 'dictionary__corpus_file') or self.params.get( 'corpus_file') or 'sentences.txt.gz' doc_id = 0 num_pos = 0 num_nnz = 0 cfs = Counter() dfs = Counter() f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8') f = tqdm(f, 'dictionary', self.sentences_cnt) unique = set() for line in f: line = line.strip() if not line: # end of document dfs.update(unique) num_nnz += len(unique) # doc_id += 1 unique = set() continue tokens = line.split(' ') cfs.update(tokens) num_pos += len(tokens) unique.update(tokens) f.close() # token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())} dictionary = GensimDictionary() dictionary.num_pos = num_pos dictionary.num_nnz = num_nnz dictionary.num_docs = doc_id dictionary.token2id = token2id #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()} #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()} for t, i in token2id.items(): dictionary.cfs[i] = cfs[t] dictionary.dfs[i] = dfs[t] #dictionary.patch_with_special_tokens({'<PAD>':0}) if save: dictionary.save(self.path + 'dictionary.pkl') self.dictionary = dictionary
class LDAembedding(InputEmbedding): def __init__(self, workdir="./embedding-models", name="lda-embedding"): """ Erstellt durch Aufruf von Pretrain ein Vokabular :param workdir: :param name: """ super(LDAembedding, self).__init__(workdir=workdir, name=name) self._normalizer = TweetNormalisation() def _load(self): modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name)) if not modeldir.exists(): return False self._lda = LdaMulticore.load(str(modeldir)) self._dictionary = Dictionary.load( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) def pretrain(self, texts: typing.Iterable[typing.Text]): texts = [self._normalizer(text).split() for text in tqdm(texts)] self._dictionary = Dictionary(texts, prune_at=200000) corpus = [self._dictionary.doc2bow(text) for text in tqdm(texts)] self._lda = LdaMulticore(corpus=corpus, id2word=self._dictionary, workers=15, num_topics=50) self._dictionary.save( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name)))) self._lda.save( str(self._workdir.joinpath("ldamodel_{}".format(self._name)))) def get_train_data(self, texts: typing.Iterable[typing.Text]) -> np.array: to_array = lambda x: np.array([ v for _, v in self._lda.get_document_topics(x, minimum_probability=0) ]) return np.stack([ to_array(self._dictionary.doc2bow(self._normalizer(text).split())) for text in texts ])
def main(): parser = ArgumentParser() parser.add_argument('-d', '--wiki-dump') parser.add_argument('-l', '--limit', default=None, type=int) parser.add_argument('-p', '--num-procs', default=1, type=int) parser.add_argument('-o', '--out', default='vocab') opts = parser.parse_args() dump_loc = opts.wiki_dump limit = opts.limit n_procs = opts.num_procs out_fn = opts.out dump_gen = get_dump_gen(dump_loc, limit=limit, n_procs=n_procs) nlp = spacy.en.English() vocab = Dictionary(([token.text.lower().strip() for token in doc if token.text.strip() != ""] for doc in nlp.pipe((art['article.text'] for art in dump_gen), n_threads=n_procs, parse=False, tag=False, entity=False))) vocab.save('%s.vocab' % out_fn) vocab.save_as_text('%s.txt' % out_fn)
help='File name to give the dictionary upon saving') args = parser.parse_args() input_path = args.input_path output_name = args.output_name CHUNK_SIZE = args.chunk_size # Stream in documents from path rdr = lmd.Reader(input_path) gnr = rdr.stream_data(get_meta=True) # Build a dictionary out of the validation documents dictionary = Dictionary() docs = rdr.stream_data(threaded=True) doc_chunks = chunks(docs, size=CHUNK_SIZE) # Progress in chunks for chunk in doc_chunks: print("Adding ", CHUNK_SIZE, " docs") tokenized = [[ tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha ] for doc in tokenizer.pipe( [item for item in chunk if language(item) == 'en'], batch_size=CHUNK_SIZE)] dictionary.add_documents(tokenized) # Keep only 2**16 most frequent tokens dictionary.filter_extremes(keep_n=2**16) dictionary.compactify() dictionary.save(output_name)
def __init__( self, path='test.json', tf_idf=True, dic_below=15, dic_above=0.9, dic_keep=80000, new=False): # tf_idf: whether or not use tf_idf method to produce if os.path.isfile( 'dictionary.gensim' ) and not new: # if new,corpus beside model should be loaded # load data inp = open(path, 'rb') self.data = pd.DataFrame(json.load(inp)) inp.close() _inp = open('pro_docs.json', 'rb') self.processed_docs = json.load(_inp) _inp.close() self.dictionary = gensim.corpora.Dictionary.load( 'dictionary.gensim') if tf_idf: self.corpus = pickle.load(open('corpus.pkl_tfidf', 'rb')) else: self.corpus = pickle.load(open('corpus.pkl', 'rb')) return else: # use jieba to produce word list jb = jb_cut( path) # see jieba_prepary/ cut the document into words and processed_docs = jb.process( ) # get format lists of list: like [[......],[word1,word2,word3......],[.....]] self.processed_docs = processed_docs # used for train outp = open("pro_docs.json", 'w', encoding="utf-8") # save list of lists outp.write( json.dumps(self.processed_docs, indent=4, ensure_ascii=False)) outp.close() # use processed_docs to produce dictionary and corpus dictionary = Dictionary( processed_docs ) # use lists of lists to get overall model dictionary dictionary.filter_extremes(no_below=dic_below, no_above=dic_above, keep_n=dic_keep) # filter dcitionary self.dictionary = dictionary if not new: dictionary.save('dictionary.gensim') # format: lists of lists of tuples corpus = [dictionary.doc2bow(text) for text in processed_docs ] # get doc2bow for each corpus in the corpora if not new: pickle.dump(corpus, open('corpus.pkl', 'wb')) if tf_idf: tfidf_model = models.TfidfModel(corpus) corpus = tfidf_model[corpus] if not new: pickle.dump(corpus, open('corpus.pkl_tfidf', 'wb')) # save cirpus self.corpus = corpus
def main(args): #set path variable data_path = os.path.join(args.data_dir, args.task) embed_path = f'{args.model_type}_{args.embed_type}_{args.corpora}_{args.embed_dim}d.kv' embed_path = os.path.join('./process/model/embed_model', embed_path) #load model for nlp pipeline & embed spacy.require_gpu() en_nlp = spacy.load(args.nlp_type) special_tokens = load_special_tokens(args) logger.info(f'loading pretrain embed model from {embed_path}') if os.path.isfile(embed_path): if args.model_type == 'orig': kv_model = keyedvectors.KeyedVectors.load(embed_path) elif args.model_type == 'ft': kv_model = fasttext.FastTextKeyedVectors.load(embed_path) else: raise FileNotFoundError('Embed file path incorrect!') # read data df = pd.read_csv(os.path.join(data_path, args.mode, 'data.csv'), encoding=args.encode_format) df = df.drop_duplicates(subset=['description', 'title'], keep=False) logger.info(f'Read data from {os.path.join(data_path,args.mode)} success!') logger.info(f'df shape:{df.shape}') logger.info(f'Special token: {special_tokens} nums: {len(special_tokens)}') #build corpous using nlp pipeline logger.info('****Start to build vocab****') if args.select_context_name: logger.info( f'Start to build context vocab for {args.select_context_name}!') #text prepare context_data = combine_data(df, args.select_context_name).tolist() context_corpus = corpus_process(context_data, en_nlp) # build lda model vocab documents = [[w for sent in doc for w in sent] for doc in context_corpus] context_dict = Dictionary(documents) context_dict.filter_extremes(1, 1, args.max_context_vocab) #add special token context_vocab = [ w for w in set(context_dict.token2id).difference(set(special_tokens)) ] if args.spec_first: context_vocab = special_tokens + context_vocab else: context_vocab += special_tokens context_dict.token2id.update( {w: w_id for w_id, w in enumerate(context_vocab)}) logger.info(f'context vocab num : {len(context_vocab)}') logger.info(f'top 15 context vocab : {context_vocab[:15]}') save_vocab_file(context_vocab, data_path, args.context_vocab_file) context_dict.save(args.lda_vocab_file) logger.info('Build pretrain embed model..') #build embed model logger.info('Create item_vocab pretrain embed!') create_embeds(context_dict.token2id, kv_model, data_path, args) if args.select_item_name: logger.info( f'Start to build string vocab for {args.select_item_name}!') #data prepare item_data = df[args.select_item_name].values.tolist() item_corpus = list(map(text_clean, item_data)) #clean text #build title vocab item_corpus = [[ w.lower() for w in doc.split() if (w not in string.punctuation) and w.isalpha() ] for doc in item_corpus] item_vocab, item_token2id = build_vocab(item_corpus, special_tokens, args.max_item_size, args.min_item_freq, args.spec_first) logger.info(f'item vocab size : {len(item_vocab)}') logger.info(f'top 15 item vocab : {item_vocab[:15]}') #save item vocab save_vocab_file(item_vocab, data_path, args.str_vocab_file) logger.info('Start to create item_vocab embed') #create item embed vector create_embeds(item_token2id, kv_model, data_path, args)
for word in sentence: for c in word: char_set.append(c) with open('char_set.pkl', 'wb') as f: pickle.dump(set(char_set), f) else: with open('char_set.pkl', 'rb') as f: char_set = pickle.load(f) # In[329]: if 1 == 0: vocabulary = Dictionary(x_tokenized) vocabulary.save('voca') else: vocabulary = Dictionary.load('voca') word_to_idx = {vocabulary[idx]: idx for idx in range(len(vocabulary))} idx_to_word = {idx: vocabulary[idx] for idx in range(len(vocabulary))} char_to_idx = {c: idx for idx, c in enumerate(char_set)} idx_to_char = {idx: c for idx, c in enumerate(char_set)} def hf(word): return word_to_idx[word] # In[83]:
documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey" ] # gensim texts = [[word for word in document.lower().split()] for document in documents] dictionary = Dictionary(texts) dictionary.save('/tmp/deerwester.dict') print(dictionary) print(dictionary.token2id) new_doc = "Human computer interaction" new_vec = dictionary.doc2bow(new_doc.lower().split()) print(new_vec) corpus = [dictionary.doc2bow(text) for text in texts] print(corpus) MmCorpus.serialize('/tmp/corpus.mm', corpus) corpus = MmCorpus('/tmp/corpus.mm') print(corpus) print(list(corpus)) # sklearn vec = CountVectorizer(min_df=1, stop_words=None,
cleanDatafile = arguments['--in'] dictfile = arguments['--out-dict'] ldafile = arguments['--out-model'] ldavectorfile = arguments['--out-topics'] print('Loading data...') rawfile = open(cleanDatafile, 'rb').read() encodeInfo = chardet.detect(rawfile[:50000]) sentences = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'], row2record=lambda row, index: row[1].split()) # Create a corpus from a list of texts print('Creating dictionary...') cases_dict = Dictionary(sentences) cases_dict.save(dictfile) # Train the model on the corpus. print('Training LDA topic model...') cases_corpus = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'], row2record=lambda row, index: cases_dict.doc2bow(row[1].split())) lda = LdaModel(cases_corpus, num_topics=N_TOPICS) lda.save(ldafile) print('Building LDA topics...') idx_documents = SentenceIterator(cleanDatafile, encoding=encodeInfo['encoding'], row2record=lambda row, index: (row[0], row[1])) with open(ldavectorfile, 'w') as csvfile: writer = csv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC)
def train_model(corpus_path, dic_conf, lda_conf): logging.info('Loading corpus from file {}'.format(corpus_path)) corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250) # corpus = LineSentence(corpus_path, 10000000) print '-' * 80 if lda_conf["build_dict"]: logging.info("Building dictionary ...") dic = Dictionary(corpus) dic.filter_extremes(no_below=dic_conf["min_tf"], no_above=dic_conf["max_df"], keep_n=dic_conf["vocab_size"]) dic.compactify() logging.info("Saving dictionary ...") dic.save(dic_conf["dic"]) else: logging.info("Loading dictionary ..") dic = Dictionary.load(dic_conf["dic"]) bow = IntCorpus(corpus, dic) l = len(bow) print l tfMod = TfidfModel.load(lda_conf["tfmod"]) #save corpus to disk for later usage # logging.info("Saving corpus to disk ...") # MmCorpus.serialize("data/corpus.mm", bow) # bow = MmCorpus("data/large_corpus.mm") print '-' * 80 if lda_conf["new"]: logging.info("Training new lda model") logging.info("Loading defined keywords ...") keywords = {} topics = [] with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f: for l in f: sp = l.strip().split(':') topic = int(sp[0]) topics.append(sp[1]) kws = sp[2].split(',') for kw in kws: if kw not in keywords: keywords[kw] = set([topic]) else: keywords[kw].add(topic) #keywords[kw.lower()] = topic logging.info("Number of defined keywords: {}".format(len(keywords))) if lda_conf["threads"] <= 1: model = LdaModelNew(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], defined_kws=keywords, alpha='auto', eval_every=lda_conf["eval_every"]) else: logging.info("Training model using mutlicore lda version") model = LdaMulticoreNew(corpus=bow, id2word=dic, workers=lda_conf["threads"], iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], defined_kws=keywords, alpha='symmetric', chunksize=lda_conf["chunksize"], eval_every=lda_conf["eval_every"], tfMod=tfMod, topic_names=topics) else: logging.info("Training ldamodel implemented in gensim") model = LdaModelOld(corpus=bow, id2word=dic, iterations=lda_conf["iterations"], num_topics=lda_conf["num_topics"], passes=lda_conf["passes"], chunksize=lda_conf["chunksize"], alpha='auto', eval_every=lda_conf["eval_every"]) logging.info('Saving lda model to {}'.format(lda_conf["model_path"])) model.save(lda_conf["model_path"]) logging.info('Saving model done!')
logging.info('no calculated files found, recomputing...') logging.info('loading files...') with open(data_dark_file, 'r') as f1, open(data_clean_file, 'r') as f2: logging.info('loading dark text...') dark_text = [line.split() for line in f1.readlines()] logging.info('loading clean text...') clean_text = [line.split() for line in f2.readlines()] logging.info('load file done') if os.path.exists(dict_file): dictionary = Dictionary.load(dict_file) else: logging.info('creating the dictionary...') dictionary = Dictionary(dark_text) dictionary.add_documents(clean_text) dictionary.save(dict_file) dictionary = filter_dict(args.vocab_size, dictionary, get_keep_tokens(dictionary)) logging.info('dictionary created') logging.info('building neighbor unigrams...') if os.path.exists(file_unigram_dark) and os.path.exists( file_unigram_dark_all): unigram_dark = np.load(file_unigram_dark) unigram_dark_all = np.load(file_unigram_dark_all) else: unigram_dark, unigram_dark_all = get_neighbor_unigram( dictionary, dark_text, args.num_neighbors) np.save(file_unigram_dark, unigram_dark)
class my_LDA(object): def __init__(self): self.low_corpus = list() self.bow_corpus = list() self.dictionary = None @classmethod def tokenize_text(cls, text): text = unicodedata.normalize("NFKD", text).replace("\n", " ").replace( "\t", "").replace(" ", " ") # tokenize text as English text_tokenized = word_tokenize(text, language='English') # convert to lowercases and remove puncutation text_tokenized = [ word.lower() for word in text_tokenized if word.isalpha() ] # lemmatize token text_tokenized = [ wordlemmatizer.lemmatize(word) for word in text_tokenized if word not in stopwords.words('english') ] return text_tokenized def callback(self, text_tokenized): if len(text_tokenized) > 10: self.low_corpus.append(text_tokenized) def text_to_low(self, texts): """ convert texts to list of words, concurrency-enabled args: text: list of string """ logging.info('running text_to_low') pool = multiprocessing.Pool(os.cpu_count()) for text in texts: pool.apply_async(self.tokenize_text, args=(text, ), callback=self.callback) pool.close() pool.join() def low_to_bow(self): """ list of words to bag of words """ logging.info('running low_to_bow') if not self.low_corpus: raise ValueError("Run text_to_low First") self.dictionary = Dictionary(self.low_corpus) self.dictionary.filter_extremes(no_below=15, no_above=0.9) self.bow_corpus = [ self.dictionary.doc2bow(doc) for doc in self.low_corpus ] def run_lda(self): logging.info('running run_lda') if not self.bow_corpus or self.dictionary is None: raise ValueError("Run low_to_bow First") lda_model = models.LdaMulticore(self.bow_corpus, alpha=0.001, num_topics=10, id2word=self.dictionary, workers=os.cpu_count()) lda_model.save("data/LDA/my_LDA/topic_model.model") self.dictionary.save("data/LDA/my_LDA/dictionary.dict") corpora.MmCorpus.serialize("data/LDA/my_LDA/corpus.mm", self.bow_corpus)
class text_corpus(object): def __init__(self, tsv_path, n_examples=100000): print("Getting %s iterator..." % tsv_path) self.n_examples = n_examples self.document_path = tsv_path self.fin = open(self.document_path, 'rb') self.instances = sum(1 for line in open(tsv_path)) self.bigram = Phraser(Phrases()) self.trigram = Phraser(Phrases()) def __iter__(self): for i, doc in self.indexed_docs(self.n_examples): yield TaggedDocument(self.process(doc), [i]) def process(self, text): return self.trigram[self.bigram[tokenize(text)]] def docs(self, n_examples=None): if n_examples == None: n_examples = self.n_examples for _, doc in self.indexed_docs(n_examples): yield self.process(doc) def reset_docs(self): self.fin.close() self.fin = open(self.document_path, 'rb') def indexed_docs(self, n_examples=-1): if n_examples == -1: with open(self.document_path, 'rb') as fin: for line in fin: try: i, doc = line.decode( 'utf-8', errors='replace').strip().split('\t') yield i, doc except: pass else: current_example = 0 for line in self.fin: if (current_example < n_examples): try: i, doc = line.decode( 'utf-8', errors='replace').strip().split('\t') current_example += 1 yield i, doc except: pass else: raise StopIteration def get_phraser(self, directory, sensitivity=3): if not os.path.isdir(directory): os.makedirs(directory) print("\t\tGetting bigram detector...") if not os.path.isfile(directory + '/bigrams.pkl'): self.bigram = Phraser( Phrases(self.docs(n_examples=-1), min_count=2, threshold=sensitivity, max_vocab_size=2000000)) self.bigram.save(directory + '/bigrams.pkl') else: self.bigram = Phraser.load(directory + '/bigrams.pkl') print("\t\tGetting trigram detector...") if not os.path.isfile(directory + '/trigrams.pkl'): self.trigram = Phraser( Phrases(self.bigram[self.docs(n_examples=-1)], min_count=2, threshold=sensitivity + 1, max_vocab_size=2000000)) self.trigram.save(directory + '/trigrams.pkl') else: self.trigram = Phraser.load(directory + '/trigrams.pkl') def load_phraser(self, directory): print("\tLoading gram detector...") self.bigram = Phraser.load(directory + '/bigrams.pkl') self.trigram = Phraser.load(directory + '/trigrams.pkl') def get_dictionary(self, directory, keep=100000): if not os.path.isdir(directory): os.makedirs(directory) if not os.path.isfile(directory + '/dictionary.dict'): print("\tBuilding dictionary...") self.dictionary = Dictionary(self.docs(n_examples=-1), prune_at=2000000) print("\tFiltering dictionary extremes...") self.dictionary.filter_extremes(no_below=3, no_above=0.5, keep_n=keep) print("\tSaving dictionary...") self.dictionary.save(directory + '/dictionary.dict') self.dictionary.save_as_text(directory + '/word_list.tsv') else: self.load_dictionary(directory) def get_word_ids(self): word_list = set() for doc in self.docs(n_examples=-1): word_list.update(doc) return dict(zip(range(len(word_list)), word_list)) def load_dictionary(self, directory): print("\tLoading dictionary...") self.dictionary = Dictionary.load(directory + '/dictionary.dict')
class Similarities(object): """ Class for text similarities stuff """ def __init__(self, mongo_conn_rec, stopwords=None): self._stopwords = set(stopwords) if stopwords is not None else set() self._mongo_connection_record = mongo_conn_rec self._lsi_mapping = dict() self._sim_index = None self._dictionary = None self._lsimodel = None self._run_transformers() @staticmethod def logger(): """ Scrapper's specific logger instance. Use this to log inside scrappers. :return: Returns a logging.Logger('openews.scrappers') instance. """ return logging.getLogger('openews.language') @property def considerable_doc_property(self): """ The document property to use for training. this is the actually data we take from the MongoDB documents to parse and train. :return: str """ return 'title' @property def dictionary_file(self): """ The filename to use when serializing gensim.corpora.dictionary.Dictionary to disk. :return: str """ return "openews.processors.dict" @property def dictionary(self): """ The used Dictionary. :return: gensim.corpora.dictionary.Dictionary """ return self._dictionary @property def lsi_model(self): """ The used LSI model. :return: gensim.models.lsimodel.LsiModel """ return self._lsimodel @property def similarity_index(self): """ The similarity index instance :return: gensim.similarities.docsim.MatrixSimilarity """ return self._sim_index @property def similarity_threshold(self): """ The similarity threshold. Anything above or equals to this value will be considered as similar document. :return: float """ return server_app.config['SIMILARITY_THRESHOLD'] @property def lsi_index_mapping(self): """ A mapping between the LSI model index (key) and the documents (Collection the document is in, document) :return: dict """ return self._lsi_mapping @staticmethod def _create_resource_path(resource_file): """ Creates a absolute path to resource_file based on the given system's temp directory. :param resource_file: str :return: str """ return os.path.join(tempfile.gettempdir(), resource_file) def _resource_exists(self, resource_file): """ Checks if resource_file exists in the given system's temp directory. :param resource_file: str :return: bool """ return os.path.isfile(self._create_resource_path(resource_file)) def _run_transformers(self): """ Runs all the transformer methods listed providing the MongoDB client context instance. """ with MongoClientContext(self._mongo_connection_record) as client: self._create_dictionary(client) self._create_lsi_similarity_index(client) def _create_dictionary(self, mongo_client): """ Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets the object's dictionary property. :param mongo_client: server.db.MongoClientContext """ from gensim.corpora.dictionary import Dictionary if self._resource_exists(self.dictionary_file): self.logger().debug( "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file)) self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file)) else: self.logger().debug("Dictionary file not found, creating a new Dictionary file") self._dictionary = Dictionary() documents = [] for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]: documents.append(self.tokenize_sentence(doc[self.considerable_doc_property])) self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents)) self._dictionary.add_documents(documents) self._dictionary.save(self._create_resource_path(self.dictionary_file)) def _create_lsi_similarity_index(self, mongo_client): """ Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and similarity_index object properties. """ from gensim.models import LsiModel from gensim.similarities import MatrixSimilarity self._lsi_mapping.clear() bow_corpus = [] for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]): self._lsi_mapping[idx] = tp bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property])) self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary) self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus]) def calculate_similarities(self): """ Find / calculate similarities between documents in the index. Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values (LSI model Index, similarity threshold - numpy.float32) tuple :return: defaultdict(list) """ similarities = defaultdict(list) if not self.lsi_index_mapping: return for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)): sentence = tp[1][self.considerable_doc_property] bow = self.sentence_to_bow(sentence) latent_space_vector = self.lsi_model[bow] sim_vector = self.similarity_index[latent_space_vector] sorted_mapped_vector = list(sorted(enumerate(sim_vector), key=itemgetter(1))) for sit in [v for v in sorted_mapped_vector if v[0] != idx and v[1] >= self.similarity_threshold and tp[0].name != self.lsi_index_mapping[v[0]][0].name]: if sit[0] not in similarities: similarities[idx].append(sit) for s in similarities.items(): main_sentence = self.lsi_index_mapping[s[0]][1][self.considerable_doc_property] print("[%s] %s:" % (self.lsi_index_mapping[s[0]][0].name, main_sentence)) for sm in s[1]: print("\t[%f][%s]: %s" % (sm[1], self._lsi_mapping[sm[0]][0].name, self.lsi_index_mapping[sm[0]][1][self.considerable_doc_property])) return similarities def store_similarities(self, update=False): """ Stores the similarities to the database :param update: True to update existing, False to delete and add new items """ with MongoClientContext(self._mongo_connection_record) as client: pass def tokenize_sentence(self, sentence): """ Tokenize a sentence (see 'tokenized_corpus_sentences' method on what tokenization in this context means). :param sentence: str :return: a list """ excluded = set(chain(self._stopwords, string.punctuation)) return [w.lower() for w in word_tokenize(sentence) if w.lower() not in excluded] def sentence_to_bow(self, sentence): """ Transforms a string sentence to a VSM bag-of-words representation. :param sentence: str :return: list of tuples """ return self.dictionary.doc2bow(self.tokenize_sentence(sentence))
def train_LDA(base_path, table_paths, batch_size, limit, use_dictionary=False, **kwargs): model_name = dic2name(kwargs) print("Model: ", model_name) topic_num = kwargs['tn'] # Pass 1 get the dictionary if use_dictionary == 'True': dic = Dictionary.load( join(LDA_CACHE, 'dictionary_{}'.format(model_name))) else: dic = Dictionary([]) b = 0 for corpus in corpus_iter(base_path, table_paths, batch_size, limit, **kwargs): dic.add_documents(corpus) print('Dictionary batch {}: current dic size {}'.format( b, len(dic))) b += 1 # save dictionary dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name))) print("Dictionary size", len(dic)) # Pass 2 train LDA whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit, **kwargs) first_batch = next(whole_corpus) first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch] #print(first_bow) lda = LdaModel(first_bow, id2word=dic, num_topics=topic_num, minimum_probability=0.0) batch_no = 0 print('LDA update batch {}'.format(batch_no)) for batch in whole_corpus: batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch] #print(corpus_bow) lda.update(batch_bow) batch_no += 1 print('LDA update batch {}'.format(batch_no)) # Save model to disk. temp_file = join(LDA_CACHE, "model_{}".format(model_name)) lda.save(temp_file) print( "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\ \nTotal size of dictionary: {}".format(table_paths, batch_size, kwargs['thr'], kwargs['num'], len(dic))) return
class GensimBOW(BaseEstimator, TransformerMixin): """ Custom sklearn transformer to convert tokenized, preprocessed data to bag-of-words representation. """ def __init__(self, id2word_path=None, use_sparse_representation=False): """ Parameters ---------- id2word_path : str Path to location of gensim id2word dict. If specified, the model will load and use this object as its id2_word dict. use_sparse_representation: Boolean (default=False) When True, a sparse representation of the array is returned. Use this when feeding into a gensim model. When False, the full array is returned. Use this if feeding into sklearn estimator. """ self.id2word = None self.use_sparse_representation = use_sparse_representation if id2word_path: self._load(id2word_path=id2word_path) def _load(self, id2word_path): """ If self.id2word_path specified, loads gensim.id2word dict from path. Parameters ---------- id2word_path: str File-path designating where self.id2word should be saved. """ from gensim.corpora.dictionary import Dictionary if not os.path.exists(id2word_path): raise IOError( 'The provided file path to id2word_path was not found.' 'Please ensure that the argument is the correct path.') self.id2word = Dictionary().load(id2word_path) def save(self, id2word_path): """ Saves self.id2word to id2word_path. If id2word does not exist, AttributeError is raised. Parameters ---------- id2word_path: str File-path designating where self.id2word should be saved. """ if not self.id2word: raise AttributeError('Nothing to save yet, please run .fit first.') self.id2word.save(id2word_path) def fit(self, documents, labels=None): """ Creates map between words and their integer ids, storing it as `self.id2word`. Parameters ---------- documents: iterable List of documents; each document a list of preprocessed tokens. labels: Optional list of same size as documents, specifying label for each document. """ from gensim.corpora.dictionary import Dictionary self.id2word = Dictionary(documents) def transform(self, documents): """ Converts a collection of words to its bag-of-words representation. Parameters ---------- documents: iterable List of documents. Each document must be a list of tokens. Returns ------- generator: yields vectorized representation of each document. """ from gensim.matutils import sparse2full if self.id2word is None: raise AttributeError('Must have a fit id2word in order' ' to call transform.') def generator(): """ Closure to mutate return type depending on value of `use_sparse_representation`. """ for document in documents: docbow = self.id2word.doc2bow(document) if self.use_sparse_representation: yield docbow else: yield sparse2full(docbow, len(self.id2word)) return list(generator())
def train_lda(path): # from gensim.test.utils import common_texts # from gensim.corpora.dictionary import Dictionary # from gensim.models.ldamodel import LdaModel # from gensim.test.utils import datapath # common_dictionary = Dictionary(common_texts) # common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # print(common_dictionary.get(80)) # lda = LdaModel(common_corpus, num_topics=5) # temp_file = datapath(path) # lda.save(temp_file) # lda = LdaModel.load(temp_file) documents = [ "Amazon sells many things ", "Apple is releasing a new product ", "Microsoft announces Nokia acquisition ", 'Julie loves me more than Linda loves me ', 'Jane likes me more than Julie loves me' ] documents = [rm_special_chars(s) for s in documents] stoplist = [ 'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', "don't", 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than' ] texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] # print(texts) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # print(dictionary.get(0)) num_topics = 3 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, random_state=100, chunksize=100, passes=20, alpha='auto') temp_file = datapath(path + 'lda_model') lda.save(temp_file) lda = LdaModel.load(temp_file) dictionary.save(path + 'dict') return lda, dictionary
def create_vocab(tweets): print("Building vocabulary...") vocab = Dictionary() vocab.add_documents(tweets) vocab.save('vocab_sentiment') return vocab
class GensimTfidf(BaseEstimator, TransformerMixin): """ Custom sklearn transformer to convert tokenized, preprocessed data to tf-idf representation. """ def __init__(self, tfidf_path=None, dictionary_path=None, use_sparse_representation=False): """ Instantiate GensimTfidf object. If loading previously fit Dictionary and TfidfModel, you must specify a path to both the Dictionary and the TfidfModel. Parameters ---------- tfidf_path : str Path to location of saved gensim TfidfModel. If specified, the model will load and use this object as its TfidfModel. dictionary_path : str Path to location of saved gensim Dictionary. If specified, the model will load and use this object as its Dictionary. use_sparse_representation: Boolean (default=False) When True, a sparse representation of the array is returned. Use this when feeding into a gensim model. When False, the full array is returned. Use this if feeding into sklearn estimator. """ self.use_sparse_representation = use_sparse_representation self.dictionary = None self.tfidf = None # if both paths specified, load object if tfidf_path and dictionary_path: self._load(tfidf_path=tfidf_path, dictionary_path=dictionary_path) elif tfidf_path or dictionary_path: raise AttributeError( 'If loading pre-fit Dictionary and TfidfModel,' ' both must be specified, not just one.') def _load(self, tfidf_path, dictionary_path): """ If specified, attempts to load gensim TfidfModel from `tfidf_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- tfidf_path: str File-path designating where self.tfidf should be saved. dictionary_path: str File-path designating where self.dictionary should be saved. """ from gensim.models import TfidfModel from gensim.corpora.dictionary import Dictionary if not os.path.exists(tfidf_path): raise IOError( 'The provided file path to the TfidfModel was not found.' 'Please ensure that the argument is the correct path.') if not os.path.exists(dictionary_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') self.tfidf = TfidfModel().load(tfidf_path) self.dictionary = Dictionary().load(dictionary_path) def save(self, tfidf_path, dictionary_path): """ Saves objects from fit process: gensim.TfidfModel to `tfidf_path` and gensim.Dictionary to `dictionary_path`. If either self.tfidf or self.dictionary does not exist, an AttributeError is raised. Parameters ---------- tfidf_path: str File-path designating where self.tfidf should be saved. dictionary_path: str File-path designating where self.dictionary should be saved. """ if not (self.tfidf and self.dictionary): raise AttributeError('Nothing to save yet, please run .fit first.') self.tfidf.save(tfidf_path) self.dictionary.save(dictionary_path) def fit(self, documents, labels=None): """ Fits a gensim TfidfModel to documents. Parameters ---------- documents: iterable List of documents. Each document must be a list of preprocessed tokens. labels: iterable Optional list of same size as documents, specifying label for each document. """ from gensim.models import TfidfModel from gensim.corpora.dictionary import Dictionary self.dictionary = Dictionary(documents) self.tfidf = TfidfModel( [self.dictionary.doc2bow(doc) for doc in documents], id2word=self.dictionary) return self def transform(self, documents): """ Returns a vectorized embedding of each document in documents. Parameters ----------- documents: iterable List of documents. Each document must be a list of tokens. Returns ------- iterable: list of vectorized documents. """ from gensim.matutils import sparse2full if self.dictionary is None: raise AttributeError('Must have a fit vocab in order' ' to call transform.') def generator(): """ Closure to mutate return type depending on value of `use_sparse_representation`. """ for document in documents: vec = self.tfidf[self.dictionary.doc2bow(document)] if self.use_sparse_representation: yield vec else: yield sparse2full(vec, len(self.dictionary)) return list(generator())
# path = 'C:\\Users\\okigboo\\Desktop\\PythonDataScience\\tweeter\\' os.chdir(path) data = pd.read_csv('nyt.csv') text_clean = [] for text in data['News_content']: text_clean.append(pptext(text).split()) print(text_clean[:3]) dictionary = Dictionary(text_clean) corpus = [dictionary.doc2bow(text) for text in text_clean] pickle.dump(corpus, open('topicModels//corpus2.pkl', 'wb')) dictionary.save('topicModels//dictionary2.gensim') ldamodel = LdaModel(corpus, num_topics=15, id2word=dictionary, passes=15) ldamodel.save('topicModels//model15.gensim') topics = ldamodel.print_topics(num_words=4) for topic in topics: print(topic) # Visaulization works only on Jupyter Notebook # type jupyter notebook or jupyter console dictionary = Dictionary.load('topicModels//dictionary2.gensim') corpus = pickle.load(open('topicModels//corpus2.pkl', 'rb')) ldamd = LdaModel.load('topicModels//model15.gensim') lda_display = pyLDAvis.gensim.prepare(ldamd, corpus,
def saveGensim(self, topic): if topic is None: # generate all self.saveGensim('movie') self.saveGensim('celebrity') self.saveGensim('syria') self.saveGensim('ufo') return posDocs = [] negDocs = [] if topic == 'movie': topic = 'movie_reviews' elif topic == 'celebrity': topic = 'bieber' if topic == 'movie_reviews': count = 100 posDocs = self.movieReviews('positive', count) negDocs = self.movieReviews('negative', count) else: posDocs = self.getArticlesHelper('positive', topic) negDocs = self.getArticlesHelper('negative', topic) listOfTokens = [] # dictionary docs = [] # corpus for posDoc in posDocs: processed = self.processDocForGensim(posDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) for negDoc in negDocs: processed = self.processDocForGensim(negDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) dictionaryFilename = 'gensim_dictionary.txt' corpusFilename = 'gensim_corpus.mm' # make destination files if they don't exist dictionaryPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, dictionaryFilename ) corpusPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, corpusFilename ) corpusTempPath = corpusPath + '.tmp' if os.path.exists(dictionaryPath): os.remove(dictionaryPath) if os.path.exists(corpusPath): os.remove(corpusPath) if os.path.exists(corpusTempPath): os.remove(corpusTempPath) with open(dictionaryPath, 'w') as f: f.write(' ') with open(corpusPath, 'w') as f: f.write(' ') # save dictionary and corpus d = Dictionary(listOfTokens) d.save(dictionaryPath) with open(corpusTempPath, 'w') as f: f.write('\n'.join(docs)) corpus = TextCorpus(corpusTempPath) MmCorpus.save_corpus(corpusPath, corpus) return
] test_texts = [ text_to_word_sequence(data['text']) for data in tqdm(imdb_dataset(test=True)) ] test_labels = [ sentiment[data['sentiment']] for data in imdb_dataset(test=True) ] # test = imdb_dataset(test=True) all_texts = np.concatenate((train_texts, test_texts)).tolist() vocabulary = Dictionary(documents=all_texts) vocabulary.save('imdb_vocabulary') train_x = np.asarray([ np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1 for doc in tqdm(train_texts) ]) train_y = np.asarray(train_labels, dtype=np.int32) test_x = np.asarray([ np.asarray(vocabulary.doc2idx(doc), dtype=np.int32) + 1 for doc in tqdm(test_texts) ]) test_y = np.asarray(test_labels, dtype=np.int32) np.save('train_x.npy', train_x) np.save('train_y', train_y)
) else: langlinks_fname = sys.argv[1] in_dict_fnames = sys.argv[2:-1] num_langs = len(in_dict_fnames) out_dict_fname = sys.argv[-1] with open(langlinks_fname) as langlinks_file: langlinks = csv.reader(langlinks_file) lang_names = next(langlinks) # Read header row out_dict = Dictionary() id_offset = 0 for in_dict_fname, lang_name in zip(in_dict_fnames, lang_names): in_dict = Dictionary.load(in_dict_fname) for token, old_id in in_dict.token2id.items(): df = in_dict.dfs[old_id] new_id = old_id + id_offset new_token = '{}#{}'.format(lang_name, token) out_dict.token2id[new_token] = new_id out_dict.dfs[new_id] = df out_dict.num_docs += in_dict.num_docs out_dict.num_pos += in_dict.num_pos out_dict.num_nnz += in_dict.num_nnz id_offset += len(in_dict) out_dict.save(out_dict_fname)
lt = LoopTimer(update_after=10, avg_length=1000, target=target) for abstract_id, row in infoDF.iterrows(): doc = Doc(vocab).from_disk( os.path.join(path_to_annotations, f"{abstract_id}.spacy")) doc = replace_cluster_in_doc(doc, replace_dic, sorted_mentions, nlp) lemma_s_list.append(doc_2_token(doc, split_sentences=True)) lemma_d_list.append(doc_2_token(doc, split_sentences=False)) abstract_id_list.append(abstract_id) breaker = lt.update(f"Create Pandas - {len(lemma_d_list)}") dictionary = Dictionary(lemma_d_list) id_d_list = [dictionary.doc2idx(document) for document in lemma_d_list] id_s_list = [[dictionary.doc2idx(sentence) for sentence in document] for document in lemma_s_list] corpus = { "abstract_id": abstract_id_list, "lemma_sentence": lemma_s_list, "lemma_document": lemma_d_list, "lemma_id_sentence": id_s_list, "lemma_id_document": id_d_list } with open(os.path.join(path_to_pandas, corpus_file_name), "wb") as handle: pickle.dump(corpus, handle) dictionary.save(os.path.join(path_to_pandas, dictionary_file_name))
def saveWords(words, wordfile): from gensim.corpora.dictionary import Dictionary from gensim.corpora import MmCorpus dict=Dictionary(words) dict.save(wordfile)
db.hset('idlookup', index, postid) class RedisCorpus(object): def __init__(self, postids): self.postids = postids self.numPosts = len(self.postids) def __iter__(self): count = 0 for postid in self.postids: if count % 100 == 0: print "Wrote %d out of %d to corpus: %s" % (count, self.numPosts, time.strftime("%H:%M:%S")) addCorpusMap(count, postid) count += 1 yield corpusOfPost(postid, force=True) def buildCorpus(): """ Returns a corpus object that contains sparse vectors from every post. """ postids = getPostids() corpus = RedisCorpus(postids) return corpus if __name__ == "__main__": buildDictionary(force=True) globalDict.save(dictName) corpus = buildCorpus() BleiCorpus.serialize('redditcorpus.lda-c', corpus)
class CableCorpus(BaseCorpus): """\ The cable corpus consists of several files which are written into a directory. * a dictionary with a ``<word id> <word> <frequency>`` mapping saved under "wordids.pickle" * a JSON file with a ``<cable reference id> <document number>`` mapping under "id2docid.json" * a `Market Matrix format <http://math.nist.gov/MatrixMarket/formats.html>` vector space model file "bow.mm" CAUTION: The corpus overrides any existing files with the same file name in the specified directory. By default, the corpus creates the word dictionary and the vector space model which may lead into an unuseful vector space model. To filter certain words, the corpus may be initialized with a pre-generated word dictionary. To make the dictionary immutable, the property ``allow_dict_updates`` should be set to ``False`` (updates are allowed by default). The resulting vector space model contains only words which are in the word dictionary then. Example to reduce the clutter:: corpus = CableCorpus('/my/directory/') # Add some texts here corpus.add_text('ref-1', u'bla bla bla') corpus.add_text('ref-2', u'bla bla blub') ... corpus.dct.filter_extremes() corpus.close() from gensim.corpora.dictionary import Dictionary # Load previously created dict dct = Dictionary.load_from_text('/my/directory/cables_wordids.txt') # Create another corpus with the previously word dict corpus = CableCorpus('/my/directory/', dct, allow_dict_updates=False) # Add some texts .... corpus.close() """ def __init__(self, path, dct=None, tokenizer=None, allow_dict_updates=True, prefix=None): """\ Initializes the cable corpus. `path` Directory where the generated files are stored. `dct` An existing `gensim.corpora.dictionary.Dictionary` If it's ``None`` (default) a dictionary will be created. `tokenizer` A function to tokenize/normalize/clean-up/remove stop words from strings. If it's ``None`` (default), a default function will be used to tokenize texts. `allow_dict_updates` Indicats if unknown words should be added to the dictionary (default ``True``). `prefix` A prefix for the generated file names. """ super(CableCorpus, self).__init__(tokenizer) if not os.path.isdir(path): raise IOError('Expected a directory path') self.dct = Dictionary() if dct is None else dct self._path = path self._prefix = prefix or 'cables_' self._mw = IncrementalMmWriter(os.path.join(path, self._prefix + 'bow.mm')) self.allow_dict_updates = allow_dict_updates self._cables = [] def add_words(self, reference_id, words): self._cables.append(reference_id) self._mw.add_vector(self.dct.doc2bow(words, self.allow_dict_updates)) def close(self): self._mw.close() self.dct.save(os.path.join(self._path, self._prefix + 'wordids.pickle')) json_filename = os.path.join(self._path, self._prefix + 'id2docid.json') json.dump(dict(zip(self._cables, count())), open(json_filename, 'wb'))