def update_model(subreddit, text_array): debug = False #Path to the dictionary path_to_dict = r'../models/wikidump_wordids/wikidump_wordids.txt' #Path to the folder full of models pertaining to certain subreddits path_to_models_folder = '../models/' + subreddit + '/' #Path to where we're gonna save the model path_to_model = path_to_models_folder + 'model.model' path_to_load = path_to_models_folder + 'model.model' #Load model model = gensim.models.LdaModel.load(path_to_load) #Loading in dictionary dct = Dictionary() dct.add_documents([text_array]) id2word = dct #Cleaning up the text common_corpus = [clean_text(text) for text in text_array] #Debug Print if debug: print(common_corpus) #Training the new model model.update(common_corpus) #Save the new model model.save(path_to_model)
def extract_dictionary(document_paths, max_words): """ Extracts a gensim Dictionary object from a set of documents. Parameters ---------- document_paths : [str] List of document paths that make up the corpus. Returns ------- dictionary : gensim.corpora.Dictionary Extracted dictionary (or tokenizer). """ print("Extracting dictionary from corpus") dictionary = Dictionary(prune_at=None) preprocessor = TextPreprocessor() for document_path in tqdm(document_paths): with open(document_path, "r") as f: document = f.read() document = preprocessor.clean_sentence(document, alphabetic_only=True) words = preprocessor.tokenize_text(document) dictionary.add_documents([words]) if len(dictionary) > max_words: start = time() dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=int(max_words * 0.9)) print("Dictionary filtered in {} seconds".format(time() - start)) return dictionary
class MyCorpus(object): def __init__(self, input_file, K): self.K = K self.input_file = input_file self.dictionary = Dictionary() with open(input_file, "rt") as f: for line in f: self.dictionary.add_documents([line.split()]) self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K) def __iter__(self): count = 1 with open(self.input_file, "rt") as f: count += 1 for line in f: yield self.dictionary.doc2bow(line.rstrip().split()) def __str__(self): s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, " s += str(len(self.dictionary.keys())) + " features, " s += str(corpus.dictionary.num_nnz) + " non-zero entries)" return s def __repr__(self): return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
class CorpusMUC(corpora.TextCorpus): def __init__(self): super(TextCorpus, self).__init__() self.stopwords = NLTKStopwords.words('english') self.stopwords.extend(['``', ',', '(', ')', '.']) self.msgs = MUCmessages() self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) def get_texts(self): """ Parse documents from the .cor file provided in the constructor. Lowercase each document and ignore some stopwords. .cor format: one document per line, words separated by whitespace. """ for doc in self.msgs: document = [ word for word in [ word_tokenize(sentence) for sentence in wordpunct_tokenize( doc[1]['content'].lower()) ] ] yield [ str(word[0]) for word in document if str(word[0]) not in self.stopwords ] def __len__(self): """Define this so we can use `len(corpus)`""" if 'length' not in self.__dict__: logger.info( "caching corpus size (calculating number of documents)") self.length = sum(1 for doc in self.get_texts()) return self.length
def __init__(self, movie_tags): """ movie_tags: dict {item_id => tags} """ self.item_vectors = {} self.id_to_idx = {} self.idx_to_id = [] dictionary = Dictionary() for item, tags in movie_tags.items(): dictionary.add_documents([tags]) for item, tags in movie_tags.items(): self.item_vectors[item] = dictionary.doc2bow(tags) data = [] row_ind = [] col_ind = [] i = 0 for item, tags in self.item_vectors.items(): for (col, count) in tags: data.append(count) row_ind.append(i) col_ind.append(col) self.id_to_idx[item] = i self.idx_to_id.append(item) i += 1 self.item_vectors = csr_matrix((data, (row_ind, col_ind)), shape=(i, dictionary.num_pos))
def create_dictionary(doc_dict): # create gensim dictionary by using python dictionary as input dic = Dictionary() for doc in doc_dict: dic.add_documents([[word for (word, _) in doc_dict[doc]]]) dic.save(dic_path) return dic
def _generate_vocabulary(self): vocab = Dictionary() session = DBSession() i = 0 for question in session.query(Question).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d questions :: %d unique tokens' % (i, self.n_questions, vocab.num_docs)) strings = [question.title, question.content] if question.content is not None else [question.title] vocab.add_documents([CorpusDictionary.tokenize(s) for s in strings]) i = 0 for answer in session.query(Answer).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d answers :: %d unique tokens' % (i, self.n_answers, vocab.num_docs)) vocab.add_documents([CorpusDictionary.tokenize(answer.content)]) # commit and close the session session.commit() session.close() return vocab
def dic_creation(ids): for phase in ['train', 'val']: print(phase) with open('data/preprocessed_{}.pickle'.format(phase), 'rb') as preprocessed: preprocessed_dict = pickle.load(preprocessed) article_txts = [] for id_ in tqdm(ids[phase]): article_txts.append(preprocessed_dict[id_]['article_text']) # article_txts = list(map(lambda id_: preprocessed_dict[id_]['article_text'], ids[phase])) print("loaded text") article_tokens = [] for txt in tqdm(article_txts): article_tokens.append(word_tokenize(txt)) # article_tokens = list(map(lambda txt: word_tokenize(txt), article_txts)) print("tokenized") if phase == 'train': dct = Dictionary(article_tokens) else: with open('data/dict.pickle', 'rb') as handle: dct = pickle.load(handle) dct.add_documents(article_tokens) print("dict processed") # save dict with open('data/dict.pickle', 'wb') as handle: pickle.dump(dct, handle, protocol=pickle.HIGHEST_PROTOCOL) print("saved")
def create_dic(): txt_in = get_in() txt_out = get_out() dic = Dictionary(txt_in+txt_out+[[EXTRA]]) dic.add_documents([[START,STOP,EXTRA]]) dic.save(DIC_NAME) return dic
def trainModel(): """ Train a model """ if args.mode == 'Random': return args.topics, 0 # need to train on dump files = [ f"{args.input}/{f}" for f in os.listdir(args.input) if os.path.isfile(os.path.join(args.input, f)) ] if args.mode == 'LDA': # create dictionary with open(files[0], "r", encoding='utf-8') as f: dct = Dictionary([' '.join(f.readlines()).split()]) for filename in files[1:]: with open(filename, "r", encoding='utf-8') as f: dct.add_documents([' '.join(f.readlines()).split()]) # create corpus corpus = [] for filename in files: with open(filename, "r", encoding='utf-8') as f: corpus.append(dct.doc2bow(' '.join(f.readlines()).split())) lda = LdaModel(corpus, num_topics=args.topics) lda.save("./models/LDAdump.model") dct.save("./models/LDAdump.dct") return lda, dct if args.mode == 'loadLDA': return LdaModel.load("./models/LDAdump.model"), Dictionary.load( "./models/LDAdump.dct")
def preprocess(segments, dct=None, bigram=None): processed_segments = [] for seg in segments: processed_seg = [] for word in seg: if True in [word.is_space, word.is_stop, word.is_punct]: continue word = word.lemma_ word = word.lower() processed_seg.append(word) processed_segments.append(processed_seg) if bigram is None: phrases = Phrases(processed_segments, min_count=3, threshold=3) bigram = Phraser(phrases) processed_segments = bigram[processed_segments] if dct is None: dct = Dictionary(processed_segments) else: dct.add_documents(processed_segments) return [dct.doc2bow(line) for line in processed_segments], dct, processed_segments, bigram
def build_tag_vectors(tag_directory_path): """Loads tag files, builds sparse vectors for each song Parameters ---------- tag_directory_path : String, path of directory containing tags Returns ------- id_vec_mapping : dict (song id => list[tuple(tagId, count)]) dictionary : gensim Dictionary containing all tags and ids """ dictionary = Dictionary() for f in listdir(tag_directory_path): with open(tag_directory_path+"/"+f, 'r') as tags: tokens = tags.read().split(sep=' ') dictionary.add_documents([tokens]) dictionary.filter_extremes(no_below=2, no_above=0.5) dictionary.compactify() id_vec_mapping = {} for f in listdir(tag_directory_path): song_id = f[0:-4] with open(tag_directory_path+"/"+f, 'r') as tags: tokens = tags.read().split(sep=' ') sparse_vec = dictionary.doc2bow(tokens) add_to_dictionary(id_vec_mapping, (song_id, sparse_vec)) return id_vec_mapping, dictionary
class KeywordDict(): # dictionary.bin 있으면 로드하고 없으면 새로 만들고 저장 def __init__(self, name="default", phraser=None): self.name = "dictionary_" + name + ".bin" self.phraser = phraser if self.name in os.listdir(dir_dictionary): with open(dir_dictionary + self.name, 'rb') as dic: self.get_dict = pickle.load(dic) print("keyword dictionary loaded") else: print("dictionary not exists") print("start building...") self.build_dictionary() self.save() # 사전 만드는 함수 def build_dictionary(self): self.get_dict = Dictionary() tickers = [ i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv") ] for ticker in tickers: df = pd.read_csv(dir_cleaned_news + ticker, index_col=0) self.get_dict.add_documents(tokenizer(df['content'], self.phraser)) print(ticker + " added") print("done") # 저장 def save(self): with open(dir_dictionary + self.name, "wb") as dic: pickle.dump(self.get_dict, dic)
def preprocess(documents, stem=False, vocab_size=10000, oov_token="<OOV>", oov_id=-1): """Preprocess documents. Args: documents: An array of strings, each string representing a document. stem: (bool) Whether to use a stemmer. Defaults to False. Returns: (gensim Dictionary, tokenized documents) """ porter_stemmer = PorterStemmer() def process_document(doc): tokens = word_tokenize(doc) tokens = [token.lower() for token in tokens if token.isalpha()] if stem: tokens = [porter_stemmer.stem(token) for token in tokens] return tokens tokenized_docs = list(map(process_document, documents)) dictionary = Dictionary(tokenized_docs) dictionary.filter_extremes(no_below=5, no_above=0.8, keep_n=vocab_size) # Add OOV to dictionary dictionary.add_documents([["<OOV>"]]) return dictionary, tokenized_docs
def _generate_vocabulary(self): vocab = Dictionary() session = DBSession() i = 0 for question in session.query(Question).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d questions :: %d unique tokens' % (i, self.n_questions, vocab.num_docs)) strings = [question.title, question.content ] if question.content is not None else [question.title] vocab.add_documents( [CorpusDictionary.tokenize(s) for s in strings]) i = 0 for answer in session.query(Answer).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d answers :: %d unique tokens' % (i, self.n_answers, vocab.num_docs)) vocab.add_documents([CorpusDictionary.tokenize(answer.content)]) # commit and close the session session.commit() session.close() return vocab
def testFilterTokens(self): self.maxDiff = 10000 d = Dictionary(self.texts) removed_word = d[0] d.filter_tokens([0]) expected = { 'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'trees': 9, 'user': 7 } del expected[removed_word] self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) expected[removed_word] = len(expected) d.add_documents([[removed_word]]) self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
def create_dictionary(dataset, texts): dict = Dictionary([]) for text in texts: dict.add_documents([text]) dict.save_as_text('../dataset_files/dictionary.txt') return dict
def _bow(table, input_col, add_words=None, no_below=1, no_above=0.8, keep_n=10000): word_list = table[input_col].tolist() dictionary = Dictionary(word_list) if add_words != None: dictionary.add_documents([add_words]) dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=None) params = { 'Input Column': input_col, 'Minimum Number of Occurrence': no_below, 'Maximum Fraction of Occurrence': no_above, 'Keep N most Frequent': keep_n } empty_description = '' if len(list(dictionary.dfs.values())) == 0: out_table = pd.DataFrame([], columns=['token', 'document_frequency']) empty_description = 'Out table is empty since parameter \"Minimum Number of Occurrence\" is greater than the maximum of document frequency.' else: out_table = pd.DataFrame.from_dict(dictionary.token2id, orient='index').drop([0], axis=1) out_table.insert(loc=0, column='token', value=dictionary.token2id.keys()) token_cnt = sorted(dictionary.dfs.items(), key=operator.itemgetter(0)) dfs_list = [] for i in range(len(dictionary.dfs)): dfs_list.append(token_cnt[i][1]) out_table['document_frequency'] = dfs_list rb = BrtcReprBuilder() rb.addMD( strip_margin(""" |# Bag of Words Result |### Parameters | | {display_params} | | {description} | """.format(display_params=dict2MD(params), description=empty_description))) model = _model_dict('bow') model['dict_table'] = out_table model['dictionary'] = dictionary model['add_words'] = add_words model['_repr_brtc_'] = rb.get() return {'model': model, 'out_table': out_table}
def load_dictionary(self, filepath): dictionary = Dictionary() with open(filepath, "rb") as f: for line in f.readlines(): # example = SampleTrainingExample(line) # context = example.context dictionary.add_documents([[word.lower() for word in line.split()]]) return dictionary
def get_coherence(config, topicvec, docwords, glove_vectors): """Calculate UMass and w2v (GloVe) coherence scores.""" # Clean docwords docs = [] for word_list in docwords: doc = word_list[0] docs.append(doc) # Clean topics byte_topics = topicvec.printTopWordsInTopics(topicvec.docs_theta, True) word_topics = [] one_word = [] for topic in byte_topics: new_topic = [] for word in topic: if type(word) != str: word = word.decode() new_topic.append(word) one = [word] one_word.append(one) word_topics.append(new_topic) # Get dictionary vocab_dict = Dictionary(docs[1:]) # Make sure words in topics are in the dictionary vocab_dict.add_documents(one_word) # Get corpus corpus = [vocab_dict.doc2bow(doc) for doc in docs] # Calculate UMass coherence score # The closer to 0, the more coherent cm = CoherenceModel(topics=word_topics, corpus=corpus, dictionary=vocab_dict, coherence='u_mass') umass_coherence = cm.get_coherence() # Calculate GloVe coherence score # Ranges between 0 and 1 # The closer to 1, the better cm = CoherenceModel(topics=word_topics, corpus=corpus, dictionary=vocab_dict, coherence='c_w2v', keyed_vectors=glove_vectors) glove_coherence = cm.get_coherence() # Log coherence score and other metrics results_dict = OrderedDict([('num_topics', config['K']), ('alpha0', config['alpha0']), ('alpha1', config['alpha1']), ('delta', config['iniDelta']), ('umass', umass_coherence), ('glove', glove_coherence)]) return results_dict
def build_vocabulary_and_corpus(): ''' Build the vocabularies and stem sequences for each type of entities. ''' # Vocabulary (same for question and answers) v = Dictionary() # Stemmer. stemmer = PorterStemmer() # Tokenizer. tokenizer = TweetTokenizer() # Read indexes user_index, question_index, answer_index, comment_index = read_indexes() # Question, answer q = {} a = {} # Read entities. with open(entity_path, 'rb') as obj: entities = pickle.load(obj) # Browse question and answers to first build vocabulary. for e in entities: # Question or answer. if e['type'] == 'Q' or e['type'] == 'A': # String content. title = str(e['title']).encode('utf-8').lower() content = str(e['content']).encode('utf-8').lower() # Tokenize d = tokenizer.tokenize(title + content) # Stem word d = [stemmer.stem(s) for s in d] # Process vocabulary. v.add_documents([d]) # Question if e['type'] == 'Q': q[question_index[e['id']]] = d # Answer if e['type'] == 'A': a[answer_index[e['id']]] = d # Write question corpus. with open(os.path.join(data_path, 'q.corpus'), 'wb') as f: pickle.dump(q, f) # Write answer corpus. with open(os.path.join(data_path, 'a.corpus'), 'wb') as f: pickle.dump(a, f) # Write to analyse. v.filter_extremes(no_below=1000, keep_n=10000) v.compactify() v.save(os.path.join(data_path, "raw_vocabulary.gensim"))
class LoadCorpora(Component, TextCorpus): """ Load corpus: input is an array with a list of json files """ def __init__(self, input_files=None): """Redefine the gensim's TextCorpus init method""" super().__init__() self.input = input_files self.dictionary = Dictionary(prune_at=5000000) self.metadata = False if input_files is not None: self.dictionary.add_documents(self.get_texts(), prune_at=5000000) else: self.logger.warning( "No input document stream provided; assuming " "dictionary will be initialized some other way.") def get_texts(self): """ Iterate through documents: yield each token on each document """ if not isinstance(self.input, list): raise ConfigError('Input argument is not a List') for filename in self.input: # each file with open(filename, 'r') as stream: for line in stream: # each line doc = json.loads(line) yield doc['content'].split() # split on each word def __iter__(self): """ Iterate through documents: yield the bow representation of each document """ if not isinstance(self.input, list): raise ConfigError('Input argument is not a List') for filename in self.input: # each file with open(filename, 'r') as stream: for line in stream: # each line doc = json.loads(line) yield self.dictionary.doc2bow(doc['content'].split()) def save(self): """Override abstract method""" return def save_corpus(self, fname, corpus, id2word=None, metadata=False): """Override abstract method""" return
def create_dict(self, corpus_file): dictionary = Dictionary(); with open(corpus_file,"rb") as infile: lines = infile.readlines() #reads single line from file for line in lines: doc = line #.split() #doc as bag of words (bow) of tokens in this line dictionary.add_documents([doc]) #infile.close() return dictionary
def test_save_load(): dct = Dictionary() docs = ['一种 大头菜 自然风', '风 主要 包括 大头菜 风', '架 主要 包括 底座 支柱'] docs_token_list = get_token_lists_of_docs(docs) dct.add_documents(docs_token_list) corpus = [dct.doc2bow(['大头菜', '风', '底座'])] print('corpus to save is {}'.format(corpus)) save_path = 'resources/corpus/test_corpus.mm' save2disk(save_path, corpus) load_corpus = load_from_disk(save_path) print('load corpus is {}'.format(load_corpus))
def get_dictionary(documents: Dict[int, List[str]]) -> Dictionary: if os.path.exists(DICTIONARY_FILE_NAME): print(f"loading dictionary from {DICTIONARY_FILE_NAME}") gensim_dict = Dictionary.load(DICTIONARY_FILE_NAME) else: print("creating dictionary") gensim_dict = Dictionary() gensim_dict.add_documents(documents.values()) gensim_dict.compactify() print(f"saving dictionary to {DICTIONARY_FILE_NAME}") gensim_dict.save(DICTIONARY_FILE_NAME) return gensim_dict
def testFilterKeepTokens_keepn(self): # keep_tokens should also work if the keep_n parameter is used, but only # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are # still getting removed to reduce the size to keep_n!) d = Dictionary(self.texts) # Note: there are four tokens with freq 3, all the others have frequence 2 # in self.texts. In order to make the test result deterministic, we add # 2 tokens of frequency one d.add_documents([['worda'], ['wordb']]) # this should keep the 3 tokens with freq 3 and the one we want to keep d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda']) expected = {'graph', 'trees', 'system', 'user', 'worda'} self.assertEqual(set(d.token2id.keys()), expected)
class Text2BowTransformer(TransformerMixin, BaseEstimator): """ Base Text2Bow module """ def __init__(self, prune_at=2000000, tokenizer=tokenize): """ Sklearn wrapper for Text2Bow model. """ self.gensim_model = None self.prune_at = prune_at self.tokenizer = tokenizer def fit(self, X, y=None): """ Fit the model according to the given training data. """ tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): """ Return the BOW format for the input documents. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # input as python lists check = lambda x: [x] if isinstance(x, string_types) else x docs = check(docs) tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), docs)) X = [[] for _ in range(0, len(tokenized_docs))] for k, v in enumerate(tokenized_docs): bow_val = self.gensim_model.doc2bow(v) X[k] = bow_val return X def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) tokenized_docs = list(map(lambda x: list(self.tokenizer(x)), X)) self.gensim_model.add_documents(tokenized_docs) return self
class Text2BowTransformer(TransformerMixin, BaseEstimator): """ Base Text2Bow module """ def __init__(self, prune_at=2000000, tokenizer=tokenize): """ Sklearn wrapper for Text2Bow model. """ self.gensim_model = None self.prune_at = prune_at self.tokenizer = tokenizer def fit(self, X, y=None): """ Fit the model according to the given training data. """ tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): """ Return the BOW format for the input documents. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # input as python lists check = lambda x: [x] if isinstance(x, string_types) else x docs = check(docs) tokenized_docs = [list(self.tokenizer(x)) for x in docs] X = [[] for _ in range(0, len(tokenized_docs))] for k, v in enumerate(tokenized_docs): bow_val = self.gensim_model.doc2bow(v) X[k] = bow_val return X def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model.add_documents(tokenized_docs) return self
def create_comment_dictionary(filter_every_n=1000, max_iter=20, from_db=True, get_data_func=None): comment_dict = Dictionary() text_gen = data_preprocessor(max_iter=max_iter, from_db=from_db, get_data_func=get_data_func) n_iter = 0 for _, stemmed_text, _ in text_gen: comment_dict.add_documents([stemmed_text]) n_iter += 1 if n_iter % filter_every_n == 0: comment_dict.filter_extremes() return comment_dict
def build_vocab(): start = time.time() test_path = os.path.join(config.DATA_PATH, 'test.csv') train_path = os.path.join(config.DATA_PATH, 'train.csv') normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt') bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram') bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt') if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH): try: os.mkdir(config.PROCESSED_PATH) except OSError: pass vocab = {} train_df = read_file(train_path) test_df = read_file(test_path) print('tokenizing vocab file') texts = np.concatenate([train_df.comment_text.fillna('N/A').values, test_df.comment_text.fillna('N/A').values]) with open(normalized_text_path, 'w') as f: processed_text = parallelize_dataframe(texts, tokenizer) for line in processed_text: f.write(line + '\n') gc.collect() lines = LineSentence(normalized_text_path) bigram = Phrases(lines) bigram.save(bigram_path) phraser = Phraser(bigram) with open(bigram_comments_path, 'w', encoding='utf_8') as f: for comment in lines: comm = u' '.join(phraser[comment]) f.write(comm + '\n') commnets = LineSentence(bigram_comments_path) bigram_dict = Dictionary(commnets) bigram_dict.filter_extremes(no_below=config.THRESHOLD) bigram_dict.save_as_text(config.VOCAB_PATH) bigram_dict.add_documents([['<pad>']]) with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f: f.write('VOCAB_SIZE = {}'.format(len(bigram_dict))) print('time passed: {} minutes'.format((time.time() - start) / 60))
def build_hdp_vec(docs, targets, dct=None, hdp=None): docs = [[str(o) for o in one] for one in docs] if dct is None: # train set dct = Dictionary(docs) for one in docs: dct.add_documents([[str(o) for o in one]]) copus = [dct.doc2bow(o) for o in docs] if hdp is None: # train hdp = HdpModel(copus, dct) v = [hdp[o] for o in copus] v_d = matutils.corpus2dense(v, num_terms=len(dct.token2id)).T return copus, v_d, targets, dct, hdp
def testFilterTokens(self): self.maxDiff = 10000 d = Dictionary(self.texts) removed_word = d[0] d.filter_tokens([0]) expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'trees': 9, 'user': 7} del expected[removed_word] self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys())) expected[removed_word] = len(expected) d.add_documents([[removed_word]]) self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))
def get_data_tokenizer(fromdate, todate): print 'Starting get and save data from mysql-server into local folder....' fromdate = fromdate + ' 00:00:00' todate = todate + ' 23:59:59' connection = my_connection.getConnection() cursor = connection.cursor() query = 'SELECT id, vntokenizer, catid FROM news WHERE create_time BETWEEN ' + '\'' + fromdate + '\' AND \'' + todate + '\';' print query cursor.execute(query) rows = cursor.fetchall() count = 0 token_dictionary = Dictionary() data = dict() for row in rows: id = row[0] tokenizer = row[1] catid = row[2] if tokenizer != None: tokenizer = tokenizer.lower() count += 1 print count print tokenizer token_list = tokenizer.split(' ') valid_token_list = list() for token in token_list: if my_util.check_valid_token(token): valid_token_list.append(token) token_dictionary.add_documents([valid_token_list]) if catid == my_catid: data[id] = valid_token_list my_connection.closeConnection(connection) # save dictionary and data into text file token_dictionary.save_as_text('..' + parameter.FILE_DICTIONARY) fb = open('..' + parameter.FILE_DATA, 'wb') pickle.dump(data, fb) fb.close() print 'Done get and save data from mysql-server!'
class MyCorpus(object): def __init__(self, filename, max_vocab_size = 2000000): self.filename = filename self.max_vocab_size = max_vocab_size self.dictionary = Dictionary() self._build_dict() def _build_dict(self): with open(self.filename, "rt") as f: for line in f: doc = line.rstrip().split() self.dictionary.add_documents([doc]) def __iter__(self): with open(self.filename, "rt") as f: for sentence in f: yield sentence.rstrip().split()
def main(path, epochs): with open(path) as f: ds = csv.DictReader(f) tweets = (d['tweet_text'] for d in ds) tweets = list(tweets) dct = Dictionary() with Pool() as pool: dct.add_documents(pool.map(preprocess, tweets)) dct.filter_extremes(no_below=10, no_above=0.5) model = SimpleEmbedder(dct, dims=100) model.load_docs(tweets[-100000:]) for i in range(int(epochs)): loss = model._epoch() print(f'Epoch {i} loss: {loss[1]}') np.save('embeddings', model.embeddings)
def compile_vocab(docs, limit=1e6, verbose=0, tokenizer=Tokenizer(stem=None, lower=None, strip=None)): """Get the set of words used anywhere in a sequence of documents and assign an integer id This vectorizer is much faster than the scikit-learn version (and only requires low/constant RAM ?). >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11)) >>> d = compile_vocab(gen, verbose=0) >>> d <gensim.corpora.dictionary.Dictionary ...> >>> print(d) Dictionary(4 unique tokens: [u'AAA', u'BBB', u'CCC', u'label']) >>> sorted(d.token2id.values()) [0, 1, 2, 3] >>> sorted(d.token2id.keys()) [u'AAA', u'BBB', u'CCC', u'label'] """ tokenizer = make_tokenizer(tokenizer) d = Dictionary() try: limit = min(limit, docs.count()) docs = docs.iterator() except (AttributeError, TypeError): pass for i, doc in enumerate(docs): # if isinstance(doc, (tuple, list)) and len(doc) == 2 and isinstance(doc[1], int): # doc, score = docs try: # in case docs is a values() queryset (dicts of records in a DB table) doc = doc.values() except AttributeError: # doc already is a values_list if not isinstance(doc, str): doc = ' '.join([str(v) for v in doc]) else: doc = str(doc) if i >= limit: break d.add_documents([list(tokenizer(doc))]) if verbose and not i % 100: log.info('{}: {}'.format(i, repr(d)[:120])) return d
def __buildVectors(self, dataset_file): lines = 0 dct = Dictionary() tmp_file = TemporaryFile(mode='w+t', encoding='utf-8') for doc_idx, (document, lines) in enumerate(self.__buildDocument(dataset_file)): dct.add_documents([document]) tmp_file.write(' '.join(document) + '\n') if doc_idx % 500 == 499: G.log.debug('%d', doc_idx) if dct.num_docs < self.__LeastDocuments: # 字典字数太少或文档数太少,没必要聚类 tmp_file.close() raise UserWarning('Too few records[%d]' % dct.num_docs) # 去掉低频词,压缩字典 num_token = len(dct) no_below = int(min(self.__NoBelow, int(dct.num_docs / 50))) dct.filter_extremes(no_below=no_below, no_above=0.999, keep_n=self.__KeepN) dct.compactify() G.log.info( 'Dictionary[%d tokens, reduced from %d] built with [%s]. ' '[%d]records(%d lines, %d words) in %s', len(dct), num_token, self.__ruleSet[0], dct.num_docs, lines, dct.num_pos, dataset_file) if len(dct) < self.__LeastTokens: # 字典字数太少,重新采样 G.log.info('Too few tokens[%d], Re-sample with next RuleSet].' % (len(dct))) tmp_file.close() return None, None # 构造tf-idf词袋和文档向量 tfidf_model = TfidfModel(dictionary=dct, normalize=False) vectors = np.zeros((dct.num_docs, len(dct))) tmp_file.seek(0) for doc_idx, new_line in enumerate(tmp_file): for (word_idx, tf_idf_value) in tfidf_model[dct.doc2bow( new_line.split())]: # [(id,tf-idf)...], id是升序 vectors[doc_idx, word_idx] = tf_idf_value G.log.info('[%d*%d]Vectors built, %.2f%% non-zeros.' % (dct.num_docs, len(dct), dct.num_nnz * 100 / len(dct) / dct.num_docs)) tmp_file.close() return dct, vectors
class DictionaryLearner(object): '''Learn a gensim dictionary from all available documents.''' def __init__(self, n=4): '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.''' self._ngram = NgramTransformer(n) self._dictionary = Dictionary() def fit(self, documentstorage, filter_extremes=True): '''Fit a dictonary using documents from given documentstorage.''' for document in documentstorage.load_iterator(u''): text_document = document.text ngrams = self._ngram.transform([text_document]) self._dictionary.add_documents(ngrams) if filter_extremes: self._dictionary.filter_extremes() def get(self): return self._dictionary
class Text2BowTransformer(TransformerMixin, BaseEstimator): """ Base Text2Bow module """ def __init__(self, prune_at=2000000, tokenizer=tokenize): """ Sklearn wrapper for Text2Bow model. """ self.gensim_model = None self.prune_at = prune_at self.tokenizer = tokenizer def fit(self, X, y=None): """ Fit the model according to the given training data. """ tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): """ Return the BOW format for the input documents. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # input as python lists if isinstance(docs, string_types): docs = [docs] tokenized_docs = (list(self.tokenizer(doc)) for doc in docs) return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model.add_documents(tokenized_docs) return self
class SeriesCorpus(TextCorpus): def __init__(self, series, vocab=None, stem=False, bigram=None, labels=True): """ Create a corpus that returns one row at a time out of a Pandas Series""" self.series = series self.metadata = False if vocab is not None: vocab = set(vocab) self.vocab = vocab self.labels = labels self.kwargs = dict(stem=stem, bigram=bigram) logging.info("Building SeriesCorpus") self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) def __iter__(self): if self.labels: for index, line in zip(self.series.index, self.series.values): label = ['SENT_%s' % str(index)] ls = LabeledSentence(line.split(' '), label) yield ls else: for index, line in self.series.index, self.series.values: yield line.split(' ') def line_iter(self, line): if self.vocab is not None: for word in line.split(' '): if word in self.vocab: yield word else: for word in line.split(' '): yield word def get_texts(self): logging.info("Iterating SeriesCorpus") for lineno, line in enumerate(self.series.values): if self.metadata: yield self.line_iter(line), (lineno,) else: yield self.line_iter(line)
class Text2BowTransformer(TransformerMixin, BaseEstimator): """Base Text2Bow module , wraps :class:`~gensim.corpora.dictionary.Dictionary`. For more information please have a look to `Bag-of-words model <https://en.wikipedia.org/wiki/Bag-of-words_model>`_. """ def __init__(self, prune_at=2000000, tokenizer=tokenize): """ Parameters ---------- prune_at : int, optional Total number of unique words. Dictionary will keep not more than `prune_at` words. tokenizer : callable (str -> list of str), optional A callable to split a document into a list of each terms, default is :func:`gensim.utils.tokenize`. """ self.gensim_model = None self.prune_at = prune_at self.tokenizer = tokenizer def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : iterable of str A collection of documents used for training the model. Returns ------- :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` The trained model. """ tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model = Dictionary(documents=tokenized_docs, prune_at=self.prune_at) return self def transform(self, docs): """Get the BOW format for the `docs`. Parameters ---------- docs : {iterable of str, str} A collection of documents to be transformed. Returns ------- iterable of list (int, int) 2-tuples. The BOW representation of each document. """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # input as python lists if isinstance(docs, string_types): docs = [docs] tokenized_docs = (list(self.tokenizer(doc)) for doc in docs) return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] def partial_fit(self, X): """Train model over a potentially incomplete set of documents. This method can be used in two ways: 1. On an unfitted model in which case the dictionary is initialized and trained on `X`. 2. On an already fitted model in which case the dictionary is **expanded** by `X`. Parameters ---------- X : iterable of str A collection of documents used to train the model. Returns ------- :class:`~gensim.sklearn_api.text2bow.Text2BowTransformer` The trained model. """ if self.gensim_model is None: self.gensim_model = Dictionary(prune_at=self.prune_at) tokenized_docs = [list(self.tokenizer(x)) for x in X] self.gensim_model.add_documents(tokenized_docs) return self
class MyCorpus(object): ''' Corpus class for streaming review documents ''' def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \ maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL): self.file_list = file_list # list of cuisine text files self.file_dir = file_dir # directory of cuisine text files self.maxwords = maxwords # maximum number of words to keep after building dictionary from clusters self.cluster_words = cluster_words # maximum number of words to keep from each cluster self.cluster_ul = cluster_ul # upper proportion of reviews to limit for cluster processing self.mindf = mindf # minimum number of documents to keep word self.maxdf = maxdf # max proportion of documents to keep word self.agglomerate = True # return clusters as single documents (True) or return single reviews (False) if dictionary: self.dictionary = dictionary else: self.dictionary = Dictionary() self._build_dict() def __str__(self): return "<MyCorpus at " + str(hex(id(self))) + ">" def __repr__(self): return self.__str__() def _build_dict(self): for filename in self.file_list: dictionary = dict() num_reviews = 0 with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: num_reviews += 1 words = line[REVIEW_INDEX:].split() for word in set(words): if word not in dictionary: dictionary[word] = 1 else: dictionary[word] += 1 doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul] doc.sort(key = lambda x: -x[1]) doc = [word for word, f in doc] self.dictionary.add_documents([doc[:self.cluster_words]]) print("%s added to corpus dictionary!" % (filename,)) self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords) self.dictionary.save("cuisine_dictionary.gensimDict") def __iter__(self): ''' Iterates through cuisines by combining all reviews for each cuisine into a single processed document. Also stores the length of each processed document ''' if self.agglomerate: for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f]) yield self.dictionary.doc2bow(doc.split()) else: reviewIDs = set() for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: id = line[:RATING_INDEX - 1] if id not in reviewIDs: reviewIDs.update([id]) doc = line[REVIEW_INDEX:].rstrip() yield self.dictionary.doc2bow(doc.split())
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
class YahooDictionary: def __init__(self, source_file, vocab_size=20000, max_ans_len=1000, max_sub_len=100, max_cont_len=500, dict_file_name=''): assert os.path.exists(source_file), 'The file "%s" was not found' % source_file self.source_file = source_file self.vocab = Dictionary() self.vocab_size = vocab_size print('Creating XML tree...') tree = ET.parse(source_file) self.root = tree.getroot() # maximum lengths for everything self.max_ans_len = max_ans_len self.max_sub_len = max_sub_len self.max_cont_len = max_cont_len print('Creating dictionary...') self._create_dictionary() @staticmethod def tokenize(text): return gensim.utils.tokenize(text, to_lower=True) def _create_dictionary(self): categories = set() # create dictionary for vespaadd in self.root.iter('vespaadd'): doc = vespaadd.find('document') subject_text = YahooDictionary.tokenize(doc.find('subject').text) content_text = YahooDictionary.tokenize(doc.find('content').text) self.vocab.add_documents([subject_text, content_text], prune_at=self.vocab_size) # category categories.add(doc.find('cat').text) # answers answers = [YahooDictionary.tokenize(answer.text) for answer in doc.find('nbestanswers').getchildren()] self.vocab.add_documents(answers, prune_at=self.vocab_size) self.cat_to_idx = dict((c, i+1) for i, c in enumerate(categories)) self.idx_to_cat = dict((i+1, c) for i, c in enumerate(categories)) def get_docs(self): all_answers = [] all_subjects = [] all_contents = [] all_categories = [] # create dictionary for vespaadd in self.root.iter('vespaadd'): doc = vespaadd.find('document') # subject and content subject_text_iter = YahooDictionary.tokenize(doc.find('subject').text) content_text_iter = YahooDictionary.tokenize(doc.find('content').text) subject_enc = [self.vocab.token2id[x] for x in itertools.islice(subject_text_iter, self.max_sub_len)] content_enc = [self.vocab.token2id[x] for x in itertools.islice(content_text_iter, self.max_cont_len)] # category index category = self.cat_to_idx[doc.find('cat').text] # answers answers = [YahooDictionary.tokenize(answer.text) for answer in doc.find('nbestanswers').getchildren()] for answer in answers: answer_enc = [self.vocab.token2id[x] for x in itertools.islice(answer, self.max_ans_len)] all_categories.append(category) all_subjects.append(subject_enc) all_contents.append(content_enc) all_answers.append(answer_enc) return pad_sequences(all_answers, self.max_ans_len),\ pad_sequences(all_subjects, self.max_sub_len),\ pad_sequences(all_contents, self.max_cont_len),\ np.array(all_categories)