def get_tfidf_model(): if os.path.isfile(TFIDF_FILE): return TfidfModel.load(TFIDF_FILE) else: model = TfidfModel(get_corpus(), get_dictionary()) model.save(TFIDF_FILE) return model
class TFIDFmodel(object): def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora']) self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,))) self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath) def __contains__(self, item): return item in self.inner_model
def cal_tfidf(documents, topk=10) -> List: """ tfidf模型训练 :param documents: 要进行训练的文档 :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词 :return: """ # 单个文档分成列表 docs = [[word for word in document.split(' ')] for document in documents] # 生成字典 dictionary = corpora.Dictionary(docs) # 生成bag of word docs_bow = [dictionary.doc2bow(doc) for doc in docs] if os.path.isfile(tfidfmodel): model = TfidfModel.load(tfidfmodel) else: model = TfidfModel(docs_bow) model.save(tfidfmodel) # 生成文本向量 docs_vector = list(model[docs_bow]) # 对所有的文本向量进行排序,取钱topk docs_sort_vector = [ sorted(doc, key=lambda x: x[1], reverse=True)[:topk] for doc in docs_vector ] # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表 docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc] for doc in docs_sort_vector] return docs_sort_chinese
def buildTfidfModel(corpus): print('get tfidf model...') if not os.path.exists(modelpath + 'tfidf.model'): # 构造tfidf向量 tfidf = TfidfModel(corpus) tfidf.save(modelpath + 'tfidf.model') else: tfidf = TfidfModel.load(modelpath + 'tfidf.model') print('done') return tfidf
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
def main(): parser = ArgumentParser() parser.add_argument('-e', '--encoding') parser.add_argument('-o', '--output-file') args = parser.parse_args() encoding = args.encoding output_fn = args.output_file if not output_fn: sys.exit(-1) if encoding: sys.stdout = codecs.getwriter(encoding)(sys.stdout) sys.stdin = codecs.getreader(encoding)(sys.stdin) texts = (line.split() for line in sys.stdin) logging.info('Creating vocabulary ...') vocab = Dictionary(texts) logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2')) vocab.save(output_fn) logging.info('Compressing vocabulary ...') with open(output_fn, 'rb') as input: with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn) logging.info('Creating IDF model ...') tfidf = TfidfModel(dictionary=vocab) logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2')) tfidf.save(output_fn + '.tfidf') logging.info('Compressing IDF model ...') with open(output_fn + '.tfidf', 'rb') as input: with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output: copyfileobj(input, output) os.remove(output_fn + '.tfidf')
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Gensim vectorizer """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): if self.lexicon == None or self.tfidf == None: inputDocuments = list(documents) self.lexicon = Dictionary(inputDocuments) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in inputDocuments], id2word=self.lexicon) self.save() return self else: return self def transform(self, documents): returnDocs = [] for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: returnDocs.append(sparse2full(vec)) else: returnDocs.append(vec) return returnDocs
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator())
class TFIDFmodel(object): def __init__(self): self.inner_model = None # load dictionary and corpus vocabulary = "raw" corpora_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora' ]) self.dictionary = corpora.Dictionary.load( os.path.join(corpora_folder, "%s.dict" % (vocabulary, ))) self.corpus = corpora.MmCorpus( os.path.join(corpora_folder, "%s.mm" % (vocabulary, ))) # parameters self.dataset = "CASEREPORT" # data file path models_folder = os.path.join(*[ os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models' ]) filename = "TFIDF_%s" % (self.dataset, ) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = TfidfModel.load(self.filepath) else: self.inner_model = TfidfModel(corpus=self.corpus) self.inner_model.save(self.filepath) def __contains__(self, item): return item in self.inner_model
def getToipc(file_name, toipc_type="lda", topics_num=5, topics_words=5): """ 生成主题模型 :param file_name: :param toipc_type: lda or lsi :param topics_num: :param topic_words: :return: """ texts = list() f = codecs.open(file_name, 'r', encoding='utf-8') for line in f: tt_texts = list() line = line.strip() words = jieba.cut(line, cut_all=False) t_texts = list(words) for text in t_texts: if len(text.strip()) > 1: tt_texts.append(text) texts.append(tt_texts) # print(texts) # 去掉只出现一次的单词 frequency = defaultdict(int) """for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts]""" dictionary = corpora.Dictionary(texts) # 生成词典# -*- coding: utf-8 -*- # 将文档存入字典,字典有很多功能,比如 # diction.token2id 存放的是单词-id key-value对 # diction.dfs 存放的是单词的出现频率 dictionary.save( 'deerwester.dict') # store the dictionary, for future reference corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('deerwester.mm', corpus) # store to disk, for later use tfidf = TfidfModel(corpus) tfidf_corpus = tfidf[corpus] #print(tfidf.idfs) tfidf.save('foo.tfidf_model') """ 加载模型 dictionary = corpora.Dictionary.load('mydict.dic') corpus = corpora.MmCorpus('lsi_corpus.mm') model = LsiModel.load('model.lsi') model2 = LdaModel.load('model.lda') TfidfModel.load(foo.tfidf_model)""" # print('aaaaaaaaaaaaaaaa') if toipc_type == "lsi": #lsi #lsi = LsiModel(corpus = tfidf_corpus,id2word=dictionary,num_topics=2) lsi = LsiModel(corpus=tfidf_corpus, id2word=dictionary) lsi_corpus = lsi[tfidf_corpus] lsi.save('model.lsi') corpora.MmCorpus.serialize('lsi_corpus.mm', lsi_corpus) #print 'LSI Topics:' #print json.dumps(lsi.print_topics(num_topics=topics_num,num_words=topics_words), encoding="UTF-8", ensure_ascii=False) return lsi.print_topics(num_topics=topics_num, num_words=topics_words) if toipc_type == "lda": #lda #lda = LdaModel(corpus = tfidf_corpus,id2word=dictionary,num_topics=1) lda = LdaModel(corpus=tfidf_corpus, id2word=dictionary) lda_corpus = lda[tfidf_corpus] lda.save('model.lda') corpora.MmCorpus.serialize('lda_corpus.mm', lda_corpus) #print 'LDA Topics:' #print json.dumps(lda.print_topics(num_topics=topics_num,num_words=topics_words), encoding="UTF-8", ensure_ascii=False) return lda.print_topics(num_topics=topics_num, num_words=topics_words)
class TextProcessor: def __init__(self, n_users, n_samples, n_dims): self.nUsers, self.nSamples, self.nDims = n_users, n_samples, n_dims self.tfIdfModel = self.lsiModel = self.ldaModel = self.w2vModel = self.dictionary = None self.dictPath, self.tfIdfPath, self.lsiPath, self.ldaPath, self.w2vPath, self.w2vVecPath =\ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='dict'), \ conf.get_filename_via_tpl('model', model_type='tfidf', n_users=n_users, n_samples=n_samples, model_filename='tfidf'),\ conf.get_filename_via_tpl('model', model_type='lsi', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lsi_model'), \ conf.get_filename_via_tpl('model', model_type='lda', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='lda_model'),\ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='w2vmodel'), \ conf.get_filename_via_tpl('model', model_type='w2v', n_users=n_users, n_samples=n_samples, n_dims=n_dims, model_filename='vec.txt') def load_model(self, model_type): model = None try: if model_type == 'tfidf': model = TfidfModel.load(self.tfIdfPath, mmap='r') self.tfIdfModel = model elif model_type == 'lsi': model = LsiModel.load(self.lsiPath, mmap='r') self.lsiModel = model elif model_type == 'lda': model = LdaModel.load(self.ldaPath, mmap='r') self.ldaModel = model elif model_type == 'w2v': model = Word2Vec.load(self.w2vPath, mmap='r') self.w2vModel = model else: logger.error('Model type error. Unexpected %s' % model_type) return None if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) logger.info('%s model loaded completely.' % model_type) except IOError: logger.error( 'The %s model doesn\'t exist. Please train the model before load it.' % model_type) finally: return model def tf_idf_transform(self, doc): """ Perform tf-idf transformation on doc. """ self.dictionary = corpora.Dictionary(doc) corpus = [self.dictionary.doc2bow(text) for text in doc] self.tfIdfModel = TfidfModel(corpus) conf.mk_dir(self.tfIdfPath) self.dictionary.save(self.dictPath) logger.info('Dictionary has been saved in %s.' % self.dictPath) self.tfIdfModel.save(self.tfIdfPath) logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath) tfidf_corpus = self.tfIdfModel[corpus] tfidf_corpus_path = conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples) corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus) logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' % (np.array(tfidf_corpus).shape, tfidf_corpus_path)) return tfidf_corpus def lsi_transform(self, corpus_tf_idf): logger.info('Training lsi model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) self.lsiModel = LsiModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) # print self.lsiModel[corpus] conf.mk_dir(self.lsiPath) self.lsiModel.save(self.lsiPath) logger.info('Lsi model has been saved in %s.' % self.lsiPath) lsi_corpus = self.lsiModel[corpus_tf_idf] lsi_corpus_path = conf.get_filename_via_tpl('lsi', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lsi_corpus_path) corpora.MmCorpus.serialize(lsi_corpus_path, lsi_corpus) logger.info('Lsi corpus with a shape of %s has been saved in %s.' % (np.array(lsi_corpus).shape, lsi_corpus_path)) return lsi_corpus def lda_transform(self, corpus_tf_idf, train_separated=False, is_update=False): """ Init a lda model with a n_topics whose default is 500, then fit it with corpus_tf_idf and transform it. :param corpus_tf_idf: Corpus which has been transformed into tf-idf matrix. :param train_separated: The model is going to be train with all corpus one time or some of them separately one time. :param is_update: Whether the training to be perform is to construct a new model or update one existed. :return: lda corpus. """ logger.info('Training lda model with a n_dims of %d...' % self.nDims) if self.dictionary is None and os.path.exists(self.dictPath): self.dictionary = corpora.Dictionary.load(self.dictPath) if is_update: # A ldaModel had been trained before and now update the model with other corpus. if self.ldaModel is None: self.load_model('lda') self.ldaModel.update(corpus_tf_idf) logger.info('Lda model has been updated successfully.') return self.ldaModel[corpus_tf_idf] if train_separated: # corpus = [] # spacing = 10000 # for i in range(int(len(corpus_tf_idf)/spacing)): # corpus.append(corpus_tf_idf[i*spacing: i]) # self.ldaModel = LdaModel() pass self.ldaModel = LdaModel(corpus=corpus_tf_idf, num_topics=self.nDims, id2word=self.dictionary) conf.mk_dir(self.ldaPath) self.ldaModel.save(self.ldaPath) logger.info('lda model has been saved in %s' % self.ldaPath) lda_corpus = self.ldaModel[corpus_tf_idf] lda_corpus_path = conf.get_filename_via_tpl('lda', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm') conf.mk_dir(lda_corpus_path) corpora.MmCorpus.serialize(lda_corpus_path, lda_corpus) logger.info('Lda corpus with a shape of %s has been saved in %s.' % (np.array(lda_corpus).shape, lda_corpus_path)) return lda_corpus def w2v_transform(self, sentences): """ Perform word2vec on texts and obtain a w2v model. :param sentences: Sentences that each one of it contains a list of words of a text. :return: W2v model. """ logger.info('Training w2v model with a dim of %d...' % self.nDims) # file = open(infile_path, 'r', encoding='utf-8') if infile_path.find('\n') < 0 else StringIO(infile_path) # sentences = [] # for sen in file.readlines(): # sentences.append(sen.strip().split(' ')) # print(sentences) self.w2vModel = Word2Vec(sentences, size=self.nDims, min_count=0) conf.mk_dir(self.w2vPath) self.w2vModel.save(self.w2vPath) self.w2vModel.wv.save_word2vec_format(self.w2vVecPath, binary=False) # print(model['[']) # Construct w2v corpus w2v_corpus = [] for sen in sentences: vec = [0] * self.nDims if len(sen) > 0: for word in sen: vec = list( map(lambda m, n: m + n, vec, self.w2vModel[word])) # vec += self.w2vModel[word] w2v_corpus.append(vec) w2v_corpus_path = conf.get_filename_via_tpl('w2v', n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims) conf.mk_dir(w2v_corpus_path) with open(w2v_corpus_path, 'w') as fp: csv_writer = csv.writer(fp) for line in w2v_corpus: csv_writer.writerow(line) logger.info('W2v corpus has been saved in %s. ' % w2v_corpus_path) return w2v_corpus def load_corpus(self, model_type, dense=False): corpus = None try: if model_type == 'tfidf': corpus = corpora.MmCorpus( conf.get_filename_via_tpl('tfidf', n_users=self.nUsers, postfix='mm', n_samples=self.nSamples)) elif model_type in ['lsi', 'lda']: corpus = corpora.MmCorpus( conf.get_filename_via_tpl(model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims, postfix='mm')) elif model_type == 'w2v': corpus = np.loadtxt(conf.get_filename_via_tpl( model_type, n_users=self.nUsers, n_samples=self.nSamples, n_dims=self.nDims), dtype=np.float, delimiter=',') logger.info('%s corpus with a shape of %s has been loaded. ' % (model_type, np.array(corpus).shape)) if dense and model_type in ['tfidf', 'lsi', 'lda']: corpus = matutils.corpus2dense(corpus, self.nDims, self.nSamples * self.nUsers, dtype=np.float).T else: corpus = np.array(corpus) except Exception as e: raise e return corpus @staticmethod def corpus2dense(corpus, n_terms, n_docs=conf.N_SAMPLES, dtype=np.float): return matutils.corpus2dense(corpus, n_terms, n_docs, dtype).T def load_vec(self, vec_type): logger.info('Loading %s vectors...' % vec_type) try: corpus_vec = self.load_corpus(vec_type, True) except Exception as e: raise e data = [] for i in range(self.nUsers): data.append(corpus_vec[i * self.nSamples:(i + 1) * self.nSamples]) data = np.array(data, dtype=np.float) return data
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", type='tfidf', tofull=False, vec_size=100): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._type = type self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._model_path = os.path.join(dirpath, type + ".model") self.lexicon = None self.model = None self.tofull = tofull self._nfeat = vec_size self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._model_path): self.model = TfidfModel().load(self._model_path) def save(self): self.lexicon.save(self._lexicon_path) self.model.save(self._model_path) def fit(self, documents, labels=None): if self._type == "tfidf": self.lexicon = Dictionary(documents) self.model = TfidfModel( [self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): if self._type == "doc2vec": taggeddoc = [ TaggedDocument(words, ['d{}'.format(idx)]) for idx, words in enumerate(documents) ] model = Doc2Vec(taggeddoc, vector_size=self._nfeat, window=2, min_count=1, workers=4) docvec_mat = self.model.docvecs.vectors_docs else: if self._type == "count": docvecs = [ self.lexicon.doc2bow(document) for document in documents ] elif self._type == "ohe": docvecs = [[(token[0], 1) for token in self.lexicon.doc2bow(document)] for document in documents] else: docvecs = [ self.tfidf[self.lexicon.doc2bow(document)] for document in documents ] docvecs = [ sparse2full(docvec, len(self.lexicon)) for docvec in docvecs ] docvec_mat = sp.csr_matrix(docvecs, dtype=np.float64) return docvec_mat
class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [ TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset'] ] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'): tfidf = TfidfModel.load('tfidf.pkl') dictionary = Dictionary.load('nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save('nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)
def gerar_modelo(self, modelo): ''' Treina o modelo selecionado, salvando-o. Após, cria a matrix de similaridade para o corpus transformado. Parâmetros: modelo (str) --> nome do modelo: "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec" Retorno: None ''' # Verifica se o modelo foi implementado if modelo not in self._modelos: print(f'O modelo "{modelo}" não foi implementado.') return # Define os nomes dos arquivos arq_model = os.path.join( self.corpus._pastas['modelos'], f'{self.corpus._link_nome}.{self._exts[modelo]}') arq_index = os.path.join(self.corpus._pastas['indices'], f'{self.corpus._link_nome}_{modelo}.idx') # Gera o modelo solicitado if modelo == 'tfidf': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario()) elif modelo == 'tfidf_pivot': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self.corpus.num_tokens model = TfidfModel(corpus=corpus_train, id2word=self.corpus.dicionario(), smartirs='nfu', pivot=self.corpus.num_tokens / self.corpus.num_docs) elif modelo == 'lda': # Inicializa o modelo corpus_train = self.corpus.corpus(tipo='bow') num_features = self._modelos[modelo]['num_topics'] model = LdaModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'lsi': # Inicia o modelo corpus_train = self.corpus.corpus(tipo='tfidf') num_features = self._modelos[modelo]['num_topics'] model = LsiModel(corpus=corpus_train, id2word=self.corpus.dicionario(), num_topics=num_features) elif modelo == 'doc2vec': # Instancia o modelo Doc2Vec corpus_train = self.corpus.corpus(tipo='tagged') num_features = self._modelos[modelo]['vector_size'] model = Doc2Vec(vector_size=num_features, workers=mp.cpu_count() / 2, alpha=self._modelos[modelo]['alpha'], min_alpha=self._modelos[modelo]['min_alpha']) # Obtém o vocabulário do corpus para treinar o modelo Doc2Vec model.build_vocab(corpus_train) # Treina o modelo Doc2Vec model.train(corpus_train, total_examples=model.corpus_count, epochs=model.epochs) else: print(f'O modelo "{modelo}" não foi implementado.') return # Salva o modelo treinado model.save(self._arqs['modelos'][modelo]) # Define o corpus para a matriz de similaridade if modelo == 'doc2vec': corpus = Doc2VecCorpus(model) else: corpus = model[corpus_train] # Gera o index a partir do modelo serializado index = Similarity(output_prefix=self._arqs['indices'][modelo], corpus=corpus, num_features=num_features) # Salva o índice index.save(self._arqs['indices'][modelo])
comments_dictionary = Dictionary(docs) comments_dictionary.filter_extremes(no_below=10, no_above=0.3) comments_dictionary.compactify() comments_dictionary.save(FLAGS.dictFile) else: print("Loading dictionary...") comments_dictionary = Dictionary.load(FLAGS.dictFile) print("Converting to BOW vectors...") comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] model_tfidf = None if doTrain: print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) model_tfidf.save(FLAGS.tfidfFile) else: print("Loading tfidf model...") model_tfidf = TfidfModel.load(FLAGS.tfidfFile) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus] comments_vecs = np.vstack( [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf]) chi2_features = None if doTrain: # Find most descrimitive words for any of the labels print("Finding discrimitive features...") labels = np.array(data['any']) model_fpr = SelectFpr(chi2, alpha=0.025)
from gensim.corpora import Dictionary class JsonCorpus(object): def __iter__(self): data = json.load(open('data/nasa.json')) desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'): tfidf = TfidfModel.load('tfidf.pkl') dictionary = Dictionary.load('nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save('nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name']) human_data_file = path.join(base_path, p['human_data_file']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() logger.info('loading word mapping') dictionary = Dictionary.load(path.join(base_path, p['corpus_path'], p['dict_name'])) Dictionary.save(dictionary, path.join(output_dir, p['dict_name'])) logger.info(dictionary) logger.info('loading corpus') corpus_bow = MmCorpus(working_corpus) logger.info("create preprocessing model and save it to disk") if p['pre_model'] == 'tfidf': pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) elif p['pre_model'] == 'log_ent': pre_model = LogEntropyModel(corpus_bow, id2word=dictionary, normalize=True) else: raise ValueError('model parameter %s not known' % p['pre_model']) pre_model.save(os.path.join(output_dir, p['pre_model_extension'])) logger.info('initialize LSI model') lsi = models.LsiModel(pre_model[corpus_bow], id2word=dictionary, num_topics=p['num_topics']) lsi.save(os.path.join(output_dir, p['lsi_extension'])) logger.info('finished --> lsi model saved to: %s' % os.path.join(output_dir, p['lsi_extension'])) # check for correlation with lee human data logger.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[pre_model[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(human_data_file) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("correlation with lee human data: %f" % cor[0, 1]) dif = start - datetime.now() logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def prepare_text_for_fitting(full_texts, sentences, nlp, **kwargs): #Grap and parse the chapters/sentences from the input corpus chapters = full_texts.split('\n\n\n\n\n\n') p_chapters = [ tokenize(nlp(chapter_return(chapter))) for chapter in chapters ] p_sentences = [tokenize(nlp(sentence)) for sentence in sentences] #Create gensim dictionaries and carefully filter the high/low occurring words. text_dict = Dictionary(p_chapters) sentence_dict = Dictionary(p_sentences) text_dict.filter_extremes(no_below=4, no_above=0.22) print len(text_dict) text_dict.compactify() text_dict[text_dict.keys()[0]] #Get the bag of word representation for every word in each chapter chap_corpus = [text_dict.doc2bow(c) for c in p_chapters] #sent_corpus = [text_dict.doc2bow(s) for s in p_sentences] #The GloVe vector representation of each word in all of the chapters tf_idf_glove = np.vstack( [nlp(text_dict[i]).vector for i in range(len(text_dict))]) #Create a normed set of the vectors for easy similarity scoring normed_vecs = copy.deepcopy(tf_idf_glove) for i, nv in enumerate(normed_vecs): normed_vecs[i] = nv / np.linalg.norm(nv) #Get the bag of word rep. for each applicable sentence. #If a word is not in the dictionary, we grab and weight the most similar available word. sent_corpus = [ get_sent_bow(s, text_dict, nlp, preload=normed_vecs) for s in p_sentences ] #pickle.dump(sent_corpus,open('raw_count_mat.pckl','wb')) #Could use atn or ntn as well as ltn if os.path.isfile('tf_idf_sent_mat_samp4.pckl'): sent_vecs = pickle.load(open('tf_idf_sent_mat_samp4.pckl', 'rb')) else: #Create a TF-IDF model for the text as a whole model_tfidf = TfidfModel(chap_corpus, id2word=text_dict, smartirs='ltn') model_tfidf.save('tfidf_model_samp4') #Apply the model to each word in the applicable sentences sent_tfidf = model_tfidf[sent_corpus] #Unpack each TF-IDF vector sent_vecs = np.vstack( [sparse2full(c, len(text_dict)) for c in sent_tfidf]) pickle.dump(sent_vecs, open('tf_idf_sent_mat_samp4.pckl', 'wb')) if os.path.isfile('glove_sent_mat_samp4.pckl'): sent_glove_mat = pickle.load(open('glove_sent_mat_samp4.pckl', 'rb')) else: #Weight the glove vector representation by the appropriate TF-IDF values sent_glove_mat = np.dot(sent_vecs, tf_idf_glove) pickle.dump(sent_glove_mat, open('glove_sent_mat_samp4.pckl', 'wb')) if os.path.isfile('sent_w2v_mat_samp4.pckl'): sent_w2v_mat = pickle.load(open('sent_w2v_mat_samp4.pckl', 'rb')) else: #Create a 250 element Word2Vec modeller model_w2v = Word2Vec(p_chapters, size=250, window=7) #Train it over 10 epochs model_w2v.train(p_chapters, total_examples=model_w2v.corpus_count, epochs=10) model_w2v.init_sims() model_w2v.save('word2vec_model_samp4') #Fix non-included ones ids = [] #Collect the dict. ID's for the intersection of the w2v and text vocabs. for k in model_w2v.wv.vocab: try: ids.append(text_dict.token2id[k]) except KeyError: pass #[text_dict.token2id[k] for k in model_w2v.wv.vocab] #Create the new, smaller subset dictionary filt_dict = {new_id: text_dict[new_id] for new_id in ids} #Deal with the id numbers being off. blah = zip(list(np.sort(ids)), range(len(model_w2v.wv.vocab))) renum_dict = dict(blah) #Subset corpus filt_sent_corp = [] for i in range(len(p_sentences)): corp_ = [] for p in sent_corpus[i]: if p[0] in ids: corp_.append((renum_dict[p[0]], p[1])) filt_sent_corp.append(corp_) #New, smaller Word2Vec model tdidf_w2v = TfidfModel(filt_sent_corp, id2word=filt_dict, smartirs='ltn') sent_w2v_tdidf = tdidf_w2v[filt_sent_corp] #Appropriate TF-IDF vectors w2v_tfidf_vecs = np.vstack( [sparse2full(c, len(filt_dict)) for c in sent_w2v_tdidf]) #Collect all of the appropriate Word2Vectors w2v_vecs = [ model_w2v.wv[filt_dict[filt_dict.keys()[i]]] for i in range(len(filt_dict)) ] w2v_vecs = np.array(w2v_vecs) w2v_vecs.shape = (len(filt_dict), 250) sent_w2v_mat = np.dot(w2v_tfidf_vecs, w2v_vecs) pickle.dump(sent_w2v_mat, open('w2v_sent_mat_samp4.pckl', 'wb')) return sent_vecs, sent_glove_mat, sent_w2v_mat
def construct_tfidf_model(self, model_path): model = TfidfModel(self.corpus) model.save(model_path) return model
def __iter__(self): data = json.load(open('../data/nasa.json')) desc = [ TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset'] ] self.dictionary = Dictionary(desc) for d in desc: yield self.dictionary.doc2bow(d) def score(text, tfidf, dictionary): return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)] if __name__ == '__main__': if os.path.exists('../data/tfidf.pkl') and os.path.exists( '../data/nasa_dictionary.pkl'): tfidf = TfidfModel.load('../data/tfidf.pkl') dictionary = Dictionary.load('../data/nasa_dictionary.pkl') else: corpus = JsonCorpus() corpus.dictionary.save(self, '../data/nasa_dictionary.pkl') dictionary = corpus.dictionary tfidf = TfidfModel(corpus, dictionary=corpus.dictionary) tfidf.save('../data/tfidf.pkl') print score('project completed', tfidf=tfidf, dictionary=dictionary)
dictionary = corpora.Dictionary( processed_docs) # create a dictionary of words from our keywords dictionary.save(path + 'dim_items_terms.dict') #saved # Creating and saving corpus corpus = [ dictionary.doc2bow(doc) for doc in processed_docs ] #create corpus where the corpus is a bag of words for each document corpora.MmCorpus.serialize(path + 'dim_items_terms.mm', corpus) #saved #---Creating TFIDF MATRIX tfidf = TfidfModel( corpus ) # # step 1 -- initialize a model i.e. create tfidf model of the corpus (train the transformation model) tfidfmodelsave = tfidf.save(path + 'dim_items_terms.tfidf') #saved tfidf_corpus = tfidf[corpus] #checking allocations of weights from tfidf matrix------ sorted_tfidf_weights = sorted(tfidf[corpus[0]], key=lambda w: w[1], reverse=True) for term_id, weight in sorted_tfidf_weights[:5]: print(dictionary.get(term_id), weight) #finding cosine similarity from tfidf matrix------ sims = Similarity('path1', tfidf[corpus], num_features=len(dictionary)) sims.save(path + '_saved_sims.similarity') app = Flask(__name__)