def __init__(self, modelpackagepath, packageFiles): for key in packageFiles.keys(): packageFiles[key] = absPath(os.path.join(modelpackagepath, packageFiles[key])) self.packageFiles = packageFiles self.models = {"dictionary": None, "m1Model": None, "m1Index": None, "m2Model": None, "m2Index": None, "detectors": None, "sampleUtterances": None, "mappings": None, "modelInfo": None} try: with open(self.packageFiles["modelInfo"], "r") as fp: self.models["modelInfo"] = ModelInfo(json.loads(fp.read())) fp.close() except: self.models["modelInfo"] = ModelInfo({}) try: self.models["dictionary"] = corpora.Dictionary.load(self.packageFiles["dictionaryFile"]) except: raise ModelFileLoadFailed("Failed to load dictionary from file " + self.packageFiles["dictionaryFile"]) try: self.models["m1Model"] = TfidfModel.load(self.packageFiles["m1ModelFile"]) except: raise ModelFileLoadFailed("Failed to load model from file " + self.packageFiles["m1ModelFile"]) try: self.models["m1Index"] = similarities.MatrixSimilarity.load(self.packageFiles["m1IndexFile"]) except: raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m1IndexFile"]) try: self.models["m2Model"] = TfidfModel.load(self.packageFiles["m2ModelFile"]) self.models["m2Model"] = None gc.collect() except: raise ModelFileLoadFailed("Failed to load model from file " + self.packageFiles["m2ModelFile"]) try: self.models["m2Index"] = similarities.MatrixSimilarity.load(self.packageFiles["m2IndexFile"]) self.models["m2Index"] = None gc.collect() except: raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m2IndexFile"]) try: with open(self.packageFiles["detectorsFile"], "r") as f: self.models["detectors"] = json.loads(f.read()) f.close() except: raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["detectorsFile"]) if self.models["modelInfo"].detectorContentSplitted: try: with open(self.packageFiles["mappingsFile"], "r") as f: self.models["mappings"] = json.loads(f.read()) f.close() except: raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["mappingsFile"]) try: with open(self.packageFiles["sampleUtterancesFile"], "r") as f: self.models["sampleUtterances"] = json.loads(f.read()) f.close() self.models["sampleUtterances"] = None gc.collect() except: raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["sampleUtterancesFile"])
def make_scores_for_sample(): doc2vec_model = doc2vec.Doc2Vec.load('doc2vec_weigths') logging.info('doc2vec loaded') tfidf_unigram_model = TfidfModel.load('tfidf_unigram') logging.info('tfidf unigram loaded') tfidf_bigram_model = TfidfModel.load('tfidf_bigram') logging.info('tfidf bigram loaded') d1 = corpora.Dictionary.load('./dict_1.gensim') logging.info('dict1 loaded') d2 = corpora.Dictionary.load('./dict_2.gensim') logging.info('dict2 loaded') queries = pd.read_csv('./queries_norm.tsv', sep='\t', header=None, index_col=0) sample = pd.read_csv('./sample.csv', sep=',').sort_values(by=['DocumentId']) with open('./submission.csv', 'w') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['QueryId', 'DocumentId', 'Score']) for idx, row in tqdm(sample.iterrows()): query_id = row['QueryId'] doc_id = row['DocumentId'] doc2vec_score = doc2vec_model.docvecs.similarity('DOC_%d' % doc_id, 'QUERY_%d' % query_id) doc = get_doc(doc_id) query = str(queries.loc[query_id]) doc_title = str(doc[1]) doc_content = str(doc[2]) doc_title_words = doc_title.split() doc_content_words = doc_content.split() query_words = query.split() doc_title_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_title_words[:-1], doc_title_words[1:])))) doc_content_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(doc_content_words[:-1], doc_content_words[1:])))) query_bigrams = d2.doc2bow(list(map(lambda x: '\t'.join(x), zip(query_words[:-1], query_words[1:])))) doc_title_words = d1.doc2bow(doc_title_words) doc_content_words = d1.doc2bow(doc_content_words) query_words = d1.doc2bow(query_words) doc_title_words = tfidf_unigram_model[doc_title_words] doc_content_words = tfidf_unigram_model[doc_content_words] query_words = tfidf_unigram_model[query_words] doc_title_bigrams = tfidf_bigram_model[doc_title_bigrams] doc_content_bigrams = tfidf_bigram_model[doc_content_bigrams] query_bigrams = tfidf_bigram_model[query_bigrams] tfidf_title_score_uni = matutils.cossim(doc_title_words, query_words) tfidf_content_score_uni = matutils.cossim(doc_content_words, query_words) tfidf_title_score_bi = matutils.cossim(doc_title_bigrams, query_bigrams) tfidf_content_score_bi = matutils.cossim(doc_content_bigrams, query_bigrams) score = (2 * tfidf_content_score_bi + 2 * tfidf_title_score_uni + tfidf_content_score_uni + 0.5 * doc2vec_score) / 5.5 writer.writerow([query_id, doc_id, score])
def make_lda_model(): tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix()) lda_model = LdaModel(nmf_iterator( CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()), tfidf_model), num_topics=TOPIC_NUM) lda_model.save((output_dir / 'lda_model.pkl').as_posix())
def load_tfidf_model(path): ''' Loads a TF-IDF model from file :param path: the path :type path: string ''' model = TfidfModel.load(path)
def tfidf_w2v_top5w(all_docs_prepro, id_dict): with open('../code/similarity/mappings/map_w2v_tfidf_5w.pkl', 'rb') as fp: Classes = pickle.load(fp) mapping = Classes['mapping'] print('Loading Word2vec model') model_path = 'embedding/models/word2vec_all.model' model_w2v = Word2Vec.load(model_path) print('Loading Tfidf model') model_path = 'embedding/models/tfidf_all.model' model_tfidf = TfidfModel.load(model_path) dct = Dictionary(all_docs_prepro) corpus = [dct.doc2bow(line) for line in all_docs_prepro] mean_ticket_ques = top5_average('ticket_ques', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf, id_dict=id_dict, all_docs_prepro=all_docs_prepro) return (mean_ticket_ques, mapping)
def tfidf_w2v_top5w(all_docs_prepro, id_dict, thresh): print('Loading Word2vec model') model_path = 'embedding/models/word2vec_all.model' model_w2v = Word2Vec.load(model_path) print('Loading Tfidf model') model_path = 'embedding/models/tfidf_all.model' model_tfidf = TfidfModel.load(model_path) dct = Dictionary(all_docs_prepro) corpus = [dct.doc2bow(line) for line in all_docs_prepro] mean_ticket_ans = top5_average(dat='ticket_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf, id_dict=id_dict, all_docs_prepro=all_docs_prepro) mean_faq_ans = top5_average(dat='faq_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf, id_dict=id_dict, all_docs_prepro=all_docs_prepro) output = compute_sim(mean_ticket_ans, mean_faq_ans, thresh) with open("../code/similarity/mappings/map_w2v_tfidf_5w.pkl", "wb") as fp: pickle.dump(output, fp)
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def main(JDK, url, title, query): dictionary = corpora.Dictionary.load( './TFIDF_Word2Vec/data/tfidf-w2v_dictionary.dict') tfidf = TfidfModel.load('./TFIDF_Word2Vec/data/tfidf.model') word2vec = gensim.models.keyedvectors.Word2VecKeyedVectors.load( './TFIDF_Word2Vec/data/word2vec.model') tfidf_w2v_model = models.keyedvectors.Word2VecKeyedVectors.load( './TFIDF_Word2Vec/data/tfidf-w2v.model') query_vec = get_tfidf_w2v_vec(query, dictionary, tfidf, word2vec) full_entity_score_vec = tfidf_w2v_model.similar_by_vector(query_vec, topn=False) sort_sims = sorted(enumerate(full_entity_score_vec), key=lambda item: -item[1]) result = [] for i in range(10): dic = { 'url': url[sort_sims[i][0]].strip('\n'), 'JDK': JDK[sort_sims[i][0]].strip('\n'), 'title': title[sort_sims[i][0]].strip('\n'), 'score': sort_sims[i][1] } result.append(dic) return result
def loadmodel(self, nameprefix): """ Load the topic model with the given prefix of the file paths. Given the prefix of the file paths, load the corresponding topic model. The files include a JSON (.json) file that specifies various parameters, a gensim dictionary (.gensimdict), and a topic model (.gensimmodel). If weighing is applied, load also the tf-idf model (.gensimtfidf). :param nameprefix: prefix of the file paths :return: None :type nameprefix: str """ # load the JSON file (parameters) parameters = json.load(open(nameprefix + '.json', 'rb')) self.nb_topics = parameters['nb_topics'] self.toweigh = parameters['toweigh'] self.algorithm = parameters['algorithm'] self.classlabels = parameters['classlabels'] # load the dictionary self.dictionary = Dictionary.load(nameprefix + '.gensimdict') # load the topic model self.topicmodel = gensim_topic_model_dict[self.algorithm].load( nameprefix + '.gensimmodel') # load the similarity matrix self.matsim = MatrixSimilarity.load(nameprefix + '.gensimmat') # load the tf-idf modek if self.toweigh: self.tfidf = TfidfModel.load(nameprefix + '.gensimtfidf') # flag self.trained = True
def tfidf(dataframe, max_words=None): """Returns a tf-idf model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. max_words : int (default is 2000000) The maximum number of words stored by the model. Returns ------- model : Gensim TfidfModel tf-idf model for documents stored in the DataFrame. """ suffix = '_{}'.format(max_words) if max_words else '' filename = 'caches/models/tfidf{}.model'.format(suffix) if not os.path.isfile(filename): if max_words: dictionary = hashdictionary_corpus(dataframe, id_range=max_words) else: dictionary = dictionary_corpus(dataframe) tfidf_model = TfidfModel(dictionary=dictionary) tfidf_model.save(filename) else: tfidf_model = TfidfModel.load(filename) return tfidf_model
def tfidf_model(self): print('Logging Info - Get Tf-idf model...') tfidf_model_path = os.path.join(FEATURE_DIR, '{}_tfidf.model').format(self.genre) dict_path = os.path.join(FEATURE_DIR, '{}_tfidf.dict').format(self.genre) if os.path.exists(tfidf_model_path): dictionary = pickle_load(dict_path) tfidf_model = TfidfModel.load(tfidf_model_path) else: corpus = [ text.split() for text in self.train_data['premise'] + self.train_data['hypothesis'] + self.dev_data['premise'] + self.dev_data['hypothesis'] + self.test_data['premise'] + self.test_data['hypothesis'] ] dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] tfidf_model = TfidfModel(corpus) del corpus tfidf_model.save(tfidf_model_path) pickle_dump(dict_path, dictionary) return dictionary, tfidf_model
def main(): COURSE_NAME_STUBS = [ "agile-planning-for-software-products", "client-needs-and-software-requirements", "design-patterns", "introduction-to-software-product-management", "object-oriented-design", "reviews-and-metrics-for-software-improvements", "service-oriented-architecture", "software-architecture", "software-processes-and-agile-practices", "software-product-management-capstone", ] for course_name in COURSE_NAME_STUBS: results_fp = os.path.join(DIR_PATH, "data", "eval.{}.pkl".format(course_name)) course_results = None with open(results_fp, "rb") as rf: course_results = load(rf) tfidf_fp = os.path.join(DIR_PATH, "data", "tfidf.{}.pkl".format(course_name)) # with open(tfidf_fp, "rb") as tfidf_f: tfidf_model = TfidfModel.load(tfidf_fp) idf_vec_size = len(tfidf_model.idfs) analyze_course_results(course_name, course_results, idf_vec_size)
def load(conf: Configuration, force: Optional[bool] = False, persist: Optional[bool] = True) -> "TFIDFRanker": model_path = conf.path_models + 'vsm_tfidf/' + conf.get_desc() + '/' if force or (not os.path.exists(model_path)) \ or (not os.path.isfile(model_path + 'corpus.mm')) \ or (not os.path.isfile(model_path + 'tfidf.model')): utils.mk_dir_if_not_exists(model_path) dataset = TFIDFRanker.extractor.load_dataset(conf=conf) dictionary = corpora.Dictionary([Ranker.get_text(conf, data) for (index, data) in dataset.iterrows()]) bow_corpus = [(dictionary.doc2bow(Ranker.get_text(conf, data)), data['filename']) for (index, data) in dataset.iterrows()] bow_corpus, names = map(list, zip(*bow_corpus)) index_mapping = TFIDFRanker.build_index_mapping(names) corpora.MmCorpus.serialize(model_path + 'corpus.mm', bow_corpus) mm_corpus = corpora.MmCorpus(model_path + 'corpus.mm') tfidf_model = TfidfModel(mm_corpus, ) tfidf_index = SparseMatrixSimilarity(tfidf_model[mm_corpus], num_features=mm_corpus.num_terms) ranker = TFIDFRanker(dictionary=dictionary, bow_corpus=mm_corpus, model=tfidf_model, index=tfidf_index, index_mapping=index_mapping, conf=conf) ranker.persist(model_path) logging.info('TFIDFRanker : initialized') logging.info('TFIDFRanker : model : {}'.format(tfidf_model)) logging.info('TFIDFRanker : index : {}'.format(tfidf_index)) return ranker else: dictionary = corpora.Dictionary.load(model_path + 'dict.dictionary') mm_corpus = corpora.MmCorpus(model_path+ 'corpus.mm') tfidf_model = TfidfModel.load(model_path + 'tfidf.model') tfidf_index = SparseMatrixSimilarity.load(model_path + 'tfidf.index') with open(model_path + 'index_mapping.pickle', mode='rb') as file: index_mapping = pickle.load(file) logging.info('TFIDFRanker : initialized') return TFIDFRanker(dictionary=dictionary,bow_corpus=mm_corpus, model=tfidf_model,index=tfidf_index,index_mapping=index_mapping,conf=conf)
def __init__(self): self.host = 'localhost' self.port = 3306 self.user = '******' self.password = '******' self.db = 'gaojiruangong' self.charset = 'utf8' db = pymysql.Connect(host=self.host, port=self.port, user=self.user, passwd=self.password, db=self.db, charset=self.charset) cursor = db.cursor() query_sql = "SELECT id, api FROM apisamplecode" cursor.execute(query_sql) results = cursor.fetchall() all_api_name_set = set() for item in results: delete_left_brackets_api_name = item[1].split('(')[0] all_api_name_set.add(delete_left_brackets_api_name) api_name = delete_left_brackets_api_name.split('.')[-1].lower() api_id = item[0] if api_name in self.api_name_2_id.keys(): self.api_name_2_id[api_name].append(api_id) else: self.api_name_2_id[api_name] = [] self.api_name_2_id[api_name].append(api_id) self.all_qualified_api_name = list(all_api_name_set) self.dictionary = corpora.Dictionary.load( ROOT_DIR + '/output/model/tfidf/tfidf_dictionary.dict') self.index = similarities.Similarity.load( ROOT_DIR + '/output/model/tfidf/tfidf_index.index') self.tfidf = TfidfModel.load(ROOT_DIR + '/output/model/tfidf/tfidf.model')
def __init__(self): self.stopwords = stopwords.words('english') # Lemmatizer self.lmtzr = WordNetLemmatizer() # Stemmer self.stemmer = PorterStemmer() self.word2vec_model = None self.words = re.compile(r"\w+", re.I) try: self.bigrams = Phrases.load('slm/app/cached_models/bigrams.gensim') except: self.bigrams = None try: self.trigrams = Phrases.load( 'slm/app/cached_models/trigrams.gensim') except: self.trigrams = None try: self.dictionary = corpora.Dictionary.load( 'slm/app/cached_models/dictionary.dict') except: self.dictionary = None try: self.tfidf = TfidfModel.load('slm/app/cached_models/tfidf.gensim') except: self.tfidf = None
def __init__(self, model_prefix=None, num_best=None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
def __init__(self): self.words = [["ewhor", "e-whor"], ["stresser", "booter"], [" rat "], ["crypt", "fud"]] # with open("data/models/word2vec.modelFile", "rb") as f: # self.d2v_model = pickle.load(f) self.tfidf_model = TfidfModel.load("data/models/1tfidf.modelFile") self.dct = Dictionary.load("data/models/1tfidf.dct")
def init(self, system, subclass): conn = self.data_processor.connect_db(self.conf.db_host, self.conf.db_database, self.conf.db_user, self.conf.db_pass) #装载词表,#装载模型 t = time.time() logger.debug("正在初始化[%s-%s]的模型加载", system, subclass) dic_name = "dictionary_" + system + "_" + subclass + ".dic" dictionary = Dictionary.load(self.model_dir + "/" + dic_name) logger.debug("加载了字典:%s", dic_name) logger.debug("词袋一共%d个词", len(dictionary.keys())) model_name = "tfidf_" + system + "_" + subclass + ".model" model = TfidfModel.load(self.model_dir + "/" + model_name) logger.debug("加载了TFIDF模型:%s", model_name) df_train = pd.read_sql( "select * from monitor_cluster_dbscan where business_system_code='{}' and rule_type_code='{}'" .format(system, subclass), conn) #KNN聚类,然后预测 knn = self.get_KNN_model(df_train, dictionary, model) duration(t, "根据字典和此分类数据,基于tfidf向量,训练出KNN模型") if knn is not None: key = system + "-" + subclass value = {'model': model, 'dictionary': dictionary, 'knn': knn} self.models[key] = value
def train(self): if not os.path.exists(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')): traindata = p.load(open(CORPUS_PATH, 'rb')) for qid in self.trainset: duplicates = self.trainset[qid]['duplicates'] for duplicate in duplicates: question = duplicate['rel_question']['tokens'] traindata.append(question) rel_comments = duplicate['rel_comments'] for rel_comment in rel_comments: q2 = rel_comment['tokens'] traindata.append(q2) self.dict = Dictionary(traindata) # fit dictionary corpus = [self.dict.doc2bow(line) for line in traindata] # convert corpus to BoW format self.tfidf = TfidfModel(corpus) # fit model self.dict.save(os.path.join(DATA_ANSWER_PATH, 'dict.model')) self.tfidf.save(os.path.join(DATA_ANSWER_PATH, 'tfidf.model')) else: self.dict = Dictionary.load( os.path.join(DATA_ANSWER_PATH, 'dict.model')) self.tfidf = TfidfModel.load( os.path.join(DATA_ANSWER_PATH, 'tfidf.model'))
def __init__(self, dictionary_path, corpus_path, tfidf_path, corpus_tfidf_path, tfidf_index_sim_path, lsi_path, lsi_index_path, stopwords_path, tweet_corpus_path): self.dictionary = gensim.corpora.Dictionary.load(dictionary_path) self.corpus = MmCorpusMeta(corpus_path, id2word=self.dictionary, metadata=True) self.tweet_corpus = MmCorpusMeta(tweet_corpus_path, id2word=self.dictionary, metadata=True) self.tfidf = TfidfModel.load(tfidf_path) self.corpus_tfidf = gensim.utils.unpickle(corpus_tfidf_path) self.tfidf_index = gensim.similarities.MatrixSimilarity.load( tfidf_index_sim_path) self.lsi = LsiModel.load(lsi_path) self.lsi_index = gensim.similarities.MatrixSimilarity.load( lsi_index_path) with open(stopwords_path) as f: self.stopwords = json.load(f) self.tdidf_tweets = self.tfidf[self.tweet_corpus] self.lsi_tweets = self.lsi[self.tdidf_tweets] self.sim_tweets = gensim.similarities.MatrixSimilarity(self.lsi_tweets) print("loaded")
def __get_tfidf_model(self): if os.path.exists(os.path.join(self.out_dir, 'tfidf.model')): tfidf_model = TfidfModel.load( os.path.join(self.out_dir, 'tfidf.model')) else: raise FileNotFoundError('"tfidf.model" file not found!') return tfidf_model
def _load_model(self, model_name): self.logger.warn('Loading DocumentRetriever models...') model_dir = Path('./model') self.dct = Dictionary.load(str(model_dir / f'{model_name}.dict')) self.tfidf = TfidfModel.load(str(model_dir / f'{model_name}.tfidf')) self.nlp = spacy.load('en_core_web_md') self.embeddings = KeyedVectors.load( str(model_dir / 'wiki-news-300d-1M-subword'))
def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
def load_source(): # 加载字典 dictionary = corpora.Dictionary.load("./Model/dictionary.dic") # 加载模型 tfidf_vectors = TfidfModel.load("./Model/tfidf_vectors.model") # 加载语料库 # corpus = corpora.MmCorpus('/corpus.mm') return dictionary, tfidf_vectors
def remove_duplicate_code(sample_codes, descriptions): Threshold = 0.9 dictionary = corpora.Dictionary.load('./output/test/tfidf_dictionary.dict') index = similarities.Similarity.load('./output/test/tfidf_index.index') tfidf = TfidfModel.load('./output/test/tfidf.model') remove_code_index = [] print(len(descriptions)) for i in range(len(descriptions)): vec_bow = dictionary.doc2bow(descriptions[i]) vec_tfidf = tfidf[vec_bow] sims = index[vec_tfidf] sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) for j in range(len(sort_sims)): if sort_sims[j][1] < Threshold: break else: if sample_codes[i]['API'] == sample_codes[ sort_sims[j][0]]['API'] and i != sort_sims[j][0]: if abs(70 - len(sample_codes[i]['Description'].strip( ).split(' '))) < abs( 70 - len(sample_codes[sort_sims[j][0]] ['Description'].strip().split(' '))): # vec_bow1 = dictionary.doc2bow([sample_codes[i]['Code']]) # vec_tfidf1 = tfidf[vec_bow1] # sims1 = index[vec_tfidf1] # # vec_bow2 = dictionary.doc2bow([sample_codes[sort_sims[j][0]]['Code']]) # vec_tfidf2 = tfidf[vec_bow2] # sims2 = index[vec_tfidf2] # if sims1[i] > sims2[sort_sims[j][0]]: # if tfidf.similarity(sample_codes[i]['Code'], sample_codes[i]['Description']) > tfidf.similarity(sample_codes[sort_sims[j][0]]['Code'], sample_codes[sort_sims[j][0]]['Description']): remove_code_index.append(sort_sims[j][0]) else: remove_code_index.append(i) print(i) sample_codes_index = [i for i in range(len(sample_codes))] sample_codes_index = set(sample_codes_index) remove_code_index = set(remove_code_index) index = list(sample_codes_index - remove_code_index) sample_codes = [sample_codes[i] for i in index] # 将全限定名,样例代码,文本描述保存 save_file = [] save_path = "RemoveDuplicateSampleCode.json" for sample_code in sample_codes: json_save = {} json_save['API'] = sample_code['API'] json_save['Code'] = sample_code['Code'] json_save['Description'] = sample_code['Description'] save_file.append(json_save) with open(OUTPUT_DIR + '/' + save_path, 'w', encoding='utf-8') as json_file: json.dump(save_file, json_file, indent=4)
def fit(self, raw_documents, y=None): self.analyzer_func = self.build_analyzer() self.model = LsiModel.load(self.model_fn) if os.path.exists(self.model_fn + '.tfidf'): self.tfidf = TfidfModel.load(self.model_fn + '.tfidf') return self
def load(self): """ load the corpora created by `make_corpus.py` """ self.corpus = MmCorpus(self.corpus_file) self.dictionary = Dictionary.load_from_text(self.dict_file) self.titles = load_titles(self.title_file) self.tfidf_model = TfidfModel.load(self.tfidf_model_file) self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
def generate_model(dictionary, bow_corpus, corpus_path): try: tfidf = TfidfModel.load(corpus_path + 'wiki-tfidf.model') print('tfidf model generated') except: tfidf = TfidfModel() tfidf = TfidfModel(bow_corpus, dictionary) tfidf._smart_save(corpus_path + 'wiki-tfidf.model') pass return tfidf
def get_tfidf(self,path): path = path + '.tfidf' if not os.path.exists(path): tfidf_model = TfidfModel(self.corpus, smartirs='ntc') tfidf_model.save(path) # перевзвешивание корпуса self.corpus = tfidf_model[self.corpus] else: tfidf_model = TfidfModel.load(path) return tfidf_model
def load(self, dir_path): dir_path = Path(dir_path) vocab_path = str(dir_path / self.VOCAB_FNAME) model_path = str(dir_path / self.TFIDF_FNAME) index_path = str(dir_path / self.INDEX_FNAME) self.vocab = Dictionary.load(vocab_path) self.model = TfidfModel.load(model_path) self.index = SparseMatrixSimilarity.load(index_path)
def __load_from_disk(self, path): """ Function that is used internally to load and set-up the class state :param path: Location from where the class internal state should be loaded :return: None, side-effect on the class on which this is called """ # Read config, with open(os.path.join(path, 'config.json')) as f: params = jsonpickle.decode(f.read()) self.net_size_in_days = params['net_size_in_days'] self.min_tok_len = params['min_tok_len'] self.undersample_multiplicity = params['undersample_multiplicity'] self.prediction_threshold = params['prediction_threshold'] self.use_sim_cs = params['use_sim_cs'] self.use_sim_j = params['use_sim_j'] self.use_sim_d = params['use_sim_d'] self.use_social = params['use_social'] self.use_temporal = params['use_temporal'] self.use_file = params['use_file'] self.use_pr_only = params['use_pr_only'] self.use_issue_only = params['use_issue_only'] self.predictions_between_updates = params[ 'predictions_between_updates'] name = params['name'] try: with open(os.path.join(path, name, 'repository_data.json')) as f: self.repository_obj = jsonpickle.decode(f.read()) with open(os.path.join(path, name, 'truth_data.json')) as f: self.truth = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: with open(os.path.join(path, name, 'fingerprint_data.json')) as f: self.fingerprint = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: self.dictionary = Dictionary.load_from_text( os.path.join(path, 'tfidf', 'term2id.txt')) self.model = TfidfModel.load( os.path.join(path, 'tfidf', 'model.tfidf')) with open(os.path.join(path, name, 'stopwords_data.json')) as f: self.stopwords = jsonpickle.decode(f.read()) except FileNotFoundError: pass try: self.clf = pickle.load( open(os.path.join(path, 'clf_model', 'model.p'), 'rb')) except FileNotFoundError: pass try: self.feature_generator = pickle.load( open(os.path.join(path, 'feature_generator', 'gen.p'), 'rb')) except FileNotFoundError: pass
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def _tfidf(corpus, dictionary): tfidf_file_name = get_tfidf_file_name(CORPUS_FILES["label"]) try: tfidf = TfidfModel.load(tfidf_file_name) except FileNotFoundError: corpus_numeric = [dictionary.doc2bow(document) for document in corpus] tfidf = TfidfModel(corpus=corpus_numeric) print("File does not exist - creating the tfidf model") create_file_and_folders_if_not_exist(tfidf_file_name) tfidf.save(tfidf_file_name) return tfidf
def __init__(self, model_prefix = None, num_best = None): self.model_prefix = model_prefix self.num_best = num_best if self.model_prefix is None: raise ValueError("model_prefix must be specified") logger.info("ESA: Loading word dictionary...") self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.info("ESA: Loading document name map...") self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("ESA: Loading TF-IDF model...") self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("ESA: Loading similarity index...") self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') #logger.info("ESA: Preloading reverse indexes...") #self.similarity_index.preload_reverse_index() logger.info("ESA: Finished loading model files.")
# corpus.save(f_bow) else: # models will be trained on your own corpus if os.path.exists(f_bow): corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus) # corpus.save(f_bow) # filter dictionary corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf) # TRAINING # lsa model if not os.path.exists(f_lsa): lsa = LsiModel(tfidf[corpus], id2word=corpus.dictionary, num_topics=lsa_dim) lsa.save(f_lsa) # word2vec model class MyCorpus(): def __iter__(self): for d in corpus.get_texts():
#print 'Saved dictionary' print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models') BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus #log_entropy = LogEntropyModel(BOW_corpus) #log_entropy.save('../models/logEntropy.model') #already provided log_entropy = LogEntropyModel.load('../models/logEntropy.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') print('Creating Similarity Index') logent_corpus = MmCorpus('../data/log_entropy_matrix') num_feat = len(wiki.dictionary.keys()) index = Similarity('../data/logEntropyShards/logEntropySimilarity', logent_corpus, num_features=num_feat) index.save('../data/logEntropyShards/logEntropySimilarityIndex') print('Saved Shards and similarity index') print('Getting list of titles...')
def scorer(model, dic): tfidf = TfidfModel.load(model) dictionary = Dictionary.load(dic) def score(words): return tfidf[dictionary.doc2bow(words)] return score
if len(sys.argv) < 2: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) model_prefix = sys.argv[1] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(model_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True logger.info("Finished loading model files.") mismatches = 0 for doc_idx in range(0, len(similarity_index)): logger.info("Checking doc: %d %s" % (doc_idx, article_dict[doc_idx])) rev_doc = scipy.sparse.dok_matrix((1, len(dictionary)), dtype=np.float64) fwd_doc = similarity_index.vector_by_id(doc_idx) for feature_id, val in enumerate(fwd_doc.toarray().flatten()): if val == 0: continue feat_rev_docs = similarity_index.docs_by_feature_id(feature_id).toarray().flatten()
if len(sys.argv) < 3: print(inspect.cleandoc(__doc__) % locals()) sys.exit(1) input_file, output_prefix = sys.argv[1:3] logger.info("running %s" % ' '.join(sys.argv)) logger.info("Loading word dictionary...") dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2') logger.debug(dictionary) logger.info("Loading document name map...") article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle') logger.info("Loading tf-idf model...") tfidf = TfidfModel.load(output_prefix + '.tfidf_model') logger.info("Loading similarity index...") similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r') similarity_index.use_reverse_index = True similarity_index.preload_reverse_index() logger.info("Finished loading model files.") logger.info("Processing input documents...") try: infile = open(input_file, 'r') except IOError: print('cannot open %s' % (input_file,)) sys.exit(1)
# What about the raw, unprocessed unicode tweet text itself? # In[6]: import gzip with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'rb') as f: nums = pd.read_csv(f, engine='python', encoding='utf-8') with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f: corpus = pd.DataFrame.from_csv(f, encoding='utf8') # Now load previously compiled vocabulary and TFIDF matrix (transformation) # In[11]: tfidf = TfidfModel.load(os.path.join(DATA_PATH, 'tfidf')) tfidf.num_docs # In[17]: bows = pd.Series(vocab.doc2bow(toks) for toks in corpus.tokens) bows # This would make a nice, compact sparse matrix representation of our entire corpus... # Which would mean we could do more in RAM at once. # Left as an exercise. (check out `scipy.sparse.coo_matrix`) # In[18]: