示例#1
0
def train_and_save_indexer(corpus, dct, file_name='model_100_indexer.model'):
    index_temp = get_tmpfile("index")
    indexer = Similarity(output_prefix=index_temp,
                         corpus=corpus,
                         num_features=len(dct),
                         num_best=6)
    indexer.save(file_name)
    return indexer
示例#2
0
class LSM:
    def __init__(self, model_name, index):
        self.model_name = model_name

        if self.model_name == 'LSI':
            self.model_file = lsi_model_file
            self.corpora_file = lsi_corpora_file
            self.similarity_file = lsi_sim_file
            self.num_topics = LSI_TOPICS
        elif self.model_name == 'LDA':
            self.model_file = lda_model_file
            self.corpora_file = lda_corpora_file
            self.similarity_file = lda_sim_file
            self.num_topics = LDA_TOPICS

        if not os.path.isfile(mm_corpus_file) or not os.path.isfile(dict_file):
            self.corpus = CorpusConnector(index)
            corpora.MmCorpus.serialize(mm_corpus_file, self.corpus)
            self.corpus.save_dict()
            self.dictionary = self.corpus.dictionary
        else:
            self.dictionary = corpora.Dictionary.load(dict_file)
            self.corpus = corpora.MmCorpus(mm_corpus_file)

        self.model = None
        self.corpora = None
        self.similarity_index = None

    def create_model(self):
        if not os.path.isfile(self.model_file):
            if self.model_name == 'LSI':
                self.model = lsimodel.LsiModel(corpus = self.corpus, \
                        id2word = self.dictionary, num_topics = self.num_topics)
            else:
                self.model = ldamodel.LdaModel(corpus = self.corpus, \
                        num_topics = self.num_topics, id2word = self.dictionary)
            self.model.save(self.model_file)

            self.corpora = self.model[self.corpus]
            corpora.MmCorpus.serialize(self.corpora_file, self.corpora)
        else:
            self.corpora = gensim.corpora.MmCorpus(self.corpora_file)
            if self.model_name == 'LSI':
                self.model = gensim.models.LsiModel.load(self.model_file)
            else:
                self.model = gensim.models.LdaModel.load(self.model_file)

    def create_similarity_index(self):
        if not os.path.isfile(self.similarity_file):
            self.similarity_index = Similarity('./LSM/', self.corpora,
                                               self.num_topics)
            self.similarity_index.save(self.similarity_file)
        else:
            self.similarity_index = Similarity.load(self.similarity_file)
示例#3
0
def main():
  orig_qns = [qn.strip()for qn in open('data/questions.txt')]
  aug = [qn.strip() for qn in open('data/augmented.txt')]
  all_qns = []
  for idx, qn in tqdm(enumerate(orig_qns)):
    all_qns.append(qn)
    if aug[idx] != qn:
      all_qns.append(aug[idx])
  print("Combined original questions and augmented questions")
  pickle.dump(all_qns, open("precompute/questions.pkl", 'wb'))

  qns = pickle.load(open("precompute/questions.pkl", 'rb'))
  documents = []
  for qn in tqdm(qns):
    document = get_similar.preprocess_text(qn)
    if len(document) < 1:
      document = ['UNK']
    documents.append(document)

  print(f"Finished preprocessing {len(documents)} questions")
  pickle.dump(documents, open("precompute/documents.pkl", "wb"))
  print("Saved tokens to documents.pkl")
  documents = pickle.load(open("precompute/documents.pkl", "rb"))
  
  dct = corpora.Dictionary(documents)
  pickle.dump(dct, open("precompute/dct.pkl", 'wb'))
  dct.save('precompute/dct.dict')
  dct = corpora.Dictionary.load('precompute/dct.dict')
  
  corpus = [dct.doc2bow(doc) for doc in tqdm(documents)]
  pickle.dump(corpus, open("precompute/corpus.pkl", 'wb'))
  print("Corpus generated")

  tfidf = models.TfidfModel(corpus, smartirs='bfn')
  pickle.dump(tfidf, open("precompute/tfidf_model.pkl", 'wb'))
  corpus_tfidf = tfidf[corpus]
  pickle.dump(corpus_tfidf, open("precompute/corpus_tfidf.pkl", 'wb'))
  print("tfidf generated")

  index_temp = get_tmpfile("index")
  index = Similarity(index_temp, corpus_tfidf, num_features=len(dct), num_best=100)
  index.save("precompute/similarities.pkl")
  print("Similarity index saved")

  PIPE = subprocess.PIPE
  #NLU = subprocess.Popen(['rasa', 'train', '--data', ' nlu-train-data', '--fixed-model-name', 'model', '-vv', 'nlu'], stdout=PIPE, stderr=PIPE)
  NLU = subprocess.Popen(['rasa', 'train', 'nlu', '-u', 'nlu-train-data', '--config', 'config.yml', '--fixed-model-name', 'model'])
  NLU.wait()
  print("Rasa NLU trained")
示例#4
0
def main(dataset_path):
    if not os.path.exists('../data/retriever/paragraph-ids.txt'):
        print('Writing paragraph ID to file...')
        with open('../data/retriever/paragraph-ids.txt', 'w') as f:
            for paragraph_id in load_ids(dataset_path):
                f.write(paragraph_id + '\n')

    dictionary_path = '../data/retriever/dct.pkl'
    if not os.path.exists(dictionary_path):
        print('Creating dictionary...')
        st = time.time()
        dct = Dictionary(load_paragraphs(dataset_path), prune_at=5000000)
        dct.save(dictionary_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating dictionary in {et - st}s.')
    else:
        print('Loading dictionary...')
        dct = Dictionary.load(dictionary_path)
        print('Dictionary loaded.')

    tfidf_path = '../data/retriever/tfidf.pkl'
    if not os.path.exists(tfidf_path):
        print('Creating model...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        model = TfidfModel(corpus)
        model.save(tfidf_path, pickle_protocol=3)
        et = time.time()
        print(f'\rFinished creating model in {et - st}s.')
    else:
        print('Loading model...')
        model = TfidfModel.load(tfidf_path)
        print('Model loaded.')

    index_path = '../data/retriever/indexes/master-index'
    if not os.path.exists(index_path):
        print('Creating index...')
        st = time.time()
        corpus = map(dct.doc2bow, load_paragraphs(dataset_path))
        index = Similarity('../data/retriever/indexes/index', model[corpus],
                           len(dct))
        index.save(index_path)
        et = time.time()
        print(f'\rFinished creating index in {et - st}s.')
        print('Done')
    else:
        print('Nothing to do. Exiting...')
示例#5
0
def create_sim_matrix(tfidf, corpus, dictionary, outputDir):
    """"
    Creates a Gensim simiariry matrix for document similarity comparison and saves it
    
    tfidf (Gensim tfidf model): Gensim tfidf model
    corpus (Gensim corpus object): Gensim corpus
    dictionary (Gensim dictionary object): Gensim dictionary
    outputDir (string): Location to save matrix
    """
    indicesFile = outputDir + 'indices'
    simFile = outputDir + 'Index'
    sims = Similarity(indicesFile,
                      tfidf[corpus],
                      num_features=(len(dictionary)))
    sims.close_shard()
    sims.save(simFile)
    print('Similarity matrix created and stored at: ' + simFile)
示例#6
0
def fit_model(data, n_topics, iterations, passes, min_prob, eval_every, n_best,
              min_df, max_df, preserved_words):
    dt = cur_date()
    output_folder = "lda_%stopics_%s" % (n_topics, dt)
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs("%s/separate" % output_folder, exist_ok=True)

    logging.info("creating corpus...")
    dictionary, corpus = make_corpus(list(data.values()), min_df, max_df,
                                     preserved_words, output_folder)
    # generate LDA model
    logging.info("training model...")
    lda = LdaModel(corpus,
                   num_topics=n_topics,
                   id2word=dictionary,
                   iterations=iterations,
                   passes=passes,
                   minimum_probability=min_prob,
                   eval_every=eval_every)
    logging.info("saving model...")
    lda.save('saved/lda_%s_%s.serialized' % (n_topics, dt))
    # print(lda.print_topics(num_topics=n_topics, num_words=4))

    # save all-vs-all pairwise similarities
    logging.info("creating index...")
    index = Similarity('./sim_index',
                       lda[corpus],
                       num_features=n_topics,
                       num_best=n_best + 1)
    paths = list(data.keys())
    logging.info("write all similarities to result file")
    with open('%s/similarities.txt' % output_folder, 'w') as res_file:
        with open('%s/similarities_summary.txt' % output_folder,
                  'w',
                  encoding='utf-8') as res_file_sum:
            for i, similarities in enumerate(index):
                cur_fname = get_filename(paths[i])
                top_similar = [(paths[s[0]], s[1]) for s in similarities
                               if s[0] != i]
                res_file.write('%s: %s\n' %
                               (cur_fname, [(get_filename(p), c)
                                            for (p, c) in top_similar]))

                res_file_sum.write('%s: %s\n' %
                                   (cur_fname, get_title(paths[i])))
                for sim in top_similar:
                    res_file_sum.write(
                        '%s: %s' % (get_filename(sim[0]), get_title(sim[0])))
                res_file_sum.write('-' * 100 + '\n')

                # for each doc we make separate file which containts list of similar docs
                with open(
                        '%s/separate/%s.txt' %
                    (output_folder, cur_fname.split('.')[0]), 'w') as sep_res:
                    for sim in top_similar:
                        sep_res.write('%s\n' % get_filename(sim[0]))

    logging.info("save index")
    index.save('saved/lda_index_%s.index' % dt)

    # save topic - words matrix
    with open("%s/topic_words.txt" % output_folder, 'w',
              encoding='utf-8') as f:
        for topic_words in lda.print_topics(lda.num_topics):
            f.write("#%s: %s\n" % (topic_words[0], topic_words[1]))

    # save document - topics matrix
    with open("%s/document_topics.txt" % output_folder, 'w') as f:
        for i, topics in enumerate(lda[corpus]):
            f.write("#%s: %s\n" % (get_filename(paths[i]), topics))

    # save dictionary
    dictionary.save_as_text("%s/dictionary.txt" % output_folder)
示例#7
0
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(output_prefix + '_bow.mm')

    # build tfidf, ~50min
    logger.info(">>> Building TF-IDF model ...")
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(output_prefix + '.tfidf_model')

    """
    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    logger.info(">>> Serializing TF-IDF vectors ...")
    MmCorpus.serialize(output_prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
    del tfidf
    tfidf_corpus = MmCorpus(output_prefix + '_tfidf.mm')
    """

    logger.info(">>> Generating similarity index ...")
    #similarity_index = Similarity(output_prefix + "_similarity_index", tfidf_corpus, len(dictionary))
    similarity_index = Similarity(output_prefix + "_similarity_index", tfidf[mm], len(dictionary),
                                  use_reverse_index=True)
    del tfidf

    logger.info(">>> Serializing similarity index ...")
    similarity_index.save(output_prefix + '_similarity.index')

    logger.info(">>> Finished running %s" % program)
import gensim
from gensim.similarities import Similarity, MatrixSimilarity

# from pgfin_timing import Timer

from pgfin_helpers import tokenize


logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore


# load the corpora

print "\n    Loading corpora.\n"
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_tfidf.mm')
# lsi_corpus = gensim.corpora.MmCorpus('./data/pgfintestdata20_lsa.mm')
# tfidf_corpus = gensim.corpora.MmCorpus('./data/pgfin_tfidf.mm')
lsi_corpus = gensim.corpora.MmCorpus('./data/pgfin_lsa.mm')
# print(tfidf_corpus)
# print(lsi_corpus)

print "\n    Start similarity index.\n"
index = Similarity('./data/pgfin_index', lsi_corpus, num_features=lsi_corpus.num_terms)
index.save('./data/pgfin_index.index')  # save to disk
# print index
index_dense = MatrixSimilarity(lsi_corpus, num_features=lsi_corpus.num_terms)
index_dense.save('./data/pgfin_matrixindex.index')  # save to disk
# print index_dense
def initialise_query_structures(corpus, dictionary):
    index_tmpfile = get_tmpfile("index")
    index = Similarity(index_tmpfile, corpus=corpus, num_features=len(dictionary))
    index.save('../NLP_DocumentSimilarity/corpus/patent_query_index.index')
    return index
示例#10
0
#tfidf = TfidfModel(BOW_corpus)
#tfidf.save('../models/tfidf.model') #already provided
tfidf = TfidfModel.load('../models/tfidf.model')
corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix',
tfidf[BOW_corpus])

print('Saved LogEntropy TF-IDF matrix')

print('Creating Similarity Index')
logent_corpus = MmCorpus('../data/log_entropy_matrix')
num_feat = len(wiki.dictionary.keys())
index = Similarity('../data/logEntropyShards/logEntropySimilarity',
logent_corpus, num_features=num_feat)

index.save('../data/logEntropyShards/logEntropySimilarityIndex')
print('Saved Shards and similarity index')

print('Getting list of titles...')
bz2_wiki = bz2.BZ2File(wiki_file, "r")
extract = corpora.wikicorpus.extract_pages(bz2_wiki)
i = 0
matches = open('../data/title_matches.txt','a')
for title,doc,z in extract:
	wiki_filt = corpora.wikicorpus.filter_wiki(doc)
	doc_token = corpora.wikicorpus.tokenize(wiki_filt)
	bowbow = diction.doc2bow(doc_token)
	if bowbow == BOW_corpus[i]:
		i+=1
		print(unidecode(title),file=matches)
		if i % 100000 == 0: