def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model
示例#2
0
    def load(self, path='default'):
        """
        :param path: the path of trained model.
        :return:
        """
        if path == 'default':
            path = 'model'
        file_list = os.listdir(path)
        for file in file_list:
            if file.endswith('.model'):
                self.model_name = file.split('.')[0]
        if self.model_name == 'lda':
            self.model = LdaModel.load(str(path + '/lda.model'))
        if self.model_name == 'lsi':
            self.model = LsiModel.load(str(path + '/lsi.model'))
        if self.model_name == 'hdp':
            self.model = HdpModel.load(str(path + '/hdp.model'))

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.model.get_topics().shape[0]
        else:
            self.num_topics = self.model.num_topics
        #self.iterations = self.model.iterations

        f = open(str(path + '/original_data.pickle'), 'rb')
        self.original_data = pickle.load(f)
        f.close()
        f = open(str(path + '/text.pickle'), 'rb')
        self.text = pickle.load(f)
        f.close()
        f = open(str(path + '/token.pickle'), 'rb')
        self.token = pickle.load(f)
        f.close()
        f = open(str(path + '/corpus.pickle'), 'rb')
        self.corpus = pickle.load(f)
        f.close()

        path = path + '/result'
        f = open(str(path + '/topic_key.pickle'), 'rb')
        self.topic_key = pickle.load(f)
        f.close()

        f = open(str(path + '/doc_topic.pickle'), 'rb')
        self.doc_topic = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_doc.pickle'), 'rb')
        self.topic_doc = pickle.load(f)
        f.close()

        f = open(str(path + '/topic_sent.pickle'), 'rb')
        self.topic_sent = pickle.load(f)
        f.close()

        self.id2word = self.model.id2word
        if self.model_name == 'hdp':
            self.num_topics = self.topic_doc.shape[0]
        else:
            self.num_topics = self.model.num_topics
示例#3
0
 def representation(self):
     if not self.model:
         print("LOAD MODEL...")
         self.model = LsiModel.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.model'))
         self.dictionary = Dictionary.load(
             os.path.join(self.preprocessor.source.path,
                          self.preprocessor.source.info + '.dic'))
     pass
 def __init__(self):
     self.dictionary = Dictionary.load(app.config["RCMDR_DICT"])
     self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"])
     self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"])
     self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"])
     self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"])
     self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"])
     self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"])
     self.job_labels = {
         int(k): v
         for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n"))
     }
示例#5
0
 def __getitem__(self, modelo):
     '''
     Retorna o modelo correspondente.
     Parâmetros:
         modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec"
     Retorno: o modelo solicitado, se existir
     '''
     if not os.path.isfile(self._arqs['modelos'][modelo]):
         print(f'O modelo "{modelo} não foi implementado ou montado."')
         return None
     if modelo in ['tfidf', 'tfidf_pivot']:
         model = TfidfModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lsi':
         model = LsiModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lda':
         model = LdaModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'doc2vec':
         model = Doc2Vec.load(self._arqs['modelos'][modelo])
     return model
示例#6
0
def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)
示例#7
0
    def get_lsa_model(self, n_topics=50, recalculate=False, from_scratch=True):

        filepath = self.paths.get_lsa_filepath(n_topics)

        if not os.path.isfile(filepath) or recalculate:

            if not from_scratch:
                raise ValueError('No LSA file exists but from_scratch is False')

            trigram_dictionary = self.lda_builder.get_corpus_dict()
            trigram_bow_corpus = self.lda_builder.get_trigram_bow_corpus(trigram_dictionary)

            print('Building LSA model...')
            lsi = LsiModel(trigram_bow_corpus, id2word=trigram_dictionary, num_topics=n_topics)

            lsi.save(filepath)
            print('LSA model (n_topics={}) written to {}'.format(n_topics, filepath))
        else:
            print('Loading LSA model (n_topics={})...'.format(n_topics))
            lsi = LsiModel.load(filepath)

        return lsi
示例#8
0
文件: lsi.py 项目: ilanfri/kpmg1
def main():

    start_time=time.time()

    rootdir=os.getcwd()
    foldername='lsi_output'
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==False or (os.path.exists(folderpath)==True and args.force==True)):
        topics, lsi = createLsiModelforCorpus(args.corpus, args.dict, args.ntopics)
    else:
        os.chdir(folderpath)
        lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
        topicsfile=(str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
        modelpath=os.path.join(folderpath,lsimodelfile)
        topicspath=os.path.join(folderpath,topicsfile)
        lsi = LsiModel.load(modelpath)
        topics=pickle.load(open(topicspath,'r'))
        f = open('lsi_corpus_topics.txt','w')
        f.seek(0)
        f.write(str(topics))
        f.close()
        os.chdir(rootdir)
        
    pp.pprint(lsi.show_topics(num_topics=args.ntopics, num_words=10, log=False, formatted=True))

    corpus = corpora.MmCorpus(args.corpus)

    if args.query!=-1:
        queryresult = lsi[corpus[args.query]]
        sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True)
        print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query)
        pp.pprint(sortedqueryresult)

    
    # Generate topic probability-document matrix, along with vector containing most probable topic (assumed to be the label) for each document
    #os.chdir(folderpath)
    outlabel_name = 'lsi_document_labels_{0}.txt'.format((args.corpus).replace('.mm',''))

    outtopic_name = 'lsi_topic_vectors_{0}.txt'.format((args.corpus).replace('.mm',''))

    outlabelpath=os.path.join(folderpath,outlabel_name)
    outtopicpath=os.path.join(folderpath,outtopic_name)
    if (os.path.exists(outlabelpath)==False or os.path.exists(outtopicpath)==False):

        outtopic = open(outtopic_name, 'w')
        outlabel = open(outlabel_name, 'w')

        for idx,doc in enumerate(corpus):
    
            tops = lsi[doc]
            doc_tops=[]
            for j in range(args.ntopics):
                search = [v[1] for v in tops if v[0] == j]

                if len(search)>0:
                    doc_tops.append(search[0])
                else:
                    doc_tops.append(0.)

            most_important = doc_tops.index(max(doc_tops))
            outlabel.write('{0}\n'.format(most_important))
            outtopic.write('\t'.join([str(d) for d in doc_tops])+'\n')

        outlabel.close()
        outtopic.close()

    shutil.move(outlabel_name,folderpath)
    shutil.move(outtopic_name,folderpath)


    #os.chdir(rootdir)
 
    end_time=time.time()
    runtime=end_time-start_time
    print "\nRuntime: {0} seconds\n".format(runtime)
示例#9
0
from datetime import datetime
from date_extractor import month_to_number
from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from nltk.corpus import stopwords as nltk_stopwords
from os.path import dirname, realpath

try:
    path_to_directory_of_this_file = dirname(realpath(__file__))

    stopwords = []
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])   
    stopwords = set(stopwords)

    lsi = LsiModel.load(path_to_directory_of_this_file + "/model")
   
    dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary")
except Exception as e:
    print e

def run(text):

    try:

        words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split()
        words = [word for word in words if len(word) > 3 and word not in stopwords]

        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
示例#10
0
def get_matrix_similarity(tweet, data):
    '''
    in:
      tweet:
        list of lemmatized strings from tweet body. Output of preprocessing
      data:
        Dict containing filenames of assoc. files
        format:
          dataset:
            filename of dataset csv file
          dict:
            filename of dictionary file
          model:
            filename of lsa model
          output_name:
            name to be used for all output files generated
          directory:
              directory name where outputs will be saved. If null, output_name is used
    '''

    df = pd.read_csv('./data/dataset/%s' % data['dataset'])
    lemmas_list = []

    for lemmas in df['lemmas']:
        lemmas = str(lemmas)
        lemmas = lemmas.replace('[',
                                '').replace(']',
                                            '').replace(',',
                                                        '').replace('\'', '')
        lemmas_list.append(lemmas.split())

    dictionary = corpora.Dictionary.load('./data/dicts/%s' % data['dict'])
    clean_doc = [dictionary.doc2bow(text) for text in lemmas_list]

    lsi = LsiModel.load('./data/models/%s' % data['model'])

    index = similarities.MatrixSimilarity(lsi[clean_doc])

    make_dir('./data/similarities/')
    directory = ''
    if ('directory' in data.keys()):
        make_dir('./data/similarities/%s' % data['directory'])
        directory = data['directory']
    else:
        make_dir('./data/similarities/%s' % data['output_name'])
        directory = data['output_name']

    data['directory'] = directory

    data['filename'] = []
    counter = 0
    for tw in tweet:
        corpus = lsi[dictionary.doc2bow(tw)]

        with open(
                './data/similarities/%s/similarities_%i.txt' %
            (directory, counter), 'w+') as file:
            for doc in sorted(enumerate(index[corpus]),
                              key=lambda item: -item[1]):
                file.write(str(doc) + '\n')

        data['filename'].append('similarities_%i.txt' % counter)

        counter += 1

    return data
示例#11
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    # load model and corpus
    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['run'], p['dict_extension']))

    model_path = path.join(result_path, p['run'], p['lsi_ext'])
    logger.info('load model from: %s' % model_path)
    lsi = LsiModel.load(model_path)
    pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext']))

    logging.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (only pre model)')
    corpus_pre = pre[bow_lee_texts]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file']))
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    max_topics = lsi.num_topics

    logger.info("iterate from %d to %d dimensions (stepsize: %d)" %
                (p['min_dim'], max_topics, p['dim_step']))

    iter_range = range(p['min_dim'], max_topics, p['dim_step'])
    res = np.zeros(len(iter_range))
    for k, l in enumerate(iter_range):

        # do the lower dimensionality transformation
        lsi.num_topics = l
        corpus_lsi = lsi[corpus_pre]

        # compute pairwise similarity matrix of transformed corpus
        sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                sim_matrix[i, j] = matutils.cossim(par1, par2)
        sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

        # compute correlations
        cor = np.corrcoef(sim_vector, human_sim_vector)
        logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1]))
        res[k] = cor[0, 1]

    plt.figure()
    plt.plot(iter_range, res)
    plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension']))
    plt.close()
    np.save(path.join(output_dir, 'model_dim_res.npy'), res)

    dif = datetime.now() - start
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
示例#12
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    model_path = path.join(base_path,
                           p['result_path'],
                           p['model_label'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # train the model on the small marketing corpus
    preprocess = []

    if 'stoplist' in p.as_dict():
        stoplist = open(path.join(base_path, p['stoplist'])).readlines()
        stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist]
        def remove_stopwords(sentence):
            return [word for word in sentence if not word in stoplist]
        preprocess.append(remove_stopwords)

    if 'stemmer' in p.as_dict():
        stemmer = Stemmer.Stemmer(p['stemmer'])
        preprocess.append(stemmer.stemWords)

    if not p['model_label']:
        cor = TextFilesCorpus(path.join(base_path, p['corpus_path']),
                              no_below=p['no_below'],
                              no_above=p['no_above'],
                              preprocess=preprocess)
        dictionary = cor.dictionary

        pre = LogEntropyModel(cor, id2word=dictionary, normalize=True)
        lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics'])
    else:
        dictionary = Dictionary.load(path.join(model_path, p['dict_name']))
        pre = SaveLoad.load(path.join(model_path, 'pre.model'))
        lsi = LsiModel.load(path.join(model_path, 'lsi.model'))
        lsi.num_topics = p['num_topics']

    test_cor_path = path.join(base_path, p['test_cor_path'])
    test_answers, gold_answers, ratings = [], [], []


    flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt'))
    for file in flist:
        match = re.search('data3_(\d)_\d+.txt', file)
        ratings.append(int(match.group(1)))
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            test_answers.append(corpus)
    flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt'))
    for file in flist:
        with open(file) as f:
            doc = string.join(map(string.strip, f.readlines()))
            doc = utils.tokenize(doc, lower=True)
            for func in preprocess:
                doc = func(doc)
            corpus = lsi[pre[dictionary.doc2bow(doc)]]
            gold_answers.append(corpus)


    sim = MatrixSimilarity(test_answers)[gold_answers]
    mean_sim = np.mean(sim, axis=0)
    print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1]
    print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
示例#13
0
from datetime import datetime
from date_extractor import month_to_number
from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from nltk.corpus import stopwords as nltk_stopwords
from os.path import dirname, realpath

try:
    path_to_directory_of_this_file = dirname(realpath(__file__))

    stopwords = []
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])   
    stopwords = set(stopwords)

    lsi = LsiModel.load(path_to_directory_of_this_file + "/model")
   
    dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary")
except Exception as e:
    print("Exception trying to load LSI index.  You can most likely ignore this:", e)

def run(text):

    try:

        words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split()
        words = [word for word in words if len(word) > 3 and word not in stopwords]

        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
示例#14
0
matrices = {}

logging.info('load the articles pickle')
with open(results_path + "sparql_wiki.pickle", 'r') as f:
    articles = pickle.load(f)

logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [dictionary.doc2bow(article['text'], allowUpdate=False, returnMissingWords=False) 
            for article in articles[key].values()]
    sim_matrix = np.zeros((len(text_list), len(text_list)))

    logging.info('transform the textlist')
    text_list = lsi[log_ent[text_list]]

    logging.info('compute similarity matrix')
    for i, par1 in enumerate(text_list):
示例#15
0
matrices = {}

logging.info('load the articles pickle')
with open(results_path + "sparql_wiki.pickle", 'r') as f:
    articles = pickle.load(f)

logging.info('load the dictionary')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('load the log_ent model')
log_ent = LogEntropyModel.load(results_path + norm_model)

logging.info('load the LSI model')
lsi = LsiModel.load(results_path + trans_model)

for key in articles.iterkeys():

    logging.info('current term: %s' % key)

    term_list = articles[key].keys()
    text_list = [
        dictionary.doc2bow(article['text'],
                           allowUpdate=False,
                           returnMissingWords=False)
        for article in articles[key].values()
    ]
    sim_matrix = np.zeros((len(text_list), len(text_list)))

    logging.info('transform the textlist')