def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model
Exemplo n.º 2
0
def get_tfidf_model():
    if os.path.isfile(TFIDF_FILE):
        return TfidfModel.load(TFIDF_FILE)
    else:
        model = TfidfModel(get_corpus(), get_dictionary())
        model.save(TFIDF_FILE)
        return model
Exemplo n.º 3
0
def cal_tfidf(documents, topk=10) -> List:
    """
    tfidf模型训练
    :param documents: 要进行训练的文档
    :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词
    :return:
    """
    # 单个文档分成列表
    docs = [[word for word in document.split(' ')] for document in documents]
    # 生成字典
    dictionary = corpora.Dictionary(docs)
    # 生成bag of word
    docs_bow = [dictionary.doc2bow(doc) for doc in docs]
    if os.path.isfile(tfidfmodel):
        model = TfidfModel.load(tfidfmodel)
    else:
        model = TfidfModel(docs_bow)
        model.save(tfidfmodel)
    # 生成文本向量
    docs_vector = list(model[docs_bow])
    # 对所有的文本向量进行排序,取钱topk
    docs_sort_vector = [
        sorted(doc, key=lambda x: x[1], reverse=True)[:topk]
        for doc in docs_vector
    ]
    # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表
    docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc]
                         for doc in docs_sort_vector]
    return docs_sort_chinese
Exemplo n.º 4
0
    def transformModel(modelType, inputModel="", dictionary=""):

        #check if using default dict or lcoation passed as parameter
        if dictionary == "":
            dictionary = corpora.Dictionary.load('dictionaries/testNewsgroupsDictionary.dict')
            print dictionary
            #sys.exit(1)
        else:
            fileName = 'dictionaries/'+str(dictionary)
            dictionary = corpora.Dictionary.load(fileName)
            
        #use default stored model; mm format
        if inputModel == "":
            inputModel = TfidfModel.load("models/testNewsgroups.tfidf_model")
            #print inputModel
        else:
            fileName = 'models/'+str(inputModel)
            corpus = corpora.MmCorpus(inputModel)
            inputModel = models.TfidfModel(corpus)
    
        #create model handlers
        if modelType == "":
            print "Chose output model for selected input file: \n 1 -> LSI model\n 2 -> LDA model\n 3 -> LogEntropy model\n Pass it as the third parameter"
            sys.exit(1)    
        elif modelType == 1:
            model = models.LsiModel(inputModel,id2word=dictionary)
        elif modelType == 2:
            model = models.LdaModel(inputModel,id2word=dictionary)
        elif type == 3:
            model = models.LogEntropyModel(inputModel,id2word=dictionary)
        else:
            errorMessage("Something went wrong with the type identificator")
        return model
Exemplo n.º 5
0
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'corpora'
        ])
        self.dictionary = corpora.Dictionary.load(
            os.path.join(corpora_folder, "%s.dict" % (vocabulary, )))
        self.corpus = corpora.MmCorpus(
            os.path.join(corpora_folder, "%s.mm" % (vocabulary, )))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'models'
        ])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)
Exemplo n.º 6
0
def buildTfidfModel(corpus):
    print('get tfidf model...')
    if not os.path.exists(modelpath + 'tfidf.model'):
        # 构造tfidf向量
        tfidf = TfidfModel(corpus)
        tfidf.save(modelpath + 'tfidf.model')
    else:
        tfidf = TfidfModel.load(modelpath + 'tfidf.model')
    print('done')
    return tfidf
Exemplo n.º 7
0
 def __getitem__(self, modelo):
     '''
     Retorna o modelo correspondente.
     Parâmetros:
         modelo (str) --> Indicador do modelo que pode ser "tfidf", "tfidf_pivot", "lsi", "lda" ou "doc2vec"
     Retorno: o modelo solicitado, se existir
     '''
     if not os.path.isfile(self._arqs['modelos'][modelo]):
         print(f'O modelo "{modelo} não foi implementado ou montado."')
         return None
     if modelo in ['tfidf', 'tfidf_pivot']:
         model = TfidfModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lsi':
         model = LsiModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'lda':
         model = LdaModel.load(self._arqs['modelos'][modelo])
     elif modelo == 'doc2vec':
         model = Doc2Vec.load(self._arqs['modelos'][modelo])
     return model
Exemplo n.º 8
0
def init():
    #初始化一些全局变量
    global dictionary
    global tfidf
    global accusation_list
    global law_list

    dictionary = corpora.Dictionary.load(modelpath + 'dictionary.model')
    tfidf = TfidfModel.load(modelpath + 'tfidf.model')

    fin = open(lawPath, 'r')
    line = fin.readline()
    while line:
        line = line.split()
        law_list.append([int(line[0]), int(line[1])])
        line = fin.readline()
    fin.close()
    for i, v in enumerate(law_list):
        law_dic[str(v)] = i
        tobe_law[i] = v
Exemplo n.º 9
0
    def __init__(self, analyzed_items_path=None, dictionary_path=None,
                 corpus_path=None, tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
Exemplo n.º 10
0
    def __init__(self,
                 analyzed_items_path=None,
                 dictionary_path=None,
                 corpus_path=None,
                 tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
Exemplo n.º 11
0
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora'])
        self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,)))
        self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,)))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)
Exemplo n.º 12
0
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#.download('words')
words = set(nltk.corpus.words.words())

from gensim.models.tfidfmodel import TfidfModel
from gensim import similarities, models, corpora, utils
from gensim.test.utils import datapath, get_tmpfile

os.chdir('K:\DS project')
#============================================
path = os.getcwd()
df = pd.read_csv('df3.csv')
dictionary = utils.SaveLoad.load(path + '\\Ds projectdim_items_terms.dict')
corpus = corpora.MmCorpus(path + '\\Ds projectdim_items_terms.mm')
tfidf = TfidfModel.load(fname=path + '\\Ds projectdim_items_terms.tfidf')
sims = utils.SaveLoad._load_specials(path +
                                     '\\Ds projectdim_items_terms.similarity')

app = Flask(__name__)


@app.route("/")
def projectname():
    return render_template("name.HTML")


@app.route("/intro")
def intro():
    return render_template("1stpage.HTML", methods=["POST"])
Exemplo n.º 13
0
class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [
            TextBlob(dataset['description'].lower()).tokens
            for dataset in data['dataset']
        ]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('tfidf.pkl')
        dictionary = Dictionary.load('nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save('nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)
Exemplo n.º 14
0
    def __iter__(self):
        data = json.load(open('../data/nasa.json'))

        desc = [
            TextBlob(dataset['description'].lower()).tokens
            for dataset in data['dataset']
        ]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('../data/tfidf.pkl') and os.path.exists(
            '../data/nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('../data/tfidf.pkl')
        dictionary = Dictionary.load('../data/nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save(self, '../data/nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('../data/tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)
Exemplo n.º 15
0
	def load_tfidf_model (self, filename='../data/models/tfidf_model'):
		self.tfidf_model = TfidfModel.load (filename) 
        comments_dictionary.save(FLAGS.dictFile)
    else:
        print("Loading dictionary...")
        comments_dictionary = Dictionary.load(FLAGS.dictFile)

    print("Converting to BOW vectors...")
    comments_corpus = [comments_dictionary.doc2bow(d) for d in docs]

    model_tfidf = None
    if doTrain:
        print("Creating tfidf model...")
        model_tfidf = TfidfModel(comments_corpus)
        model_tfidf.save(FLAGS.tfidfFile)
    else:
        print("Loading tfidf model...")
        model_tfidf = TfidfModel.load(FLAGS.tfidfFile)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = np.vstack(
        [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf])

    chi2_features = None
    if doTrain:
        # Find most descrimitive words for any of the labels
        print("Finding discrimitive features...")
        labels = np.array(data['any'])
        model_fpr = SelectFpr(chi2, alpha=0.025)
        model_fpr.fit(comments_vecs, labels)
        chi2_features = model_fpr.get_support(indices=True)
        np.save(FLAGS.chi2File, chi2_features)
Exemplo n.º 17
0
def train_model(corpus_path, dic_conf, lda_conf):
    logging.info('Loading corpus from file {}'.format(corpus_path))
    corpus = FastTextCorpus(corpus_path, bufsize=20000000, length=5926250)
    # corpus = LineSentence(corpus_path, 10000000)
    print '-' * 80
    if lda_conf["build_dict"]:
        logging.info("Building dictionary ...")
        dic = Dictionary(corpus)
        dic.filter_extremes(no_below=dic_conf["min_tf"],
                            no_above=dic_conf["max_df"],
                            keep_n=dic_conf["vocab_size"])
        dic.compactify()
        logging.info("Saving dictionary ...")
        dic.save(dic_conf["dic"])
    else:
        logging.info("Loading dictionary ..")
        dic = Dictionary.load(dic_conf["dic"])

    bow = IntCorpus(corpus, dic)
    l = len(bow)
    print l

    tfMod = TfidfModel.load(lda_conf["tfmod"])
    #save corpus to disk for later usage
    # logging.info("Saving corpus to disk ...")
    # MmCorpus.serialize("data/corpus.mm", bow)
    # bow = MmCorpus("data/large_corpus.mm")

    print '-' * 80
    if lda_conf["new"]:
        logging.info("Training new lda model")
        logging.info("Loading defined keywords ...")
        keywords = {}
        topics = []
        with codecs.open(lda_conf["kw_file"], "r", "utf-8") as f:
            for l in f:
                sp = l.strip().split(':')
                topic = int(sp[0])
                topics.append(sp[1])
                kws = sp[2].split(',')
                for kw in kws:
                    if kw not in keywords:
                        keywords[kw] = set([topic])
                    else:
                        keywords[kw].add(topic)
                    #keywords[kw.lower()] = topic

        logging.info("Number of defined keywords: {}".format(len(keywords)))
        if lda_conf["threads"] <= 1:
            model = LdaModelNew(corpus=bow,
                                id2word=dic,
                                iterations=lda_conf["iterations"],
                                num_topics=lda_conf["num_topics"],
                                passes=lda_conf["passes"],
                                chunksize=lda_conf["chunksize"],
                                defined_kws=keywords,
                                alpha='auto',
                                eval_every=lda_conf["eval_every"])
        else:
            logging.info("Training model using mutlicore lda version")
            model = LdaMulticoreNew(corpus=bow,
                                    id2word=dic,
                                    workers=lda_conf["threads"],
                                    iterations=lda_conf["iterations"],
                                    num_topics=lda_conf["num_topics"],
                                    passes=lda_conf["passes"],
                                    defined_kws=keywords,
                                    alpha='symmetric',
                                    chunksize=lda_conf["chunksize"],
                                    eval_every=lda_conf["eval_every"],
                                    tfMod=tfMod,
                                    topic_names=topics)

    else:
        logging.info("Training ldamodel implemented in gensim")
        model = LdaModelOld(corpus=bow,
                            id2word=dic,
                            iterations=lda_conf["iterations"],
                            num_topics=lda_conf["num_topics"],
                            passes=lda_conf["passes"],
                            chunksize=lda_conf["chunksize"],
                            alpha='auto',
                            eval_every=lda_conf["eval_every"])

    logging.info('Saving lda model to {}'.format(lda_conf["model_path"]))
    model.save(lda_conf["model_path"])
    logging.info('Saving model done!')
Exemplo n.º 18
0
from gensim.corpora import Dictionary


class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('tfidf.pkl')
        dictionary = Dictionary.load('nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save('nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)