Exemplo n.º 1
0
def get_tfidf_model():
    if os.path.isfile(TFIDF_FILE):
        return TfidfModel.load(TFIDF_FILE)
    else:
        model = TfidfModel(get_corpus(), get_dictionary())
        model.save(TFIDF_FILE)
        return model
Exemplo n.º 2
0
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'corpora'
        ])
        self.dictionary = corpora.Dictionary.load(
            os.path.join(corpora_folder, "%s.dict" % (vocabulary, )))
        self.corpus = corpora.MmCorpus(
            os.path.join(corpora_folder, "%s.mm" % (vocabulary, )))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'data', 'models'
        ])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)
    def tf_idf_transform(self, doc):
        """
        Perform tf-idf transformation on doc.
        """
        self.dictionary = corpora.Dictionary(doc)
        corpus = [self.dictionary.doc2bow(text) for text in doc]
        self.tfIdfModel = TfidfModel(corpus)

        conf.mk_dir(self.tfIdfPath)

        self.dictionary.save(self.dictPath)
        logger.info('Dictionary has been saved in %s.' % self.dictPath)

        self.tfIdfModel.save(self.tfIdfPath)
        logger.info('TF-IDF model has been saved in %s.' % self.tfIdfPath)

        tfidf_corpus = self.tfIdfModel[corpus]
        tfidf_corpus_path = conf.get_filename_via_tpl('tfidf',
                                                      n_users=self.nUsers,
                                                      postfix='mm',
                                                      n_samples=self.nSamples)
        corpora.MmCorpus.serialize(tfidf_corpus_path, tfidf_corpus)
        logger.info('TF-IDF corpus with a shape of %s has been saved in %s.' %
                    (np.array(tfidf_corpus).shape, tfidf_corpus_path))

        return tfidf_corpus
Exemplo n.º 4
0
def cal_tfidf(documents, topk=10) -> List:
    """
    tfidf模型训练
    :param documents: 要进行训练的文档
    :param topk: 提取tfidf score 的前多少个单词, 如果topk大于提取到的单词个数,返回所有单词
    :return:
    """
    # 单个文档分成列表
    docs = [[word for word in document.split(' ')] for document in documents]
    # 生成字典
    dictionary = corpora.Dictionary(docs)
    # 生成bag of word
    docs_bow = [dictionary.doc2bow(doc) for doc in docs]
    if os.path.isfile(tfidfmodel):
        model = TfidfModel.load(tfidfmodel)
    else:
        model = TfidfModel(docs_bow)
        model.save(tfidfmodel)
    # 生成文本向量
    docs_vector = list(model[docs_bow])
    # 对所有的文本向量进行排序,取钱topk
    docs_sort_vector = [
        sorted(doc, key=lambda x: x[1], reverse=True)[:topk]
        for doc in docs_vector
    ]
    # 把对应的向量id转换成中文单词,docs_sort_chinese是中文单词和tfidf的score的列表
    docs_sort_chinese = [[(dictionary[vec[0]], vec[1]) for vec in doc]
                         for doc in docs_sort_vector]
    return docs_sort_chinese
Exemplo n.º 5
0
class TFIDFmodel(object):
    def __init__(self):
        self.inner_model = None

        # load dictionary and corpus
        vocabulary = "raw"
        corpora_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'corpora'])
        self.dictionary = corpora.Dictionary.load(os.path.join(corpora_folder, "%s.dict" % (vocabulary,)))
        self.corpus = corpora.MmCorpus(os.path.join(corpora_folder, "%s.mm" % (vocabulary,)))

        # parameters
        self.dataset = "CASEREPORT"

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        filename = "TFIDF_%s" % (self.dataset, )
        self.filepath = os.path.join(models_folder, filename)
        model_exists = os.path.isfile(self.filepath)

        if model_exists:
            logging.info("found data file %s" % (self.filepath, ))
            self.inner_model = TfidfModel.load(self.filepath)
        else:
            self.inner_model = TfidfModel(corpus=self.corpus)
            self.inner_model.save(self.filepath)

    def __contains__(self, item):
        return item in self.inner_model
Exemplo n.º 6
0
    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
Exemplo n.º 7
0
 def fit(self, documents, labels=None):
     self.lexicon = Dictionary(documents)
     self.tfidf = TfidfModel(
         [self.lexicon.doc2bow(doc) for doc in documents],
         id2word=self.lexicon)
     self.save()
     return self
Exemplo n.º 8
0
def buildTfidfModel(corpus):
    print('get tfidf model...')
    if not os.path.exists(modelpath + 'tfidf.model'):
        # 构造tfidf向量
        tfidf = TfidfModel(corpus)
        tfidf.save(modelpath + 'tfidf.model')
    else:
        tfidf = TfidfModel.load(modelpath + 'tfidf.model')
    print('done')
    return tfidf
 def fit(self, documents, labels=None):
     if self.lexicon == None or self.tfidf == None:
         inputDocuments = list(documents)
         self.lexicon = Dictionary(inputDocuments)
         self.tfidf = TfidfModel(
             [self.lexicon.doc2bow(doc) for doc in inputDocuments],
             id2word=self.lexicon)
         self.save()
         return self
     else:
         return self
Exemplo n.º 10
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel(
            [self.lexicon.doc2bow(doc) for doc in documents],
            id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec

        return list(generator())
Exemplo n.º 11
0
    def __init__(self,
                 docs,
                 strip_diac=True,
                 num_option=OPTION_GROUP,
                 usr_option=OPTION_GROUP,
                 url_option=OPTION_GROUP,
                 emo_option=OPTION_GROUP,
                 lc=True,
                 del_dup1=True,
                 token_list=[-1],
                 lang=None,
                 **kwargs):
        self.strip_diac = strip_diac
        self.num_option = num_option
        self.usr_option = usr_option
        self.url_option = url_option
        self.emo_option = emo_option
        self.emoclassifier = EmoticonClassifier()
        self.lc = lc
        self.del_dup1 = del_dup1
        self.token_list = token_list

        if lang:
            self.lang = LangDependency(lang)
        else:
            self.lang = None

        self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'}

        docs = [self.tokenize(d) for d in docs]
        self.dictionary = corpora.Dictionary(docs)
        corpus = [self.dictionary.doc2bow(d) for d in docs]
        self.model = TfidfModel(corpus)
Exemplo n.º 12
0
def calculate_embedding(corpus: Corpus,
                        *,
                        rank=2,
                        svd_dims=50,
                        perplexity=30,
                        seed=0):
    """ Calculate a document embedding that assigns each document in the
    corpus a N-d position based on the word usage.

    :returns: A list of N-d tuples for the documents in the corpus.
    """
    from gensim.models.tfidfmodel import TfidfModel
    from sklearn.decomposition import TruncatedSVD
    from sklearn.manifold import TSNE

    dic = corpus.dictionary
    freqs = corpus.frequencies
    tfidf = corpus2dense(TfidfModel(dictionary=dic)[freqs], len(dic)).T

    if svd_dims is not None:
        svd = TruncatedSVD(n_components=svd_dims, random_state=seed)
        components = svd.fit_transform(tfidf)
    else:
        components = tfidf

    model = TSNE(rank,
                 metric='cosine',
                 square_distances=True,
                 perplexity=perplexity,
                 random_state=seed)
    return model.fit_transform(components)
Exemplo n.º 13
0
    def transformModel(modelType, inputModel="", dictionary=""):

        #check if using default dict or lcoation passed as parameter
        if dictionary == "":
            dictionary = corpora.Dictionary.load('dictionaries/testNewsgroupsDictionary.dict')
            print dictionary
            #sys.exit(1)
        else:
            fileName = 'dictionaries/'+str(dictionary)
            dictionary = corpora.Dictionary.load(fileName)
            
        #use default stored model; mm format
        if inputModel == "":
            inputModel = TfidfModel.load("models/testNewsgroups.tfidf_model")
            #print inputModel
        else:
            fileName = 'models/'+str(inputModel)
            corpus = corpora.MmCorpus(inputModel)
            inputModel = models.TfidfModel(corpus)
    
        #create model handlers
        if modelType == "":
            print "Chose output model for selected input file: \n 1 -> LSI model\n 2 -> LDA model\n 3 -> LogEntropy model\n Pass it as the third parameter"
            sys.exit(1)    
        elif modelType == 1:
            model = models.LsiModel(inputModel,id2word=dictionary)
        elif modelType == 2:
            model = models.LdaModel(inputModel,id2word=dictionary)
        elif type == 3:
            model = models.LogEntropyModel(inputModel,id2word=dictionary)
        else:
            errorMessage("Something went wrong with the type identificator")
        return model
Exemplo n.º 14
0
def tf_idf_weight(spacy_contexts):
    """
    @param spacy_contexts Spacy-fied contexts

    Returns list of Dicts, each dictionary corresponds to one document and
    contains words and their tf-idf weights
    """
    docs_dict = Dictionary(spacy_contexts)
    docs_dict.compactify()

    docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts]

    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf = model_tfidf[docs_corpus]

    # Now generate a list of dicts with k,v = "word": tfidf_frequency
    # each dict contains words from one document (sentence)
    doc_tfidf_dicts = []

    for doc in docs_tfidf:
        d = dict()
        for term, freq in doc:
            d[docs_dict[term]] = freq

        doc_tfidf_dicts.append(d)

    return doc_tfidf_dicts
Exemplo n.º 15
0
def gensim_similarity(data_c):
    """
    使用Gensim包计算相似度:
        词频
            COUNT
            LDA
            LSI
        Tfidf:
            TFIDF
            LDA
            LSI
    """
    # 合并获取词袋
    data_c['s1'] = data_c['s1'].apply(lambda text: list(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: list(text))
    data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s')

    # 构建词典
    print("starting create dic....")
    dic = corpora.Dictionary(data_c['s1'].values)
    dic.add_documents(data_c['s2'].values)

    print("文档数:", dic.num_docs)
    print("starting create count bow...")
    data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text))
    data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text))
    data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text))

    # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)]
    # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)]

    cps1 = list(data_c['s1'])
    cps2 = list(data_c['s2'])
    cps = list(data_c_all['s'])

    # 计算s1,s2词频相似度
    print("starting count similarity....")
    sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000)
    count_sm = np.diag(sm[cps2])

    # 计算s1,s2词频LDA,LSI相似度
    count_lda_sm = lda_similarity(cps, cps1, cps2, dic)
    # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic)

    # 计算s1,s2 tfidf相似度
    print("starting tfidf similarity....")
    tfidf = TfidfModel(corpus=cps, id2word=dic)
    cps1_tfidf = tfidf[cps1]
    cps2_tfidf = tfidf[cps2]
    cps_tfidf = tfidf[cps]

    # 计算s1,s2 TFIDF相似度
    sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000)
    tfidf_sm = np.diag(sm[cps2_tfidf])

    # 计算s1,s2词频LDA,LSI相似度
    tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)
    tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic)

    return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
Exemplo n.º 16
0
def get_lda_feature():
    doc_train = pd.read_csv(id_content_path)
    documents = doc_train['content'].apply(lambda x: x.split(' '))
    #    建立词和ID的映射字典(id:word)
    dictionary = corpora.Dictionary(documents)
    #    建立文档和id和list(tuple(id,num)) of list df
    ds_df = [dictionary.doc2bow(document) for document in documents]
    #    建立tfidf模型,通过语料文档的tf,预测的时候只要提供语料的df
    tfidf_model = TfidfModel(ds_df)
    #    获取文档的tdf获取文档tfidf
    ds_tfidf = tfidf_model[ds_df]
    #    定义文档的主题个数
    n = 60
    #    构建lda模型,输入参数是文档的tfidf,并指明主题的个数
    lda_model = LdaModel(ds_tfidf, num_topics=n, passes=10, random_state=12)
    vec_size = (len(documents), n)
    lda_feature = np.zeros(vec_size)
    i = 0

    for doc in ds_tfidf:
        topics = lda_model.get_document_topics(doc, minimum_probability=0.01)
        for topic in topics:
            num_topic = topic[0]
            prob = round(topic[1], 5)
            lda_feature[i, num_topic] = prob
        i += 1

    f_names = get_lda_feacture_name(n)
    pd.DataFrame(lda_feature, columns=f_names).to_csv(id_content_lda_path,
                                                      index=0)
Exemplo n.º 17
0
 def get_tfidf(self):
     docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
     model_tfidf = TfidfModel(docs_corpus, id2word=self.docs_dict)
     docs_tfidf = model_tfidf[docs_corpus]
     docs_vecs = np.vstack(
         [sparse2full(c, len(self.docs_dict)) for c in docs_tfidf])
     return docs_vecs
Exemplo n.º 18
0
def get_tfidf_model(path="data/swiki.json",
                    save_path="data/swiki_dict.txt",
                    stem=False):
    """
    :param path:
    :param save_path:
    :return:
    """
    texts = map(lambda x: _preprocess_text(x, stem=stem),
                _load_json_list("data/swiki.json"))

    def _get_swiki_dictionary():
        dict_file = os.path.join(BASE_DIR, save_path)
        if os.path.exists(dict_file):
            dictionary = corpora.Dictionary.load_from_text(dict_file)
        else:
            dictionary = corpora.Dictionary(texts)
            dictionary.save_as_text(dict_file)
        return dictionary

    dct = _get_swiki_dictionary()

    bow_texts = map(dct.doc2bow, texts)
    tfidf = TfidfModel(bow_texts)
    return dct, tfidf
    def load_model(self, model_type):
        model = None
        try:
            if model_type == 'tfidf':
                model = TfidfModel.load(self.tfIdfPath, mmap='r')
                self.tfIdfModel = model
            elif model_type == 'lsi':
                model = LsiModel.load(self.lsiPath, mmap='r')
                self.lsiModel = model
            elif model_type == 'lda':
                model = LdaModel.load(self.ldaPath, mmap='r')
                self.ldaModel = model
            elif model_type == 'w2v':
                model = Word2Vec.load(self.w2vPath, mmap='r')
                self.w2vModel = model
            else:
                logger.error('Model type error. Unexpected %s' % model_type)
                return None

            if self.dictionary is None and os.path.exists(self.dictPath):
                self.dictionary = corpora.Dictionary.load(self.dictPath)

            logger.info('%s model loaded completely.' % model_type)
        except IOError:
            logger.error(
                'The %s model doesn\'t exist. Please train the model before load it.'
                % model_type)
        finally:
            return model
Exemplo n.º 20
0
def lda(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]

	#id2word = dictionary.id2word
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' train lda Model...')
	ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
	#ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpusD, num_topics=topicNum, update_every=1, chunksize=8000, passes=10)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get lda feature...')
	ldaFeature = np.zeros((len(texts), topicNum))
	i = 0

	for doc in corpus_tfidf:
		topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
		
		for t in topic:
			 ldaFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return ldaFeature
Exemplo n.º 21
0
    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)
def getLsiFeature(documents, topicNum):
    '''
     Funciton:
         generate lsi features by training lsi model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lsi features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpusD = [dictionary.doc2bow(text) for text in texts]
    
    # train lsi model
#     LogInfo(' Train LSI model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
    model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

    # generate lsi features
    LogInfo(' Generate LSI features...')
    lsiFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpusD:
        topic = model[doc]
        for t in topic:
             lsiFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlsi")
    lsiFeature = pd.DataFrame(lsiFeature, columns = colName)
    return lsiFeature
def getLdaFeature(documents, topicNum):
    '''
     Funciton:
         generate lda features by training lda model
     Input:
         documents: list of preprocessed sentences
         topicNum: output vector dimension
     Output:
         lda features(DataFrame format)
    '''
    # get corpus
#     LogInfo(' Get corpus...')
    texts = [[word for word in document.split(' ')] for document in documents]
    dictionary = corpora.Dictionary(texts)    
    corpusD = [dictionary.doc2bow(text) for text in texts]

    # train lda model
#     LogInfo(' Train LDA model...')
    tfidf = TfidfModel(corpusD)
    corpus_tfidf = tfidf[corpusD]
#     ldaModel = gensim.models.ldamulticore.LdaMulticore(corpus_tfidf, workers = 8, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    ldaModel = LdaModel(corpus_tfidf, num_topics=topicNum, chunksize=8000, passes=10, random_state = 12)
    # generate lda features
    LogInfo(' Generate LDA features...')
    ldaFeature = np.zeros((len(texts), topicNum))
    i = 0
    for doc in corpus_tfidf:
        topic = ldaModel.get_document_topics(doc, minimum_probability = 0.01)
        for t in topic:
             ldaFeature[i, t[0]] = round(t[1],5)
        i = i + 1
    colName = getColName(topicNum, "qlda")
    ldaFeature = pd.DataFrame(ldaFeature, columns = colName)
    return ldaFeature
Exemplo n.º 24
0
def lsi(documents, topicNum):
	texts = [[word for word in document.split(' ')] for document in documents]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(len(texts)))
	dictionary = corpora.Dictionary(texts)
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' get corpus..')
	corpusD = [dictionary.doc2bow(text) for text in texts]
	print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+' tfidf Model...')
	tfidf = TfidfModel(corpusD)
	corpus_tfidf = tfidf[corpusD]

	model = LsiModel(corpusD, num_topics=topicNum, chunksize=8000, extra_samples = 100)#, distributed=True)#, sample = 1e-5, iter = 10,seed = 1)

	lsiFeature = np.zeros((len(texts), topicNum))
	print('translate...')
	i = 0

	for doc in corpusD:
		topic = model[doc]
		
		for t in topic:
			 lsiFeature[i, t[0]] = round(t[1],5)
		i = i + 1
		if i%1000 == 1:
			print(str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+str(i))

	return lsiFeature
Exemplo n.º 25
0
 def loadCorpus(self, mmfile, dictfile, doctuplesfile=None):
     self.corpus = corpora.MmCorpus(mmfile)
     self.dictionary = corpora.Dictionary.load(dictfile)
     if doctuplesfile != None:
         with open(doctuplesfile, 'rb') as docpicklef:
             self.doctuples = pickle.load(docpicklef)
     if self.toweight:
         self.tfidf = TfidfModel(self.corpus)
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, dirpath=".", tofull=False):
        """
        Gensim vectorizer
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):
        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        if self.lexicon == None or self.tfidf == None:
            inputDocuments = list(documents)
            self.lexicon = Dictionary(inputDocuments)
            self.tfidf = TfidfModel(
                [self.lexicon.doc2bow(doc) for doc in inputDocuments],
                id2word=self.lexicon)
            self.save()
            return self
        else:
            return self

    def transform(self, documents):
        returnDocs = []
        for document in documents:
            vec = self.tfidf[self.lexicon.doc2bow(document)]
            if self.tofull:
                returnDocs.append(sparse2full(vec))
            else:
                returnDocs.append(vec)
        return returnDocs
Exemplo n.º 27
0
def compute_tfidf():
    from gensim.models.tfidfmodel import TfidfModel

    keys, unstem_map, paragraph_lengths, int2word, word2int = compute_all_words(
    )
    with time_code('compute_tfidf'):
        tfidf = TfidfModel(corpus, smartirs='ltc', id2word=int2word)
    return tfidf
Exemplo n.º 28
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())
Exemplo n.º 29
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--output-file')
    args = parser.parse_args()

    encoding = args.encoding
    output_fn = args.output_file

    if not output_fn:
        sys.exit(-1)

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    texts = (line.split() for line in sys.stdin)

    logging.info('Creating vocabulary ...')
    vocab = Dictionary(texts)

    logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2'))
    vocab.save(output_fn)

    logging.info('Compressing vocabulary ...')

    with open(output_fn, 'rb') as input:
        with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn)

    logging.info('Creating IDF model ...')
    tfidf = TfidfModel(dictionary=vocab)

    logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2'))
    tfidf.save(output_fn + '.tfidf')

    logging.info('Compressing IDF model ...')

    with open(output_fn + '.tfidf', 'rb') as input:
        with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb',
                         compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn + '.tfidf')
Exemplo n.º 30
0
def predict_on_group(model,
                     docs_data,
                     word2vec_model300,
                     length=5) -> 'pd.DataFrame of type :  pair_id  || target':
    """ 
    Parameters:

        model -- model object with methods train and predict

        docs_data -- pandas Data Frame with fields pair_id, content, target
        word2vec_model300 -- w2v model (object)


    Returns:

       pd.DataFrame of type : { pair_id  || target }   with predicted target for each pair_id

    """

    dictionary = corpora.Dictionary()
    for i in docs_data.content:
        try:
            dictionary.add_documents([i])
        except:
            dictionary.add_documents([['a']])

    docs_data['vector'] = docs_data.content.apply(doc_opti,
                                                  args=(dictionary, ))
    # except:
    #     docs_data['vector'] = docs_data.content.apply(dictionary.doc2bow)

    corpus = []
    for line in docs_data.content:
        try:
            if math.isnan(line): line = ["мимо"]
        except:
            pass
        corpus = corpus + [dictionary.doc2bow(line)]

    similarity_matrix = word2vec_model300.similarity_matrix(
        dictionary,
        tfidf=TfidfModel(corpus, dictionary=dictionary),
        threshold=0.0,
        exponent=2.0,
        nonzero_limit=100)

    docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec,
                                                   args=(docs_data.vector,
                                                         similarity_matrix))

    features = [str(i) for i in range(length)]

    for i in range(length):
        docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, ))

    docs_data['target'] = model.predict(np.array(docs_data[features]))

    return docs_data[['pair_id', 'target']]
Exemplo n.º 31
0
def train_model_on_group(model, docs_data, word2vec_model300, length=5):
    """ 
    Parameters:

        model -- model object with methods train and predict

        docs_data -- pandas Data Frame with fields pair_id, content, target
        word2vec_model300 -- w2v model (object)


    Returns:

         model trained on data


    """

    dictionary = corpora.Dictionary()
    for i in docs_data.content:
        try:
            dictionary.add_documents([i])
        except:
            dictionary.add_documents([['a']])

    docs_data['vector'] = docs_data.content.apply(doc_opti,
                                                  args=(dictionary, ))

    corpus = []
    for line in docs_data.content:
        try:
            if math.isnan(line): line = ["мимо"]
        except:
            pass
        corpus = corpus + [dictionary.doc2bow(line)]

    similarity_matrix = word2vec_model300.similarity_matrix(
        dictionary,
        tfidf=TfidfModel(corpus, dictionary=dictionary),
        threshold=0.0,
        exponent=2.0,
        nonzero_limit=100)

    docs_data['dist_vec'] = docs_data.vector.apply(make_dist_vec,
                                                   args=(docs_data.vector,
                                                         similarity_matrix))

    features = [str(i) for i in range(length)]

    for i in range(length):
        docs_data[str(i)] = docs_data.dist_vec.apply(take_S, args=(i, ))

    print(docs_data.head())

    model = model.fit(docs_data[features], docs_data['target'])

    print(model.score(docs_data[features], docs_data['target']))

    return model
Exemplo n.º 32
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-e', '--encoding')
    parser.add_argument('-o', '--output-file')
    args = parser.parse_args()

    encoding = args.encoding
    output_fn = args.output_file

    if not output_fn:
        sys.exit(-1)

    if encoding:
        sys.stdout = codecs.getwriter(encoding)(sys.stdout)
        sys.stdin = codecs.getreader(encoding)(sys.stdin)

    texts = (line.split() for line in sys.stdin)

    logging.info('Creating vocabulary ...')
    vocab = Dictionary(texts)

    logging.info('Saving vocabulary to %s ...' % (output_fn + '.bz2'))
    vocab.save(output_fn)

    logging.info('Compressing vocabulary ...')

    with open(output_fn, 'rb') as input:
        with bz2.BZ2File(output_fn + '.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn)

    logging.info('Creating IDF model ...')
    tfidf = TfidfModel(dictionary=vocab)

    logging.info('Saving IDF model to %s ...' % (output_fn + '.tfidf.bz2'))
    tfidf.save(output_fn + '.tfidf')

    logging.info('Compressing IDF model ...')

    with open(output_fn + '.tfidf', 'rb') as input:
        with bz2.BZ2File(output_fn + '.tfidf.bz2', 'wb', compresslevel=9) as output:
            copyfileobj(input, output)

    os.remove(output_fn + '.tfidf')
Exemplo n.º 33
0
def corpus_vec(docs, model, corpus, size = DEFAULT_SAMPLE_SIZE):
    """ Creates a NxD array of document vectors for each document in a list"""

    tfidf = TfidfModel(corpus)
    N,D = len(docs), model.wv.syn0.shape[1]
    arr = np.empty((N, D))
    for i in range(N):
        arr[i,:] = doc_vec(docs[i], model, corpus, size, tfidf)
    return arr
Exemplo n.º 34
0
    def __init__(self,
                 docs,
                 num_option=OPTION_GROUP,
                 usr_option=OPTION_GROUP,
                 url_option=OPTION_GROUP,
                 emo_option=OPTION_GROUP,
                 lc=True,
                 del_dup=True,
                 del_punc=False,
                 del_diac=True,
                 token_list=[-1],
                 token_min_filter=-1,
                 token_max_filter=1.0,
                 tfidf=True,
                 **kwargs):
        self.del_diac = del_diac
        self.num_option = num_option
        self.usr_option = usr_option
        self.url_option = url_option
        self.emo_option = emo_option
        self.lc = lc
        self.del_dup = del_dup
        self.del_punc = del_punc
        self.token_list = token_list
        self.token_min_filter = token_min_filter
        self.token_max_filter = token_max_filter
        self.tfidf = tfidf
        self.kwargs = {k: v for k, v in kwargs.items() if k[0] != '_'}

        if emo_option == OPTION_NONE:
            self.emo_map = None
        else:
            # self.emo_map = get_compiled_map(os.path.join(os.path.dirname(__file__), 'resources', 'emoticons.json'))
            self.emo_map = EmoticonClassifier()

        docs = [self.tokenize(d) for d in docs]
        self.dictionary = corpora.Dictionary(docs)
        corpus = [self.dictionary.doc2bow(d) for d in docs]
        if self.token_min_filter != 1 or self.token_max_filter != 1.0:
            if self.token_min_filter < 0:
                self.token_min_filter = abs(self.token_min_filter)
            else:
                self.token_min_filter = int(
                    len(corpus) * self.token_min_filter)

            if self.token_max_filter < 0:
                self.token_max_filter = abs(
                    self.token_max_filter) / len(corpus)

            self.dictionary.filter_extremes(no_below=self.token_min_filter,
                                            no_above=self.token_max_filter,
                                            keep_n=None)

        if self.tfidf:
            self.model = TfidfModel(corpus)
        else:
            self.model = None
Exemplo n.º 35
0
 def __init__(self, documents):
     self.documents = documents
     self.texts = [[word for word in document.lower().split()]
                   for document in documents]
     self.dictionary = corpora.Dictionary(self.texts)
     self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
     self.tfidf = TfidfModel(self.corpus)
     self._make_random_indexing()
     print "initialized!"
Exemplo n.º 36
0
 def tf_idf(dataSeg_save):
     corpus = pd.read_csv(dataSeg_save,header=None)[0]
     texts = [sentence.split(' ') for sentence in corpus]
     dictionary = corpora.Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     tf_idf_model = TfidfModel(corpus, normalize=False)
     word_tf_tdf = list(tf_idf_model[corpus])
     # print('词典:', dictionary.token2id)
     # print('词频:', corpus)
     # print('词的tf-idf值:', word_tf_tdf)
     return word_tf_tdf,dictionary.token2id
Exemplo n.º 37
0
    def __init__(self, analyzed_items_path=None, dictionary_path=None,
                 corpus_path=None, tfidf_model_path=None):
        if dictionary_path:
            self.dictionary = Dictionary.load(dictionary_path)
        else:
            self.dictionary = None

        if analyzed_items_path:
            self.analyzed_items_path = analyzed_items_path
        else:
            self.analyzed_items_path = None

        if corpus_path:
            self.corpus = MmCorpus(corpus_path)
        else:
            self.corpus = None

        if tfidf_model_path:
            self.tfidf_model = TfidfModel.load(tfidf_model_path)
        else:
            self.tfidf_model = None
Exemplo n.º 38
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
    human_data_file = path.join(base_path, p['human_data_file'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(base_path,
                                           p['corpus_path'],
                                           p['dict_name']))
    Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
    logger.info(dictionary)

    logger.info('loading corpus')
    corpus_bow = MmCorpus(working_corpus)

    logger.info("create preprocessing model and save it to disk")
    if p['pre_model'] == 'tfidf':
        pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    elif p['pre_model'] == 'log_ent':
        pre_model = LogEntropyModel(corpus_bow,
                                    id2word=dictionary, normalize=True)
    else:
        raise ValueError('model parameter %s not known' % p['pre_model'])
    pre_model.save(os.path.join(output_dir, p['pre_model_extension']))

    logger.info('initialize LSI model')
    lsi = models.LsiModel(pre_model[corpus_bow],
                          id2word=dictionary, num_topics=p['num_topics'])
    lsi.save(os.path.join(output_dir, p['lsi_extension']))
    logger.info('finished --> lsi model saved to: %s' %
                os.path.join(output_dir, p['lsi_extension']))

    # check for correlation with lee human data
    logger.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (LSI)')
    corpus_lsi = lsi[pre_model[bow_lee_texts]]

    # # compute pairwise similarity matrix of transformed corpus
    sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
    for i, par1 in enumerate(corpus_lsi):
        for j, par2 in enumerate(corpus_lsi):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(human_data_file)
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    # compute correlations
    cor = np.corrcoef(sim_vector, human_sim_vector)
    logger.info("correlation with lee human data: %f" % cor[0, 1])

    dif = start - datetime.now()
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Exemplo n.º 39
0
    def construct_tfidf_model(self, model_path):
        model = TfidfModel(self.corpus)
        model.save(model_path)

        return model
Exemplo n.º 40
0
 def fit(self, documents, labels=None):
     self.lexicon = Dictionary(documents)
     self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon)
     self.save()
     return self
Exemplo n.º 41
0
	def load_tfidf_model (self, filename='../data/models/tfidf_model'):
		self.tfidf_model = TfidfModel.load (filename) 
Exemplo n.º 42
0
from gensim.corpora import Dictionary


class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)


def score(text, tfidf, dictionary):
    return tfidf[dictionary.doc2bow(TextBlob(text.lower()).tokens)]


if __name__ == '__main__':
    if os.path.exists('tfidf.pkl') and os.path.exists('nasa_dictionary.pkl'):
        tfidf = TfidfModel.load('tfidf.pkl')
        dictionary = Dictionary.load('nasa_dictionary.pkl')
    else:
        corpus = JsonCorpus()
        corpus.dictionary.save('nasa_dictionary.pkl')
        dictionary = corpus.dictionary
        tfidf = TfidfModel(corpus, dictionary=corpus.dictionary)
        tfidf.save('tfidf.pkl')

    print score('project completed', tfidf=tfidf, dictionary=dictionary)