示例#1
0
def genCorpus(corpus_name, f_inputs=[], word_count=20, word_doc_freq=0.3):
    """
    Function
        generate corpus (bow representation of documents, id2words) for topic models(e.g. LDA)
    Input Args
        f_inputs
            type: list of filenames
            format: ['file1', 'file2', ...]
            desc: files to be converted to corpus
        corpus_name
            type: str
            desc: corpus path to be stored
    Return
        gensim_docBow
            type: list
            format: [ [doc1 bag of words],
                      [doc2 bag of words],
                     ]
        id2word
            type: gensim.corpora.Dictionary
    """
    if not path.exists(corpus_name):
        docs = SentenceIter(*f_inputs)
        wordDict = dictionary.Dictionary(documents=docs, prune_at=None)
        wordDict.filter_extremes(
            no_below=word_count, no_above=word_doc_freq
        )  # filter word count< WORD_COUNT, word appear in documents rate > WORD_DOC_FREQ
        corpus_docBow = [wordDict.doc2bow(doc) for doc in docs]
        with open(corpus_name, 'wb') as fo:
            pickle.dump([corpus_docBow, wordDict], fo)
    else:
        with open(corpus_name, 'rb') as fi:
            corpus_docBow, wordDict = pickle.load(fi)
    return corpus_docBow, wordDict
示例#2
0
    def fit(self, training, training_info):
        # store training sets
        self.training = training
        self.training_info = training_info

        print("creating train tokens")
        train_tokens = training_info["tokens"].apply(
            lambda tokens: tokens.split(" ")).values.tolist()
        print("creating train dict")
        train_my_dict = dictionary.Dictionary(train_tokens)
        print("creating train corpus")
        train_corpus = [train_my_dict.doc2bow(token) for token in train_tokens]
        print("training Lsi model")
        if os.path.isfile('temp/model.lsi') and self.use_pretrained_model:
            self.lsi = models.LsiModel.load('temp/model.lsi')
        else:
            self.lsi = models.LsiModel(train_corpus,
                                       id2word=train_my_dict,
                                       num_topics=500)
            self.lsi.save('temp/model.lsi')
        print("creating train Lsi matrix")
        self.lsi_train_matrix = np.array(
            [self.lsi[document] for document in train_corpus])

        self.address_books = create_address_books(training, training_info)
        self.mids_sender_recipient = create_dictionary_mids(
            training, training_info)
示例#3
0
文件: tfidf.py 项目: mr-ma/sip-ml
 def __init__(self, dataframe):
     self.dataframe = dataframe
     self.tokens = dictionary.Dictionary()
     # Retrieve tokens form documents, populating the tokens dictionary
     self.dataframe = self.dataframe.apply(self.get_tokens)
     #print(tabulate(self.dataframe, headers='keys', tablefmt='psql'))
     print(self.dataframe)
     self.dataframe.apply(self.dump_tokens)
示例#4
0
    def __init__(self, docs):
        self.documents = docs
        self.tokens = dictionary.Dictionary()
        # Retrieve tokens form documents, populating the tokens dictionary

        #从ducuments中得到tokens并添加在corpus中
        for doc in self.documents:
            content = [[word for word in open(doc).read().lower().split() if word not in [",","%","(",")",",",":","\n","$"]]]
            self.tokens.add_documents(content)
        print ("[*] Retrieved %s tokens from %s documents in the corpus" % (len(self.tokens), len(self.documents)))
 def testBuild(self):
     d = dictionary.Dictionary(self.texts)
     expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 2, 7: 3, 8: 2, 9: 3, 10: 3, 11: 2}
     self.assertEqual(d.docFreq, expected)
     expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2,
                 'minors': 11, 'response': 3, 'survey': 4, 'system': 5,
                 'time': 6, 'trees': 9, 'user': 7}
     self.assertEqual(d.token2id, expected)
     expected = dict((v, k) for k, v in expected.iteritems())
     self.assertEqual(d.id2token, expected)
示例#6
0
 def __init__(self, docs):
     self.documents = docs
     self.tokens = dictionary.Dictionary()
     # Retrieve tokens form documents, populating the tokens dictionary
     for doc in self.documents:
         content = []
         for c in [
                 word for word in open(doc).read().lower().split()
                 if word not in [",", "%", "(", ")", ",", ":", "\n", "$"]
         ]:
             if re.match('^[A-Za-z0-9();-<>:./]+$', c):
                 content.append(c)
         self.tokens.add_documents([content])
     print "[*] Retrieved %s tokens from %s documents in the corpus" % (len(
         self.tokens), len(self.documents))
示例#7
0
def filter_tokenlist():
    id2token_dictionary = dictionary.Dictionary.load_from_text(corpus_path)
    token2id = []
    print(len(id2token_dictionary))
    tokenTuples = [
        tuple(map(int, i.split(' '))) for i in open(feature_mapping_path)
    ]
    # with open('/home/miriam/malwaredetection/utils/source_codes/token_filtered_new.txt', 'w+') as fh:
    for i, token in id2token_dictionary.items():
        if re.match('^[A-Za-z0-9();-<>:./]+$', token):
            # fh.write(token.encode('utf-8') + '\n')
            token2id.append((i, token))
    tokens_filtered = dictionary.Dictionary()
    tokens_filtered.token2id = token2id
    print(dictionary.Dictionary.from_corpus(token2id))
    print()
示例#8
0
def vectorize(ori_data, train_file_path, test_file_path):
    train_data = ori_data['train']
    corpus = []
    for label, data in train_data:
        corpus.append(data)
    dic = dictionary.Dictionary(corpus)
    corpus = [dic.doc2bow(doc) for doc in corpus]
    lda = ldamodel.LdaModel(corpus, passes=40, num_topics=80)
    #training data
    fp = open(train_file_path, 'w')
    for label, data in train_data:
        vec = lda[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
    fp = open(test_file_path, 'w')
    for label, data in ori_data['test']:
        vec = lda[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
示例#9
0
def vectorize(ori_data, train_file_path, test_file_path):
    train_data = ori_data['train']
    corpus = []
    for label, data in train_data:
        corpus.append(data)
    dic = dictionary.Dictionary(corpus)
    corpus = [dic.doc2bow(doc) for doc in corpus]
    tfidf = tfidfmodel.TfidfModel(corpus)
    #training data
    fp = open(train_file_path, 'w')
    for label, data in train_data:
        vec = tfidf[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
    fp = open(test_file_path, 'w')
    for label, data in ori_data['test']:
        vec = tfidf[dic.doc2bow(data)]
        fp.write("%s %s\n" % (label, sparse(vec)))
    fp.close()
示例#10
0
def get_tfidf(files, update):
    if update:
        tokenizer, lemmatizer, stop = RegexpTokenizer(r'[a-z]{4,}'), WordNetLemmatizer(), set(stopwords.words('english'))
        bug_list = bugs_exceeding_count(files, 10)
        replacements = get_replacements(bug_list)
        with open(files.replacements, 'w') as outf:
            json.dump(replacements, outf)
        with open(files.abstracts) as documents:
            word_dict = dictionary.Dictionary(
                get_words(id_abstract, replacements, tokenizer, lemmatizer) for id_abstract in documents)
            word_dict.filter_tokens(bad_ids=stop) # get rid of stop words
            word_dict.filter_extremes() # filter out
            documents.seek(0) # reset to beginning of file iterator
            corpus = (word_dict.doc2bow(get_words(id_abstract, replacements, tokenizer, lemmatizer)) for
                      id_abstract in documents)
            mmcorpus.MmCorpus.serialize(files.corpus, corpus)
        tfidf_model = models.tfidfmodel.TfidfModel(dictionary=word_dict, id2word=word_dict)
        tfidf_model.save(files.tfidf)
    tfidf_model = models.tfidfmodel.TfidfModel.load(files.tfidf)
    return tfidf_model
示例#11
0
def similarity(folder_path, stop_word_file, user_dict, cos_result_file,
               sim_result_file):
    print("start time is :",
          time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    files = load_files(folder_path)
    words = [cut(file, stop_word_file, user_dict) for file in files]

    # 生成字典和向量语料
    doc_dict = dictionary.Dictionary(words)
    doc_corpus = [doc_dict.doc2bow(item) for item in words]

    # 5.通过token2id得到特征数(字典里面的键的个数)
    tfidf, index = tfidf_calc(doc_corpus, len(doc_dict.token2id.keys()))
    log = idx_out(files, index)

    # sims = simhash(words, doc_corpus)
    sims = simhash_tfidf(words, tfidf)
    log1 = sim_out(files, sims)

    write_file(cos_result_file, log)
    write_file(sim_result_file, log1)
    print("end time is :", time.strftime("%Y-%m-%d %H:%M:%S",
                                         time.localtime()))
示例#12
0
from gensim.utils import to_unicode
from gensim.interfaces import TransformedCorpus
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
                            ucicorpus, malletcorpus, textcorpus, indexedcorpus, dictionary)
from gensim.models import (tfidfmodel,word2vec,ldamodel)

print 'start'
train_set=[]
for line in open('articles.txt'):
	items = line.strip().split('\t', 1)
	if len(items) < 2:
		continue
	words = items[1].strip().split(' ')
	train_set.append(words)

print 'construct dict'
dic = dictionary.Dictionary(train_set)
print 'doc2bow'
corpus = [dic.doc2bow(text) for text in train_set]
print 'ifidf'
tfidf = tfidfmodel.TfidfModel(corpus)
print 'ifidf corpus'
corpus_tfidf = tfidf[corpus]
print 'lda model'
lda = ldamodel.LdaModel(corpus_tfidf, id2word = dic, num_topics = 1000,  iterations = 1300, alpha = 0.15, eta = 0.01)
print 'corpus_tfidf'
corpus_lda = lda[corpus_tfidf]

lda.save('lda_model')

 def testFilter(self):
     d = dictionary.Dictionary(self.texts)
     d.filterExtremes(noBelow = 2, noAbove = 1.0, keepN = 4)
     expected = {0: 3, 1: 3, 2: 3, 3: 3}
     self.assertEqual(d.docFreq, expected)
示例#14
0
 def get_dic(self):
     """
     Creates a gensim dictionary and return it
     :return: Gensim dictionary
     """
     return dictionary.Dictionary(self.tokens)
logging.basicConfig(filename='build_corpus_and_dictionary.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.path.pardir))

from gensim.corpora import dictionary

import settings
from models import tokenizer
from models import corpus


punct_tags = list(',.:)"') + ['CD', 'IN']

pt_tokenizer = tokenizer.PretaggedTokenizer(stopword_list=None, filter_tags=punct_tags)
lj_corpus = corpus.LazyJSONCorpus(tokenizer=pt_tokenizer, dictionary=None, path_to_text="tagged")

glob_pattern = os.path.join(settings.PROC_DIR, '*.json')
#glob_pattern = os.path.join(settings.PROC_DIR, '60182*.json')
lj_corpus.glob_documents(glob_pattern)
with open(os.path.join(settings.PERSIST_DIR, 'document_index'), 'w') as fout:
    for floc in iglob(glob_pattern):
        doc_id = os.path.basename(floc).split('.')[0]
        fout.write(doc_id+'\n')

my_dict = dictionary.Dictionary(lj_corpus)


lj_corpus.dictionary = my_dict

my_dict.save(os.path.join(settings.PERSIST_DIR, 'my_dict'))
示例#16
0
    def predict(self, test, test_info):

        print("creating test tokens")
        test_tokens = test_info["tokens"].apply(
            lambda tokens: tokens.split(" ")).values.tolist()
        print("creating test dictionnary")
        test_my_dict = dictionary.Dictionary(test_tokens)
        print("creating test corpus")
        test_corpus = [test_my_dict.doc2bow(token) for token in test_tokens]
        print("creating lsi test matrix")
        lsi_test_matrix = np.array([self.lsi[doc] for doc in test_corpus])

        print("prediction per sender")
        predictions_per_sender = {}
        for nb_done, row in test.iterrows():
            print("Progression: %f" % (nb_done / len(test)))
            # retrieve sender attributes
            sender = row[0]
            mids_sender = self.training[self.training["sender"] ==
                                        sender]["mids"].values[0]
            mids_sender = np.array(mids_sender.split(" "), dtype=int)
            position_mails_training = self.training_info[
                self.training_info["mid"].isin(mids_sender)].index.values
            lsi_mails_sender = self.lsi_train_matrix[position_mails_training]

            index = similarities.MatrixSimilarity(lsi_mails_sender)

            # This dictionnary is used to recover the positions in the
            #training set from the position in the similarity matrix
            dict_Sim_Training = dict(
                zip(position_mails_training,
                    range(len(position_mails_training))))

            # get IDs of the emails for which recipient prediction is needed
            mids_predict = np.array(row[1].split(" "), dtype=int)

            # initialize list to store predictions
            lsi_preds = []

            for mid_predict in mids_predict:
                # get the position of current mail in test_info dataset
                position_mail_predict = test_info[test_info["mid"] ==
                                                  mid_predict].index.values
                lsi_mail_predict = lsi_test_matrix[position_mail_predict]
                #sims: similarity score ordered by untrained document id
                sims = index[lsi_mail_predict][0]

                scores = []
                for recipient, nb_occurrences in self.address_books[sender]:
                    mids_recipient = self.mids_sender_recipient[(sender,
                                                                 recipient)]
                    positions_mids_recipient = self.training_info[
                        self.training_info["mid"].isin(
                            mids_recipient)].index.values
                    ind_sim = np.array([
                        dict_Sim_Training[ind]
                        for ind in positions_mids_recipient
                    ])
                    similarities_recipient = sims[ind_sim]
                    scores.append((recipient, similarities_recipient.mean()))

                # sort the scores and get the 10 recipients with higher scores
                prediction = [
                    recipient for recipient, score in sorted(
                        scores, key=lambda elt: elt[1], reverse=True)
                    [:self.nb_recipients_to_predict]
                ]

                lsi_preds.append(prediction)

            predictions_per_sender[sender] = [mids_predict, lsi_preds]

        return predictions_per_sender
示例#17
0
def prune_dict(docs, lower=0.1, upper=0.9):
    dicti = dictionary.Dictionary(docs)
    lower *= len(docs)
    dicti.filter_extremes(no_above=upper, no_below=int(lower))
    return dicti
示例#18
0
def corpus(words):
    doc_dict = dictionary.Dictionary(words)
    doc_corpus = doc_dict.doc2bow(words)
    return doc_corpus