Exemplo n.º 1
0
def build_dictionary(db):
    dictionary = Dictionary()
    for article in db.articles.find():
        dictionary.doc2bow(article['clean_text'], allow_update=True)
    # print dictionary
    # dictionary.save('data/cnn.dict') # store the dictionary, for future reference
    return dictionary
class TermFrequency(object):
    """ Computes a term frequency distance_matrix
    """
    def __init__(self, documents):
        logging.log(logging.INFO, "Creating Term Frequency")
        
        self.id2Word = Dictionary(documents)
        self.num_unique_words = len(self.id2Word)
        self.distance_matrix = self.to_term_frequency_matrix(documents)

    def to_term_frequency_vector(self, document):
        return self.id2Word.doc2bow(document)


    def to_binary_vector(self, document):
        tf = self.id2Word.doc2bow(document)
        vect = sparse2full(tf, len(self.id2Word.keys()))
        return np.array( vect > 0, dtype=int ) # concerts to binary

    def to_term_frequency_matrix(self, documents):
            return [self.to_term_frequency_vector(d) for d in documents]

    def binary_matrix(self):
        """ Turns a regular tf distance_matrix into a binary distance_matrix """
        def get_binary_data(val):
            if val <= 0:
                return 0
            return 1
       
        full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words)
        return [[get_binary_data(cell)
                for cell in row]
                for row in full_matrix]
Exemplo n.º 3
0
class LDA(Step):

    def __init__(self, num_topics):
        self._model = None
        self._dictionary = None
        self._n_topics = num_topics

    def fit(self, filename):
        contents = [x for _, x in Reader(filename)]
        self._dictionary = Dictionary(contents)
        corpus = [self._dictionary.doc2bow(text) for text in contents]
        self._model = LdaModel(corpus, num_topics=self._n_topics)

    def transform(self, filename):
        uuids, vectors = self._transform(filename)
        return uuids, vectors

    def _transform(self, filename):
        vectors = []
        uuids = []
        for uuid, tokens in Reader(filename):
            bow = self._dictionary.doc2bow(tokens)
            lda_probs = {dim: prob for dim, prob in self._model[bow]}
            lda_vec = [lda_probs.get(i, 0) for i in range(self._n_topics)]
            vectors.append(lda_vec)
            uuids.append(uuid)
        return uuids, np.array(vectors)

    @classmethod
    def _read(cls, filename):
        for uuid, tokens in Reader(filename):
            yield ' '.join(tokens)
Exemplo n.º 4
0
Arquivo: G8.py Projeto: lum4chi/IR
def do_ir2(db, param):
    print 'Computazione di IR2', db, param, '...'

    def words(text):
        stopwords = set(nltk.corpus.stopwords.words('english'))
        return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]

    class BigramsCorpus:
        def __init__(self, db, collection):
            self.client = MongoClient()[db][collection]

        def __iter__(self):
            for doc in self.client.find():
                yield [doc['_id']]

        def __len__(self):
            return self.client.count()

    bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
    bigrams = Dictionary(bigram_corpus)

    project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
    a = [project]
    project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)

    n = max(bigrams.keys())
    dataset = []

    for doc in project_corpus:
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        dataset.append(x)

    alg = KMeans(n_clusters=int(param))
    alg.fit(dataset)

    clusters = defaultdict(list)
    for i, doc in enumerate(project_corpus):
        temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
        x = [0]*(n+1)
        for bi, _ in temp:
            x[bi] = 1
        p = alg.predict([x])
        clusters[p[0]].append(doc['reference'])

    mongo_clusters = []
    for k, v in clusters.items():
        mongo_clusters.append({'cluster': k, 'projects': v})

    # Mongo da questo errore: InvalidDocument: Cannot encode object: 0
    print mongo_clusters
    # Salva su collezione Mongo
    mongo = MongoClient()['g8']['ir2']
    mongo.insert_many(mongo_clusters)
    print 'Fatto!'
    def test_doc2bow(self):
        d = Dictionary([["žluťoučký"], ["žluťoučký"]])

        # pass a utf8 string
        self.assertEqual(d.doc2bow(["žluťoučký"]), [(0, 1)])

        # doc2bow must raise a TypeError if passed a string instead of array of strings by accident
        self.assertRaises(TypeError, d.doc2bow, "žluťoučký")

        # unicode must be converted to utf8
        self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)])
Exemplo n.º 6
0
def build_corpora(db):
    dictionary = Dictionary()
    corpus = []
    for article in db.articles.find():
        text = article['clean_text']
        dictionary.doc2bow(text, allow_update=True)
    dictionary.filter_extremes()
    for article in db.articles.find():
        text = article['clean_text']
        corpus.append(dictionary.doc2bow(text))
    gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus)
    dictionary.save('data/cnn.dict')
    return corpus, dictionary
 class MyCorpus(object):
     def __init__(self, input_file, K):
         self.K = K
         self.input_file = input_file
         self.dictionary = Dictionary()
         with open(input_file, "rt") as f:
             for line in f:
                 self.dictionary.add_documents([line.split()])
         self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K)
                 
     def __iter__(self):
         count = 1
         with open(self.input_file, "rt") as f:
             count += 1
             for line in f:
                 yield self.dictionary.doc2bow(line.rstrip().split())
                 
     def __str__(self):
         s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, "
         s += str(len(self.dictionary.keys())) + " features, "
         s += str(corpus.dictionary.num_nnz) + " non-zero entries)"
         return s
         
     def __repr__(self):
         return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
Exemplo n.º 8
0
    def similarity_matrix(self):
        """Test similarity_matrix returns expected results."""

        corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
        dictionary = Dictionary(corpus)
        corpus = [dictionary.doc2bow(document) for document in corpus]

        # checking symmetry and the existence of ones on the diagonal
        similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
        self.assertTrue((similarity_matrix.T == similarity_matrix).all())
        self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())

        # checking that thresholding works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
        self.assertEquals(18, np.sum(similarity_matrix == 0))

        # checking that exponent works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix))

        # checking that nonzero_limit works as expected
        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
        self.assertEquals(4, np.sum(similarity_matrix == 0))

        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
        self.assertEquals(20, np.sum(similarity_matrix == 0))
def main():
    collection_name = "nips"
    years = xrange(2010, 2015)  # 10 ~ 14
    n_topics = 10
    
    corpus_paths = map(lambda y: 
                       "data/{}-{}.dat".format(collection_name, y),
                       years)
    all_corpus = []
    year2corpus = {}
    for year, path in zip(years, corpus_paths):
        corpus = list(load_line_corpus(path))
        all_corpus.append(proc_corpus(corpus))
        year2corpus[year] = corpus

    all_corpus = list(itertools.chain.from_iterable(all_corpus))

    dictionary = Dictionary(all_corpus)
    all_corpus = [dictionary.doc2bow(doc)
                  for doc in all_corpus]

    import pdb
    pdb.set_trace()

    # print all_corpus
    model = LdaModel(all_corpus, num_topics=n_topics,
                     id2word=dictionary,
                     eval_every=10, passes=100)
    print model.show_topics()
Exemplo n.º 10
0
def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8):
    """
    Extracts text data from the corpus
    Cleans and tokenizes text data
    Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model
    :param corpus:
    :return: processed corpus with phrases, dictionary and BOW corpus
    """

    logging.info("Cleaned and tokenzed dataset")
    text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all)

    if bigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below)
        text_dataset = bi_grams[text_dataset]
    elif trigrams is not None:
        bi_grams = Phrases(text_dataset, threshold=bigrams)
        tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams)
        text_dataset = tri_grams[bi_grams[text_dataset]]

    dictionary = Dictionary(text_dataset)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)
    bow_corpus = [dictionary.doc2bow(text) for text in text_dataset]

    return text_dataset, dictionary, bow_corpus
Exemplo n.º 11
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
        stoplist = set('for a of the and to in'.split())
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once]
                for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        #we have to compare values, because in creating dictionary from corpus
        #informations about words are lost
        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
    def produce(self):        

        print('Getting src docs')
        docs = []
        doctokens = [] # aka Gensim's "text"
        stopwords = nltk.corpus.stopwords.words('english')
        for doc in self.src_doc_generator():
            (doc_id,doc_label,doc_str) = doc
            docs.append(doc)
            doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords])
            if len(docs) % 1000 == 0: print(len(docs))
                
        print('Creating the dictionary')
        dictionary = Dictionary(doctokens)
        #dictionary.compactify()
        #dictionary.filter_extremes(keep_n=None)
        if self.dictfile:
            dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True)

        with self.dbi as db:

            print('Creating WORD') # aka Gensim's "dictionary"
            db.create_table('word')
            for word_id, word_str in dictionary.iteritems():
                db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str))
            
            print('Creating DOC and DOCWORD')
            db.create_table('doc')
            db.create_table('docword')
            for doc_idx, doc in enumerate(docs):
                db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2]))
                doc_id = doc[0]
                for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])):
                    word_str = dictionary.get(word_id) # Is this valid? I believe it is.
                    db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
Exemplo n.º 13
0
Arquivo: util.py Projeto: Badodon/FFNN
def load_data(fname):
    
    print 'input file name:', fname

    target = [] #ラベル
    source = [] #文書ベクトル

    #文書リストを作成
    document_list = []
    word_list = []
    for l in open(fname, 'r').readlines():
        sample = l.strip().split(' ',  1)
        label = sample[0]
        target.append([label]) #ラベル
        word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング
        document_list.append(word_list) #文書ごとの単語リスト
    
    #辞書を作成
    #低頻度と高頻度のワードは除く
    dct = Dictionary(document_list)
    dct.filter_extremes(no_below=3, no_above=0.6)

    #文書のBOWでベクトル化
    for doc in document_list:
        tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] 
        dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0])
        source.append(dense)

    dataset = {}
    dataset['target'] = np.array(target)    
    dataset['source'] = np.array(source)    

    return dataset #, max_len, width
Exemplo n.º 14
0
    def test_corpus_summarization(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        # Extract the most important documents.
        selected_documents = summarize_corpus(corpus)

        # They are compared to the method reference.
        with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f:
            summary = f.read()
            summary = summary.split('\n')

        # Each sentence in the document selection has to be in the model summary.
        for doc_number, document in enumerate(selected_documents):
            # Retrieves all words from the document.
            words = [dictionary[token_id] for (token_id, count) in document]

            # Asserts that all of them are in a sentence from the model reference.
            self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
Exemplo n.º 15
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
    print('Building dictionary...')
    dictionary = Dictionary(docs)
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus
Exemplo n.º 16
0
def bag_of_words(lemma):
    "Takes in lemmatised words and returns a bow."
    # Create bag of words from dictionnary
    dictionary = Dictionary(lemma)
    dictionary.save('text.dict')
    # Term frequency–inverse document frequency (TF-IDF)
    bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms
    return (bow, dictionary)
Exemplo n.º 17
0
class GensimTfidfVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, dirpath=".", tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = os.path.join(dirpath, "corpus.dict")
        self._tfidf_path = os.path.join(dirpath, "tfidf.model")

        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load()

    def load(self):

        if os.path.exists(self._lexicon_path):
            self.lexicon = Dictionary.load(self._lexicon_path)

        if os.path.exists(self._tfidf_path):
            self.tfidf = TfidfModel().load(self._tfidf_path)

    def save(self):
        self.lexicon.save(self._lexicon_path)
        self.tfidf.save(self._tfidf_path)

    def fit(self, documents, labels=None):
        self.lexicon = Dictionary(documents)
        self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon)
        self.save()
        return self

    def transform(self, documents):
        def generator():
            for document in documents:
                vec = self.tfidf[self.lexicon.doc2bow(document)]
                if self.tofull:
                    yield sparse2full(vec)
                else:
                    yield vec
        return list(generator())
Exemplo n.º 18
0
 def test_run(self, data):
     dictionary = Dictionary(data)
     dictionary.filter_extremes(no_above=0.5)
     bags_of_words = [ dictionary.doc2bow(t) for t in data]
     #This can take a while to run:
     lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2)
     results = self.assemble_topics(lda)
     return results
Exemplo n.º 19
0
 def run(self, data):
     wordlists = [corpus.tokenized_contents for corpus in data]
     dictionary = Dictionary(wordlists)
     # dictionary.filter_extremes(no_above=0.5)
     bags_of_words = [ dictionary.doc2bow(t) for t in wordlists]
     #This can take a while to run:
     lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=10)
     results = []
     return self.assemble_topics(lda)
Exemplo n.º 20
0
def to_corpus(documents):
    """
    Make into a corpus
    @documents:list[list[tuple[str,int]]] of bows
    @returns Dictionary, Corpus
    """
    d = Dictionary()
    corpus = [d.doc2bow(doc, allow_update=True) for doc in documents]
    return d, corpus
Exemplo n.º 21
0
    def test_low_distinct_words_corpus_summarization_is_empty_list(self):
        text = self._get_text_from_test_data("testlowdistinctwords.txt")

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertEqual(summarize_corpus(corpus), [])
Exemplo n.º 22
0
class JsonCorpus(object):
    def __iter__(self):
        data = json.load(open('data/nasa.json'))

        desc = [TextBlob(dataset['description'].lower()).tokens for dataset in data['dataset']]

        self.dictionary = Dictionary(desc)

        for d in desc:
            yield self.dictionary.doc2bow(d)
Exemplo n.º 23
0
 def setUp(self):
     texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], 
     [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
     ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'],
     ['river','water','mud','tree'],['money','transaction','bank','finance'],
     ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']]
     # initializing using own LDA sufficient statistics so that we get same results each time.
     sstats = numpy.loadtxt(datapath('sstats_test.txt'))
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
    def testMallet2ModelOn20NewsGroups(self):
        corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")]
        dictionary = Dictionary(corpus)

        corpus = [dictionary.doc2bow(text) for text in corpus]

        lda_mallet_model = ldamallet.LdaMallet(
            self.mallet_path, corpus=corpus,
            num_topics=20, id2word=dictionary, iterations=500)

        lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000)
        self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
Exemplo n.º 25
0
    def test_corpus_summarization_is_not_empty_list_on_short_input_text(self):
        text = self._get_text_from_test_data("testsummarization_unrelated.txt")

        # Keeps the first 8 sentences to make the text shorter.
        sentences = text.split('\n')[:8]

        # Generate the corpus.
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertNotEqual(summarize_corpus(corpus), [])
 def test_patch_with_special_tokens(self):
     special_tokens = {'pad': 0, 'space': 1, 'quake': 3}
     corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
     d = Dictionary(corpus)
     self.assertEqual(len(d.token2id), 5)
     d.patch_with_special_tokens(special_tokens)
     self.assertEqual(d.token2id['pad'], 0)
     self.assertEqual(d.token2id['space'], 1)
     self.assertEqual(d.token2id['quake'], 3)
     self.assertEqual(len(d.token2id), 8)
     self.assertNotIn((0, 1), d.doc2bow(corpus[0]))
     self.assertIn((0, 1), d.doc2bow(['pad'] + corpus[0]))
     corpus_with_special_tokens = [["máma", "mele", "maso"], ["ema", "má", "máma", "space"]]
     d = Dictionary(corpus_with_special_tokens)
     self.assertEqual(len(d.token2id), 6)
     self.assertNotEqual(d.token2id['space'], 1)
     d.patch_with_special_tokens(special_tokens)
     self.assertEqual(len(d.token2id), 8)
     self.assertEqual(max(d.token2id.values()), 7)
     self.assertEqual(d.token2id['space'], 1)
     self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0]))
     self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))
Exemplo n.º 27
0
    def test_low_distinct_words_corpus_summarization_is_none(self):
        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')

        with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f:
            text = f.read()

        # Generate the corpus.
        sentences = text.split("\n")
        tokens = [sentence.split() for sentence in sentences]
        dictionary = Dictionary(tokens)
        corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens]

        self.assertTrue(summarize_corpus(corpus) is None)
Exemplo n.º 28
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = [
            "Human machine interface for lab abc computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV Widths of trees and well quasi ordering",
            "Graph minors A survey"
        ]
        stoplist = set('for a of the and to in'.split())
        texts = [
            [word for word in document.lower().split() if word not in stoplist]
            for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] for text in texts]

        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # Create dictionary from corpus without a token map
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)

        # Create dictionary from corpus with an id=>token map
        dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary)

        self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz)

        # Ensure Sparse2Corpus is compatible with from_corpus
        bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100))
        dictionary = Dictionary.from_corpus(bow)
        self.assertEqual(dictionary.num_docs, 100)
Exemplo n.º 29
0
	def score_messages_by_text(self, deviation_threshold=2):
		"""
		Method calculates token_score parameter for self.messages.

		Args:
			deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens
		"""
		texts = [x['tokens'] for x in self.messages.values()]
		if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1:
			for k in self.messages.keys():
				self.messages[k]['token_score'] = 0
			return
		dictionary = Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		tfidf = TfidfModel(corpus, id2word=dictionary)
		index = MatrixSimilarity(tfidf[corpus])
		try:
			scores = index[dictionary.doc2bow(self.cores[deviation_threshold])]
		except IndexError:
			error('Index error in token scoring for event {}'.format(self.id))
			scores = [0]*len(self.messages.values())
		for i in range(len(scores)):
			self.messages.values()[i]['token_score'] = float(scores[i])
Exemplo n.º 30
0
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5):
  print('Building dictionary...')
  dictionary = Dictionary(docs)
  stopwords = nltk_stopwords().union(additional_stopwords)
  stopword_ids = map(dictionary.token2id.get, stopwords)
  dictionary.filter_tokens(stopword_ids)
  dictionary.compactify()
  dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  dictionary.compactify()

  print('Building corpus...')
  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus
Exemplo n.º 31
0
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return [token for token in stemmed_tokens
            if len(token) > 2]  # skip short tokens


dataset = [text2tokens(txt) for txt in newsgroups['data']
           ]  # convert a documents to list of tokens

from gensim.corpora import Dictionary
dictionary = Dictionary(documents=dataset, prune_at=None)
dictionary.filter_extremes(
    no_below=5, no_above=0.3,
    keep_n=None)  # use Dictionary to remove un-relevant tokens
dictionary.compactify()

d2b_dataset = [dictionary.doc2bow(doc) for doc in dataset
               ]  # convert list of tokens to bag of word representation

###############################################################################
#
# Second, fit two LDA models.
# ---------------------------
#

from gensim.models import LdaMulticore
num_topics = 15

lda_fst = LdaMulticore(
    corpus=d2b_dataset,
    num_topics=num_topics,
    id2word=dictionary,
    elif preDictTag != None:
        dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag))

    #### Step 2, apply Tf-IDF representation ####
    bow_corpus = []
    meta_wf = open("{}{}-Meta.csv".format(corpora_path, fileTag), "w")
    meta_wf.write("position_index,id_str,created_time\n")

    # use Timer to print elapsed time
    with Timer():
        for each_collection in collections:
            print("Transforming the corpus for {}".format(each_collection))
            file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv"
            for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)):
                # gensim's Dictionary.doc2bow will ignore words that are not in dictionary by default
                bow_per_doc = dct.doc2bow(a_tweet.tokens_str.split(","))
                if len(bow_per_doc) > 4:
                    timestamp = a_tweet.created_at
                    id_str = a_tweet.id_str
                    meta_wf.write("{},{},{}\n".format(len(bow_corpus), id_str,
                                                      timestamp))
                    bow_corpus.append(bow_per_doc)
    meta_wf.close()

    tfidf_model = TfidfModel(bow_corpus)  # fit model
    tfidf_corpus = tfidf_model[bow_corpus]

    #### Step 3, export model ####
    if preDictTag == None:
        dct.save('{}{}.dict'.format(corpora_path, fileTag))
    MmCorpus.serialize('{}{}-tf-idf.mm'.format(corpora_path, fileTag),
Exemplo n.º 33
0
                    all_questions_dict[group_id][
                        'questions_and_choice_tok'].append(question_tok)

num_question_groups = len(all_questions_dict)
all_questions = []
for question_group_id in range(num_question_groups):
    all_questions.append([])
    for each_question in all_questions_dict[question_group_id][
            'questions_and_choice_tok']:
        all_questions[question_group_id] += each_question

#print(all_questions)

dct = Dictionary(all_questions)
corpus = [dct.doc2bow(line) for line in all_questions]
##print(corpus)
model = TfidfModel(corpus)

with open(output_tsv, 'w') as fo:
    output_line = 'question\ttop3words\tchoices\ttop3wordsIncuChoice\n'
    fo.write(output_line)
    for question_group_id, each_question_group in enumerate(corpus):
        vector = model[each_question_group]
        #print(all_questions[question_group_id])
        sorted_by_second = sorted(vector, key=lambda tup: tup[1], reverse=True)
        sorted_by_second = [[dct[word], score]
                            for word, score in sorted_by_second]
        #print(sorted_by_second)
        for question_id in range(
                len(all_questions_dict[question_group_id]['questions'])):
Exemplo n.º 34
0
def compute_keywords(X_train_raw,
                     n_keywords=20,
                     k_topics=50,
                     alpha=0.3,
                     eta=1,
                     niter=200,
                     ismain=False,
                     news_index=[],
                     news_name=[]):
    print('compute_keywords\n')
    stopwordsFilePath = '../stopwords_es_ES_enh.txt'
    my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~'

    # X_train_raw = [article.content for article in newspaper]
    processed_docs, stopwords = doc_processing(X_train_raw,
                                               stopwordsFilePath,
                                               doc=True)
    id2word = Dictionary(processed_docs)
    # dictionary.filter_extremes(no_below=1, no_above=0.6, keep_n=None)
    corpus = [id2word.doc2bow(text) for text in processed_docs]

    # Topic modeling using LDA
    ldamodel = LdaModel(corpus=corpus,
                        num_topics=k_topics,
                        alpha=alpha,
                        eta=eta,
                        id2word=id2word,
                        iterations=niter)
    num_words = 50
    ldatopics = ldamodel.top_topics(corpus, num_words=num_words)

    topic_word_matrix = ldamodel.expElogbeta
    word2id = id2word.token2id

    top_words = []
    for topic in ldatopics:
        word_in_topic = []
        for wid in range(num_words):
            word_in_topic.append(topic[0][wid][1])
        top_words.append(word_in_topic)

    #### top_words == ldatopics

    # Create a topic-document matrix:
    d = ldamodel.get_document_topics(corpus)
    topic_doc_matrix = np.zeros((k_topics, len(d)), dtype=np.float16)
    topic_x_doc = np.zeros((1, len(d)), dtype=np.int8)
    list_topic_x_doc = []
    for n, doc in enumerate(d):
        aux = np.reshape(doc, (len(doc), 2))
        topics = aux[:, 0]
        list_topic_x_doc.append(topics)
        if len(topics) > 0:
            topic_x_doc[0, n] = len(topics)
            for i in topics:
                topic_doc_matrix[int(i),
                                 n] = aux[int(np.nonzero(aux == int(i))[0])][1]

    all_keywords = []
    all_weigths = []
    idnewspaper = 0
    for n in range(len(X_train_raw)):
        tokens_in_file = tokenize(X_train_raw[n], deacc=False)
        aux = []
        for word in tokens_in_file:
            aux.append(word)

        aux = Dictionary([aux])
        word2id_in_file = aux.token2id

        topic_filewords_matrix = np.zeros(
            (topic_x_doc[0, n], len(word2id_in_file)), dtype=float)

        for k_top in range(topic_x_doc[0, n]):
            topic = int(list_topic_x_doc[n][k_top])
            for word in word2id_in_file:
                if len(
                        word
                ) > 1 and word not in stopwords and word not in my_punctuation and word in word2id:
                    topic_filewords_matrix[
                        k_top, word2id_in_file[word]] += topic_word_matrix[
                            topic, word2id[word]]
                    # topic_sentence_matrix[k_top, s] += topic_doc_matrix[topic, n] * topic_word_matrix[topic, word2id[word]]

            topic_filewords_matrix[k_top, :] *= topic_doc_matrix[topic, n]

        # Suma por columnas para tener el peso acumulado de una palabra en todos los topics del documento
        weight_words = np.sum(topic_filewords_matrix, axis=0)
        doc_keywords, keyword_weight = get_keywords(n_keywords, weight_words,
                                                    word2id_in_file)
        all_keywords.append(doc_keywords)
        all_weigths.append(keyword_weight /
                           np.sum(keyword_weight))  # Normalizo pesos

        if ismain:
            if n == news_index[idnewspaper]:
                save_obj(all_keywords,
                         'data/%s.keyw.pkl' % news_name[idnewspaper])
                save_obj(all_weigths,
                         'data/%s.weight.pkl' % news_name[idnewspaper])
                all_keywords = []
                all_weigths = []
                idnewspaper += 1

    if not ismain:
        return all_keywords, all_weigths, word2id, id2word.id2token
Exemplo n.º 35
0
# remov%
#all_tokens = sum(texts, [])
#tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) < (len(dictionary))*0.02)
#texts = [[word for word in text if word not in tokens_once]
#        for text in texts]
#stemmed_all=texts
csv.register_dialect('myDialect',
                     delimiter=',',
                     quoting=csv.QUOTE_ALL)
with open('Lemmatized_documents.csv', 'w', newline='') as file:
    writer = csv.writer(file, dialect='myDialect')
    writer.writerows(stemmed_all)
mydict = corpora.Dictionary()
mydict = corpora.Dictionary()
#dtm
doc_term_matrix = [dictionary.doc2bow(doc) for doc in stemmed_all]
mycorpus = [mydict.doc2bow(doc, allow_update=True) for doc in stemmed_all]
csv.register_dialect('myDialect',
                     delimiter=',',
                     quoting=csv.QUOTE_ALL)
with open('Document_term_matrix.csv', 'w', newline='') as file:
    writer = csv.writer(file, dialect='myDialect')
    writer.writerows(doc_term_matrix)
#print(mycorpus)

word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus]

texts = stemmed_all

texts=stemmed_all
data_lemmatized=stemmed_all
Exemplo n.º 36
0
#Dont add the new tweets because we want to recreate the same dictionary

# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of all the documents from all the years.
dictionaryAll = Dictionary(justTweetsAll)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
# The last part was removed since I want them in there because theyre search terms, but it was no_above=0.5
dictionaryAll.filter_extremes(no_below=20)

#################################################################################################################

# Bag-of-words representation of the documents.
corpusAll = [dictionaryAll.doc2bow(doc) for doc in justTweetsAll]
corpus2018 = [dictionaryAll.doc2bow(doc) for doc in justTweets2018]
corpus2019 = [dictionaryAll.doc2bow(doc) for doc in justTweets2019]
corpus2020 = [dictionaryAll.doc2bow(doc) for doc in justTweets2020]
corpusNEW = [dictionaryAll.doc2bow(doc) for doc in justTweetsNEW]

# Let's see how many tokens and documents we have to train on
print('Number of unique tokens: %d' % len(dictionaryAll))
print('Number of documents: %d' % len(corpusAll))

##################################################################################################################

#monte carlo simulation to identify number of topics

from gensim.models import LdaModel
from gensim.models import CoherenceModel
Exemplo n.º 37
0
sentences = df_sentences['Article_sentence_nouns_cleaned'].to_list()

# Read in list in list (=1 sentences 1 doc)
# sentences = MakeListInLists(sentences)

# Create a dictionary representation of the documents
dict_nouns = Dictionary(sentences)

# Display
# pp.pprint(dict_nouns.token2id)

# Filter out words that occur less than 20 documents, or more than 50% of the documents
dict_nouns.filter_extremes(no_below=4, no_above=0.4)

# Bag-of-words representation of the documents
corpus_nouns = [dict_nouns.doc2bow(doc) for doc in sentences]

# Make a index to word dictionary
temp = dict_nouns[0]  # This is only to "load" the dictionary
id2word_nouns = dict_nouns.id2token

# Display
pp.pprint(id2word_nouns)

# Display results of Corpus
# print(corpus_nouns)
# print('Number of unique tokens: {}'.format(len(dict_nouns)))
# print('Number of documents: {}'.format(len(corpus_nouns)))

# TODO: save corpus and dctionary to disk and load them back
# save to path_lda_data
Exemplo n.º 38
0
minimum_count_for_link = 4
word_window = 5

corpus = []
for x in d:
    thing = d[x]["text"].lower().translate(
        str.maketrans('', '', string.punctuation))
    tfiltered = list(filter(lambda w: not w in s, thing.split()))

    #tfiltered = map(lambda x: lemmatizer.lemmatize(x), tfiltered)
    #tfiltered = list(tfiltered)
    corpus.append(tfiltered)

dct = Dictionary(corpus)
bow_corpus = [dct.doc2bow(line) for line in corpus]
term_doc_mat = corpus2csc(bow_corpus)

from collections import OrderedDict

document = corpus
names = dct.values()

occurrences = OrderedDict(
    (name, OrderedDict((name, 0) for name in names)) for name in names)

# Find the co-occurrences:
for l in document:
    for i in range(len(l)):
        print(l[i - word_window:i] + l[i + word_window:])
        for item in l[i - word_window:i] + l[i + word_window:]:
Exemplo n.º 39
0
def prepare():
    with open(dataset_path) as file:
        lines = file.readlines()

    step = 65000

    with open(out_put, 'w') as f:
        for i in range(0, len(lines), step):
            count_line = 0
            dataset = []
            label = []
            print(count_line, len(lines))
            for line in lines:

                if i <= count_line < i + step:
                    if type_ != 'test':
                        dataset.append(line.split(',')[1].split(' '))
                        label.append(line.split(',')[2])
                    else:
                        dataset.append(line.split(',')[1].split(' '))
                        label.append(line.split(',')[0])
                count_line += 1

            from gensim.models import TfidfModel
            from gensim.corpora import Dictionary

            dct = Dictionary(dataset)
            corpus = [dct.doc2bow(line)
                      for line in dataset]  # convert corpus to BoW format
            model = TfidfModel(corpus)  # fit model
            # vec = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)

            id2token = {}
            for (k, v) in dct.token2id.items():
                id2token[v] = k

            # ver_rs=[]
            # vs=['520477','816903','995362','920327','1226448','1025743','990423',
            #                              '133940','1071452','876555','323159','572782','105283','166959',
            #                              '235896','554251','','1267351','1224594','201789','824446','263278']
            # for v in vs:
            #     print(v,dct.token2id[v])
            dataset_after_tfidf = []
            for i in range(len(dataset)):
                vector = model[
                    corpus[i]]  # apply model to the first corpus document
                ver_rs = []
                for v in vector:
                    (id2, score) = v
                    if score > 0.01:
                        ver_rs.append(id2token[id2])
                d_temp = []
                for d in dataset[i]:
                    if type_ != 'test':
                        if d in ver_rs or d == '816903':
                            d_temp.append(d)
                    else:
                        if d in ver_rs:
                            d_temp.append(d)
                # print(len(dataset[i]),len(d_temp))
                dataset_after_tfidf.append(' '.join(d_temp))

            if type_ != 'test':
                # f.writelines(dataset_after_tfidf[i]+'__label__'+label[i]+'\n')
                for i in range(len(dataset_after_tfidf)):
                    count = 0
                    new_line = []
                    for t in dataset_after_tfidf[i].split(' '):
                        if t != '816903' or count < 15:
                            if t == '816903':
                                count += 1
                            if t not in [
                                    '520477', '816903', '995362', '920327',
                                    '1226448', '1025743', '990423', '133940',
                                    '1071452', '876555', '323159', '572782',
                                    '105283', '166959', '235896', '554251', '',
                                    '1267351', '1224594', '201789', '824446',
                                    '263278'
                            ]:
                                new_line.append(t)
                        else:
                            # print(count)
                            count = 0
                            f.writelines(' '.join(new_line) + ' __label__' +
                                         str(label[i]) + '\n')
                            new_line = []
            else:
                for i in range(len(dataset_after_tfidf)):
                    f.writelines(label[i] + ',' + dataset_after_tfidf[i] +
                                 '\n')

            print('dd')
Exemplo n.º 40
0
        lines = f.readlines()
        for sentence in lines:
            words = sentence.decode('utf8').split(" ")
            sentence_segment = []
            for word in words:
                if word.strip() != '':
                    sentence_segment.append(word.strip())
            corpus_list.append(sentence_segment)
    return corpus_list


code_dataset = getCorpus("frcorpus/text%d.dat" % REPO_ID)
text_dataset = getCorpus("frcorpus/code%d.dat" % REPO_ID)
code_dct = Dictionary(code_dataset)
text_dct = Dictionary(text_dataset)
code_corpus = [code_dct.doc2bow(line)
               for line in code_dataset]  # convert corpus to BoW format
text_corpus = [text_dct.doc2bow(line)
               for line in text_dataset]  # convert corpus to BoW format
code_model = TfidfModel(code_corpus)
code_model.save("frcorpus/code%d.model" % REPO_ID)
text_model = TfidfModel(text_corpus)
text_model.save("frcorpus/text%d.model" % REPO_ID)


def read_data(path):
    res = []
    filelist = os.listdir(path)
    for i in range(0, len(filelist)):
        filepath = os.path.join(path, filelist[i])
        logging.info("Loaded the file:" + filepath)
total_examples = model_2.corpus_count

#Updating the model on our data based on the pretrained model
model_2.build_vocab([list(glove_model.vocab.keys())], update=True)
model_2.intersect_word2vec_format("word2vec.txt", binary=False, lockf=1.0)
model_2.train(data['content'], total_examples=total_examples, epochs=model_2.iter)

# Getting sentence vector through weighted average of word vector and tf idf score 
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

# Basically a vocabulary of the words in our dataset
dct = Dictionary(data['content']) 
 
#Creating corpus for every tweet
corpus = [dct.doc2bow(line) for line in data['content']]  # convert corpus to BoW format

#Fitting the tfidf model on the corpus
model_tfidf = TfidfModel(corpus)

#Initializing an empty list for sentence vector 
sent_vec=[]

#Sent_vec=sum over length of sentence(Tf_idf*Word vector)/Sum tf_idf
for i in range(len(data['content'])):
    weighted = np.zeros(200)
    sum_tfidf=0
    for j in range(len(list(set(data['content'][i])))):
        weighted= (weighted)+ (model_tfidf[corpus[i]][j][1])* (model_2[list(set(data['content'][i]))[j]])
        sum_tfidf+=model_tfidf[corpus[i]][j][1]
    sent_vec.append((weighted)/sum_tfidf)
)  # needed because sample data files are located in the same folder


def datapath(fname):
    return os.path.join(module_path, 'test_data', fname)


# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
         ['survey', 'user', 'computer', 'system', 'response', 'time'],
         ['eps', 'user', 'interface', 'system'],
         ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'],
         ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'],
         ['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


def testfile():
    # temporary data will be stored to this file
    return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')


class TestLsiModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
    def setUp(self):
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.model = lsimodel.LsiModel(self.corpus, num_topics=2)

    def testTransform(self):
        """Test lsi[vector] transformation."""
        # create the transformation model
Exemplo n.º 43
0
class Parent(object):
    def __init__(self,
                 hotel_description,
                 user_description,
                 hotel_attributes,
                 user_attributes,
                 incremental=False,
                 num_incremental=4):
        '''hotel_description = path of file
        user_description = path of file
        hotel_attributes = path of file
        user_attributes = path of file
        are list of lists, where each list corresponds to a different element that has been preprocessed and tokenized
        incremental indicates the use of the incremental user method
         num_incremental indicates the number of sentences per user query'''
        self.hotel_description = hotel_description
        self.user_description = user_description
        self.hotel_attributes = hotel_attributes
        self.user_attributes = user_attributes
        self.clean_hotel_description = self.preprocess(
            self.hotel_description, 'testorcone')  #preprocessed
        self.clean_user_description = self.preprocess(
            self.user_description, 'richiestautente')  #preprocessed
        self.clean_hotel_attributes = self.load_hotel_attributes(
        )  #preprocessed
        self.clean_user_attributes = self.load_user_attributes(
            self.user_attributes, 'richiestautente')  #preprocessed
        self.dictionary = Dictionary(self.clean_hotel_description)
        self.vocab_new = dict()
        for k, v in self.dictionary.token2id.items():
            self.vocab_new[k] = v
        self.incremental = incremental
        if incremental == True:
            self.incremental_user = self.write_user_queriesXsentence(
                num_incremental)
            self.clean_user_description = self.preprocess(
                "./datasets/incremental_user/queries" + str(num_incremental) +
                ".txt", 'richiestautente')

        #make a switch to the user query part so that user description becomes that

    'Functions for loading and preprocessing files'

    def loader_docs(self, file, stop_word):
        # Load a text file, dividing it in different strings depending on stopword
        f = open(file, 'r')
        txt = f.readlines()
        f.close()

        l = ' '
        txt = [txt[i].strip() for i in range(len(txt))]
        txt = l.join(txt)
        txt = txt.split(stop_word)
        txt.pop()

        return txt

    def description_to_words(self, raw_review):
        # The input is a single string, and the output is a tokenized list preprocessed

        tokenizer = RegexpTokenizer(r'\w+')
        docs = raw_review.lower()  # Convert to lowercase.
        doc = tokenizer.tokenize(docs)  # tokenize string

        # Remove stop words
        stops = set(stopwords.words("english"))
        doc = [w for w in doc if not w in stops]
        # Remove words of only one character
        doc = [token for token in doc if len(token) > 1]
        # Stem words
        snowball = SnowballStemmer('english')
        doc = [snowball.stem(token) for token in doc]
        return doc

    def get_bigrams(self, clean_docs):  # in place operation

        # Add bigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(clean_docs, min_count=20)
        for idx in range(len(clean_docs)):
            for token in bigram[clean_docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    clean_docs[idx].append(token)

        return clean_docs

    def preprocess(self, file, stop_word, activate_grams='Yes'):
        #preprocess txt file

        docs = self.loader_docs(file, stop_word)
        clean_docs = []

        for i in range(0, len(docs)):
            clean_docs.append(self.description_to_words(docs[i]))

        if activate_grams == 'Yes':
            clean_docs = self.get_bigrams(clean_docs)
        return clean_docs

    def load_hotel_attributes(self):
        #specific preprocess for hotel attributes
        clean_hotel_attributes = self.preprocess(self.hotel_attributes,
                                                 'nuovohotelinarrivo', 'NO')
        clean_hotel_attributes = [[
            token for token in doc if token != 'attributinuovi'
        ] for doc in clean_hotel_attributes]
        return clean_hotel_attributes

    def load_user_attributes(self, file, stop_word):
        # specific preprocess for user attributes
        f = open(file, 'r')
        txt = f.readlines()
        f.close()
        snowball = SnowballStemmer('english')
        stops = set(stopwords.words("english"))
        #removing unnacesary parts of user_attributes
        stops.add('go')
        stops.add('play')
        stops.add('center')
        stops.add('centre')
        stops.add('nearby')
        stops.add('service')
        stops.add('do')
        stops.add('spot')
        tokenizer = RegexpTokenizer(r'\w+')

        txt = [txt[i].strip() for i in range(len(txt))]
        txt = [token for token in txt if token != '']
        txt = [token for token in txt if token != 'new_sentence']
        users = []
        user = []
        for i in range(len(txt)):
            if txt[i] == stop_word:
                users.append(user)
                user = []
            else:
                attribute = txt[i].lower()
                attribute = tokenizer.tokenize(attribute)
                attribute = [w for w in attribute if not w in stops]
                attribute = [token for token in attribute if len(token) > 1]
                attribute = [snowball.stem(token) for token in attribute]
                l = ' '
                attribute = l.join(attribute)
                user.append(attribute)
        return users

    def preprocess_special(self):
        'creates files of preprocessed hotel description and attributes used for evaluation'

        f = open('./datasets/pp/hotel_attributes.txt', 'w')
        for txt in self.clean_hotel_attributes:
            l = ' '
            txt = [txt[i].strip() for i in range(len(txt))]
            txt = l.join(txt)
            f.write(txt + '\n')
        f.close()
        f = open('./datasets/pp/hotel_descriptions.txt', 'w')
        for txt in self.clean_hotel_description:
            l = ' '
            txt = [txt[i].strip() for i in range(len(txt))]
            txt = l.join(txt)
            f.write(txt + '\n')
        f.close()
        f = open('./datasets/pp/user_queries.txt', 'w')
        for txt in self.clean_user_description:
            l = ' '
            txt = [txt[i].strip() for i in range(len(txt))]
            txt = l.join(txt)
            f.write(txt + '\n')
        f.close()

    def incremental_loader_docs(self, file):
        # Load a text file, dividing it in different strings for incremental user
        f = open(file, 'r')
        txt = f.readlines()
        f.close()
        hotels = []
        hotel = []
        for i in txt:
            line = i.strip()
            if line == 'richiestautente':
                hotels.append(hotel)
                hotel = []
            else:
                hotel.append(line)

        return hotels

    'General functions'

    def get_corpus(self):
        'get the Bag Of Words representation for the hotel_description and user_queries'
        corpus = [
            self.dictionary.doc2bow(doc)
            for doc in self.clean_hotel_description
        ]
        BOW_user_queries = [
            self.dictionary.doc2bow(doc) for doc in self.clean_user_description
        ]

        return corpus, BOW_user_queries

    def accuracy_query2hotel(self, hotel, user):
        # This function calculates how many attributes are satisfied in an hotel description
        # inputs: hotel_description index and user_description index
        # output: attributes satisfied in hotel description over total number of attributes

        tokenizer = RegexpTokenizer(r'\w+')
        count = 0
        f = open("./datasets/pp/hotel_descriptions.txt", 'r')
        clean_hotel_description = []
        for line in f:
            print(line.split(' '))
            clean_hotel_description.append(line.split(' '))
        f.close()
        f = open("./datasets/pp/hotel_attributes.txt", 'r')
        clean_hotel_attributes = []
        for line in f:
            clean_hotel_attributes.append(line.split(' '))
        f.close()
        for user_attribute in self.clean_user_attributes[user]:
            part_count = 0
            user_attribute = tokenizer.tokenize(user_attribute)
            for part_attribute in user_attribute:

                if part_attribute in clean_hotel_description[
                        hotel] or part_attribute in clean_hotel_attributes[
                            hotel]:
                    part_count += 1
            if part_count == len(user_attribute):
                count += 1

        return count / len(self.clean_user_attributes[user])

    def make_accuracy_array(self, queryXhotel, num_best, bol=True):
        'for each user query it computes the accuracy of the 5 most similar hotels'
        self.preprocess_special()
        accuracy_array = np.zeros((len(self.clean_user_description), num_best))
        for i in range(np.shape(queryXhotel)[0]):
            ordered = matutils.argsort(queryXhotel[i], topn=5, reverse=bol)

            if self.incremental == False:
                for j in range(num_best):
                    accuracy = self.accuracy_query2hotel(ordered[j], i)
                    accuracy_array[i][j] = accuracy
            else:
                for j in range(num_best):
                    accuracy = self.accuracy_query2hotel(
                        ordered[j], self.incremental_user[i])
                    accuracy_array[i][j] = accuracy
        return accuracy_array

    def get_overall_accuracy(self, accuracy_array, num_best=5):
        #It computes the average accuracy for each best hotel
        count = 0
        for i in range(num_best):
            overall_accuracy = np.sum(
                accuracy_array[:, i]) / accuracy_array.shape[0]
            print('recall ' + str(i + 1) + ' ' + str(overall_accuracy))
            count += overall_accuracy
        return count

    def get_accuracy_array(self, hotel_match_X_query, num_best):
        '''for each user query it computes the accuracy of the 5 most similar hotels, works with gensim module cosine
        similarity and word's mover distance'''
        self.preprocess_special()
        accuracy_array = np.zeros((len(hotel_match_X_query), num_best))
        for i in range(len(hotel_match_X_query)):
            if self.incremental == False:
                for j in range(num_best):
                    accuracy = self.accuracy_query2hotel(
                        hotel_match_X_query[i][j][0], i)
                    accuracy_array[i][j] = accuracy
            else:
                for j in range(num_best):
                    accuracy = self.accuracy_query2hotel(
                        hotel_match_X_query[i][j][0], self.incremental_user[i])
                    accuracy_array[i][j] = accuracy
        return accuracy_array

    'Similarity functions'

    #A series of functions that interacts with the gensim modules

    def Jaccard_similiarity(self,
                            corpus,
                            corpus_model_user_description,
                            num_best=5):
        'for each user query it computes the Jaccard coefficient with respect to each hotel'
        length = len(corpus_model_user_description)
        queryXhotel = np.zeros((length, len(corpus)))

        for i in range(length):
            for j in range(len(corpus)):
                queryXhotel[i][j] = jaccard(corpus_model_user_description[i],
                                            corpus[j])

        #np.save('jaccard_similiarity', queryXhotel)
        accuracy_array = self.make_accuracy_array(queryXhotel,
                                                  num_best,
                                                  bol=False)

        return accuracy_array

    def cosine_similarity(self,
                          corpus,
                          corpus_model_user_description,
                          num_best=5):
        # corpus can be for example corpus_tfidf
        # corpus_model_user_description can be for example tfidf_user_queries:
        # num_best refers to the number of best hotels that will be considered
        #USES cosine similarity as implemented in Gensim
        index = similarities.MatrixSimilarity(corpus, num_best=num_best)
        hotel_match_X_query = []
        for query in corpus_model_user_description:  #tfidf_user_queries:
            sims = index[query]
            hotel_match_X_query.append(sims)
        accuracy_array = self.get_accuracy_array(hotel_match_X_query, num_best)
        return accuracy_array

    def Hellinger_similiarity(self,
                              corpus,
                              corpus_model_user_description,
                              num_best=5):
        'implements Hellinger similarity using gensim modules'
        length = len(corpus_model_user_description)
        queryXhotel = np.zeros((length, len(corpus)))
        print('It takes some time')
        for i in range(length):
            for j in range(len(corpus)):
                queryXhotel[i][j] = hellinger(corpus_model_user_description[i],
                                              corpus[j])
            print(i)
        #np.save('hellinger_similiarity', queryXhotel)
        accuracy_array = self.make_accuracy_array(queryXhotel,
                                                  num_best,
                                                  bol=False)  #true?
        return accuracy_array

    def WMD_similiarity(self,
                        corpus,
                        w2v_model,
                        corpus_model_user_description,
                        num_best=5):
        'Word mover distance similarity'
        index = similarities.WmdSimilarity(corpus, w2v_model, num_best)
        hotel_match_X_query = []
        for query in corpus_model_user_description:  #tfidf_user_queries::20
            sims = index[query]
            hotel_match_X_query.append(sims)
        accuracy_array = self.get_accuracy_array(hotel_match_X_query, num_best)
        return accuracy_array

    'second experiment'

    #The second experiment consists in analyzing the performance of the reccomendation system on user queries based on
    #a different amount of attributes

    def divide_query_per_num_attribute(self):
        'The queries are divided based on the number of attributes they contain'
        length_4 = []
        length_5 = []
        length_6 = []
        length_7 = []
        length_8 = []
        length_9 = []

        for i in range(len(self.clean_user_attributes)):
            length = len(self.clean_user_attributes[i])
            if length == 4:
                length_4.append(i)
            elif length == 5:
                length_5.append(i)
            elif length == 6:
                length_6.append(i)
            elif length == 7:
                length_7.append(i)
            elif length == 8:
                length_8.append(i)
            elif length == 9:
                length_9.append(i)
            else:
                print('Error,missing list of len = ' + str(len(i)))

        return length_4, length_5, length_6, length_7, length_8, length_9

    def get_accuracy_based_attributes(self,
                                      accuracy_array,
                                      length_array,
                                      num_best=1):
        'the accuracy is calculated for each user query based on the amount of attributes'

        for i in range(len(length_array)):
            numpy_length = np.array(length_array[i])
            sliced_array = accuracy_array[numpy_length]
            accuracy = 0
            for j in range(num_best):
                accuracy += np.sum(sliced_array[:, j]) / sliced_array.shape[0]
            accuracy = accuracy / float(num_best)
            print('accuracy for ' + str(i + 4) + ' attributes ' +
                  str(accuracy))

    'incremental user'

    def write_user_queriesXsentence(self, numb_sentences):
        'it divides the user query based on the amount of sentences'
        #input is the amount of sentences the query should have
        #output is a list with the index of user queries that satisfy the contraint and a text file containing all query
        f = open(
            "./datasets/incremental_user/queries" + str(numb_sentences) +
            ".txt", 'w')
        hotels = self.incremental_loader_docs(self.user_description)
        indexes = []
        for i in range(len(hotels)):
            if len(hotels[i]) > (numb_sentences - 1):
                indexes.append(i)
                for j in range(numb_sentences):
                    sentence = hotels[i][j]
                    f.write(sentence + '\n')
                f.write('richiestautente\n')
        f.close()
        return indexes
Exemplo n.º 44
0
=============================================================================
tri-grammed tokenized article: {}

""".format(docs[1], docs_tokens[1], docs_phrased[1]))

# %% get corpus & dictionary to use for further nlp analysis
"""
I suggest to prepare the dictionary and the corpus `once for all' -- that is, 
dumping the files that, eventually, will be loaded for further analysis.
"""
# get dictionary and write it to a file
"""
a dictionary is a mapping between words and their integer ids. See Gensim 
documentation here: https://radimrehurek.com/gensim/corpora/dictionary.html
"""
pr_dictionary = Dictionary(docs_phrased)
pr_dictionary.save("/tmp/pr_dictionary.dict")
# get corpus and write it to a file
"""
as per the Gensim documentation, it possible to convert document into the 
bag-of-words (format = list of (token_id, token_count) tuples) via doc2bow
"""
pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased]
"""
Gensim offers several utilities to write a corpus of text to a file. 
Personally, I prefer the Matrix Market format [1]

[1]: https://math.nist.gov/MatrixMarket/formats.html
"""
corpora.MmCorpus.serialize("/tmp/pr_corpus.mm", pr_corpus)
Exemplo n.º 45
0
import sys

from gensim.corpora import Dictionary
from gensim.models.ldaseqmodel import LdaSeqModel
from gensim.models import word2vec

data_file = sys.argv[1]
topic_num = int(sys.argv[2])

sentences = list(word2vec.LineSentence(data_file))

dic = Dictionary(sentences)

corpus = [dic.doc2bow(s) for s in sentences]

ldaseq = LdaSeqModel(corpus = corpus, id2word = dic, num_topics = topic_num, time_slice = [len(corpus)])

print('topic,item,prob')

for i, ts in enumerate(ldaseq.print_topics(top_terms = 10)):
  for t in ts:
    print(f'{i},{t[0]},{t[1]}')
Exemplo n.º 46
0
        lines = f.readlines()
        for sentence in lines:
            words = sentence.decode('utf8').split(" ")
            sentence_segment = []
            for word in words:
                if word.strip() != '':
                    sentence_segment.append(word.strip())
            corpus_list.append(sentence_segment)
    return corpus_list


code_dataset = getCorpus("frcorpus/text%d.dat" % REPO_ID)
text_dataset = getCorpus("frcorpus/code%d.dat" % REPO_ID)
code_dct = Dictionary(code_dataset)
text_dct = Dictionary(text_dataset)
code_corpus = [code_dct.doc2bow(line) for line in code_dataset]  # convert corpus to BoW format
text_corpus = [text_dct.doc2bow(line) for line in text_dataset]  # convert corpus to BoW format
code_model = TfidfModel(code_corpus)
code_model.save("frcorpus/code%d.model" % REPO_ID)
text_model = TfidfModel(text_corpus)
text_model.save("frcorpus/text%d.model" % REPO_ID)


def read_data(path):
    res = []
    filelist = os.listdir(path)
    for i in range(0, len(filelist)):
        filepath = os.path.join(path, filelist[i])
        logging.info("Loaded the file:"+filepath)
        if os.path.isfile(filepath):
            file = open(filepath, 'rb')
Exemplo n.º 47
0
def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_obama = preprocess(sentence_obama)
sentence_president = preprocess(sentence_president)
sentence_orange = preprocess(sentence_orange)

###############################################################################
# Next, we will build a dictionary and a TF-IDF model, and we will convert the
# sentences to the bag-of-words format.
#
from gensim.corpora import Dictionary
documents = [sentence_obama, sentence_president, sentence_orange]
dictionary = Dictionary(documents)

sentence_obama = dictionary.doc2bow(sentence_obama)
sentence_president = dictionary.doc2bow(sentence_president)
sentence_orange = dictionary.doc2bow(sentence_orange)

from gensim.models import TfidfModel
documents = [sentence_obama, sentence_president, sentence_orange]
tfidf = TfidfModel(documents)

sentence_obama = tfidf[sentence_obama]
sentence_president = tfidf[sentence_president]
sentence_orange = tfidf[sentence_orange]

###############################################################################
# Now, as mentioned earlier, we will be using some downloaded pre-trained
# embeddings. We load these into a Gensim Word2Vec model class and we build
# a term similarity mextrix using the embeddings.
Exemplo n.º 48
0
"""
Automated tests for checking transformation algorithms (the models package).
"""

import logging
import unittest

from gensim.corpora import mmcorpus, Dictionary
from gensim.models import hdpmodel
from gensim.test import basetmtests
from gensim.test.utils import datapath, common_texts

import numpy as np

dictionary = Dictionary(common_texts)
corpus = [dictionary.doc2bow(text) for text in common_texts]


class TestHdpModel(unittest.TestCase, basetmtests.TestBaseTopicModel):
    def setUp(self):
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.class_ = hdpmodel.HdpModel
        self.model = self.class_(corpus,
                                 id2word=dictionary,
                                 random_state=np.random.seed(0))

    def testTopicValues(self):
        """
        Check show topics method
        """
        results = self.model.show_topics()[0]
Exemplo n.º 49
0
def create_bow(data):
    dct = Dictionary(data)
    dct.filter_extremes(no_below=20)
    bow = [dct.doc2bow(doc) for doc in data]
    return dct, bow
Exemplo n.º 50
0
    corpus = sohu_corpus(fname=os.path.join(training_file_path, 'neg_1.txt'),
                         dic=dictionary)

    # save dictionary
    # dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict'))
    MmCorpus.serialize(os.path.join(training_file_path, '07_11_corpus_12.mm'),
                       corpus)
    # dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict'))
    corpus_tfidf_mm = MmCorpus(
        os.path.join(training_file_path, '07_11_corpus_12.mm'))

    training_src_data = sogou_corpus_file(
        os.path.join(training_file_path, 'neg_1.txt'))
    training_src = []
    for each_file in training_src_data:
        training_src.append(each_file)

    # convert counts to tfidf
    tfidf = TfidfModel(corpus=corpus_tfidf_mm)

    index = MatrixSimilarity(tfidf[corpus_tfidf_mm])

    sims = index[tfidf[dictionary.doc2bow(['阳台', '打死'])]]
    print('doc2bow:')
    print(dictionary.doc2bow(['阳台']))
    print('tfidf:')
    print(tfidf[dictionary.doc2bow(['阳台'])])
    print(u'相似文档:\n')
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for num, value in sims[:3]:
        print(str(num) + '\t' + str(value) + '\t' + training_src[num] + '\n')
Exemplo n.º 51
0
import gensim
import json
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc

#model = gensim.models.Word2Vec.load("./models/word2vec/size-256_min-count-2_epoch-50_examples-total_window-15_sentences/word2vec_size-100_window-5_min-count-1_workers-4.model")


def read_list_corpus(list_corp, tokens_only=False):
    for i, paragraph in enumerate(list_corp):
        if tokens_only:
            yield gensim.utils.simple_preprocess(paragraph[0])
        else:
            yield gensim.models.doc2vec.TaggedDocument(
                gensim.utils.simple_preprocess(paragraph[0]), [i])


model = gensim.models.TfidfModel.load("./models/tfidf/sentences/tfidf")
dataset = json.load(open("./datasets/dataset_paragraphs.json"))
dataset = list(read_list_corpus(dataset, tokens_only=True))
dct = Dictionary(dataset)
bow_corpus = [dct.doc2bow(line) for line in dataset]
term_doc_mat = corpus2csc(bow_corpus)
print dir(term_doc_mat)
print term_doc_mat.get_shape()
Exemplo n.º 52
0
class Word2VecWmdRelaxSimilarity(Word2VecSimilarityBase):
    def __init__(
        self,
        cut_off=0.2,
        cleanup_urls=True,
        nltk_tokenizer=False,
        confidence_threshold=0.8,
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.dictionary = Dictionary(self.corpus)
        self.tfidf = TfidfModel(dictionary=self.dictionary)

    def search_similar_bugs(self, query):

        query = self.text_preprocess(self.get_text(query))
        words = [
            word for word in set(chain(query, *self.corpus)) if word in self.w2vmodel.wv
        ]
        indices, words = zip(
            *sorted(
                (
                    (index, word)
                    for (index, _), word in zip(self.dictionary.doc2bow(words), words)
                )
            )
        )
        query = dict(self.tfidf[self.dictionary.doc2bow(query)])
        query = [
            (new_index, query[dict_index])
            for new_index, dict_index in enumerate(indices)
            if dict_index in query
        ]
        documents = [
            dict(self.tfidf[self.dictionary.doc2bow(document)])
            for document in self.corpus
        ]
        documents = [
            [
                (new_index, document[dict_index])
                for new_index, dict_index in enumerate(indices)
                if dict_index in document
            ]
            for document in documents
        ]
        embeddings = np.array(
            [self.w2vmodel.wv[word] for word in words], dtype=np.float32
        )
        nbow = dict(
            (
                (index, list(chain([None], zip(*document))))
                for index, document in enumerate(documents)
                if document != []
            )
        )
        nbow["query"] = tuple([None] + list(zip(*query)))
        distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query")

        return [
            self.bug_ids[distance[0]]
            for distance in distances
            if self.bug_ids[distance[0]] != query["id"]
        ]

    def get_distance(self, query1, query2):
        query1 = self.text_preprocess(self.get_text(query1))
        query2 = self.text_preprocess(self.get_text(query2))

        words = [
            word
            for word in set(chain(query1, query2, *self.corpus))
            if word in self.w2vmodel.wv
        ]
        indices, words = zip(
            *sorted(
                (
                    (index, word)
                    for (index, _), word in zip(self.dictionary.doc2bow(words), words)
                )
            )
        )
        query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)])
        query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)])

        query1 = [
            (new_index, query1[dict_index])
            for new_index, dict_index in enumerate(indices)
            if dict_index in query1
        ]
        query2 = [
            (new_index, query2[dict_index])
            for new_index, dict_index in enumerate(indices)
            if dict_index in query2
        ]
        embeddings = np.array(
            [self.w2vmodel.wv[word] for word in words], dtype=np.float32
        )
        nbow = {}
        nbow["query1"] = tuple([None] + list(zip(*query1)))
        nbow["query2"] = tuple([None] + list(zip(*query2)))
        distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query1")

        return distances[0][1]
Exemplo n.º 53
0
print(time.time() - start)
# Create the term similarity matrix.
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary,
                                               tfidf)
print(time.time() - start)
output_file = open('semantic_comparison.json', 'w+')
output = []

for x in range(0, 300):
    query_string = qs[(int)(len(qs) *
                            random.random())]  # pick a random question
    query = preprocess(query_string)

    # Compute Soft Cosine Measure between the query and the documents.
    # From: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb
    query_tf = tfidf[dictionary.doc2bow(query)]

    index = SoftCosineSimilarity(
        tfidf[[dictionary.doc2bow(document) for document in corpus]],
        similarity_matrix)

    doc_similarity_scores = index[query_tf]

    # Output the sorted similarity scores and documents
    sorted_indexes = np.argsort(doc_similarity_scores)[::-1]
    output_obj = {'question': query_string}
    answer_array = []
    ticks = 0
    for idx in sorted_indexes:
        ticks += 1
        if ticks == 5:
Exemplo n.º 54
0
import time

time_start = time.time()
# 数据预处理
with open("toutiao_cat_data.txt", "r", encoding="utf-8") as f:
    # with open("test.txt","r",encoding="utf-8") as f:
    data = []
    for line in f.readlines():
        line = line.strip()  # 去除空格
        line = ','.join(line.split("_!_")[3:])  # 按符号切割数据,并且不要前三个无关文本内容的数据
        data.append(jieba.lcut(line))

# 文本向量化
dictionary = Dictionary(data)  # 统计每个词在其它文本中出现了多少次
dictionary.filter_n_most_frequent(200)  # 过滤掉频率过高的词
corpus = [dictionary.doc2bow(text) for text in data]  # 转化为词袋向量

# 训练模型
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)  # 指定了10个主题,

# 获取主题词分布
topic_list = lda.print_topics(20)
# print(topic_list)
for i in topic_list:
    print(i)


def pre(data):
    '获取某篇文档的主题分布'
    print(data)
    doc_bow = dictionary.doc2bow(data)  # 文档转换成bow
Exemplo n.º 55
0
class DocDataset(Dataset):
    def __init__(self,taskname,txtPath=None,lang="zh",tokenizer=None,stopwords=None,no_below=5,no_above=0.0134,hasLable=False,rebuild=False,use_tfidf=False):
        cwd = os.getcwd()
        txtPath = os.path.join(cwd,'data',f'{taskname}_lines.txt') if txtPath==None else txtPath
        tmpDir = os.path.join(cwd,'data',taskname)
        self.txtLines = [line.strip('\n') for line in open(txtPath,'r',encoding='utf-8')]
        self.vob = [ [line.strip('\n')] for line in open("./data/topic_model_vocab.txt",'r',encoding='utf-8')]
        self.dictionary = None
        self.bows,self.docs = None,None
        self.use_tfidf = use_tfidf
        self.tfidf,self.tfidf_model = None,None
        if not os.path.exists(tmpDir):
            os.mkdir(tmpDir)
        if not rebuild and os.path.exists(os.path.join(tmpDir,'corpus.mm')):
            # print("here exit")
            # exit()
            self.bows = gensim.corpora.MmCorpus(os.path.join(tmpDir,'corpus.mm'))
            if self.use_tfidf:
                self.tfidf = gensim.corpora.MmCorpus(os.path.join(tmpDir,'tfidf.mm'))
            self.dictionary = Dictionary.load_from_text(os.path.join(tmpDir,'dict.txt'))
            self.docs = pickle.load(open(os.path.join(tmpDir,'docs.pkl'),'rb'))
            self.dictionary.id2token = {v:k for k,v in self.dictionary.token2id.items()} # because id2token is empty be default, it is a bug.
        else:
            if stopwords==None:
                stopwords = set([l.strip('\n').strip() for l in open(os.path.join(cwd,'data','stopwords.txt'),'r',encoding='utf-8')])
            # self.txtLines is the list of string, without any preprocessing.
            # self.texts is the list of list of tokens.
            print('Tokenizing ...')
            if tokenizer is None:
                tokenizer = globals()[LANG_CLS[lang]](stopwords=stopwords)
            self.docs = tokenizer.tokenize(self.txtLines)
           # print("self.docs", len(self.docs))
            self.docs = [line for line in self.docs if line!=[]]
            #print("self.docs", len(self.docs))
            # build dictionary
            print("self.vob", len(self.vob), self.vob[0])
            self.dictionary = Dictionary(self.vob)
            # #self.dictionary.filter_n_most_frequent(remove_n=20)
            # self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=2000)  # use Dictionary to remove un-relevant tokens
            # self.dictionary.compactify()
            self.dictionary.id2token = {v:k for k,v in self.dictionary.token2id.items()} # because id2token is empty by default, it is a bug.
            # convert to BOW representation
            self.bows, _docs = [],[]
            for doc in self.docs:
                _bow = self.dictionary.doc2bow(doc)
                if _bow!=[]:
                    _docs.append(list(doc))
                    self.bows.append(_bow)
           
            self.docs = _docs
            print("bow", len(self.bows), self.bows[0], len(self.bows[0]))
            if self.use_tfidf==True:
                self.tfidf_model = TfidfModel(self.bows)
                self.tfidf = [self.tfidf_model[bow] for bow in self.bows]
            # serialize the dictionary
            gensim.corpora.MmCorpus.serialize(os.path.join(tmpDir,'corpus.mm'), self.bows)
            self.dictionary.save_as_text(os.path.join(tmpDir,'dict.txt'))
            pickle.dump(self.docs,open(os.path.join(tmpDir,'docs.pkl'),'wb'))
            if self.use_tfidf:
                gensim.corpora.MmCorpus.serialize(os.path.join(tmpDir,'tfidf.mm'),self.tfidf)
        self.vocabsize = len(self.dictionary)
        self.numDocs = len(self.bows)
        print(f'Processed {len(self.bows)} documents.')
        
    def __getitem__(self,idx):
        bow = torch.zeros(self.vocabsize)
        if self.use_tfidf:
            item = list(zip(*self.tfidf[idx]))
        else:
            item = list(zip(*self.bows[idx])) # bow = [[token_id1,token_id2,...],[freq1,freq2,...]]
        bow[list(item[0])] = torch.tensor(list(item[1])).float()
        txt = self.docs[idx]
        return txt,bow
    
    def __len__(self):
        return self.numDocs
    
    def collate_fn(self,batch_data):
        texts,bows = list(zip(*batch_data))
        return texts,torch.stack(bows,dim=0)

    def __iter__(self):
        for doc in self.docs:
            yield doc

    def show_dfs_topk(self,topk=20):
        ndoc = len(self.docs)
        dfs_topk = sorted([(self.dictionary.id2token[k],fq) for k,fq in self.dictionary.dfs.items()],key=lambda x: x[1],reverse=True)[:topk]
        for i,(word,freq) in enumerate(dfs_topk):
            print(f'{i+1}:{word} --> {freq}/{ndoc} = {(1.0*freq/ndoc):>.13f}')
        return dfs_topk

    def show_cfs_topk(self,topk=20):
        ntokens = sum([v for k,v in self.dictionary.cfs.items()])
        cfs_topk = sorted([(self.dictionary.id2token[k],fq) for k,fq in self.dictionary.cfs.items()],key=lambda x: x[1],reverse=True)[:topk]
        for i,(word,freq) in enumerate(cfs_topk):
            print(f'{i+1}:{word} --> {freq}/{ntokens} = {(1.0*freq/ntokens):>.13f}')
    
    def topk_dfs(self,topk=20):
        ndoc = len(self.docs)
        dfs_topk = self.show_dfs_topk(topk=topk)
        return 1.0*dfs_topk[-1][-1]/ndoc
Exemplo n.º 56
0
import pandas as pd
from gensim.corpora import Dictionary
from gensim.models import LdaModel

pos_com = pd.read_csv('./data/pos_com.csv', header=None, index_col=0)
neg_com = pd.read_csv('./data/neg_com.csv', header=None, index_col=0)

# 正向评价
pos_com.columns = ['comment']
mid = list(pos_com['comment'].str.split(' '))
dictionary = Dictionary(mid)
bow = [dictionary.doc2bow(com) for com in mid]
# 模型构建
pos_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3)
pos_model.print_topic(0)
pos_model.print_topic(1)
pos_model.print_topic(2)

# 负面评价
neg_com.columns = ['comment']
mid = list(neg_com['comment'].str.split(' '))
dictionary = Dictionary(mid)
bow = [dictionary.doc2bow(com) for com in mid]
# 模型构建
neg_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3)
neg_model.print_topic(0)
neg_model.print_topic(1)
neg_model.print_topic(2)
Exemplo n.º 57
0
def start(num_topics, kind):
    data = loader.load_data(kind)
    df = pd.DataFrame(data)
    cleaner.clean(df)

    nlps = {
        'it': spacy.load('it_core_news_lg'),
        'en': spacy.load('en_core_web_lg'),
        'fr': spacy.load('fr'),
        'de': spacy.load('de')
    }

    tokenizers = {
        'it': Tokenizer(nlps['it'].vocab),
        'en': Tokenizer(nlps['en'].vocab),
        'fr': Tokenizer(nlps['fr'].vocab),
        'de': Tokenizer(nlps['de'].vocab)
    }

    # Customize stop words by adding to the default list
    stop_words = []
    stop_words += nlps['it'].Defaults.stop_words
    stop_words += nlps['en'].Defaults.stop_words
    stop_words += nlps['fr'].Defaults.stop_words
    stop_words += nlps['de'].Defaults.stop_words
    stop_words += s.ALL_STOPWORDS
    stop_words = set(stop_words)

    # ALL_STOP_WORDS = spacy + gensim + wordcloud
    ALL_STOP_WORDS = stop_words.union(SW).union(stopwords)

    cleaner.remove_stopwords(df, tokenizers, ALL_STOP_WORDS)
    cleaner.lemmas(df, nlps)

    tok.tokenize_text(df)

    # Create a id2word dictionary
    id2word = Dictionary(df['lemma_tokens'])
    print(len(id2word))

    # Filtering Extremes
    id2word.filter_extremes(no_below=2, no_above=.99)
    print(len(id2word))

    # Creating a corpus object
    corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]

    # Instantiating a Base LDA model
    base_model = LdaMulticore(corpus=corpus,
                              num_topics=num_topics,
                              id2word=id2word,
                              workers=12,
                              passes=5)

    # Filtering for words
    words = [re.findall(r'"([^"]*)"', t[1]) for t in base_model.print_topics()]

    # Create Topics
    topics = [' '.join(t[0:10]) for t in words]

    # Getting the topics
    for id, t in enumerate(topics):
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")

    # Compute Perplexity
    # a measure of how good the model is. lower the better
    base_perplexity = base_model.log_perplexity(corpus)
    print('\nPerplexity: ', base_perplexity)

    # Compute Coherence Score
    coherence_model = CoherenceModel(model=base_model,
                                     texts=df['lemma_tokens'],
                                     dictionary=id2word,
                                     coherence='c_v')
    coherence_lda_model_base = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model_base)

    lda_display = pyLDAvis.gensim.prepare(base_model, corpus, id2word)
    d = pyLDAvis.display(lda_display)

    today = date.today()
    directory_path = f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/"
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
    f = open(
        f"/home/marco/Scrivania/tirocinio-unicredit/lda-html/{kind}/{today}/{num_topics}.html",
        'w')
    f.write(d.data)
    f.close()

    vectorizer = CountVectorizer()
    data_vectorized = vectorizer.fit_transform(df['lemmas_back_to_text'])

    # Define Search Param
    search_params = {
        'n_components': [10, 15, 20, 25, 30],
        'learning_decay': [.5, .7, .9]
    }

    # Init the Model
    lda = LatentDirichletAllocation()

    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)

    # Do the Grid Search
    model.fit(data_vectorized)
    GridSearchCV(cv=None,
                 error_score='raise',
                 estimator=LatentDirichletAllocation(batch_size=128,
                                                     doc_topic_prior=None,
                                                     evaluate_every=-1,
                                                     learning_decay=0.7,
                                                     learning_method=None,
                                                     learning_offset=10.0,
                                                     max_doc_update_iter=100,
                                                     max_iter=10,
                                                     mean_change_tol=0.001,
                                                     n_components=10,
                                                     n_jobs=1,
                                                     perp_tol=0.1,
                                                     random_state=None,
                                                     topic_word_prior=None,
                                                     total_samples=1000000.0,
                                                     verbose=0),
                 iid=True,
                 n_jobs=1,
                 param_grid={
                     'n_topics': [10, 15, 20, 30],
                     'learning_decay': [0.5, 0.7, 0.9]
                 },
                 pre_dispatch='2*n_jobs',
                 refit=True,
                 return_train_score='warn',
                 scoring=None,
                 verbose=0)

    # Best Model
    best_lda_model = model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))
Exemplo n.º 58
0
def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)
Exemplo n.º 59
0
def _build_corpus(sentences):
    split_tokens = [sentence.token.split() for sentence in sentences]
    dictionary = Dictionary(split_tokens)
    return [dictionary.doc2bow(token) for token in split_tokens]
Exemplo n.º 60
0
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized[:10]
data_words2 = list(sent_to_words(data_lemmatized))

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(data_words2)
dictionary.filter_extremes(no_below=10, no_above=0.6)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)


corpus = [dictionary.doc2bow(doc) for doc in data_words2]

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             max_df=0.6,                        
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

# Materialize the sparse data
data_dense = data_vectorized.todense()