예제 #1
0
def main():
    
    data = get_data()


    ############ VERY TIME CONSUMING!!! ####################
    ############ Get Fuzzy scores - to csv ####################
    df_combos(df)
    
    

    df = manage_data(data)
    
    scores = pd.read_csv('similarity scores.csv')
    scores = scores.loc[scores['score']>87]
    ids = list(scores.id2.unique())
    
    df['id'] = df.index +2
    df = df.loc[~df['id'].isin(ids)]
    
    plot_journal_counts(df)
    plot_yearly_counts(df)
    
    
    ########## DTM MODELS  #####################
    df = df.sort_values(by='Year',ascending=True)
    
    ## If you want to run lda model uncomment next line
    times = get_time_slice(df)
    
    doc_processed = df['abstract3'].map(preprocess)

    
    dictionary = corpora.Dictionary(doc_processed)
    #to prepapre a document term matrix
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_processed]
    
    
    ldaseq = ldaseqmodel.LdaSeqModel(corpus=doc_term_matrix, id2word = dictionary,
                                 time_slice=times, num_topics=25, chain_variance=0.05)

    #### Set the times
    times = ["1990-1992", "1993-1995", "1996-1998", "1999-2001", "2002-2004", "2005-2007", "2008-2010",
         "2011-2013", "2014-2016", "2017-2019"]


    ### Make the topics in a spreadsheet
    # full, twenty, twentyfive
    full = make_topics_time(times, ldaseq, number = "twentyfive")
    full['time period'] = full.apply(fix_times, axis=1)
예제 #2
0
    def run(self, document_collection, topic_count=2, time_group=[10, 10, 11]):
        """document_collection should be sorted in order of time_slice."""
        dictionary = Dictionary(document_collection)
        corpus = [dictionary.doc2bow(text) for text in document_collection]
        ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                         id2word=dictionary,
                                         num_topics=topic_count,
                                         time_slice=time_group)

        topics = ldaseq.print_topics(1)
        for topic in topics:
            print("TOPIC " + str(topic))

        return ldaseq
예제 #3
0
 def sequence_lda_topic_modeling(self, tokenized_sentence_list, time_slice,
                                 num_topics):
     self.time_slice = time_slice
     texts = [[
         word for word in document.split(" ")
         if word not in self.__get_stopwords()
     ] for document in tokenized_sentence_list]
     dictionary = corpora.Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                           id2word=dictionary,
                                           time_slice=self.time_slice,
                                           num_topics=num_topics,
                                           em_max_iter=10)
     sequence_topic_list = []
     for num in range(num_topics):
         sequence_topic_list.append(
             (num, self.ldaseq.print_topic_times(topic=num)))
     return sequence_topic_list
예제 #4
0
def bruteforce_lda_entire_corpus():
    # this does not work
    # reports Bus error on slurm
    # probably requires way too much RAM/compute power
    corpus = mmcorpus.MmCorpus(join(corpus_folder, 'in_corpus.mm'))
    slice_df = pd.read_csv(join(d_folder, '..', 'notes', 'month_linects.txt'),
                           sep=' ',
                           names=['mth', 'ct'],
                           index_col=False)
    slices = list(slice_df.ct)
    slices[-1] += corpus.num_docs - sum(slices)
    dictionary = Dictionary.load(join(corpus_folder, 'in_corpus_dict.dict'))
    print('loaded corpus and dictionary')
    t1 = time.time()
    ldaseqmodel.LdaSeqModel(corpus=corpus,
                            id2word=dictionary,
                            time_slice=slices,
                            num_topics=500)
    t2 = time.time()
    print(f"# seconds = {int(t2-t1)}")
 def setUp(self):
     texts = [
         [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],
         [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],
         [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],
         [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'],
         [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'],
         [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],
         [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'],
         [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],
         [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'],
         [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'],
         [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],
         [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'],
         [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'],
         [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'],
         [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'],
         [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],
         [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'],
         [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'],
         [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'],
         [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'],
         ['bank', 'river', 'shore', 'water'],
         ['river', 'water', 'flow', 'fast', 'tree'],
         ['bank', 'water', 'fall', 'flow'],
         ['bank', 'bank', 'water', 'rain', 'river'],
         ['river', 'water', 'mud', 'tree'],
         ['money', 'transaction', 'bank', 'finance'],
         ['bank', 'borrow', 'money'],
         ['bank', 'finance'],
         ['finance', 'money', 'sell', 'bank'],
         ['borrow', 'sell'],
         ['bank', 'loan', 'sell']
     ]
     # initializing using own LDA sufficient statistics so that we get same results each time.
     sstats = np.loadtxt(datapath('sstats_test.txt'))
     dictionary = Dictionary(texts)
     corpus = [dictionary.doc2bow(text) for text in texts]
     self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
예제 #6
0
def trigramz(date_1, date_2):
    """We then grab the time slice and then we can run the ldaseq with the given time_slice"""

    # grab the time_slice of the documents that we need
    time_sliced = time_slice(date_1, date_2)
    print(time_sliced)
    if time_sliced == 'not in dictionary':
        raise ValueError('dates must be contained in text timeframe')
    trigram_dictionary = phrases()
    print(trigram_dictionary)
    trigram_bow_corpus = B_O_wCreator(trigram_reviews_filepath, trigram_dictionary)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # lda = LdaMulticore(trigram_bow_corpus,
        #                    num_topics=50,
        #                    id2word=trigram_dictionary,
        #                    workers=3)

        ldaseq = ldaseqmodel.LdaSeqModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary, time_slice=time_sliced, num_topics=50)

    ldaseq.save(lda_model_filepaths)
    print(ldaseq)
예제 #7
0
    # Count word frequencies
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    # Only keep words that appear more than once
    processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
    dictionary = corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

    # Model
    num_topics = 5
    ldaseq = ldaseqmodel.LdaSeqModel(corpus=bow_corpus, id2word=dictionary, time_slice=time_slice, num_topics=num_topics)

    # Output topic every year
    word_num = 12
    out_path = "topic"
    with open(out_path + ".csv", "w") as csvfile:
        writer = csv.writer(csvfile)
        header = []
        for i in range(0, num_topics):
            header.append(("topic_" + str(i + 1)))
        writer.writerow(header)
        for j in range(0, len(time_slice)):
            writer.writerow(ldaseq.print_topics(j, word_num))

    # Visualising Dynamic Topic Models
    doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=bow_corpus)
            decay=0.6,
            offset=0.8,
            passes=10,
            iterations=400,
            eval_every=10,
            model_file=model_file,
            only_viz=config.DO_NOT_COMPUTE_COHERENCE)
        print(
            'topic:' + str(num_topics), ', time_slice' + ' '.join(
                [str(i) + ':' + str(j) for i, j in enumerate(time_slices)]) +
            ', c_v:' + str(round(c_v, 4)) + ', cu:' + str(round(u_mass, 4)))

        dyn_model = ldaseqmodel.LdaSeqModel(initialize='ldamodel',
                                            lda_model=model,
                                            time_slice=time_slices,
                                            corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=num_topics,
                                            passes=10,
                                            random_state=config.SEED)
        filename = config.TRAIN_PARAMETERS[section][3]
        dyn_model.save(filename)

        for t in range(0, len(time_slices)):
            doc_topic, topic_term, doc_lengths, term_frequency, vocab = dyn_model.dtm_vis(
                time=t, corpus=corpus)
            prepared = pyLDAvis.prepare(topic_term_dists=topic_term,
                                        doc_topic_dists=doc_topic,
                                        doc_lengths=doc_lengths,
                                        vocab=vocab,
                                        term_frequency=term_frequency)
예제 #9
0
                                            per_word_topics=True)

# In[54]:

#seqmodel
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
import numpy
from gensim.matutils import hellinger

# In[60]:

time_slice = [50, 145, 116, 81, 76, 19, 43, 64, 57, 65, 55, 16, 7]
#time_slice=[350]
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus_new,
                                 id2word=id2word,
                                 time_slice=time_slice,
                                 num_topics=20)

# In[61]:

with open('ldaseq20.dat', "wb") as file:
    pickle.dump(ldaseq, file)

# In[62]:

ldaseq.print_topics(time=0)

# In[30]:

# Print the Keyword in the topics
print(lda_model.print_topics())
예제 #10
0
파일: dtm_p.py 프로젝트: elara7/dtm
# 读取时间段

t = open(main_path + 'corpus/dtm_o/time_series.txt', 'r')
time_series = [int(i) for i in t.read().split()]
t.close()
# 建模

model_gen = ldaseqmodel.LdaSeqModel(
    corpus=corpus,
    time_slice=time_series,
    id2word=corpus.dictionary,
    alphas=para['alpha'],
    num_topics=para['num_topics'],
    initialize=para['initialize'],
    sstats=None,
    lda_model=None,
    obs_variance=para['obs_variance'],
    chain_variance=para['top_chain_var'],
    passes=10,
    random_state=None,
    lda_inference_max_iter=para['lda_inference_max_iter'],
    em_min_iter=para['em_min_iter'],
    em_max_iter=para['em_max_iter'],
    chunksize=100)

# model_gen = LdaSeqModel(corpus = corpus, time_slice=time_series, id2word = dictionary, num_topics = num_topics)
print 'model training finish'
model_gen.save(main_path + 'result/dtm_o_' + sys.platform + '_topic_' +
               str(para['num_topics']) + '.model')
print 'model saving finish'
#model1 = DtmModel.load('topic1.model')

def generate_timeslice_data(index_arr):
    arr = []
    for i in index_arr.values():
        arr.append(i)
        
    diff_arr=[]
    for i in range(1,len(arr)):
        diff_arr.append(arr[i]-arr[i-1])
    return diff_arr


# In[4]:


time_slices = generate_timeslice_data(time_index_arr)


# In[ ]:


ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=id2word, time_slice=time_slices, num_topics=35)


# In[ ]:




예제 #12
0
def run(n, folder, verbose=True):
    # documents is a list of lists, where each nested list has the words from one document
    documents = load_documents(folder)
    len(documents)
    len(documents[0])

    # remove common words
    stoplist = set(
        'for a an of the and or to in from on is are can we'.split())
    documents = [[word for word in document if word not in stoplist]
                 for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for document in documents:
        for word in document:
            frequency[word] += 1
    documents = [[word for word in document if frequency[word] > 1]
                 for document in documents]

    len(documents)
    [len(document) for document in documents]

    # use only the first n words per document
    documents = [document[:n] for document in documents]
    [len(document) for document in documents]

    class DTMcorpus(textcorpus.TextCorpus):
        def get_texts(self):
            return self.input

        def __len__(self):
            return len(self.input)

    corpus = DTMcorpus(documents)

    first_half = len(documents) / 2
    second_half = len(documents) - first_half
    time_slice = [first_half,
                  second_half]  # n documents split into 2 time slices

    if verbose:
        # activate logging
        logger = logging.getLogger()
        logger.setLevel(logging.DEBUG)

    # run
    ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                     id2word=corpus.dictionary,
                                     time_slice=time_slice,
                                     num_topics=5)

    # Visualizing dynamic topic models
    from gensim.models.wrappers.dtmmodel import DtmModel
    from gensim.corpora import Dictionary, bleicorpus
    import pyLDAvis

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(
        time=0, corpus=corpus)
    vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term,
                               doc_topic_dists=doc_topic,
                               doc_lengths=doc_lengths,
                               vocab=vocab,
                               term_frequency=term_frequency)

    # For ipython notebook:
    # pyLDAvis.display(vis_dtm)

    # This works best for me (then view dtm.html in a browser)
    with open("dtm3.html", "w") as f:
        pyLDAvis.save_html(vis_dtm, f)

    return ("dtm3.html saved.")
model_corpus_17 = tfidfmodel[ldacorpus_17]

print(sent_big_17[0])
print(ldacorpus_17[0])
print(model_corpus_17[0])

time_slice = [3323, 3324, 7485, 7486]
#Time slots I am considering
#First time slot represents the first half of Book1
#Second time slot represents the second half of Book1
#Third time slot represents the first half of Book7
#Fourth time slot represents the second half of Book7

#Run the LDA Seq model that allows to track the dynamics of the topics.
ldaseq = ldaseqmodel.LdaSeqModel(corpus=ldacorpus_17,
                                 id2word=dictionary_big_17,
                                 time_slice=time_slice,
                                 num_topics=3)

# In[43]:

#Look at the three different topics at time 0 (corresponding to the first book)
ldaseq.print_topics(time=0)

# In[44]:

ldaseq.print_topics(
    time=1
)  #Look at the three different topics at time 1 (corresponding to book7)

# In[45]:
예제 #14
0
    normalized = " ".join(word for word in punc_free.split() if len(word) > 4)
    return normalized


# In[5]:

doc_clean = [clean(d).split() for d in docs]
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc)
                   for doc in doc_clean]  #generating a document-term matrix

# In[6]:

#running the model on the data
ldaseq = ldaseqmodel.LdaSeqModel(doc_term_matrix,
                                 id2word=dictionary,
                                 time_slice=time_slice,
                                 num_topics=5)

# In[7]:

#printing the topics generated
ldaseq.print_topics(time=0)

# In[8]:

#printing evolution of topics in time slices
ldaseq.print_topic_times(topic=0)

# In[27]:

#testing the trained model on a document in the corpus
예제 #15
0
    def __len__(self):
        return len(self.input)


corpus = DTMcorpus(documents)

time_slice = [3, 7]  # 10 documents split into 2 time slices

# activate logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# run
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus,
                                 id2word=corpus.dictionary,
                                 time_slice=time_slice,
                                 num_topics=5)

# Visualizing dynamic topic models
from gensim.models.wrappers.dtmmodel import DtmModel
from gensim.corpora import Dictionary, bleicorpus
import pyLDAvis

doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(
    time=0, corpus=corpus)
vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term,
                           doc_topic_dists=doc_topic,
                           doc_lengths=doc_lengths,
                           vocab=vocab,
                           term_frequency=term_frequency)
예제 #16
0
파일: DTM.py 프로젝트: Peizhi-Wang/MyReport
dictionary.compactify()       # 删除去除单词后的空格
dictionary.save('C:/Users/WCH/Desktop/try-DTM/news_dictionary')  # 保存词典
#将文档加载成构造语料库
class MyCorpus(object):
    def __iter__(self):
        for line in open('C:/Users/WCH/Desktop/try-DTM/testceshi.txt'):
            yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus()  
corpus = [vector for vector in corpus_memory_friendly]  # 将读取的文档转换成语料库
corpora.BleiCorpus.serialize('C:/Users/WCH/Desktop/try-DTM/news_corpus', corpus)  # 存储为Blei lda-c格式的语料库

dictionary = Dictionary.load('C:/Users/WCH/Desktop/try-DTM/news_dictionary')
corpus = bleicorpus.BleiCorpus('C:/Users/WCH/Desktop/try-DTM/news_corpus')
time_slice = [1000, 1000, 1000, 821]   #设置这个语料库的间隔,此处分为三个时期,第一个时期内有438条新闻,第二为430条,第三个为456条。
num_topics = 5  #设置主题数,此处为5个主题
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=num_topics) #将语料库、词典、参数加载入模型中进行训练

print ("########################################################")
corpusTopic = ldaseq.print_topics(time=0)  # 输出指定时期主题分布,此处第一个时期主题分布
print ("corpusTopic")
print(corpusTopic)
topicEvolution = ldaseq.print_topic_times(topic=0) # 查询指定主题在不同时期的演变,此处为第一个主题的
print ("topicEvolution")
print(topicEvolution)
doc = ldaseq.doc_topics(0) # 查询指定文档的主题分布,此处为第一篇文档的主题分布

print ("########################################################")
corpusTopic = ldaseq.print_topics(time=1)  # 输出指定时期主题分布,此处第一个时期主题分布
print ("corpusTopic")
print(corpusTopic)
topicEvolution = ldaseq.print_topic_times(topic=1) # 查询指定主题在不同时期的演变,此处为第一个主题的
예제 #17
0
        WordCloud(background_color='white',
                  colormap='brg',
                  min_font_size=4,
                  max_words=80,
                  scale=1).fit_words(dict(LDAmain.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()
    fig.savefig("visuals\LDAmaintop_%s.png" % t)

#-----------------------4. Dynamic topic modelling-------------------------------
time_slice = list_mon  #declaring time slice
ldaseq = ldaseqmodel.LdaSeqModel(initialize='ldamodel',
                                 lda_model=LDAmain,
                                 num_topics=5,
                                 corpus=corpus,
                                 id2word=dictionary,
                                 time_slice=time_slice,
                                 chain_variance=0.05)
ldaseq.print_topics(time=0, top_terms=20)
ldaseq.print_topic_times(topic=0, top_terms=20)
ldaseq.save("models\ldaseqmain")

#exporting results
np.savetxt("topics\dtmtop0file.csv",
           ldaseq.print_topic_times(topic=0),
           delimiter=",",
           fmt='%s')
np.savetxt("topics\dtmtop1file.csv",
           ldaseq.print_topic_times(topic=1),
           delimiter=",",
예제 #18
0
dictionary.filter_extremes(
    no_below=10, no_above=0.25
)  #Filter words that appear less than 10 documents and more than 25% of all documents

#Create the bag of words for all documents
bag_of_words = [dictionary.doc2bow(abstract) for abstract in abstracts]

print('- Read and preprocessed the dataset!')

########################## DYNAMIC TOPIC MODELING ##########################

#Build the model
print('- Training the model')
start_time = time.time()  #Start count time
ldaseq = ldaseqmodel.LdaSeqModel(corpus=bag_of_words,
                                 id2word=dictionary,
                                 time_slice=time_slices_2years_interval,
                                 num_topics=8)
print('- Model finish running in', round((time.time() - start_time) / 60),
      'min(s)')
#Save the model
path = datapath('dynamic_model_code')
ldaseq.save(path)

########################## EVALUATION ##########################
coherence = ldaseq.dtm_coherence(time=0)
temp = CoherenceModel(topics=coherence,
                      corpus=bag_of_words,
                      dictionary=dictionary,
                      coherence='u_mass')
print("u_mass = ", temp.get_coherence())
temp = CoherenceModel(topics=coherence,