def corpusframe_test(dm_path=DOCMODELS_PATH, max=1000): print('Testing corpusframe from dicts') timer = Timer() dl = [ dm.to_dict() for i, dm in enumerate(DocModel.docmodel_generator(dm_path)) if i < max ] df = pd.DataFrame(dl).set_index('id') print( f'Done making df from dicts, size: {df.memory_usage(index=True).sum()/(1024**2)} mbs' ) timer.step() print(df)
def docmodel_tolist_speed_test(dm_path=DOCMODELS_PATH): print('Testing docmodel tolist speed') timer = Timer() r = [dm.metadata_to_dict() for dm in DocModel.docmodel_generator(dm_path)] print( f'Dm tolist speed test done. Made list with ids from {len(r)} docmodels in {timer.get_run_time()}' )
def docmodel_read_speed_test(dm_path=DOCMODELS_PATH, max=1000000): print('Testing docmodel read speed') timer = Timer() for i, dm in enumerate(DocModel.docmodel_generator(dm_path)): t = dm.to_dict() if (i + 1) == max: break print( f'Dm read speed test done. Read title from {max} docmodels in {timer.get_run_time()}' )
def extract_and_tag_docmodel_texts(path): """Loads and updates all DocModels in a dir by extracting and tagging abstracts and texts""" timer = Timer() tagger = treetaggerwrapper.TreeTagger(TAGLANG='en') print(f'Starting to extract and tag texts from docmodels at {path}...') for i, dm in enumerate(DocModel.docmodel_generator(path)): dm.extract_abstract(TRASH_SECTIONS) dm.extract_text(TRASH_SECTIONS) dm.treetag_abstract(tagger) dm.treetag_text(tagger) dm.save_to_pickle() if (i + 1) % 10000 == 0: print(f'Processed {i+1} docmodels...') print(f'Done! Processing time: {timer.get_run_time()}')
def create_docmodels_from_xml_corpus(srs_path, save_path, extract_metadata=True): """Reads XMLs and create DocModel objects. Extracts metadata if asked to, which should usually be the case.""" timer = Timer() print(f'Starting to parse xml files at {srs_path}...') for i, filename in enumerate(os.listdir(srs_path)): try: DocModel(filename, ET.parse(srs_path / filename), save_path, extract_metadata_on_init=extract_metadata) except: print(f'Error on {filename}') if (i + 1) % 10000 == 0: print(f'Parsed {i+1} files...') print(f'Done! Parsing time: {timer.get_run_time()}') print("Save path : {}".format(save_path))
def generate_metadata_from_mappings(docmodels_path, generate_doctype_cats=True, generate_primary_subjects=True, generate_secondary_subjects=True): """Generate 'subjects' and 'doctype cats' metadata from mappings loaded from CSV""" timer = Timer() if generate_doctype_cats: with open(DOCTYPE_CATS_CSV_PATH, newline='') as cd_csv: doctype_cats_mapping = {n[0]: n[1] for n in csv.reader(cd_csv)} print(doctype_cats_mapping) if generate_primary_subjects: with open(PRIMARY_SUBJECTS_CSV_PATH, newline='') as cd_csv: primary_subjects_mapping = { n[0]: [n[i] for i in range(1, len(n)) if n[i] != ''] for n in csv.reader(cd_csv) } print(primary_subjects_mapping) if generate_secondary_subjects: with open(SECONDARY_SUBJECTS_CSV_PATH, newline='') as cd_csv: secondary_subjects_mapping = { n[0]: [n[i] for i in range(1, len(n)) if n[i] != ''] for n in csv.reader(cd_csv) } print(secondary_subjects_mapping) for dm in DocModel.docmodel_generator(docmodels_path): if generate_doctype_cats: dm.extract_doctype_cat(doctype_cats_mapping) if generate_primary_subjects: dm.extract_primary_subjects(primary_subjects_mapping) if generate_secondary_subjects: dm.extract_secondary_subjects(secondary_subjects_mapping) dm.save_to_pickle(docmodels_path / dm.filename) print( f'Done extracting metadata from csvs. Parsing time: {timer.get_run_time()}' )
def lda_topics_main(name, topics, a, b, min_tokens, max_word_frequence, min_word_occs, loops=100, decay=0.9, rnd=2112, learn_method='batch'): timer = Timer() param_dict = { 'name': name, 'n_topics': topics, 'a': a, 'b': b, 'min_tokens': min_tokens, 'max_word_freq': max_word_frequence, 'min_word_occ': min_word_occs, 'loops': loops, 'decay': decay } # Define the path where the model files will go # Creates a new file if it does not exist, else old files will be overwritten # Based on BASE_ANALYSIS_PATH, from config. path = BASE_ANALYSIS_PATH / 'LDA' / name path.mkdir(exist_ok=True) # Load docterm df # df = pickle.load(open(CORPUSFRAMES_PATH / 'abs_nva_docterm_corpusframe.p', 'rb')) # One step preprocess. Loads, filters and normalizes abs nva docterm df df = load_nva_abs_docterm_df( filtered_index_list=filter_metadata_index_list(min_text_tokens=2000), min_tokens=min_tokens, min_word_occs=min_word_occs, max_word_freq=max_word_frequence, log_norm=True) # Only keep docs with enough tokens # df = df[(df.sum(axis=1) >= min_tokens)] # Remove words that occur in too few docs (by default, removes words that happen in less than 10 docs) # and # Remove words that are in more than x% of the docs (by default, remove words that are in more than 30% of docs) # df = df.drop((df > 0).sum()[lambda x: (x < min_word_occs) | (x > max_word_frequence*len(df))].index, axis=1) # print(df) param_dict['num_docs'], param_dict['num_words'] = df.shape # tfidf or log normalization, one or both can be commented out #df = tfidf_docterm_df(df) # df = df.apply(lambda x: np.log(x + 1)) # Fit model and save as pickle lda_model = make_and_train_lda_model(df, topics, loops, decay, a, b, learn_method, rnd) pickle.dump(lda_model, open(path / 'lda_model.p', 'wb')) # lda_model = pickle.load(open(path / 'lda_model.p', 'rb')) # Make topic words df (index = topics, cols = words) topic_words_df = pd.DataFrame(lda_model.components_, index=[f'topic_{i}' for i in range(topics)], columns=df.columns) # Transform to get doc x topic dist, normalized doc_topics_df = pd.DataFrame(lda_model.transform(df), columns=[f'topic_{i}' for i in range(topics)], index=df.index) # Normalize both, so that the sum of each row = 1. Can be commented out topic_words_df = topic_words_df.apply( lambda x: x / topic_words_df.sum(axis=1)) doc_topics_df = doc_topics_df.apply( lambda x: x / doc_topics_df.sum(axis=1)) # add col with main topic for each doc # add col with main word for each topic # doc_topics_df['main_topic'] = doc_topics_df.idxmax(axis=1) # topic_words_df['top_word'] = topic_words_df.idxmax(axis=1) # Reduce to 3d with umad and cols info to doc_topics_df # Reduce to 3d with umad and cols info to topics_words_df # reducer = UMAP(n_components=3) # doc_topics_df[['x', 'y', 'z']] = reducer.fit_transform(doc_topics_df) # topic_words_df[['x', 'y', 'z']] = reducer.fit_transform(topic_words_df) # Save results! pickle.dump(doc_topics_df, open(path / 'doc_topics_df.p', 'wb')) pickle.dump(topic_words_df, open(path / 'topic_words_df.p', 'wb')) res = lda_model.components_ names = df.columns param_dict['log likelyhood'] = lda_model.score(df) param_dict['perplexity'] = lda_model.perplexity(df) # Print and save to txt top words for each topic # Does the same job as the csv thing but is uglier and clunkier # But it's nice to have the results printed and it works so who cares resdict = {} for num, topic in enumerate(res): features = [topic.argsort()[:-19:-1]] resdict[num] = [names[i] for i in features] # print([df.columns[topic.argsort()[i]] for i in range(10)]) topics_str = '\n'.join([ f'{key} - {", ".join(word for word in words[0])}' for key, words in resdict.items() ]) param_str = '\n'.join( [f'{key} - {value}' for key, value in param_dict.items()]) with open(path / 'topics.txt', 'wb') as f: f.write(topics_str.encode('utf-8')) with open(path / 'params.txt', 'wb') as f: f.write(param_str.encode('utf-8')) with open(path / 'topic_probs.csv', 'wb') as f: f.write(topics_to_csv(topic_words_df, 15, param_dict).encode('utf-8')) print(param_str) print('\n') print(topics_str) print(f'Done running lda main! Run time: {timer.get_run_time()}') print('\n\n\n')
print() pickle.dump( wc, open(CORPUSFRAMES_PATH / 'text_word_counts_filtered_ids_df.p', 'wb')) # tf = df['primary_subjects'].apply(pd.Series) # print(tf.unique()) # pf = make_source_subjects_df(PRIMARY_SUBJECTS_CSV_PATH) # sf = make_source_subjects_df(SECONDARY_SUBJECTS_CSV_PATH) # pickle.dump(pf, open(CORPUSFRAMES_PATH / 'primary_subjects_df.p', 'wb')) # pickle.dump(sf, open(CORPUSFRAMES_PATH / 'secondary_subjects_df.p', 'wb')) assert False corpusframe_fct = make_metadata_corpusframe save_name = 'metadata_corpusframe_2.p' print('Running corpusframe main.') print( f'Making corpusframe with dict {corpusframe_fct.__name__}, saving as {save_name} ' ) assert input('Enter \'Y\' to continue...').lower() == 'y' timer = Timer() df = corpusframe_fct() pickle.dump(df, open(CORPUSFRAMES_PATH / save_name, 'wb')) print(f'Done pickling! Run time: {timer.get_run_time()}') print(f'Df using: {df.memory_usage(deep=True).sum() / (1024 ** 2)} mbs') print(df.dtypes) print(df)