示例#1
0
def corpusframe_test(dm_path=DOCMODELS_PATH, max=1000):
    print('Testing corpusframe from dicts')
    timer = Timer()
    dl = [
        dm.to_dict()
        for i, dm in enumerate(DocModel.docmodel_generator(dm_path)) if i < max
    ]
    df = pd.DataFrame(dl).set_index('id')
    print(
        f'Done making df from dicts, size: {df.memory_usage(index=True).sum()/(1024**2)} mbs'
    )
    timer.step()
    print(df)
示例#2
0
def docmodel_tolist_speed_test(dm_path=DOCMODELS_PATH):
    print('Testing docmodel tolist speed')
    timer = Timer()
    r = [dm.metadata_to_dict() for dm in DocModel.docmodel_generator(dm_path)]
    print(
        f'Dm tolist speed test done. Made list with ids from {len(r)} docmodels in {timer.get_run_time()}'
    )
示例#3
0
def docmodel_read_speed_test(dm_path=DOCMODELS_PATH, max=1000000):
    print('Testing docmodel read speed')
    timer = Timer()
    for i, dm in enumerate(DocModel.docmodel_generator(dm_path)):
        t = dm.to_dict()
        if (i + 1) == max:
            break
    print(
        f'Dm read speed test done. Read title from {max} docmodels in {timer.get_run_time()}'
    )
示例#4
0
def extract_and_tag_docmodel_texts(path):
    """Loads and updates all DocModels in a dir by extracting and tagging abstracts and texts"""

    timer = Timer()
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
    print(f'Starting to extract and tag texts from docmodels at {path}...')
    for i, dm in enumerate(DocModel.docmodel_generator(path)):
        dm.extract_abstract(TRASH_SECTIONS)
        dm.extract_text(TRASH_SECTIONS)
        dm.treetag_abstract(tagger)
        dm.treetag_text(tagger)
        dm.save_to_pickle()
        if (i + 1) % 10000 == 0: print(f'Processed {i+1} docmodels...')
    print(f'Done! Processing time: {timer.get_run_time()}')
示例#5
0
def create_docmodels_from_xml_corpus(srs_path,
                                     save_path,
                                     extract_metadata=True):
    """Reads XMLs and create DocModel objects. Extracts metadata if asked to, which should usually be the case."""

    timer = Timer()
    print(f'Starting to parse xml files at {srs_path}...')
    for i, filename in enumerate(os.listdir(srs_path)):
        try:
            DocModel(filename,
                     ET.parse(srs_path / filename),
                     save_path,
                     extract_metadata_on_init=extract_metadata)
        except:
            print(f'Error on {filename}')
        if (i + 1) % 10000 == 0: print(f'Parsed {i+1} files...')
    print(f'Done! Parsing time: {timer.get_run_time()}')
    print("Save path : {}".format(save_path))
示例#6
0
def generate_metadata_from_mappings(docmodels_path,
                                    generate_doctype_cats=True,
                                    generate_primary_subjects=True,
                                    generate_secondary_subjects=True):
    """Generate 'subjects' and 'doctype cats' metadata from mappings loaded from CSV"""

    timer = Timer()

    if generate_doctype_cats:
        with open(DOCTYPE_CATS_CSV_PATH, newline='') as cd_csv:
            doctype_cats_mapping = {n[0]: n[1] for n in csv.reader(cd_csv)}
            print(doctype_cats_mapping)
    if generate_primary_subjects:
        with open(PRIMARY_SUBJECTS_CSV_PATH, newline='') as cd_csv:
            primary_subjects_mapping = {
                n[0]: [n[i] for i in range(1, len(n)) if n[i] != '']
                for n in csv.reader(cd_csv)
            }
            print(primary_subjects_mapping)
    if generate_secondary_subjects:
        with open(SECONDARY_SUBJECTS_CSV_PATH, newline='') as cd_csv:
            secondary_subjects_mapping = {
                n[0]: [n[i] for i in range(1, len(n)) if n[i] != '']
                for n in csv.reader(cd_csv)
            }
            print(secondary_subjects_mapping)

    for dm in DocModel.docmodel_generator(docmodels_path):
        if generate_doctype_cats:
            dm.extract_doctype_cat(doctype_cats_mapping)
        if generate_primary_subjects:
            dm.extract_primary_subjects(primary_subjects_mapping)
        if generate_secondary_subjects:
            dm.extract_secondary_subjects(secondary_subjects_mapping)
        dm.save_to_pickle(docmodels_path / dm.filename)
    print(
        f'Done extracting metadata from csvs. Parsing time: {timer.get_run_time()}'
    )
示例#7
0
def lda_topics_main(name,
                    topics,
                    a,
                    b,
                    min_tokens,
                    max_word_frequence,
                    min_word_occs,
                    loops=100,
                    decay=0.9,
                    rnd=2112,
                    learn_method='batch'):
    timer = Timer()
    param_dict = {
        'name': name,
        'n_topics': topics,
        'a': a,
        'b': b,
        'min_tokens': min_tokens,
        'max_word_freq': max_word_frequence,
        'min_word_occ': min_word_occs,
        'loops': loops,
        'decay': decay
    }

    # Define the path where the model files will go
    # Creates a new file if it does not exist, else old files will be overwritten
    # Based on BASE_ANALYSIS_PATH, from config.
    path = BASE_ANALYSIS_PATH / 'LDA' / name
    path.mkdir(exist_ok=True)

    # Load docterm df
    # df = pickle.load(open(CORPUSFRAMES_PATH / 'abs_nva_docterm_corpusframe.p', 'rb'))

    # One step preprocess. Loads, filters and normalizes abs nva docterm df
    df = load_nva_abs_docterm_df(
        filtered_index_list=filter_metadata_index_list(min_text_tokens=2000),
        min_tokens=min_tokens,
        min_word_occs=min_word_occs,
        max_word_freq=max_word_frequence,
        log_norm=True)

    # Only keep docs with enough tokens
    # df = df[(df.sum(axis=1) >= min_tokens)]

    # Remove words that occur in too few docs (by default, removes words that happen in less than 10 docs)
    # and
    # Remove words that are in more than x% of the docs (by default, remove words that are in more than 30% of docs)
    # df = df.drop((df > 0).sum()[lambda x: (x < min_word_occs) | (x > max_word_frequence*len(df))].index, axis=1)

    # print(df)

    param_dict['num_docs'], param_dict['num_words'] = df.shape

    # tfidf or log normalization, one or both can be commented out
    #df = tfidf_docterm_df(df)
    # df = df.apply(lambda x: np.log(x + 1))

    # Fit model and save as pickle
    lda_model = make_and_train_lda_model(df, topics, loops, decay, a, b,
                                         learn_method, rnd)
    pickle.dump(lda_model, open(path / 'lda_model.p', 'wb'))
    # lda_model = pickle.load(open(path / 'lda_model.p', 'rb'))

    # Make topic words df (index = topics, cols = words)
    topic_words_df = pd.DataFrame(lda_model.components_,
                                  index=[f'topic_{i}' for i in range(topics)],
                                  columns=df.columns)

    # Transform to get doc x topic dist, normalized
    doc_topics_df = pd.DataFrame(lda_model.transform(df),
                                 columns=[f'topic_{i}' for i in range(topics)],
                                 index=df.index)

    # Normalize both, so that the sum of each row = 1. Can be commented out
    topic_words_df = topic_words_df.apply(
        lambda x: x / topic_words_df.sum(axis=1))
    doc_topics_df = doc_topics_df.apply(
        lambda x: x / doc_topics_df.sum(axis=1))

    # add col with main topic for each doc
    # add col with main word for each topic
    # doc_topics_df['main_topic'] = doc_topics_df.idxmax(axis=1)
    # topic_words_df['top_word'] = topic_words_df.idxmax(axis=1)

    # Reduce to 3d with umad and cols info to doc_topics_df
    # Reduce to 3d with umad and cols info to topics_words_df
    # reducer = UMAP(n_components=3)
    # doc_topics_df[['x', 'y', 'z']] = reducer.fit_transform(doc_topics_df)
    # topic_words_df[['x', 'y', 'z']] = reducer.fit_transform(topic_words_df)

    # Save results!
    pickle.dump(doc_topics_df, open(path / 'doc_topics_df.p', 'wb'))
    pickle.dump(topic_words_df, open(path / 'topic_words_df.p', 'wb'))

    res = lda_model.components_
    names = df.columns
    param_dict['log likelyhood'] = lda_model.score(df)
    param_dict['perplexity'] = lda_model.perplexity(df)

    # Print and save to txt top words for each topic
    # Does the same job as the csv thing but is uglier and clunkier
    # But it's nice to have the results printed and it works so who cares
    resdict = {}
    for num, topic in enumerate(res):
        features = [topic.argsort()[:-19:-1]]
        resdict[num] = [names[i] for i in features]
        # print([df.columns[topic.argsort()[i]] for i in range(10)])
    topics_str = '\n'.join([
        f'{key} - {", ".join(word for word in words[0])}'
        for key, words in resdict.items()
    ])
    param_str = '\n'.join(
        [f'{key} - {value}' for key, value in param_dict.items()])

    with open(path / 'topics.txt', 'wb') as f:
        f.write(topics_str.encode('utf-8'))

    with open(path / 'params.txt', 'wb') as f:
        f.write(param_str.encode('utf-8'))

    with open(path / 'topic_probs.csv', 'wb') as f:
        f.write(topics_to_csv(topic_words_df, 15, param_dict).encode('utf-8'))

    print(param_str)
    print('\n')
    print(topics_str)
    print(f'Done running lda main! Run time: {timer.get_run_time()}')
    print('\n\n\n')
示例#8
0
    print()
    pickle.dump(
        wc, open(CORPUSFRAMES_PATH / 'text_word_counts_filtered_ids_df.p',
                 'wb'))

    # tf = df['primary_subjects'].apply(pd.Series)
    # print(tf.unique())

    # pf = make_source_subjects_df(PRIMARY_SUBJECTS_CSV_PATH)
    # sf = make_source_subjects_df(SECONDARY_SUBJECTS_CSV_PATH)
    # pickle.dump(pf, open(CORPUSFRAMES_PATH / 'primary_subjects_df.p', 'wb'))
    # pickle.dump(sf, open(CORPUSFRAMES_PATH / 'secondary_subjects_df.p', 'wb'))

    assert False
    corpusframe_fct = make_metadata_corpusframe
    save_name = 'metadata_corpusframe_2.p'

    print('Running corpusframe main.')
    print(
        f'Making corpusframe with dict {corpusframe_fct.__name__}, saving as {save_name} '
    )
    assert input('Enter \'Y\' to continue...').lower() == 'y'

    timer = Timer()
    df = corpusframe_fct()
    pickle.dump(df, open(CORPUSFRAMES_PATH / save_name, 'wb'))
    print(f'Done pickling! Run time: {timer.get_run_time()}')
    print(f'Df using: {df.memory_usage(deep=True).sum() / (1024 ** 2)} mbs')
    print(df.dtypes)
    print(df)