import os
from typing import Optional, Set

import nltk
import pandas

from util.chart import AxisLabel, DistributionChart
from util.distribution import filter_distribution_if_in, plot_top_word_distribution, word_distribution_per_topics
from util.keyword import split_keywords, Topic
from util.natural_language import lemmatize

import tqdm

from util.log import init_local_logger, log_execution

_logger = init_local_logger(logging.INFO)


@log_execution(_logger)
def main(*, papers_details_mendeley_path: str, relevant_papers_path: str):
    papers_details: pandas.DataFrame = pandas.read_csv(
        papers_details_mendeley_path, index_col=0)
    relevant_papers: pandas.DataFrame = pandas.read_csv(relevant_papers_path)

    def filename(tag):
        return os.path.join('phase1_2', 'generated',
                            f'distribution_of_mendeley_keywords.{tag}')

    # callback protocols currently are not supported by PyCharm :(
    outputs = word_distribution_per_topics(filename, count_lemmas,
                                           papers_details, relevant_papers)
示例#2
0
import math
import os
import traceback
from dataclasses import dataclass
from enum import Enum
from typing import Tuple

import pandas

import bibtexparser

from util.bibtex_entry import id_tag_of

from util.log import init_local_logger, log_execution

_logger = init_local_logger(level=logging.INFO)


class ForumType(Enum):
    JOURNAL = 'journal'
    CONFERENCE = 'conference'


@log_execution(_logger)
def main(*, raw_papers_path: str) -> Tuple[str, ...]:
    parser = bibtexparser.bparser.BibTexParser(common_strings=True, ignore_nonstandard_types=False)
    with open(raw_papers_path, 'r', encoding='utf-8') as bibfile:
        bibliography: bibtexparser.bibdatabase.BibDatabase = bibtexparser.load(bibfile, parser)

    papers = pandas.DataFrame()
    for entry in bibliography.get_entry_list():