def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True): if re.search("bundestag", data_source.lower()): name = "bundestag" raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source) elif re.search("sustainability", data_source.lower()): name = "sustainability" raw_corpus = DataHandler.get_sustainability_data(path=data_source) elif re.search("unv1.0-tei", data_source.lower()): name = "united_nations" raw_corpus = DataHandler.get_un_texts(directory=data_source) elif re.search("state_of_the_union", data_source.lower()): name = "state_of_the_union" raw_corpus = DataHandler.get_state_of_the_union(directory=data_source) else: name = "abstracts" raw_corpus = DataHandler.get_abstracts(path=data_source) language = raw_corpus[0].language print('loaded', len(raw_corpus), 'documents') if preprocess: Preprocessor.preprocess(raw_corpus, language=language) print('preprocessed', len(raw_corpus), 'documents') corpus = Corpus(source=raw_corpus, language=language, name=name) print('parsed', len(corpus.get_documents(as_list=True)), 'documents to a Corpus') corpus.save_corpus(corpus_destination)
def cleaning_authors(config, overwrite=False): corpus_names = [ "bundestag_corpus", # "sustainability_corpus", # "abstract_corpus" ] languages = [Language.DE, Language.EN, Language.EN] wlc = 0 m_a = 0 s_a = 0 for i, corpus_name in enumerate(corpus_names): corpus = Corpus(source=config["corpora"][corpus_name], language=languages[i], name=corpus_name) # corpus = DataHandler.load_corpus(config["corpora"][corpus_name]) for d in corpus.get_documents(): if d.author: if isinstance(d.author, float) and np.isnan(d.author): d.author = None else: if corpus_name == "bundestag_corpus": authors = [d.author] elif corpus_name == "sustainability_corpus": if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author else: if d.language != "English": wlc += 1 continue if isinstance(d.author, str): authors = [a.strip() for a in d.author.split(',')] authors = [ f'{j}. {i}' for i, j in zip(authors[::2], authors[1::2]) ] else: authors = d.author if len(authors) > 1: m_a += 1 print(d.author, authors) else: s_a += 1 d.author = authors if not overwrite: os.rename(src=config["corpora"][corpus_name], dst=create_new_filepath_uncleaned( config["corpora"][corpus_name])) corpus.save_corpus(config["corpora"][corpus_name]) print(wlc, m_a, s_a)
def cleaning_punctuation(config, overwrite=False): corpus_names = [ "bundestag_corpus", "sustainability_corpus", "abstract_corpus" ] languages = [Language.DE, Language.EN, Language.EN] for i, corpus_name in enumerate(corpus_names): corpus = Corpus(source=config["corpora"][corpus_name], language=languages[i], name=corpus_name) remove_punctuation(corpus) if not overwrite: os.rename(src=config["corpora"][corpus_name], dst=create_new_filepath_uncleaned( config["corpora"][corpus_name])) corpus.save_corpus(config["corpora"][corpus_name])
def cleaning_un(config, overwrite=True): corpus = Corpus(source=config["corpora"]["united_nations_corpus"], language=Language.DE, name="united_nations_corpus") corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["united_nations_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["united_nations_corpus"])) corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_bundestag(config, overwrite=True): corpus = Corpus(source=config["corpora"]["bundestag_corpus"], language=Language.DE, name="bundestag_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"]) corpus = Corpus(source=[d for d in corpus.get_documents() if d.date], language=corpus.language, name=corpus.name) print("1", len(corpus)) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["bundestag_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["bundestag_corpus"])) corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True): corpus = Corpus(source=config["corpora"]["abstract_corpus"], language=Language.EN, name="abstract_corpus") # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"]) print("1", len(corpus)) corpus = Corpus([ d for d in corpus.get_documents() if d.date and len(str(d.date)) == 4 and d.date.isnumeric() ], name=corpus.name, language=Language.EN) for d in corpus.get_documents(): d.date = int(d.date) print("2", len(corpus)) if not overwrite: os.rename(src=config["corpora"]["abstract_corpus"], dst=create_new_filepath_uncleaned( config["corpora"]["abstract_corpus"])) corpus.save_corpus(config["corpora"]["abstract_corpus"])