def main(): #### PREPROCESSING ################################### # obtain a dict with the metadata on each text: meta = metadata.metadata_dict() # convert the articles in the original xml files to plain text: preprocessing.parse_secondary_dbnl(max_documents=60000) # frog the plain articles: # preprocessing.frog_articles() """ #### WIKIFICATION ################################### # construct the wikifier: wikifier = Wikifier() # collect relevant page_ids for your categories wikifier.relevant_page_ids(fresh=False) page_ids = wikifier.page_ids[:100] # collect ids of pages that backlink to your relevant pages: wikifier.backlinking_pages(page_ids=page_ids, fresh=False) # collect all mentions of the target pages in the backlinks (this will take a while!) wikifier.mentions_from_backlinks(fresh=False) # turn the collected mentions into a matrix (vectorization) input_dim, output_dim = wikifier.vectorize_wiki_mentions(fresh=False, max_features=500) # optimize a classifier to label new mentions: #dev_acc, test_acc = wikifier.classifier(input_dim=input_dim, output_dim=output_dim, fresh=True, test=True, nb_epochs=2) # train the final classifier on all data: wikifier.classifier(input_dim=input_dim, output_dim=output_dim, fresh=False, test=False, nb_epochs=1) ######## (the following is specific to the dbnl data) ####################################################### # collect all unique NEs in the corpus # and get the pages which the wikipedia search interface links them to: wikifier.extract_unique_nes(fresh=False, max_documents=100, max_words_per_doc=150) # use the trained wikifier to disambiguate the NEs in the corpus: wikifier.disambiguate_nes(max_documents=100, max_words_per_doc=150) """ """
def parse_secondary_dbnl(max_documents=100): """ Parses all xml-files under the ../texts directory. Only considers files with: - genre = 'sec - letterkunde' - subgenre = 'tijdschrift / jaarboek' - 1945 > date < 2002 Additionally, only Dutch-language articles will be included. Only outputs articles which are recognized as 'nl' All individual 'chapters' (i.e. articles) are saved separately in ../workspace/periodicals """ year_counts = Counter() # get metadata metadata_dict = metadata.metadata_dict() # keep track: document_cnt = 0 # nb of documents (i.e. 'journal issues') article_cnt = 0 # nb of chapters (i.e. 'articles/reviews') # initalize directories: if not os.path.isdir('../workspace'): os.mkdir('../workspace') if not os.path.isdir('../figures'): os.mkdir('../figures') if os.path.isdir('../workspace/periodicals'): shutil.rmtree('../workspace/periodicals') os.mkdir('../workspace/periodicals') # iterate over the full texts which we have: for filepath in glob.glob('../texts/*.xml'): text_id = os.path.splitext(os.path.basename(filepath))[0][:-3] # remove trailing "_01" # see whether we have all the necessary metadata for the text: try: title = metadata_dict[text_id]['title'] date = metadata_dict[text_id]['year'] genre = metadata_dict[text_id]['genre'] subgenre = metadata_dict[text_id]['subgenre'] except KeyError: continue # limited to post-war studies on literature in periodicals: if genre == 'sec - letterkunde' and \ subgenre == 'tijdschrift / jaarboek' and \ date > 1945 and date < 2002 and date != "???": print(">>>", title) # collect the individual articles in the issue: articles = xml_to_articles(filepath) if articles: for idx, article in enumerate(articles): new_filepath = '../workspace/periodicals/' new_filepath += text_id+"-"+str(idx+1)+'-'+str(date)+'.txt' with codecs.open(new_filepath, 'w', 'utf-8') as f: f.write(article) # update stats: article_cnt += 1 year_counts[date] += 1 # update cnts: document_cnt += 1 if document_cnt >= max_documents: break print('nb issues parsed:', document_cnt) print('nb individual articles extracted:', article_cnt) # visualize distribution over time: cnts = sorted(year_counts.items()) output_file('../figures/nb_articles_yearly.html') p = figure(plot_width=1200, plot_height=400, x_axis_label='year', y_axis_label='nb articles') p.line([y for y,_ in cnts], [c for _,c in cnts], line_width=2) save(p)
def parse_secondary_dbnl(max_documents=100): """ Parses all xml-files under the ../texts directory. Only considers files with: - genre = 'sec - letterkunde' - subgenre = 'tijdschrift / jaarboek' - date > 1945 Additionally, only Dutch-language articles will be included. Only outputs articles which are recognized as 'nl' All individual 'chapters' (i.e. articles) are saved separately in ../workspace/periodicals """ year_counts = Counter() # get metadata metadata_dict = metadata.metadata_dict() # keep track: document_cnt = 0 # nb of documents (i.e. 'journal issues') article_cnt = 0 # nb of chapters (i.e. 'articles/reviews') # initalize directories: if not os.path.isdir('../workspace'): os.mkdir('../workspace') if not os.path.isdir('../figures'): os.mkdir('../figures') if os.path.isdir('../workspace/periodicals_tmp'): shutil.rmtree('../workspace/periodicals_tmp') os.mkdir('../workspace/periodicals_tmp') # iterate over the full texts which we have: for filepath in glob.glob('../texts/*.xml'): text_id = os.path.splitext( os.path.basename(filepath))[0][:-3] # remove trailing "_01" # see whether we have all the necessary metadata for the text: try: title = metadata_dict[text_id]['title'] date = metadata_dict[text_id]['year'] genre = metadata_dict[text_id]['genre'] subgenre = metadata_dict[text_id]['subgenre'] except KeyError: continue # limited to post-war studies on literature in periodicals: if genre == 'sec - letterkunde' and \ subgenre == 'tijdschrift / jaarboek' and \ date > 2001 and date != "???": print(">>>", title) # collect the individual articles in the issue: articles = xml_to_articles(filepath) if articles: for idx, article in enumerate(articles): new_filepath = '../workspace/periodicals_tmp/' new_filepath += text_id + "-" + str(idx + 1) + '-' + str( date) + '.txt' with codecs.open(new_filepath, 'w', 'utf-8') as f: f.write(article) # update stats: article_cnt += 1 year_counts[date] += 1 # update cnts: document_cnt += 1 if document_cnt >= max_documents: break print('nb issues parsed:', document_cnt) print('nb individual articles extracted:', article_cnt) # visualize distribution over time: cnts = sorted(year_counts.items()) output_file('../figures/nb_articles_yearly.html') p = figure(plot_width=1200, plot_height=400, x_axis_label='year', y_axis_label='nb articles') p.line([y for y, _ in cnts], [c for _, c in cnts], line_width=2) save(p)