def parse_txt(in_dir, mappings, out_dir): """ Iterate over directory of Banken text files, parse each volume to a JSON object. """ for subdir, dirs, files in os.walk(in_dir): for txt_f in tqdm.tqdm(files): if txt_f[0] != ".": obj = Parsed() id_str = txt_f[:-4] try: maps = mappings[id_str] obj.a = maps["AUTHOR"] obj.t = maps["TITLE"] obj.y = maps["PUBDATE"] with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in: for line in txt_in: add_content(line, obj, 'swedish') with open(out_dir + txt_f[:-4] + '.json', 'w', encoding='utf-8') as out: out.write(build_json(obj)) out.close() except KeyError: pass
def parse_txt(in_dir, ids, out_dir): """ Iterate over directory of Gutenberg text files, parse each volume to a JSON object. """ for subdir, dirs, files in os.walk(in_dir): for txt_f in tqdm.tqdm(files): if txt_f[0] != ".": reading = False obj = Parsed() with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in: for line in txt_in: if 'Posting Date' in line: idno = get_idno(line) pub_info = match_pub_info(idno, ids) obj.a, obj.t, obj.y = pub_info[1], pub_info[ 2], pub_info[3] if 'START OF THIS PROJECT GUTENBERG EBOOK' in line: reading = True if 'END OF THIS PROJECT GUTENBERG EBOOK' in line: reading = False if reading and 'START OF THIS PROJECT GUTENBERG EBOOK' not in line: add_content(line, obj, 'german') with open(out_dir + txt_f[:-4] + '.json', 'w', encoding='utf-8') as out: out.write(build_json(obj)) out.close()
def _parse_files(self, doc, subdir): """ Parse an individual XML volume. """ try: f = open("{0}/{1}".format(self.input_dir, doc), 'r') except FileNotFoundError: f = open("{0}/{1}".format(subdir, doc), 'r') tree = BeautifulSoup(f.read(), 'xml') obj = Parsed() self.get_text(tree, obj) pub_info = self.mapping[doc[:-4]] obj.a = pub_info["author"] obj.t = pub_info["title"] obj.y = pub_info["pub_date"] with open("{0}/{1}.json".format(self.output_dir, doc[:-4]), 'w', encoding='utf-8') as out: out.write(build_json(obj)) out.close() f.close()
def parse_txt(in_dir, mappings, out_dir): """ Iterate over directory of Runeberg text files, parse each volume to a JSON object. """ for subdir, dirs, files in os.walk(in_dir): for vol in tqdm.tqdm(dirs): if vol[0] != "." and vol != "": obj = Parsed() try: with open("{}/{}/title".format(in_dir, vol), 'r') as title_str: id_str = title_str.read() maps = mappings[id_str] valid = True except KeyError: valid = False if valid: obj.a = maps["AUTHOR"] obj.t = maps["TITLE"] obj.y = maps["PUBDATE"] for subdir, dirs, files in os.walk("{}/{}/Pages/".format( in_dir, vol)): for text_f in files: if text_f != "whole-page-ok.lst" and text_f[ 0] != ".": with open( "{}/{}/Pages/{}".format( in_dir, vol, text_f), 'r') as txt_in: for line in txt_in: add_content(line, obj, 'swedish') with open(out_dir + vol[:-4] + '.json', 'w', encoding='utf-8') as out: out.write(build_json(obj)) out.close()
def parse_threaded(xml_doc, input_doc, output_doc, csv_in): refs = get_pub_dates(csv_in) tree = ET.parse(input_doc + xml_doc) root = tree.getroot() obj = Parsed() get_text(root, obj) text = "".join(obj.c) if text != "": try: with open(output_doc + xml_doc[:-4] + '.json', 'w', encoding='utf-8') as out: get_title_and_author(root, obj) get_publication_info(root, obj) get_isbn(root, obj) obj.y = refs[xml_doc] doc_type(root, obj) get_chapters(root, obj) out.write(build_json(obj)) out.close() except IOError: pass
def parse_files(in_dir, out_dir, htids, language): for folder, subfolders, files in os.walk(in_dir): if not subfolders: for xml_file in files: if xml_file[-4:] == ".xml": htid_test = test_file_htid(htids, folder, xml_file) # test if htid in set of htids, store it and build file if true if htid_test[0]: htid = htid_test[1] obj = Parsed() # replace periods for file-naming obj.h = htid.replace(".", "_") try: obj.a = htids[htid][0] obj.t = htids[htid][1] obj.y = htids[htid][2] except KeyError: print("File with HTID {0} not found in CSV reference file.".format(htid)) for zip_file in files: if zip_file[-4:] == ".zip": with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf: for txt_file in zf.namelist(): if txt_file[-4:] == ".txt": text = zf.read(txt_file).decode('utf-8') add_content(text, obj, language) with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out: out.write(build_json(obj))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", metavar='in-directory', action="store", help="input directory argument") parser.add_argument("-o", help="output directory argument", action="store") parser.add_argument("-csv", help="csv file with publication dates", action="store") try: args = parser.parse_args() except IOError: fail("IOError") build_out(args.o) if args.csv is not None: ids = parse_csv(args.csv) else: fail("Please specify input csv file path") for subdir, dirs, files in os.walk(args.i): for xmldoc in tqdm.tqdm(files): if xmldoc[0] != ".": tree = ET.parse(args.i + xmldoc) root = tree.getroot() base_url = get_id(root) obj = Parsed() get_text(root, obj) if len(obj.c) > 0: pub_info = get_pub_info(ids, base_url) obj.a, obj.t, obj.y = pub_info[0], pub_info[1], pub_info[2] with open(args.o + xmldoc[:-4] + '.json', 'w', encoding='utf-8') as out: out.write(build_json(obj)) out.close()
def build_json(file: Parsed): """ Construct JSON object which represents a volume in a corpus. """ if file.t is None: file.t = "No title listed" if file.a is None: file.a = "No author listed" if file.p is None: file.p = "No publisher listed" if file.i == '': file.i = "No ISBN listed" if file.d is None: file.d = "No document type" if file.h is None: file.h = "No HTID for this file" file.t = file.t.replace("\n", " ") file.a = file.a.replace("\n", " ") file.p = file.p.replace("\n", " ") file.d = file.d.replace("\n", " ") file.ch = filter_chapters(file.ch) jfile = json.dumps( { 'Title': file.t, 'Author': file.a, 'Publisher': file.p, 'Date': file.y, 'ISBN': file.i, 'Document Type': file.d, 'List of chapters': file.ch, 'HTID': file.h, 'Text': file.c, 'Stemmed': file.cstem, 'Filtered': file.tx, 'Filtered Stemmed': file.txstem, 'Full Sentences': file.c_sent, 'Filtered Sentences': file.tx_sent, 'Stemmed Sentences': file.cstem_sent, 'Filtered Stemmed Sentences': file.txstem_sent, 'URL': file.url }, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False) return jfile
def add_bs_xml_content(text: str, f: Parsed, lang: str): """ Add content to Parsed object from BeautifulSoup XML parser output. """ sentences = re.split("[.!?]", text) for sentence in sentences: s = clean_text(sentence) if len(s) > 1: f.add_content_sent(" ".join(s)) s_stem = stem_text(s, lang) f.add_stemmed_sent(" ".join(s_stem)) s_filt = filter_text(s, lang) if len(s_filt) > 1: f.add_filtered_sent(" ".join(s_filt)) s_filt_stem = stem_text(s_filt, lang) f.add_filtered_stemmed_sent(" ".join(s_filt_stem)) text_list = clean_text(text) f.add_content(text_list) stem = stem_text(text_list, lang) f.add_stemmed(stem) filt = filter_text(text_list, lang) f.add_filtered(filt) filt_stem = stem_text(filt, lang) f.add_filtered_stemmed(filt_stem)
def add_xml_content(root, file: Parsed, language: str): """ Transforms text from xml file into raw/filtered/stemmed forms and adds it to a file object. """ text = '' if str(root.text) != 'None': text += root.text if str(root.tail) != 'None': text += ' ' + root.tail if text != '': sentences = re.split('(?<=[.!?]) +', text) for sentence in sentences: sentence = clean_text(sentence) if len(sentence) > 1: file.add_content_sent(" ".join(sentence)) sentence_stemmed = stem_text(sentence, language) file.add_stemmed_sent(" ".join(sentence_stemmed)) sentence_filtered = filter_text(sentence, language) if len(sentence_filtered) > 1: file.add_filtered_sent(" ".join(sentence_filtered)) sentence_filtered_stemmed = stem_text( sentence_filtered, language) file.add_filtered_stemmed_sent( " ".join(sentence_filtered_stemmed)) text_list = clean_text(text) # full text file.add_content(text_list) # stem the full text stemmed = stem_text(text_list, language) file.add_stemmed(stemmed) # filter the unstemmed full text filtered = filter_text(text_list, language) file.add_filtered(filtered) # stem the filtered text filtered_stemmed = stem_text(filtered, language) file.add_filtered_stemmed(filtered_stemmed)
def add_content(text: str, file: Parsed, language: str): """ Transforms text into raw/filtered/stemmed forms and adds it to a file object. """ sentences = re.split('(?<=[.!?]) +', text) for sentence in sentences: sentence = clean_text(sentence) if len(sentence) > 1: file.add_content_sent(" ".join(sentence)) sentence_stemmed = stem_text(sentence, language) file.add_stemmed_sent(" ".join(sentence_stemmed)) sentence_filtered = filter_text(sentence, language) if len(sentence_filtered) > 1: file.add_filtered_sent(" ".join(sentence_filtered)) sentence_filtered_stemmed = stem_text(sentence_filtered, language) file.add_filtered_stemmed_sent( " ".join(sentence_filtered_stemmed)) text_list = clean_text(text) # full text file.add_content(text_list) # stem the full text stemmed = stem_text(text_list, language) file.add_stemmed(stemmed) # filter the unstemmed full text filtered = filter_text(text_list, language) file.add_filtered(filtered) # stem the filtered text filtered_stemmed = stem_text(filtered, language) file.add_filtered_stemmed(filtered_stemmed)