def create_word_mappings(saf, alignments, lowercase, out_dir): create_dirs(out_dir) alignment_data = json.load(alignments) aligned1 = alignment_data['gs'] aligned2 = alignment_data['ocr'] saf = json.load(saf) if lowercase: words = [w['word'].lower() for w in saf['tokens']] aligned1 = [c.lower() for c in aligned1] aligned2 = [c.lower() for c in aligned2] else: words = [w['word'] for w in saf['tokens']] wb = find_word_boundaries(words, aligned1) doc_id = remove_ext(alignments.name) res = {'gs': [], 'ocr': [], 'doc_id': []} for s, e in wb: w1 = u''.join(aligned1[s:e]) w2 = u''.join(aligned2[s:e]) res['gs'].append(w1.strip()) res['ocr'].append(w2.strip()) res['doc_id'].append(doc_id) # Use pandas DataFrame to create the csv, so commas and quotes are properly # escaped. df = pd.DataFrame(res) out_file = out_file_name(out_dir, doc_id, ext='csv') df.to_csv(out_file, encoding='utf-8')
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) tables = [] write = False (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'w', encoding='utf-8') as tmp: for line in in_file: if line.startswith('<h2>General'): write = True if line.startswith('<h2>Difference'): write = False if line.startswith('<h2>Error'): write = True if write: tmp.write(line) with codecs.open(tmpfile, encoding='utf-8') as f: soup = BeautifulSoup(f.read(), 'lxml') tables = soup.find_all('table') assert len(tables) == 2 os.remove(tmpfile) doc = remove_ext(in_file.name) t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = [''] lines[i].append(entry) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u','.join(lines[i])) f.write(u'\n') t = tables[1] table_data = [[cell.text for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}",'.format(data[0])) f.write(u','.join(data[1:])) f.write(u'\n')
def clin2018st_extract_text(json_file, out_dir): create_dirs(out_dir) corrections = {} gs_text = [] text_with_errors = [] text = json.load(json_file) for w in text['corrections']: span = w['span'] # TODO: fix 'after' if 'after' in w.keys(): print('Found "after" in {}.'.format( os.path.basename(json_file.name))) for i, w_id in enumerate(span): corrections[w_id] = {} if i == 0: corrections[w_id]['text'] = w['text'] else: corrections[w_id]['text'] = u'' corrections[w_id]['last'] = False if i == (len(span) - 1): corrections[w_id]['last'] = True for w in text['words']: w_id = w['id'] gs_text.append(w['text']) if w_id in corrections.keys(): text_with_errors.append(corrections[w_id]['text']) else: text_with_errors.append(w['text']) if w['space']: gs_text.append(u' ') text_with_errors.append(u' ') gs_file = remove_ext(json_file.name) gs_file = os.path.join(out_dir, '{}-gs.txt'.format(gs_file)) with codecs.open(gs_file, 'wb', encoding='utf-8') as f: f.write(u''.join(gs_text)) err_file = remove_ext(json_file.name) err_file = os.path.join(out_dir, '{}-errors.txt'.format(err_file)) with codecs.open(err_file, 'wb', encoding='utf-8') as f: f.write(u''.join(text_with_errors))
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) soup = BeautifulSoup(in_file, 'lxml') tables = [] for header in soup.find_all('h2'): if (header.text == 'General results' or header.text.startswith('Error rate')): tables.append(header.find_next('table')) assert len(tables) == 2 doc = remove_ext(in_file.name) t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = ['doc_id'] lines[i].append(entry.replace(',', '.')) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u';'.join(lines[i])) f.write(u'\n') t = tables[1] table_data = [[cell.text.replace(',', '.') for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}";'.format(data[0])) f.write(u';'.join(data[1:])) f.write(u'\n')
def ocrevaluation_extract(in_file, out_dir): create_dirs(out_dir) soup = BeautifulSoup(in_file, 'lxml') tables = soup.find_all('table') assert len(tables) == 3 doc = remove_ext(in_file.name) # global measures: table[0] t = tables[0] table_data = [[cell.text for cell in row('td')] for row in t('tr')] # 'transpose' table_data lines = {} for data in table_data: for i, entry in enumerate(data): if i not in lines.keys(): # add doc id to data line (but not to header) if i != 0: lines[i] = [doc] else: lines[i] = [''] lines[i].append(entry) out_file = os.path.join(out_dir, '{}-global.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for i in range(len(lines.keys())): f.write(u','.join(lines[i])) f.write(u'\n') # character measures: table[2] t = tables[2] table_data = [[cell.text for cell in row('td')] for row in t('tr')] out_file = os.path.join(out_dir, '{}-character.csv'.format(doc)) with codecs.open(out_file, 'wb', encoding='utf-8') as f: for data in table_data: f.write(u'"{}",'.format(data[0])) f.write(u','.join(data[1:])) f.write(u'\n')
def test_remove_ext_full_path(): fname = '/home/jvdzwaan/data/test.txt' assert remove_ext(fname) == 'test'
def test_remove_ext_no_ext(): fname = 'test' assert remove_ext(fname) == 'test'
def test_remove_ext_filename(): fname = 'test.txt' assert remove_ext(fname) == 'test'
def safar_add_metadata(in_file, in_file_meta, max_len, out_dir): """Add metadata from a csv file to a SAFAR XML file. """ create_dirs(out_dir) analysis_tag = None total_words = None markers = b'<markers></markers>' # check whether the analysis_tag should be stemmer_analysis with codecs.open(in_file, 'r', encoding='utf-8') as xml_file: for line in xml_file: if re.search('morphology_analysis', line): analysis_tag = 'morphology_analysis' elif re.search('stemmer_analysis', line): analysis_tag = 'stemmer_analysis' m = re.search('total_words="(\d+)"', line) if m: total_words = m.group(1) if analysis_tag is not None and total_words is not None: break # Extract the words and markers click.echo('Extracting tokens') (fd, tmpfile) = tempfile.mkstemp() with codecs.open(tmpfile, 'wb') as words: context = etree.iterparse(in_file, events=('end', ), tag=('word', 'markers'), huge_tree=True) context = tqdm(context, total=int(total_words)) for event, elem in context: if elem.tag == 'word': # Setting method to html (instead of xml) fixes problems # with writing Arabic characters in the value attribute of # the word element. words.write( etree.tostring(elem, encoding='utf-8', method='html')) elif elem.tag == 'markers': markers = etree.tostring(elem, encoding='utf-8') # make iteration over context fast and consume less memory # https://www.ibm.com/developerworks/xml/library/x-hiperfparse elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context # Get the metadata md = pd.read_csv(in_file_meta, sep=',|;', index_col='BookURI', encoding='utf-8') # make sure the index type is string if six.PY2: md.index = md.index.map(unicode) else: md.index = md.index.map(str) if '-' in os.path.basename(in_file): uri = os.path.basename(in_file).split('-', 1)[0] else: uri = remove_ext(in_file) try: md = md.loc[uri] metadata = [u'<metadata>'] for key in md.keys()[1:]: # skip over order (the old index) val = md[key] if isinstance(val, six.string_types): val = smart_strip(val) val = escape(val) # Make sure the values aren't to long, because # BlackLab doesn't allow values that are to long in dropdowns. # The default value of 94 was set emperically. It seems the # lengths of strings are caluclated differently in Java (the # max length in Java is 256). if len(val) >= max_len: val = 'X ' + val[:max_len - 2] metadata.append(u'<meta name="{}">{}</meta>'.format(key, val)) metadata.append(u'<meta name="{}">{}</meta>'.format('BookURI', uri)) metadata.append(u'</metadata>') metadata = u'\n'.join(metadata) except KeyError: metadata = u'<metadata></metadata>' # Write output click.echo('Writing output') xml_out = out_file_name(out_dir, in_file) with codecs.open(xml_out, 'wb') as f: f.write(b'<?xml version="1.0" encoding="utf-8"?>\n') f.write(b'<document>\n') f.write(metadata.encode('utf-8')) tag = ' <{} total_words="{}">\n'.format(analysis_tag, total_words) f.write(tag.encode('utf-8')) with codecs.open(tmpfile, 'rb') as words_file: for line in tqdm(words_file): f.write(line) f.write(' </{}>\n'.format(analysis_tag).encode('utf-8')) f.write(markers) f.write(b'</document>\n') os.remove(tmpfile)