def process_file(filename, outdir): citations = CitationSet() for (key, ref, val) in read_citations(filename): citations.add(key, format_ref(ref) + untexify(decode_utf(val))) testId = os.path.splitext(os.path.basename(filename))[0] citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId)) citations.write_nlm(os.path.join(outdir, '%s.xml' % testId)) citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
def name_parser(s): abbrv_name = r'(((?<=\s)|\A)[A-Za-z]\.)' long_string = r'((?:\w|[-])(?:\w|[-])+)' def name_tokens(list): afretsplit = [] hadAbbrv = any(map(lambda s: re.match(abbrv_name, s), list)) hadLast = False for token in reversed(list): if re.match(abbrv_name, token, flags=re.UNICODE): afretsplit.append(('given-names', token)) elif re.match(long_string, token, flags=re.UNICODE): if not hadLast or hadAbbrv: hadLast = True afretsplit.append(('surname', token)) else: afretsplit.append(('given-names', token)) elif re.match(r'\A\s*\Z', token, flags=re.UNICODE): afretsplit.append(('whitespace', token)) else: afretsplit.append(('other', token)) afretsplit.reverse() return afretsplit def merge(named_tokens): merged = [named_tokens[0]] for tag, string in named_tokens[1:]: if tag == merged[-1][0]: merged[-1] = (tag, merged[-1][1] + string) else: merged.append((tag, string)) return merged def supermerge(merged): supermerged = [merged[0]] i = 1 while i < len(merged): if merged[i][0] == 'whitespace' and merged[i+1][0] == supermerged[-1][0]: supermerged[-1] = (merged[i+1][0], supermerged[-1][1] + merged[i][1] + merged[i+1][1]) i += 2 else: supermerged.append(merged[i]) i += 1 return supermerged def unify_surname(supermerged): renamed = [] hadSurname = False for t, s in reversed(supermerged): if t == 'surname': if hadSurname: renamed.append(('given-names', s)) else: hadSurname = True renamed.append((t, s)) else: renamed.append((t, s)) renamed.reverse() return renamed def list2xml(renamed): result = [] for t, s in renamed: if t in ['whitespace', 'other']: result.append(s) else: result.append('<%s>%s</%s>' % (t, s, t)) return result s = untexify(decode_utf(s)) list = filter(lambda s: len(s) > 0, flatten(map(lambda s: re.split(long_string, s, flags=re.UNICODE), re.split(abbrv_name, s, flags=re.UNICODE)))) named_tokens = name_tokens(list) merged = merge(named_tokens) supermerged = supermerge(merged) renamed = unify_surname(supermerged) renamed = supermerge(renamed) result = list2xml(renamed) return ''.join(result)
def get_mixed_citations(filename, parsing_fun): for (no,ref,val) in read_citations(filename): yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))
def name_parser(s): abbrv_name = r'(((?<=\s)|\A)[A-Za-z]\.)' long_string = r'((?:\w|[-])(?:\w|[-])+)' def name_tokens(list): afretsplit = [] hadAbbrv = any(map(lambda s: re.match(abbrv_name, s), list)) hadLast = False for token in reversed(list): if re.match(abbrv_name, token, flags=re.UNICODE): afretsplit.append(('given-names', token)) elif re.match(long_string, token, flags=re.UNICODE): if not hadLast or hadAbbrv: hadLast = True afretsplit.append(('surname', token)) else: afretsplit.append(('given-names', token)) elif re.match(r'\A\s*\Z', token, flags=re.UNICODE): afretsplit.append(('whitespace', token)) else: afretsplit.append(('other', token)) afretsplit.reverse() return afretsplit def merge(named_tokens): merged = [named_tokens[0]] for tag, string in named_tokens[1:]: if tag == merged[-1][0]: merged[-1] = (tag, merged[-1][1] + string) else: merged.append((tag, string)) return merged def supermerge(merged): supermerged = [merged[0]] i = 1 while i < len(merged): if merged[i][0] == 'whitespace' and merged[ i + 1][0] == supermerged[-1][0]: supermerged[-1] = (merged[i + 1][0], supermerged[-1][1] + merged[i][1] + merged[i + 1][1]) i += 2 else: supermerged.append(merged[i]) i += 1 return supermerged def unify_surname(supermerged): renamed = [] hadSurname = False for t, s in reversed(supermerged): if t == 'surname': if hadSurname: renamed.append(('given-names', s)) else: hadSurname = True renamed.append((t, s)) else: renamed.append((t, s)) renamed.reverse() return renamed def list2xml(renamed): result = [] for t, s in renamed: if t in ['whitespace', 'other']: result.append(s) else: result.append('<%s>%s</%s>' % (t, s, t)) return result s = untexify(decode_utf(s)) list = filter( lambda s: len(s) > 0, flatten( map(lambda s: re.split(long_string, s, flags=re.UNICODE), re.split(abbrv_name, s, flags=re.UNICODE)))) named_tokens = name_tokens(list) merged = merge(named_tokens) supermerged = supermerge(merged) renamed = unify_surname(supermerged) renamed = supermerge(renamed) result = list2xml(renamed) return ''.join(result)
def get_mixed_citations(filename, parsing_fun): for (no, ref, val) in read_citations(filename): yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))