示例#1
0
def process_file(filename, outdir):
    citations = CitationSet()
    for (key, ref, val) in read_citations(filename):
        citations.add(key, format_ref(ref) + untexify(decode_utf(val)))

    testId = os.path.splitext(os.path.basename(filename))[0]
    citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId))
    citations.write_nlm(os.path.join(outdir, '%s.xml' % testId))
    citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
def process_file(filename, outdir):
    citations = CitationSet()
    for (key, ref, val) in read_citations(filename):
        citations.add(key, format_ref(ref) + untexify(decode_utf(val)))

    testId = os.path.splitext(os.path.basename(filename))[0]
    citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId))
    citations.write_nlm(os.path.join(outdir, '%s.xml' % testId))
    citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
def name_parser(s):
    abbrv_name = r'(((?<=\s)|\A)[A-Za-z]\.)'
    long_string = r'((?:\w|[-])(?:\w|[-])+)'

    def name_tokens(list):
        afretsplit = []
        hadAbbrv = any(map(lambda s: re.match(abbrv_name, s), list))
        hadLast = False
        for token in reversed(list):
            if re.match(abbrv_name, token, flags=re.UNICODE):
                afretsplit.append(('given-names', token))
            elif re.match(long_string, token, flags=re.UNICODE):
                if not hadLast or hadAbbrv:
                    hadLast = True
                    afretsplit.append(('surname', token))
                else:
                    afretsplit.append(('given-names', token))
            elif re.match(r'\A\s*\Z', token, flags=re.UNICODE):
                afretsplit.append(('whitespace', token))
            else:
                afretsplit.append(('other', token))
        afretsplit.reverse()
        return afretsplit

    def merge(named_tokens):
        merged = [named_tokens[0]]
        for tag, string in named_tokens[1:]:
            if tag == merged[-1][0]:
                merged[-1] = (tag, merged[-1][1] + string)
            else:
                merged.append((tag, string))
        
        
        return merged

    def supermerge(merged):
        supermerged = [merged[0]]
        i = 1
        while i < len(merged):
            if merged[i][0] == 'whitespace' and merged[i+1][0] == supermerged[-1][0]:
                supermerged[-1] = (merged[i+1][0], supermerged[-1][1] + 
                                                   merged[i][1] + merged[i+1][1])
                i += 2
            else:
                supermerged.append(merged[i])
                i += 1
        
        
        return supermerged

    def unify_surname(supermerged):
        renamed = []
        hadSurname = False
        for t, s in reversed(supermerged):
            if t == 'surname':
                if hadSurname:
                    renamed.append(('given-names', s))
                else:
                    hadSurname = True
                    renamed.append((t, s))
            else:
                renamed.append((t, s))
        renamed.reverse()
        return renamed

    def list2xml(renamed):
        result = []
        for t, s in renamed:
            if t in ['whitespace', 'other']:
                result.append(s)
            else:
                result.append('<%s>%s</%s>' % (t, s, t))
        
        
        return result

    s = untexify(decode_utf(s))
    list = filter(lambda s: len(s) > 0, 
                  flatten(map(lambda s: re.split(long_string, s, flags=re.UNICODE), 
                              re.split(abbrv_name, s, flags=re.UNICODE))))
    
    named_tokens = name_tokens(list)
    merged = merge(named_tokens)
    supermerged = supermerge(merged)
    renamed = unify_surname(supermerged)
    renamed = supermerge(renamed)
    result = list2xml(renamed)
    
    return ''.join(result)
def get_mixed_citations(filename, parsing_fun):
    for (no,ref,val) in read_citations(filename):
        yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))
def name_parser(s):
    abbrv_name = r'(((?<=\s)|\A)[A-Za-z]\.)'
    long_string = r'((?:\w|[-])(?:\w|[-])+)'

    def name_tokens(list):
        afretsplit = []
        hadAbbrv = any(map(lambda s: re.match(abbrv_name, s), list))
        hadLast = False
        for token in reversed(list):
            if re.match(abbrv_name, token, flags=re.UNICODE):
                afretsplit.append(('given-names', token))
            elif re.match(long_string, token, flags=re.UNICODE):
                if not hadLast or hadAbbrv:
                    hadLast = True
                    afretsplit.append(('surname', token))
                else:
                    afretsplit.append(('given-names', token))
            elif re.match(r'\A\s*\Z', token, flags=re.UNICODE):
                afretsplit.append(('whitespace', token))
            else:
                afretsplit.append(('other', token))
        afretsplit.reverse()
        return afretsplit

    def merge(named_tokens):
        merged = [named_tokens[0]]
        for tag, string in named_tokens[1:]:
            if tag == merged[-1][0]:
                merged[-1] = (tag, merged[-1][1] + string)
            else:
                merged.append((tag, string))

        return merged

    def supermerge(merged):
        supermerged = [merged[0]]
        i = 1
        while i < len(merged):
            if merged[i][0] == 'whitespace' and merged[
                    i + 1][0] == supermerged[-1][0]:
                supermerged[-1] = (merged[i + 1][0], supermerged[-1][1] +
                                   merged[i][1] + merged[i + 1][1])
                i += 2
            else:
                supermerged.append(merged[i])
                i += 1

        return supermerged

    def unify_surname(supermerged):
        renamed = []
        hadSurname = False
        for t, s in reversed(supermerged):
            if t == 'surname':
                if hadSurname:
                    renamed.append(('given-names', s))
                else:
                    hadSurname = True
                    renamed.append((t, s))
            else:
                renamed.append((t, s))
        renamed.reverse()
        return renamed

    def list2xml(renamed):
        result = []
        for t, s in renamed:
            if t in ['whitespace', 'other']:
                result.append(s)
            else:
                result.append('<%s>%s</%s>' % (t, s, t))

        return result

    s = untexify(decode_utf(s))
    list = filter(
        lambda s: len(s) > 0,
        flatten(
            map(lambda s: re.split(long_string, s, flags=re.UNICODE),
                re.split(abbrv_name, s, flags=re.UNICODE))))

    named_tokens = name_tokens(list)
    merged = merge(named_tokens)
    supermerged = supermerge(merged)
    renamed = unify_surname(supermerged)
    renamed = supermerge(renamed)
    result = list2xml(renamed)

    return ''.join(result)
def get_mixed_citations(filename, parsing_fun):
    for (no, ref, val) in read_citations(filename):
        yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))