예제 #1
0
def refextract_text():
    """Run refextract on a piece of text."""
    if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"):
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        data = {
            "journal_kb_data": create_journal_dict(),
            "text": request.json["text"]
        }
        response = requests.post(
            f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_text",
            headers=headers,
            data=orjson.dumps(data),
        )
        if response.status_code != 200:
            return jsonify({"message": "Can not extract references"}, 500)
        extracted_references = response.json()["extracted_references"]
    else:
        extracted_references = extract_references_from_string(
            request.json["text"],
            override_kbs_files={"journals": create_journal_dict()},
            reference_format="{title},{volume},{page}",
        )
    deduplicated_extracted_references = dedupe_list(extracted_references)
    references = map_refextract_to_schema(deduplicated_extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
예제 #2
0
def extract_references_from_text(text, source=None, custom_kbs_file=None):
    """Extract references from text and return in INSPIRE format."""
    extracted_references = extract_references_from_string(
        text,
        override_kbs_files=get_refextract_kbs_path(),
        reference_format=u'{title},{volume},{page}',
    )

    return map_refextract_to_schema(extracted_references, source=source)
예제 #3
0
def refextract_text():
    """Run refextract on a piece of text."""
    extracted_references = extract_references_from_string(
        request.json['text'],
        override_kbs_files=get_refextract_kbs_path(),
        reference_format=u'{title},{volume},{page}')
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
예제 #4
0
def extract_references_from_text(text, source=None, custom_kbs_file=None):
    """Extract references from text and return in INSPIRE format."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_string(
            text,
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}',
        )

    return map_refextract_to_schema(extracted_references, source=source)
예제 #5
0
def refextract_text():
    """Run refextract on a piece of text."""
    extracted_references = extract_references_from_string(
        request.json["text"],
        override_kbs_files={"journals": create_journal_dict()},
        reference_format="{title},{volume},{page}",
    )
    references = map_refextract_to_schema(extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
예제 #6
0
def refextract_text():
    """Run refextract on a piece of text."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_string(
            request.json['text'],
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}'
        )
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
예제 #7
0
def refextract_text():
    """Run refextract on a piece of text."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_string(
            request.json["text"],
            override_kbs_files=kbs_path,
            reference_format="{title},{volume},{page}",
        )
    references = map_refextract_to_schema(extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
예제 #8
0
def get_references(rl):
    refs = []
    #convert individual references
    for ref in rl.find_all('ref'):
        (lt, refno) = ('', '')
        for label in ref.find_all('label'):
            lt = label.text.strip()
            lt = re.sub('\W', '', lt)
            if re.search('\[', lt):
                refno = '%s ' % (lt)
            else:
                refno = '[%s] ' % (lt)
        #journal and preprint
        for mc in ref.find_all(
                'element-citation',
                attrs={'publication-type': ['journal', 'preprint']}):
            (title, authors, pbn, doi, arxiv) = ('', [], '', '', '')
            #authors
            for nametag in mc.find_all('name'):
                name = ''
                for gn in nametag.find_all('given-names'):
                    name = gn.text.strip()
                for sn in nametag.find_all('surname'):
                    name += ' ' + sn.text.strip()
                authors.append(name)
            #title
            for at in mc.find_all('article-title'):
                #title = at.text.strip()
                title = cleanformulas(at)
            #pubnote
            for source in mc.find_all('source'):
                pbn = source.text.strip()
            for volume in mc.find_all('volume'):
                pbn += ' ' + volume.text.strip()
            for issue in mc.find_all('issue'):
                pbn += ', No. ' + issue.text.strip()
            for year in mc.find_all('year'):
                pbn += ' (%s) ' % (year.text.strip())
            for fpage in mc.find_all('fpage'):
                pbn += ' ' + fpage.text.strip()
            for lpage in mc.find_all('lpage'):
                pbn += '-' + lpage.text.strip()
            for fpage in mc.find_all('elocation-id'):
                pbn += ' ' + fpage.text.strip()
            #refextract on pbn to normalize it
            repbn = extract_references_from_string(
                pbn,
                override_kbs_files={
                    'journals':
                    '/opt/invenio/etc/docextract/journal-titles-inspire.kb'
                },
                reference_format="{title},{volume},{page}")
            if repbn:
                if 'journal_reference' in repbn[0].keys():
                    #print ' [refextract] normalize "%s" to "%s"' % (pbn, repbn[0]['journal_reference'])
                    pbn = repbn[0]['journal_reference']
            else:
                for comment in mc.find_all('comment'):
                    pbn = comment.text.strip()
            #DOI
            for pi in mc.find_all('pub-id', attrs={'pub-id-type': 'doi'}):
                doi = pi.text.strip()
            #arXiv
            for el in mc.find_all('ext-link', attrs={'ext-link-type':
                                                     'arxiv'}):
                arxiv = el.text.strip()
                if re.search('^\d\d\d\d\.\d\d\d\d', arxiv):
                    arxiv = 'arXiv:' + arxiv
            #all together
            if doi:
                reference = [('x', refno + '%s: %s, %s, DOI: %s' %
                              (', '.join(authors), title, pbn, doi))]
                if arxiv:
                    reference.append(('r', arxiv))
                reference.append(('a', 'doi:' + doi))
                if lt: reference.append(('o', re.sub('\D', '', lt)))
            else:
                reference = [
                    ('x',
                     refno + '%s: %s, %s' % (', '.join(authors), title, pbn))
                ]
                if arxiv:
                    reference.append(('r', arxiv))
            refs.append(reference)
        #book
        for mc in ref.find_all(
                'element-citation',
                attrs={'publication-type': ['confproc', 'book']}):
            (atitle, btitle, editors, authors, pbn, bpbn,
             doi) = ('', '', [], [], '', '', '')
            #authors/editors
            for pg in mc.find_all('person-group'):
                for nametag in mc.find_all('name'):
                    name = ''
                    for gn in nametag.find_all('given-names'):
                        name = gn.text.strip()
                    for sn in nametag.find_all('surname'):
                        name += ' ' + sn.text.strip()
                    if pg['person-group-type'] == 'author':
                        authors.append(name)
                    elif pg['person-group-type'] == 'editor':
                        editors.append(name)
            #title
            for at in mc.find_all('article-title'):
                atitle = cleanformulas(at)
                #atitle = at.text.strip()
            #book title
            for source in mc.find_all('source'):
                btitle = cleanformulas(source)
                #btitle = source.text.strip()
            for source in mc.find_all('conf-name'):
                btitle += ' ' + cleanformulas(source)
            #book pubnote
            for publishername in mc.find_all('publisher-name'):
                bpbn = publishername.text.strip() + ', '
            for publisherloc in mc.find_all('publisher-loc'):
                bpbn += publisherloc.text.strip() + ', '
            for year in mc.find_all('year'):
                bpbn += year.text.strip()
            #pubnote
            for fpage in mc.find_all('fpage'):
                pbn += ' ' + fpage.text.strip()
            for lpage in mc.find_all('lpage'):
                pbn += '-' + lpage.text.strip()
            #all together
            if atitle:
                refs.append([('x', refno + '%s: %s, pages %s in: %s: %s, %s' %
                              (', '.join(authors), atitle, pbn,
                               ', '.join(editors), btitle, bpbn))])
            else:
                refs.append([
                    ('x',
                     refno + '%s: %s, %s' % (', '.join(authors), btitle, bpbn))
                ])
        #other
        for mc in ref.find_all('mixed-citation',
                               attrs={'publication-type': 'other'}):
            (doi, recid, arxiv) = ('', '', '')
            #INSPIRE links
            inspirelink = ''
            for el in mc.find_all('ext-link', attrs={'ext-link-type': 'uri'}):
                if el.has_attr('xlink:href'):
                    link = el['xlink:href']
                    if re.search('inspirehep.net.*IRN', link):
                        irn = re.sub('.*\D', '', link)
                        #inspire2 for recid in search_pattern(p='970__a:SPIRES-' + irn):
                        #inspire2    inspirelink += ', https://old.inspirehep.net/record/%i' % (recid)
                        #inspire2 el.decompose()
                    elif re.search('inspirehep.net.*recid', link):
                        recid = re.sub('.*\D', '', link)
                        inspirelink += ', https://old.inspirehep.net/record/%s' % (
                            recid)
                        el.decompose()
                    elif re.search('inspirehep.net', link):
                        el.decompose()
                    elif re.search('arxiv.org', link):
                        arxiv = re.sub(' ', '', el.text.strip())
                        arxiv = re.sub('^\[', '', arxiv)
                        arxiv = re.sub('(\d)\]$', r'\1', arxiv)
                        if re.search('^\d{4}\.\d', arxiv):
                            arxiv = 'arXiv:' + arxiv
                        elif re.search('ar[xX]iv\:[a-z\-]+\/\d', arxiv):
                            arxiv = arxiv[6:]
                        el.decompose()
            #missing spaces?
            for bold in mc.find_all('bold'):
                bt = bold.text.strip()
                bold.replace_with(' %s ' % (bt))
            #DOI
            for pi in mc.find_all('pub-id', attrs={'pub-id-type': 'doi'}):
                doi = pi.text.strip()
                pi.replace_with(', DOI: %s' % (doi))
            #all together
            reference = [('x', refno + cleanformulas(mc))]
            #reference = [('x', refno + mc.text.strip())]
            if doi:
                reference.append(('a', 'doi:' + doi))
            if recid:
                reference.append(('0', str(recid)))
            if arxiv:
                reference.append(('r', arxiv))
            if doi or recid or arxiv:
                if lt: reference.append(('o', re.sub('\D', '', lt)))
            refs.append(reference)
    return refs
예제 #9
0
def refextract_url():
    """Run refextract on a URL."""
    extracted_references = extract_references_from_string(request.json['url'])
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
예제 #10
0
def refextract_text():
    """Run refextract on a piece of text."""
    extracted_references = extract_references_from_string(request.json['text'])
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
예제 #11
0
def get_references(url, clean='jacow'):
    from refextract import extract_references_from_string
    filename = url.split('/')[-1]
    if os.path.isfile('%s/%s_clean.txt' % (tmppath, filename[:-4])):
        controlfile = codecs.EncodedFile(
            codecs.open('%s/%s_clean.txt' % (tmppath, filename[:-4])), 'utf8')
        fulltext = controlfile.read()
        fulltext = fulltext.decode("utf-8")
        controlfile.close()
    else:
        if not os.path.isfile('%s/%s.txt' % (tmppath, filename[:-4])):
            if not os.path.isfile('%s/%s' % (tmppath, filename)):
                os.system('wget -q -O %s%s %s' % (tmppath, filename, url))
            os.system('/usr/bin/pdftotext %s%s' % (tmppath, filename))

        infile = codecs.EncodedFile(
            codecs.open('%s/%s.txt' % (tmppath, filename[:-4])), 'utf8')
        fulltext = infile.readlines()
        fulltext = [line.decode("utf-8") for line in fulltext]
        if clean == 'jacow':
            fulltext = clean_fulltext_jacow(fulltext, verbose=1)
        elif clean == 'moriond':
            fulltext = clean_fulltext_moriond(fulltext)
        elif clean == 'linebreaks':
            fulltext = '\n'.join(fulltext) + '\n'
            fulltext = clean_linebreaks(fulltext)
        else:
            fulltext = '\n'.join(fulltext) + '\n'
        fulltext = get_reference_section(fulltext)
        infile.close()

        if '[2]' in fulltext:
            lines = fulltext.split('\n')
            lines.sort(cmp=by_number)
            fulltext = '\n'.join(lines)
            last_number = 0
            errors = ''
            for line in lines:
                number = re_number.search(line)
                if number:
                    this_number = int(number.group(1))
                    if not this_number - last_number == 1:
                        errors += '%s: [%s] followed by [%s]\n' % (
                            filename[:-4], last_number, this_number)
                    last_number = this_number
                elif last_number:
                    errors += '%s: No number for %s\n' % (filename[:-4],
                                                          line[:30])
            if errors:
                reflog_file = open('%s/%s.log' %
                                   (publisherdatapath, filename[:-4]),
                                   mode='wb')
                reflog_file.write(errors)
                reflog_file.close()

        controlfile = codecs.EncodedFile(
            codecs.open('%s/%s_clean.txt' % (tmppath, filename[:-4]),
                        mode='wb'), 'utf8')
        controlfile.write(fulltext.encode("utf-8"))
        controlfile.close()

    refs = extract_references_from_string(
        fulltext,
        is_only_references=False,
        override_kbs_files={
            'journals': '/opt/invenio/etc/docextract/journal-titles-inspire.kb'
        },
        reference_format="{title},{volume},{page}")
    references = []

    #mappings for references in JSON to MARC
    mappings = {
        'doi': 'a',
        'collaborations': 'c',
        'document_type': 'd',
        'author': 'h',
        'isbn': 'i',
        'texkey': 'k',
        'misc': 'm',
        'journal_issue': 'n',
        'label': 'o',
        'linemarker': 'o',
        'reportnumber': 'r',
        'journal_reference': 's',
        'title': 't',
        'urls': 'u',
        'url': 'u',
        'raw_ref': 'x',
        #                'journal_title': None,
        #                'journal_volume': None,
        #                'journal_page': None,
        #                'journal_year': None,
        #                'publisher': None,
        'year': 'y'
    }

    for ref in refs:
        entryaslist = [('9', 'refextract')]
        for key in ref.keys():
            if key in mappings:
                for entry in ref[key]:
                    entryaslist.append((mappings[key], entry))


#            else:
#                print 'no mapping for', key
        references.append(entryaslist)
    return references