Пример #1
0
def refextract_url():
    """Run refextract on a URL."""
    if current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"):
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
        data = {
            "journal_kb_data": create_journal_dict(),
            "url": request.json["url"]
        }
        response = requests.post(
            f"{current_app.config['REFEXTRACT_SERVICE_URL']}/extract_references_from_url",
            headers=headers,
            data=orjson.dumps(data),
        )
        if response.status_code != 200:
            return jsonify({"message": "Can not extract references"}, 500)
        extracted_references = response.json()["extracted_references"]
    else:
        extracted_references = extract_references_from_url(
            request.json["url"],
            override_kbs_files={"journals": create_journal_dict()},
            reference_format="{title},{volume},{page}",
        )
    deduplicated_extracted_references = dedupe_list(extracted_references)
    references = map_refextract_to_schema(deduplicated_extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
Пример #2
0
def refextract_url():
    """Run refextract on a URL."""
    extracted_references = extract_references_from_url(
        request.json['url'],
        override_kbs_files=get_refextract_kbs_path(),
        reference_format=u'{title},{volume},{page}')
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
Пример #3
0
def refextract_url():
    """Run refextract on a URL."""
    extracted_references = extract_references_from_url(
        request.json["url"],
        override_kbs_files={"journals": create_journal_dict()},
        reference_format="{title},{volume},{page}",
    )
    references = map_refextract_to_schema(extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
Пример #4
0
def refextract_url():
    """Run refextract on a URL."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_url(
            request.json['url'],
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}'
        )
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)
Пример #5
0
def refextract_url():
    """Run refextract on a URL."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_url(
            request.json["url"],
            override_kbs_files=kbs_path,
            reference_format="{title},{volume},{page}",
        )
    references = map_refextract_to_schema(extracted_references)
    match_result = match_references(references)
    return jsonify(match_result.get("matched_references"))
Пример #6
0
def explore_references(
        url, author,
        title):  #RECURSIVE REFERENCE EXPLORATION FUNCTION (IMPLEMENTED)
    references = extract_references_from_url(url)  #get references from url
    reverse = reversed(
        references)  #initialize reversed copy of references to iterate over
    original_length = len(references)  #store number of references for deletion
    i = 0  #initialize counter variable i to keep track of current index in reverse

    for reference in reverse:  #iterate over reversed reference list
        if not 'author' in list(
                reference.keys()):  #if the reference has no offer (invalid)
            del references[
                original_length - 1 -
                i]  #remove that reference (notice reverse not changed)
        i += 1  #increment i
    for reference in references:  #for each valid reference
        true_author = parse(reference['author'][0])  #parse the author
        next_title = get_title(reference)  #get title of reference
        add_node(true_author, next_title,
                 False)  #add new node for that reference's author
        add_edge(
            nodes.index(next(node for node in nodes
                             if node["Label"] == author)),
            nodes.index(
                next(node for node in nodes if node["Label"] == true_author))
        )  #add edge from current node to this reference's new node
    cont = get_yes_no(
        "Explore all valid references?"
    )  #get yes no input for whether program should continue down tree
    if cont:  #if user says program should continue down tree
        for reference in references:  #for each valid reference
            true_author = parse(reference['author'][0])  #parse the author
            next_title = get_title(reference)  #get title of reference
            q = get_queries(
                reference
            )  #Store list of search queries associated with reference
            new_url = find_pdf(
                q
            )  #Get a pdf url from the list of queries (google search and web scrape)
            if new_url != "FAILURE":
                explore_references(new_url, true_author,
                                   next_title)  #explore new pdf if found
            else:
                print("NO PDF FOUND FOR THIS DOCUMENT, MOVING ON")  #
        return  #once done exploring all the references as far as desired end the function
    else:  #if shouldn't continue further down tree
        return  #done exploring that node
Пример #7
0
def main():
    if len(sys.argv) < 2:
        print('usage: extractrefs <pdf_path> [dst_path]')
        return

    pdf_path = sys.argv[1]
    assert pdf_path.endswith('.pdf')
    dst_path = \
        sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace('.pdf', '.json')

    if pdf_path.startswith('http://') or pdf_path.startswith('https://'):
        refs = refextract.extract_references_from_url(pdf_path)
    else:
        refs = refextract.extract_references_from_file(pdf_path)

    with open(dst_path, 'w') as f:
        json.dump(refs, f, indent=4)
    print('saved refs to %s' % dst_path)
Пример #8
0
from refextract import extract_references_from_url
references = extract_references_from_url(
    'https://arxiv.org/pdf/1503.07589.pdf')
print(references[0])
Пример #9
0
def refextract_url():
    """Run refextract on a URL."""
    extracted_references = extract_references_from_url(request.json['url'])
    references = map_refextract_to_schema(extracted_references)

    return jsonify(references)