Python extract_references_from_file示例，refextract.extract_references_from_file Python示例

示例#1

0

显示文件

文件： backwards_search.py 项目： tstehr/refextract

def get_references_for_entry(entry):
    if "file" not in entry.fields:
        return
    filename = '/' + entry.fields["file"].split(':')[1]
    try:
        references = extract_references_from_file(filename)
    except FullTextNotAvailableError as e:
        filename = closest_filename(filename)
        try:
            references = extract_references_from_file(filename)
        except FullTextNotAvailableError as e:
            print("file not found " + filename, file=sys.stderr)
            return
    return entry, references

示例#2

0

显示文件

文件： test_extract_references.py 项目： julio-navarro-lara/biblio

def extract_citations_from_pdf(name_of_file):
    references = extract_references_from_file(name_of_file)
    #reference = extract_references_from_string("text.txt")

    print "**************"

    array_of_citations = []

    for element in references:
        for key in element:
            print key
            print element[key]
            print "---"

        #array_of_citations.append(element['raw_ref'][0].replace(u'['+element['linemarker'][0]+u'] ',u''))
        array_of_citations.append(element['raw_ref'][0])
        print "**************"

    if len(array_of_citations) > 1:
        string_of_citations = array_to_semicolon_separated(array_of_citations)
    elif len(array_of_citations) == 1:
        string_of_citations = array_of_citations[0]
    else:
        string_of_citations = "No references to extract"

    print string_of_citations
    return string_of_citations

示例#3

0

显示文件

def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
        # override_kbs_files={
        #    'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME'])
        # }
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {}
            reference["journal_pubnote"] = ref.get('journal_reference')
            reference["year"] = ref.get('year')
            reference["collaboration"] = ref.get('collaboration')
            reference["title"] = ref.get('title')
            reference["misc"] = ref.get('misc')
            reference["number"] = ref.get('linemarker')
            reference["authors"] = ref.get('author')
            reference["isbn"] = ref.get('isbn')
            reference["doi"] = ref.get('doi')
            reference["report_number"] = ref.get('reportnumber')
            reference["publisher"] = ref.get('publisher')
            reference["recid"] = ref.get('recid')

            for key, value in reference.items():
                if value and isinstance(value, list):
                    reference[key] = ",".join(value)
                elif not value:
                    del reference[key]
            mapped_references.append(reference)
    return mapped_references

示例#4

0

显示文件

文件： tasks.py 项目： liamkirsh/inspire-next

def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
        # override_kbs_files={
        #    'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME'])
        # }
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {}
            reference["journal_pubnote"] = ref.get('journal_reference')
            reference["year"] = ref.get('year')
            reference["collaboration"] = ref.get('collaboration')
            reference["title"] = ref.get('title')
            reference["misc"] = ref.get('misc')
            reference["number"] = ref.get('linemarker')
            reference["authors"] = ref.get('author')
            reference["isbn"] = ref.get('isbn')
            reference["doi"] = ref.get('doi')
            reference["report_number"] = ref.get('reportnumber')
            reference["publisher"] = ref.get('publisher')
            reference["recid"] = ref.get('recid')

            for key, value in reference.items():
                if value and isinstance(value, list):
                    reference[key] = ",".join(value)
                elif not value:
                    del reference[key]
            mapped_references.append(reference)
    return mapped_references

示例#5

0

显示文件

文件： pdfCollector.py 项目： lamvin/preprints-refs-collector

def parse_pdfs(platform,parser="refextract"):
    pdf_path = os.path.join("..","data","pdf",platform)
    if parser == "cermine":
        jar_path = 'cermine-impl-1.13-jar-with-dependencies.jar'  
        for path in execute(['java', '-cp', jar_path, 'pl.edu.icm.cermine.ContentExtractor',
                     '-path', pdf_path,
                     '-outputs','jats'
                     ]):
            print(path, end="")
    elif parser == 'refextract':
        files_dir = np.array(os.listdir(os.path.join("data","pdf",platform)))
        pdf_files = files_dir[np.char.endswith(files_dir,'.pdf')]
        parsed_pdfs = np.array(os.listdir(os.path.join("data","json",platform)))
        parsed_json_IDs = np.array(['.'.join(x.split('.')[:-1]) for x in parsed_pdfs])
        parsed_IDs = np.array(['.'.join(x.split('.')[:-2]) for x in pdf_files])
        files_parse = pdf_files[~np.isin(parsed_IDs,parsed_json_IDs)]
        files_parse_ID = parsed_IDs[~np.isin(parsed_IDs,parsed_json_IDs)]
        nb_files = len(files_parse)
        with open(os.path.join("data","meta",platform+"_pdfs_parsed.txt"),'a') as f:
            for i in range(nb_files):
                print("Extracting refs {}: {}/{}.".format(platform,i+1,nb_files))
                file = files_parse[i]
                file_ID = files_parse_ID[i]
                references = extract_references_from_file(os.path.join("data","pdf",platform,file))
                with open(os.path.join("data","json",platform,file_ID+".json"),'w') as f:
                    json.dump(references,f)

示例#6

0

显示文件

文件： ref_extractor.py 项目： bvignau/SLR_Helper

def Extract_Ref_From_PDF(path):
    references = extract_references_from_file(path)
    all = []
    for r in references:
        ref = Reference()
        ref.create_Ref(r)
        all.append(ref)
    return all

示例#7

0

显示文件

文件： refextract.py 项目： rikirenz/inspire-next

def extract_references(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    extracted_references = extract_references_from_file(
        filepath,
        override_kbs_files=custom_kbs_file,
        reference_format=u'{title},{volume},{page}')

    return map_refextract_to_schema(extracted_references, source=source)

示例#8

0

显示文件

def countCitations(directory, min_distance_ratio):
    file_paths = [
        join(directory, f) for f in listdir(directory)
        if isfile(join(directory, f))
    ]
    file_paths = filter(lambda x: x.endswith(".pdf"), file_paths)
    references = []
    for filepath in file_paths:
        print("Extracting references from %s" % filepath)
        json_file_path = splitext(filepath)[0] + ".json"

        if isfile(json_file_path):
            # load json if available
            with open(json_file_path, 'r') as f:
                extracted_refs = json.load(f)["references"]
        else:
            extracted_refs = extract_references_from_file(filepath)
            dict_object = dict(references=extracted_refs)
            try:
                file_object = open(json_file_path, 'w')
                # Save references data into the JSON file
                json.dump(dict_object, file_object)
            except Exception:
                print("Failed to save json file")

        # remove entries without year and author
        extracted_refs = filter(lambda x: 'author' in x and 'year' in x,
                                extracted_refs)
        # remove leading citation number (e.g. [1])
        extracted_refs = map(
            lambda x:
            (re.sub("\[\d+\]", "", x['raw_ref'][0]).strip(), x['year'][0]),
            extracted_refs)
        references += extracted_refs

    print("Calculating distances...")
    distances = createDistanceMatrix(references)

    mask = np.where((distances > min_distance_ratio), distances, 0)
    counts = (mask > 0).sum(axis=1)

    i_indices = np.where(counts > 0)[0]

    result = []
    for i in i_indices:
        count = counts[i] + 1
        title = references[i][0]
        result.append((count, title))

    result.sort(key=lambda x: x[0], reverse=True)

    print("\n==========================")
    print("Number of papers: %i" % len(file_paths))
    print("Total references found: %i\n" % len(references))
    for count, title in result:
        print("Cited by %i papers: %s" % (count, title))

示例#9

0

显示文件

文件： refextract.py 项目： harunurhan/inspire-next

def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_file(
            filepath,
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}',
        )

    return map_refextract_to_schema(extracted_references, source=source)

示例#10

0

显示文件

def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_file(
            filepath,
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}',
        )

    return map_refextract_to_schema(extracted_references, source=source)

示例#11

0

显示文件

def find_reference_list(path):
    """ Returns the reference list as a list of dicts if refextract can detect it

    Keyword arguments:
    path -- path to the file containing filepaths for the files that should be examined
    """
    references = []
    try:
        references = extract_references_from_file(path)
    except:
        print('Could not read PDF file')
    return remove_duplicate_refs(references)

示例#12

0

显示文件

文件： refextract.py 项目： fschwenn/inspire-next

def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format=u"{title},{volume},{page}",
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {
                'curated_relation': False,
                'raw_refs': [
                    {
                        'position': '',
                        'schema': '',
                        'source': '',
                        'value': json.dumps(ref),
                    }
                ],
                'record': {
                    '$ref': ''
                },
                'reference': {
                    'authors': ref.get('author'),
                    'book_series': ref.get('book_series'),
                    'collaboration': ref.get('collaboration'),
                    'dois': ref.get('dois'),
                    'imprint': ref.get('imprint'),
                    'misc': ref.get('misc'),
                    'number': ref.get('linemarker'),
                    'persistent_identifiers': ref.get(
                        'persistent_identifiers'
                    ),
                    'publication_info': {
                        'year': ref.get('year'),
                        'journal_title': ref.get('journal_title'),
                        'journal_volume': ref.get('journal_volume'),
                        'page_start': ref.get('journal_page'),
                        'artid': ref.get('journal_reference'),
                    },
                    'texkey': ref.get('texkey'),
                    'titles': ref.get('titles'),
                    'urls': ref.get('urls'),
                },
            }
            mapped_references.append(reference)

    return mapped_references

示例#13

0

显示文件

文件： tasks.py 项目： michamos/inspire-next

def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {
                'curated_relation':
                False,
                'raw_refs': [{
                    'position': '',
                    'schema': '',
                    'source': '',
                    'value': json.dumps(ref),
                }],
                'record': {
                    '$ref': ''
                },
                'reference': {
                    'authors': ref.get('author'),
                    'book_series': ref.get('book_series'),
                    'collaboration': ref.get('collaboration'),
                    'dois': ref.get('dois'),
                    'imprint': ref.get('imprint'),
                    'misc': ref.get('misc'),
                    'number': ref.get('linemarker'),
                    'persistent_identifiers':
                    ref.get('persistent_identifiers'),
                    'publication_info': {
                        'year': ref.get('year'),
                        'journal_title': ref.get('journal_title'),
                        'journal_volume': ref.get('journal_volume'),
                        'page_start': ref.get('journal_page'),
                        'artid': ref.get('journal_reference'),
                    },
                    'texkey': ref.get('texkey'),
                    'titles': ref.get('titles'),
                    'urls': ref.get('urls'),
                },
            }
            mapped_references.append(reference)

    return mapped_references

示例#14

0

显示文件

文件： sameref.py 项目： gitferry/sameref

def read_files_from_path(directory_path):
    refrence_dic = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.pdf'):
            references = extract_references_from_file(directory_path +
                                                      file_name)
            for reference_item in references:
                if reference_item.has_key('misc'):
                    title = extract_title(reference_item['misc'][0])
                    if refrence_dic.has_key(title):
                        refrence_dic[title] += 1
                    else:
                        refrence_dic[title] = 1

    for k, v in refrence_dic.iteritems():
        if v > 1:
            print k, v

示例#15

0

显示文件

文件： extractrefs.py 项目： larocs/attention_dl

def main():
    if len(sys.argv) < 2:
        print('usage: extractrefs <pdf_path> [dst_path]')
        return

    pdf_path = sys.argv[1]
    assert pdf_path.endswith('.pdf')
    dst_path = \
        sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace('.pdf', '.json')

    if pdf_path.startswith('http://') or pdf_path.startswith('https://'):
        refs = refextract.extract_references_from_url(pdf_path)
    else:
        refs = refextract.extract_references_from_file(pdf_path)

    with open(dst_path, 'w') as f:
        json.dump(refs, f, indent=4)
    print('saved refs to %s' % dst_path)

示例#16

0

显示文件

文件： build_bib.py 项目： saitiku/AutoGoogleScolarJumpStart

def get_refs(file_path):
    """list of references in pdf file"""
    print("File:", file_path)
    refs = refextract.extract_references_from_file(str(file_path))

    # iterate over refs
    for i, ref in enumerate(refs):

        # skip this if this is a link
        if "http" in "".join(ref['raw_ref']):
            continue

        # get title of this work
        title = ref.get('title', None)

        # if not none pass on
        # if title was read
        if title is not None:
            title = " ".join(title).lower()
            yield title

示例#17

0

显示文件

文件： routes.py 项目： Tubbz-alt/arxiv-references

def extract() -> tuple:
    """Handle a request for reference extraction for a POSTed PDF."""
    logger = getLogger()
    if 'file' not in request.files:
        return jsonify({'explanation': 'No file found'}), HTTP_400_BAD_REQUEST

    try:
        filepath = handle_upload(request.files['file'])
    except ValueError as e:
        return jsonify({'explanation': e.msg}), HTTP_400_BAD_REQUEST

    try:
        response_data = extract_references_from_file(filepath)
        status = HTTP_200_OK
    except Exception as e:
        response_data = {'explanation': 'refextract failed: %s' % e}
        status = HTTP_500_INTERNAL_SERVER_ERROR
    finally:
        try:
            cleanup_upload(filepath)
        except IOError as e:
            logger.warning('Could not remove file %s: %s' % filepath, e)

    return jsonify(response_data), status

示例#18

0

显示文件

def test_get_number_header_lines_does_not_crash_on_final_empty_page(pdf_files):
    assert extract_references_from_file(pdf_files[4])

示例#19

0

显示文件

from refextract import extract_references_from_file
from os import path
from glob import glob


def find_ext(dr, ext):
    return glob(path.join(dr, "*.{}".format(ext)))


list_of_pdfs = find_ext("PDFS_to_PROCESS", "pdf")

for pdf in list_of_pdfs:
    reference = extract_references_from_file(pdf)
    with open('drop_these_into_anystyledotio.txt', 'a') as biblio:
        for dictn in reference:
            raw = dictn['raw_ref']
            try:
                biblio.write("{}\n".format(str("".join(raw))))
            except UnicodeEncodeError:
                biblio.write("{}\n".format(raw))
            else:
                biblio.write("{}\n".format(raw))
            # r = dict((k, v[0]) for k, v in dictn.iteritems() if v)
            # biblio.write("{}\n".format(r))

示例#20

0

显示文件

文件： mapper.py 项目： Jfaler/Citation-Mapper

Programmer: Justin Faler
Date: 4/8/2020
Description: PDF Citation Map
'''

import numpy as np
import networkx as nx
import matplotlib as mpl
import pdfminer, re, sys, os, time, csv
import matplotlib.pyplot as plt
from refextract import extract_references_from_file

plt.rcParams["figure.facecolor"] = "#333333"

citation_map = nx.DiGraph(directed=True)
references = extract_references_from_file(
    '/home/user/Documents/Books/example.pdf')

options = {
    'node_color': '#36DBCA',
    'node_size': 150,
    'width': 0.2,
    'alpha': 1,
    'arrowstyle': '-|>',
    'arrowsize': 15,
}

base = os.path.basename('/home/user/Documents/Books/example.pdf')
name = os.path.splitext(base)[0]

i = 1

示例#21

0

显示文件

文件： referenceExtractor.py 项目： dataMethodMan/keyPhraseExtraction

import pdfx

# pdf = pdfx.PDFx("/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf")
# metadata = pdf.get_metadata()
#
# refs = pdf.get_references()
# ref_dict = pdf.get_references_as_dict()
#
#
# print(len(ref_dict))
# for k , v in ref_dict.items():
#     print(k ,  v)
# print(10*"=")
#
# metadata = pdf.get_metadata()
# print(metadata)
#
# print(10*"-=-")
# print(pdf.get_references_count())

from refextract import extract_references_from_file
reference = extract_references_from_file(
    "/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf"
)
print(references)