示例#1
0
def get_references_for_entry(entry):
    if "file" not in entry.fields:
        return
    filename = '/' + entry.fields["file"].split(':')[1]
    try:
        references = extract_references_from_file(filename)
    except FullTextNotAvailableError as e:
        filename = closest_filename(filename)
        try:
            references = extract_references_from_file(filename)
        except FullTextNotAvailableError as e:
            print("file not found " + filename, file=sys.stderr)
            return
    return entry, references
def extract_citations_from_pdf(name_of_file):
    references = extract_references_from_file(name_of_file)
    #reference = extract_references_from_string("text.txt")

    print "**************"

    array_of_citations = []

    for element in references:
        for key in element:
            print key
            print element[key]
            print "---"

        #array_of_citations.append(element['raw_ref'][0].replace(u'['+element['linemarker'][0]+u'] ',u''))
        array_of_citations.append(element['raw_ref'][0])
        print "**************"

    if len(array_of_citations) > 1:
        string_of_citations = array_to_semicolon_separated(array_of_citations)
    elif len(array_of_citations) == 1:
        string_of_citations = array_of_citations[0]
    else:
        string_of_citations = "No references to extract"

    print string_of_citations
    return string_of_citations
示例#3
0
def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
        # override_kbs_files={
        #    'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME'])
        # }
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {}
            reference["journal_pubnote"] = ref.get('journal_reference')
            reference["year"] = ref.get('year')
            reference["collaboration"] = ref.get('collaboration')
            reference["title"] = ref.get('title')
            reference["misc"] = ref.get('misc')
            reference["number"] = ref.get('linemarker')
            reference["authors"] = ref.get('author')
            reference["isbn"] = ref.get('isbn')
            reference["doi"] = ref.get('doi')
            reference["report_number"] = ref.get('reportnumber')
            reference["publisher"] = ref.get('publisher')
            reference["recid"] = ref.get('recid')

            for key, value in reference.items():
                if value and isinstance(value, list):
                    reference[key] = ",".join(value)
                elif not value:
                    del reference[key]
            mapped_references.append(reference)
    return mapped_references
示例#4
0
def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
        # override_kbs_files={
        #    'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME'])
        # }
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {}
            reference["journal_pubnote"] = ref.get('journal_reference')
            reference["year"] = ref.get('year')
            reference["collaboration"] = ref.get('collaboration')
            reference["title"] = ref.get('title')
            reference["misc"] = ref.get('misc')
            reference["number"] = ref.get('linemarker')
            reference["authors"] = ref.get('author')
            reference["isbn"] = ref.get('isbn')
            reference["doi"] = ref.get('doi')
            reference["report_number"] = ref.get('reportnumber')
            reference["publisher"] = ref.get('publisher')
            reference["recid"] = ref.get('recid')

            for key, value in reference.items():
                if value and isinstance(value, list):
                    reference[key] = ",".join(value)
                elif not value:
                    del reference[key]
            mapped_references.append(reference)
    return mapped_references
def parse_pdfs(platform,parser="refextract"):
    pdf_path = os.path.join("..","data","pdf",platform)
    if parser == "cermine":
        jar_path = 'cermine-impl-1.13-jar-with-dependencies.jar'  
        for path in execute(['java', '-cp', jar_path, 'pl.edu.icm.cermine.ContentExtractor',
                     '-path', pdf_path,
                     '-outputs','jats'
                     ]):
            print(path, end="")
    elif parser == 'refextract':
        files_dir = np.array(os.listdir(os.path.join("data","pdf",platform)))
        pdf_files = files_dir[np.char.endswith(files_dir,'.pdf')]
        parsed_pdfs = np.array(os.listdir(os.path.join("data","json",platform)))
        parsed_json_IDs = np.array(['.'.join(x.split('.')[:-1]) for x in parsed_pdfs])
        parsed_IDs = np.array(['.'.join(x.split('.')[:-2]) for x in pdf_files])
        files_parse = pdf_files[~np.isin(parsed_IDs,parsed_json_IDs)]
        files_parse_ID = parsed_IDs[~np.isin(parsed_IDs,parsed_json_IDs)]
        nb_files = len(files_parse)
        with open(os.path.join("data","meta",platform+"_pdfs_parsed.txt"),'a') as f:
            for i in range(nb_files):
                print("Extracting refs {}: {}/{}.".format(platform,i+1,nb_files))
                file = files_parse[i]
                file_ID = files_parse_ID[i]
                references = extract_references_from_file(os.path.join("data","pdf",platform,file))
                with open(os.path.join("data","json",platform,file_ID+".json"),'w') as f:
                    json.dump(references,f)
示例#6
0
def Extract_Ref_From_PDF(path):
    references = extract_references_from_file(path)
    all = []
    for r in references:
        ref = Reference()
        ref.create_Ref(r)
        all.append(ref)
    return all
示例#7
0
def extract_references(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    extracted_references = extract_references_from_file(
        filepath,
        override_kbs_files=custom_kbs_file,
        reference_format=u'{title},{volume},{page}')

    return map_refextract_to_schema(extracted_references, source=source)
示例#8
0
def countCitations(directory, min_distance_ratio):
    file_paths = [
        join(directory, f) for f in listdir(directory)
        if isfile(join(directory, f))
    ]
    file_paths = filter(lambda x: x.endswith(".pdf"), file_paths)
    references = []
    for filepath in file_paths:
        print("Extracting references from %s" % filepath)
        json_file_path = splitext(filepath)[0] + ".json"

        if isfile(json_file_path):
            # load json if available
            with open(json_file_path, 'r') as f:
                extracted_refs = json.load(f)["references"]
        else:
            extracted_refs = extract_references_from_file(filepath)
            dict_object = dict(references=extracted_refs)
            try:
                file_object = open(json_file_path, 'w')
                # Save references data into the JSON file
                json.dump(dict_object, file_object)
            except Exception:
                print("Failed to save json file")

        # remove entries without year and author
        extracted_refs = filter(lambda x: 'author' in x and 'year' in x,
                                extracted_refs)
        # remove leading citation number (e.g. [1])
        extracted_refs = map(
            lambda x:
            (re.sub("\[\d+\]", "", x['raw_ref'][0]).strip(), x['year'][0]),
            extracted_refs)
        references += extracted_refs

    print("Calculating distances...")
    distances = createDistanceMatrix(references)

    mask = np.where((distances > min_distance_ratio), distances, 0)
    counts = (mask > 0).sum(axis=1)

    i_indices = np.where(counts > 0)[0]

    result = []
    for i in i_indices:
        count = counts[i] + 1
        title = references[i][0]
        result.append((count, title))

    result.sort(key=lambda x: x[0], reverse=True)

    print("\n==========================")
    print("Number of papers: %i" % len(file_paths))
    print("Total references found: %i\n" % len(references))
    for count, title in result:
        print("Cited by %i papers: %s" % (count, title))
示例#9
0
def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_file(
            filepath,
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}',
        )

    return map_refextract_to_schema(extracted_references, source=source)
示例#10
0
def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None):
    """Extract references from PDF and return in INSPIRE format."""
    with local_refextract_kbs_path() as kbs_path:
        extracted_references = extract_references_from_file(
            filepath,
            override_kbs_files=kbs_path,
            reference_format=u'{title},{volume},{page}',
        )

    return map_refextract_to_schema(extracted_references, source=source)
示例#11
0
def find_reference_list(path):
    """ Returns the reference list as a list of dicts if refextract can detect it

    Keyword arguments:
    path -- path to the file containing filepaths for the files that should be examined
    """
    references = []
    try:
        references = extract_references_from_file(path)
    except:
        print('Could not read PDF file')
    return remove_duplicate_refs(references)
示例#12
0
def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format=u"{title},{volume},{page}",
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {
                'curated_relation': False,
                'raw_refs': [
                    {
                        'position': '',
                        'schema': '',
                        'source': '',
                        'value': json.dumps(ref),
                    }
                ],
                'record': {
                    '$ref': ''
                },
                'reference': {
                    'authors': ref.get('author'),
                    'book_series': ref.get('book_series'),
                    'collaboration': ref.get('collaboration'),
                    'dois': ref.get('dois'),
                    'imprint': ref.get('imprint'),
                    'misc': ref.get('misc'),
                    'number': ref.get('linemarker'),
                    'persistent_identifiers': ref.get(
                        'persistent_identifiers'
                    ),
                    'publication_info': {
                        'year': ref.get('year'),
                        'journal_title': ref.get('journal_title'),
                        'journal_volume': ref.get('journal_volume'),
                        'page_start': ref.get('journal_page'),
                        'artid': ref.get('journal_reference'),
                    },
                    'texkey': ref.get('texkey'),
                    'titles': ref.get('titles'),
                    'urls': ref.get('urls'),
                },
            }
            mapped_references.append(reference)

    return mapped_references
示例#13
0
def extract_references(filepath):
    """Extract references from PDF and return in INSPIRE format."""
    references = extract_references_from_file(
        filepath,
        reference_format="{title},{volume},{page}",
    )
    mapped_references = []
    if references.get('references'):
        for ref in references.get('references'):
            reference = {
                'curated_relation':
                False,
                'raw_refs': [{
                    'position': '',
                    'schema': '',
                    'source': '',
                    'value': json.dumps(ref),
                }],
                'record': {
                    '$ref': ''
                },
                'reference': {
                    'authors': ref.get('author'),
                    'book_series': ref.get('book_series'),
                    'collaboration': ref.get('collaboration'),
                    'dois': ref.get('dois'),
                    'imprint': ref.get('imprint'),
                    'misc': ref.get('misc'),
                    'number': ref.get('linemarker'),
                    'persistent_identifiers':
                    ref.get('persistent_identifiers'),
                    'publication_info': {
                        'year': ref.get('year'),
                        'journal_title': ref.get('journal_title'),
                        'journal_volume': ref.get('journal_volume'),
                        'page_start': ref.get('journal_page'),
                        'artid': ref.get('journal_reference'),
                    },
                    'texkey': ref.get('texkey'),
                    'titles': ref.get('titles'),
                    'urls': ref.get('urls'),
                },
            }
            mapped_references.append(reference)

    return mapped_references
示例#14
0
def read_files_from_path(directory_path):
    refrence_dic = {}
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.pdf'):
            references = extract_references_from_file(directory_path +
                                                      file_name)
            for reference_item in references:
                if reference_item.has_key('misc'):
                    title = extract_title(reference_item['misc'][0])
                    if refrence_dic.has_key(title):
                        refrence_dic[title] += 1
                    else:
                        refrence_dic[title] = 1

    for k, v in refrence_dic.iteritems():
        if v > 1:
            print k, v
示例#15
0
def main():
    if len(sys.argv) < 2:
        print('usage: extractrefs <pdf_path> [dst_path]')
        return

    pdf_path = sys.argv[1]
    assert pdf_path.endswith('.pdf')
    dst_path = \
        sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace('.pdf', '.json')

    if pdf_path.startswith('http://') or pdf_path.startswith('https://'):
        refs = refextract.extract_references_from_url(pdf_path)
    else:
        refs = refextract.extract_references_from_file(pdf_path)

    with open(dst_path, 'w') as f:
        json.dump(refs, f, indent=4)
    print('saved refs to %s' % dst_path)
def get_refs(file_path):
    """list of references in pdf file"""
    print("File:", file_path)
    refs = refextract.extract_references_from_file(str(file_path))

    # iterate over refs
    for i, ref in enumerate(refs):

        # skip this if this is a link
        if "http" in "".join(ref['raw_ref']):
            continue

        # get title of this work
        title = ref.get('title', None)

        # if not none pass on
        # if title was read
        if title is not None:
            title = " ".join(title).lower()
            yield title
示例#17
0
def extract() -> tuple:
    """Handle a request for reference extraction for a POSTed PDF."""
    logger = getLogger()
    if 'file' not in request.files:
        return jsonify({'explanation': 'No file found'}), HTTP_400_BAD_REQUEST

    try:
        filepath = handle_upload(request.files['file'])
    except ValueError as e:
        return jsonify({'explanation': e.msg}), HTTP_400_BAD_REQUEST

    try:
        response_data = extract_references_from_file(filepath)
        status = HTTP_200_OK
    except Exception as e:
        response_data = {'explanation': 'refextract failed: %s' % e}
        status = HTTP_500_INTERNAL_SERVER_ERROR
    finally:
        try:
            cleanup_upload(filepath)
        except IOError as e:
            logger.warning('Could not remove file %s: %s' % filepath, e)

    return jsonify(response_data), status
示例#18
0
def test_get_number_header_lines_does_not_crash_on_final_empty_page(pdf_files):
    assert extract_references_from_file(pdf_files[4])
示例#19
0
from refextract import extract_references_from_file
from os import path
from glob import glob


def find_ext(dr, ext):
    return glob(path.join(dr, "*.{}".format(ext)))


list_of_pdfs = find_ext("PDFS_to_PROCESS", "pdf")

for pdf in list_of_pdfs:
    reference = extract_references_from_file(pdf)
    with open('drop_these_into_anystyledotio.txt', 'a') as biblio:
        for dictn in reference:
            raw = dictn['raw_ref']
            try:
                biblio.write("{}\n".format(str("".join(raw))))
            except UnicodeEncodeError:
                biblio.write("{}\n".format(raw))
            else:
                biblio.write("{}\n".format(raw))
            # r = dict((k, v[0]) for k, v in dictn.iteritems() if v)
            # biblio.write("{}\n".format(r))
示例#20
0
Programmer: Justin Faler
Date: 4/8/2020
Description: PDF Citation Map
'''

import numpy as np
import networkx as nx
import matplotlib as mpl
import pdfminer, re, sys, os, time, csv
import matplotlib.pyplot as plt
from refextract import extract_references_from_file

plt.rcParams["figure.facecolor"] = "#333333"

citation_map = nx.DiGraph(directed=True)
references = extract_references_from_file(
    '/home/user/Documents/Books/example.pdf')

options = {
    'node_color': '#36DBCA',
    'node_size': 150,
    'width': 0.2,
    'alpha': 1,
    'arrowstyle': '-|>',
    'arrowsize': 15,
}

base = os.path.basename('/home/user/Documents/Books/example.pdf')
name = os.path.splitext(base)[0]

i = 1
import pdfx

# pdf = pdfx.PDFx("/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf")
# metadata = pdf.get_metadata()
#
# refs = pdf.get_references()
# ref_dict = pdf.get_references_as_dict()
#
#
# print(len(ref_dict))
# for k , v in ref_dict.items():
#     print(k ,  v)
# print(10*"=")
#
# metadata = pdf.get_metadata()
# print(metadata)
#
# print(10*"-=-")
# print(pdf.get_references_count())

from refextract import extract_references_from_file
reference = extract_references_from_file(
    "/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf"
)
print(references)