def get_references_for_entry(entry): if "file" not in entry.fields: return filename = '/' + entry.fields["file"].split(':')[1] try: references = extract_references_from_file(filename) except FullTextNotAvailableError as e: filename = closest_filename(filename) try: references = extract_references_from_file(filename) except FullTextNotAvailableError as e: print("file not found " + filename, file=sys.stderr) return return entry, references
def extract_citations_from_pdf(name_of_file): references = extract_references_from_file(name_of_file) #reference = extract_references_from_string("text.txt") print "**************" array_of_citations = [] for element in references: for key in element: print key print element[key] print "---" #array_of_citations.append(element['raw_ref'][0].replace(u'['+element['linemarker'][0]+u'] ',u'')) array_of_citations.append(element['raw_ref'][0]) print "**************" if len(array_of_citations) > 1: string_of_citations = array_to_semicolon_separated(array_of_citations) elif len(array_of_citations) == 1: string_of_citations = array_of_citations[0] else: string_of_citations = "No references to extract" print string_of_citations return string_of_citations
def extract_references(filepath): """Extract references from PDF and return in INSPIRE format.""" references = extract_references_from_file( filepath, reference_format="{title},{volume},{page}", # override_kbs_files={ # 'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME']) # } ) mapped_references = [] if references.get('references'): for ref in references.get('references'): reference = {} reference["journal_pubnote"] = ref.get('journal_reference') reference["year"] = ref.get('year') reference["collaboration"] = ref.get('collaboration') reference["title"] = ref.get('title') reference["misc"] = ref.get('misc') reference["number"] = ref.get('linemarker') reference["authors"] = ref.get('author') reference["isbn"] = ref.get('isbn') reference["doi"] = ref.get('doi') reference["report_number"] = ref.get('reportnumber') reference["publisher"] = ref.get('publisher') reference["recid"] = ref.get('recid') for key, value in reference.items(): if value and isinstance(value, list): reference[key] = ",".join(value) elif not value: del reference[key] mapped_references.append(reference) return mapped_references
def extract_references(filepath): """Extract references from PDF and return in INSPIRE format.""" references = extract_references_from_file( filepath, reference_format="{title},{volume},{page}", # override_kbs_files={ # 'journals': get_mappings_from_kbname(['REFEXTRACT_KB_NAME']) # } ) mapped_references = [] if references.get('references'): for ref in references.get('references'): reference = {} reference["journal_pubnote"] = ref.get('journal_reference') reference["year"] = ref.get('year') reference["collaboration"] = ref.get('collaboration') reference["title"] = ref.get('title') reference["misc"] = ref.get('misc') reference["number"] = ref.get('linemarker') reference["authors"] = ref.get('author') reference["isbn"] = ref.get('isbn') reference["doi"] = ref.get('doi') reference["report_number"] = ref.get('reportnumber') reference["publisher"] = ref.get('publisher') reference["recid"] = ref.get('recid') for key, value in reference.items(): if value and isinstance(value, list): reference[key] = ",".join(value) elif not value: del reference[key] mapped_references.append(reference) return mapped_references
def parse_pdfs(platform,parser="refextract"): pdf_path = os.path.join("..","data","pdf",platform) if parser == "cermine": jar_path = 'cermine-impl-1.13-jar-with-dependencies.jar' for path in execute(['java', '-cp', jar_path, 'pl.edu.icm.cermine.ContentExtractor', '-path', pdf_path, '-outputs','jats' ]): print(path, end="") elif parser == 'refextract': files_dir = np.array(os.listdir(os.path.join("data","pdf",platform))) pdf_files = files_dir[np.char.endswith(files_dir,'.pdf')] parsed_pdfs = np.array(os.listdir(os.path.join("data","json",platform))) parsed_json_IDs = np.array(['.'.join(x.split('.')[:-1]) for x in parsed_pdfs]) parsed_IDs = np.array(['.'.join(x.split('.')[:-2]) for x in pdf_files]) files_parse = pdf_files[~np.isin(parsed_IDs,parsed_json_IDs)] files_parse_ID = parsed_IDs[~np.isin(parsed_IDs,parsed_json_IDs)] nb_files = len(files_parse) with open(os.path.join("data","meta",platform+"_pdfs_parsed.txt"),'a') as f: for i in range(nb_files): print("Extracting refs {}: {}/{}.".format(platform,i+1,nb_files)) file = files_parse[i] file_ID = files_parse_ID[i] references = extract_references_from_file(os.path.join("data","pdf",platform,file)) with open(os.path.join("data","json",platform,file_ID+".json"),'w') as f: json.dump(references,f)
def Extract_Ref_From_PDF(path): references = extract_references_from_file(path) all = [] for r in references: ref = Reference() ref.create_Ref(r) all.append(ref) return all
def extract_references(filepath, source=None, custom_kbs_file=None): """Extract references from PDF and return in INSPIRE format.""" extracted_references = extract_references_from_file( filepath, override_kbs_files=custom_kbs_file, reference_format=u'{title},{volume},{page}') return map_refextract_to_schema(extracted_references, source=source)
def countCitations(directory, min_distance_ratio): file_paths = [ join(directory, f) for f in listdir(directory) if isfile(join(directory, f)) ] file_paths = filter(lambda x: x.endswith(".pdf"), file_paths) references = [] for filepath in file_paths: print("Extracting references from %s" % filepath) json_file_path = splitext(filepath)[0] + ".json" if isfile(json_file_path): # load json if available with open(json_file_path, 'r') as f: extracted_refs = json.load(f)["references"] else: extracted_refs = extract_references_from_file(filepath) dict_object = dict(references=extracted_refs) try: file_object = open(json_file_path, 'w') # Save references data into the JSON file json.dump(dict_object, file_object) except Exception: print("Failed to save json file") # remove entries without year and author extracted_refs = filter(lambda x: 'author' in x and 'year' in x, extracted_refs) # remove leading citation number (e.g. [1]) extracted_refs = map( lambda x: (re.sub("\[\d+\]", "", x['raw_ref'][0]).strip(), x['year'][0]), extracted_refs) references += extracted_refs print("Calculating distances...") distances = createDistanceMatrix(references) mask = np.where((distances > min_distance_ratio), distances, 0) counts = (mask > 0).sum(axis=1) i_indices = np.where(counts > 0)[0] result = [] for i in i_indices: count = counts[i] + 1 title = references[i][0] result.append((count, title)) result.sort(key=lambda x: x[0], reverse=True) print("\n==========================") print("Number of papers: %i" % len(file_paths)) print("Total references found: %i\n" % len(references)) for count, title in result: print("Cited by %i papers: %s" % (count, title))
def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None): """Extract references from PDF and return in INSPIRE format.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_file( filepath, override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None): """Extract references from PDF and return in INSPIRE format.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_file( filepath, override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
def find_reference_list(path): """ Returns the reference list as a list of dicts if refextract can detect it Keyword arguments: path -- path to the file containing filepaths for the files that should be examined """ references = [] try: references = extract_references_from_file(path) except: print('Could not read PDF file') return remove_duplicate_refs(references)
def extract_references(filepath): """Extract references from PDF and return in INSPIRE format.""" references = extract_references_from_file( filepath, reference_format=u"{title},{volume},{page}", ) mapped_references = [] if references.get('references'): for ref in references.get('references'): reference = { 'curated_relation': False, 'raw_refs': [ { 'position': '', 'schema': '', 'source': '', 'value': json.dumps(ref), } ], 'record': { '$ref': '' }, 'reference': { 'authors': ref.get('author'), 'book_series': ref.get('book_series'), 'collaboration': ref.get('collaboration'), 'dois': ref.get('dois'), 'imprint': ref.get('imprint'), 'misc': ref.get('misc'), 'number': ref.get('linemarker'), 'persistent_identifiers': ref.get( 'persistent_identifiers' ), 'publication_info': { 'year': ref.get('year'), 'journal_title': ref.get('journal_title'), 'journal_volume': ref.get('journal_volume'), 'page_start': ref.get('journal_page'), 'artid': ref.get('journal_reference'), }, 'texkey': ref.get('texkey'), 'titles': ref.get('titles'), 'urls': ref.get('urls'), }, } mapped_references.append(reference) return mapped_references
def extract_references(filepath): """Extract references from PDF and return in INSPIRE format.""" references = extract_references_from_file( filepath, reference_format="{title},{volume},{page}", ) mapped_references = [] if references.get('references'): for ref in references.get('references'): reference = { 'curated_relation': False, 'raw_refs': [{ 'position': '', 'schema': '', 'source': '', 'value': json.dumps(ref), }], 'record': { '$ref': '' }, 'reference': { 'authors': ref.get('author'), 'book_series': ref.get('book_series'), 'collaboration': ref.get('collaboration'), 'dois': ref.get('dois'), 'imprint': ref.get('imprint'), 'misc': ref.get('misc'), 'number': ref.get('linemarker'), 'persistent_identifiers': ref.get('persistent_identifiers'), 'publication_info': { 'year': ref.get('year'), 'journal_title': ref.get('journal_title'), 'journal_volume': ref.get('journal_volume'), 'page_start': ref.get('journal_page'), 'artid': ref.get('journal_reference'), }, 'texkey': ref.get('texkey'), 'titles': ref.get('titles'), 'urls': ref.get('urls'), }, } mapped_references.append(reference) return mapped_references
def read_files_from_path(directory_path): refrence_dic = {} for file_name in os.listdir(directory_path): if file_name.endswith('.pdf'): references = extract_references_from_file(directory_path + file_name) for reference_item in references: if reference_item.has_key('misc'): title = extract_title(reference_item['misc'][0]) if refrence_dic.has_key(title): refrence_dic[title] += 1 else: refrence_dic[title] = 1 for k, v in refrence_dic.iteritems(): if v > 1: print k, v
def main(): if len(sys.argv) < 2: print('usage: extractrefs <pdf_path> [dst_path]') return pdf_path = sys.argv[1] assert pdf_path.endswith('.pdf') dst_path = \ sys.argv[2] if len(sys.argv) > 2 else pdf_path.replace('.pdf', '.json') if pdf_path.startswith('http://') or pdf_path.startswith('https://'): refs = refextract.extract_references_from_url(pdf_path) else: refs = refextract.extract_references_from_file(pdf_path) with open(dst_path, 'w') as f: json.dump(refs, f, indent=4) print('saved refs to %s' % dst_path)
def get_refs(file_path): """list of references in pdf file""" print("File:", file_path) refs = refextract.extract_references_from_file(str(file_path)) # iterate over refs for i, ref in enumerate(refs): # skip this if this is a link if "http" in "".join(ref['raw_ref']): continue # get title of this work title = ref.get('title', None) # if not none pass on # if title was read if title is not None: title = " ".join(title).lower() yield title
def extract() -> tuple: """Handle a request for reference extraction for a POSTed PDF.""" logger = getLogger() if 'file' not in request.files: return jsonify({'explanation': 'No file found'}), HTTP_400_BAD_REQUEST try: filepath = handle_upload(request.files['file']) except ValueError as e: return jsonify({'explanation': e.msg}), HTTP_400_BAD_REQUEST try: response_data = extract_references_from_file(filepath) status = HTTP_200_OK except Exception as e: response_data = {'explanation': 'refextract failed: %s' % e} status = HTTP_500_INTERNAL_SERVER_ERROR finally: try: cleanup_upload(filepath) except IOError as e: logger.warning('Could not remove file %s: %s' % filepath, e) return jsonify(response_data), status
def test_get_number_header_lines_does_not_crash_on_final_empty_page(pdf_files): assert extract_references_from_file(pdf_files[4])
from refextract import extract_references_from_file from os import path from glob import glob def find_ext(dr, ext): return glob(path.join(dr, "*.{}".format(ext))) list_of_pdfs = find_ext("PDFS_to_PROCESS", "pdf") for pdf in list_of_pdfs: reference = extract_references_from_file(pdf) with open('drop_these_into_anystyledotio.txt', 'a') as biblio: for dictn in reference: raw = dictn['raw_ref'] try: biblio.write("{}\n".format(str("".join(raw)))) except UnicodeEncodeError: biblio.write("{}\n".format(raw)) else: biblio.write("{}\n".format(raw)) # r = dict((k, v[0]) for k, v in dictn.iteritems() if v) # biblio.write("{}\n".format(r))
Programmer: Justin Faler Date: 4/8/2020 Description: PDF Citation Map ''' import numpy as np import networkx as nx import matplotlib as mpl import pdfminer, re, sys, os, time, csv import matplotlib.pyplot as plt from refextract import extract_references_from_file plt.rcParams["figure.facecolor"] = "#333333" citation_map = nx.DiGraph(directed=True) references = extract_references_from_file( '/home/user/Documents/Books/example.pdf') options = { 'node_color': '#36DBCA', 'node_size': 150, 'width': 0.2, 'alpha': 1, 'arrowstyle': '-|>', 'arrowsize': 15, } base = os.path.basename('/home/user/Documents/Books/example.pdf') name = os.path.splitext(base)[0] i = 1
import pdfx # pdf = pdfx.PDFx("/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf") # metadata = pdf.get_metadata() # # refs = pdf.get_references() # ref_dict = pdf.get_references_as_dict() # # # print(len(ref_dict)) # for k , v in ref_dict.items(): # print(k , v) # print(10*"=") # # metadata = pdf.get_metadata() # print(metadata) # # print(10*"-=-") # print(pdf.get_references_count()) from refextract import extract_references_from_file reference = extract_references_from_file( "/Users/stephenbradshaw/Documents/codingTest/AutomaticKeyphraseExtraction-master/data/1/1.pdf" ) print(references)