def process_pdf(): file = request.files['input'] grobid = grobid_client_generic(config_path="./config.json") tf = NamedTemporaryFile() tf.write(file.read()) result_text = grobid.process_pdf(tf.name, 'processPDF', params={'disableLinking': 'true'}, headers={'Accept': 'application/json'}) result_json = json.loads(result_text) new_paragraphs = [] paragraphs = result_json['paragraphs'] for index, paragraph in enumerate(paragraphs): if 'spans' not in paragraph: new_paragraphs.append(paragraph) continue extracted_data_from_paragraphs = RuleBasedLinker().process_paragraph( paragraph) for sentence in extracted_data_from_paragraphs: new_paragraphs.append(sentence) result_json['paragraphs'] = new_paragraphs return result_json
def __init__(self, config_path, verbose=False): self.verbose = verbose config_json = open(config_path).read() self.config = json.loads(config_json) if verbose: print("Configuration: ", self.config) self.grobid_client = grobid_client_generic() self.grobid_client.set_config(self.config, ping=True) if verbose: print("Checking indexes") self.ensure_indexes() if verbose: print("Init completed.")
def run_linking_crf(paragraphs): predicted_links = [] for paragraph in paragraphs: output_text = "" offset = 0 if len([ span for span in ( paragraph['spans'] if 'spans' in paragraph else []) if span['type'] == "<material>" ]) == 0 or len([ span for span in ( paragraph['spans'] if 'spans' in paragraph else []) if span['type'] == "<tcValue>" ]) == 0: continue for span in paragraph['spans'] if 'spans' in paragraph else []: output_text += escape( paragraph['text'][offset:span['offsetStart']]) offset = span['offsetStart'] output_text += span['type'].replace( ">", " id='" + str(span['id']) + "'>") if span['text'].endswith(" "): output_text += escape( span['text'][0:-1]) + span['type'].replace("<", "</") + " " else: output_text += escape(span['text']) + span['type'].replace( "<", "</") offset += len(span['text']) output_text += escape(paragraph['text'][offset:]) output = json.loads(grobid_client_generic().process_text( output_text, 'linker')) predicted_links.extend(extract_predicted_links(output[0])) return predicted_links
# Script to extract superconductor and materials name from PDFs import argparse import csv import json import os import re import traceback from difflib import SequenceMatcher from pathlib import Path from grobid_client_generic import grobid_client_generic grobid_client = grobid_client_generic(config_path='./config.json') def decode(response_string): try: return json.loads(response_string) except ValueError as e: return "Value error: " + str(e) except TypeError as te: return "Type error: " + str(te) def process_file(source_path, type="pdf"): output_classes = [] output_classes_from_materials = [] materials = [] materials_from_abstract = [] materials_from_body = [] materials_from_keywords = []
help="Output format.") parser.add_argument("--task", default='processPDF', choices=['processPDF', 'processPDF_disableLinking'], help="Tasks to be executed.") args = parser.parse_args() input_path = args.input output_path = args.output recursive = args.recursive format = args.format config = args.config task = args.task grobid_client = grobid_client_generic(config_path=config) if os.path.isdir(input_path): if not os.path.isdir(output_path): print("--output should specify always a directory") sys.exit(-1) path_list = [] if recursive: for root, dirs, files in os.walk(input_path): # Manage to create the directories for dir in dirs: abs_path_dir = os.path.join(root, dir) abs_output_path = abs_path_dir.replace( str(input_path), str(output_path)) if not os.path.exists(abs_output_path):