Python grobid_client_generic示例

编程语言: Python

命名空间/包名称: grobid_client_generic

方法/功能: grobid_client_generic

hotexamples.com的示例: 5

Python grobid_client_generic - 已找到5个示例。这些是从开源项目中提取的最受好评的grobid_client_generic.grobid_client_generic现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

def process_pdf():
    file = request.files['input']
    grobid = grobid_client_generic(config_path="./config.json")
    tf = NamedTemporaryFile()
    tf.write(file.read())
    result_text = grobid.process_pdf(tf.name,
                                     'processPDF',
                                     params={'disableLinking': 'true'},
                                     headers={'Accept': 'application/json'})

    result_json = json.loads(result_text)
    new_paragraphs = []
    paragraphs = result_json['paragraphs']
    for index, paragraph in enumerate(paragraphs):
        if 'spans' not in paragraph:
            new_paragraphs.append(paragraph)
            continue

        extracted_data_from_paragraphs = RuleBasedLinker().process_paragraph(
            paragraph)
        for sentence in extracted_data_from_paragraphs:
            new_paragraphs.append(sentence)

    result_json['paragraphs'] = new_paragraphs

    return result_json

示例#2

显示文件

    def __init__(self, config_path, verbose=False):
        self.verbose = verbose
        config_json = open(config_path).read()
        self.config = json.loads(config_json)
        if verbose:
            print("Configuration: ", self.config)
        self.grobid_client = grobid_client_generic()
        self.grobid_client.set_config(self.config, ping=True)
        if verbose:
            print("Checking indexes")
            self.ensure_indexes()

        if verbose:
            print("Init completed.")

示例#3

显示文件

def run_linking_crf(paragraphs):
    predicted_links = []
    for paragraph in paragraphs:
        output_text = ""
        offset = 0
        if len([
                span for span in (
                    paragraph['spans'] if 'spans' in paragraph else [])
                if span['type'] == "<material>"
        ]) == 0 or len([
                span for span in (
                    paragraph['spans'] if 'spans' in paragraph else [])
                if span['type'] == "<tcValue>"
        ]) == 0:
            continue

        for span in paragraph['spans'] if 'spans' in paragraph else []:
            output_text += escape(
                paragraph['text'][offset:span['offsetStart']])
            offset = span['offsetStart']
            output_text += span['type'].replace(
                ">", " id='" + str(span['id']) + "'>")
            if span['text'].endswith(" "):
                output_text += escape(
                    span['text'][0:-1]) + span['type'].replace("<", "</") + " "
            else:
                output_text += escape(span['text']) + span['type'].replace(
                    "<", "</")

            offset += len(span['text'])

        output_text += escape(paragraph['text'][offset:])

        output = json.loads(grobid_client_generic().process_text(
            output_text, 'linker'))

        predicted_links.extend(extract_predicted_links(output[0]))

    return predicted_links

示例#4

显示文件

# Script to extract superconductor and materials name from PDFs
import argparse
import csv
import json
import os
import re
import traceback
from difflib import SequenceMatcher
from pathlib import Path

from grobid_client_generic import grobid_client_generic

grobid_client = grobid_client_generic(config_path='./config.json')


def decode(response_string):
    try:
        return json.loads(response_string)
    except ValueError as e:
        return "Value error: " + str(e)
    except TypeError as te:
        return "Type error: " + str(te)


def process_file(source_path, type="pdf"):
    output_classes = []
    output_classes_from_materials = []
    materials = []
    materials_from_abstract = []
    materials_from_body = []
    materials_from_keywords = []

示例#5

显示文件

                        help="Output format.")
    parser.add_argument("--task",
                        default='processPDF',
                        choices=['processPDF', 'processPDF_disableLinking'],
                        help="Tasks to be executed.")

    args = parser.parse_args()

    input_path = args.input
    output_path = args.output
    recursive = args.recursive
    format = args.format
    config = args.config
    task = args.task

    grobid_client = grobid_client_generic(config_path=config)

    if os.path.isdir(input_path):
        if not os.path.isdir(output_path):
            print("--output should specify always a directory")
            sys.exit(-1)
        path_list = []

        if recursive:
            for root, dirs, files in os.walk(input_path):
                # Manage to create the directories
                for dir in dirs:
                    abs_path_dir = os.path.join(root, dir)
                    abs_output_path = abs_path_dir.replace(
                        str(input_path), str(output_path))
                    if not os.path.exists(abs_output_path):