예제 #1
0
def test_serialize_transformer_data():
    data = {"x": TransformerData.empty()}
    bytes_data = srsly.msgpack_dumps(data)
    new_data = srsly.msgpack_loads(bytes_data)
    assert isinstance(new_data["x"], TransformerData)

    nlp = Language()
    trf = nlp.add_pipe(
        "transformer",
        config={
            "model": {
                "name": "distilbert-base-uncased",
                "transformer_config": {
                    "output_attentions": True
                },
            }
        },
    )
    nlp.initialize()
    doc = nlp("This is a test.")
    b = doc.to_bytes()
    reloaded_doc = Doc(nlp.vocab)
    reloaded_doc.from_bytes(b)
    assert_docs_equal(doc, reloaded_doc)
    for key in doc._.trf_data.model_output:
        assert_array_equal(doc._.trf_data.model_output[key],
                           reloaded_doc._.trf_data.model_output[key])
def test_transformer_model_tobytes():
    nlp = Language()
    trf = nlp.add_pipe("transformer")
    nlp.initialize()
    trf_bytes = trf.to_bytes()

    nlp2 = Language()
    trf2 = nlp2.add_pipe("transformer")
    nlp2.initialize()
    trf2.from_bytes(trf_bytes)
예제 #3
0
def test_initialized_transformer_todisk():
    nlp = Language()
    trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    with make_tempdir() as d:
        trf.to_disk(d)
        nlp2 = Language()
        trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
        trf2.from_disk(d)
예제 #4
0
def test_initialized_transformer_tobytes():
    nlp = Language()
    trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    trf_bytes = trf.to_bytes()

    nlp2 = Language()
    trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
    trf2.from_bytes(trf_bytes)
예제 #5
0
def get_raw_doc_generator(
    abstract_generator: Iterator[Tuple[str, pipe_pubmed.AbstractMetadata]],
    nlp: spacy.Language,
) -> Generator[Tuple[spacy.tokens.Doc, pipe_pubmed.AbstractMetadata], None,
               None]:
    """Get tuples of (abstract_text, metadata) and yield tuples of (Doc, metadata) where Doc is the spacy doc corresponding to the original text.

    Args:
        abstract_generator (Iterator[Tuple[str, pipe_pubmed.AbstractMetadata]]): iterator of tuples where the first element is abstract text and second element is a
        dict with abstract metadata
        nlp (spacy.Language): Spacy Language object

    Yields:
        Generator[Tuple[spacy.tokens.Doc, pipe_pubmed.AbstractMetadata], None, None]: Generator that yields tuples of
            1. spacy docs containing the abstract text
            2. the same metadata as in the tuples received as argument
    """
    yield from nlp.pipe(abstract_generator, as_tuples=True, n_process=-1)
 def reader(nlp: Language):
     doc = nlp.make_doc(f"This is an example")
     return [Example.from_dict(doc, annots)]
예제 #7
0
def test_transformer_pipeline_tobytes():
    nlp = Language()
    nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp.initialize()
    assert nlp.pipe_names == ["transformer"]
    nlp_bytes = nlp.to_bytes()

    nlp2 = Language()
    nlp2.add_pipe("transformer", config=DEFAULT_CONFIG)
    nlp2.from_bytes(nlp_bytes)
    assert nlp2.pipe_names == ["transformer"]
예제 #8
0
def pdf_reader(
    pdf_path: str,
    nlp: spacy.Language,
    pdf_parser: BaseParser = pdfminer.PdfminerParser,
    verbose: bool = False,
    **kwargs: Any,
) -> spacy.tokens.Doc:
    """Convert a PDF document to a spaCy Doc object.

    Args:
        pdf_path: Path to a PDF file.
        nlp: A spaCy Language object with a loaded pipeline. For example
            `spacy.load("en_core_web_sm")`.
        pdf_parser: The parser to convert PDF file to text. Read the docs for
            more detailsDefaults to pdfminer.Parser.
        verbose: If True details will be printed to the terminal. By default,
            False.
        **kwargs: Arbitrary keyword arguments.

    Returns:
        A spacy Doc object with the custom extensions.

    Examples:
        By default pdfminer is used to extract text from the PDF.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp)

        To be more explicit import `PdfminerParser` and pass it into the
        `pdf_reader` function.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pdfminer import PdfminerParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PdfminerParser)

        Alternative parsers can be used as well such as pytesseract.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pytesseract import PytesseractParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser)

        For more fine tuning you can pass in additional parameters to
        pytesseract.

        >>> import spacy
        >>> from spacypdfreader import pdf_reader
        >>> from spacypdfreader.parsers.pytesseract import PytesseractParser
        >>>
        >>> nlp = spacy.load("en_core_web_sm")
        >>> params = {"nice": 1}
        >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser, **params)
    """
    if verbose:
        console.print(f"PDF to text engine: [blue bold]{pdf_parser.name}[/]...")

    pdf_path = os.path.normpath(pdf_path)
    num_pages = _get_number_of_pages(pdf_path)

    # Convert pdf to text.
    if verbose:
        console.print(f"Extracting text from {num_pages} pdf pages...")
    texts = []
    for page_num in range(1, num_pages + 1):
        parser = pdf_parser(pdf_path, page_num)
        text = parser.pdf_to_text(**kwargs)
        texts.append(text)

    # Convert text to spaCy Doc objects.
    if verbose:
        console.print("Converting text to [blue bold]spaCy[/] Doc...")

    docs = [doc for doc in nlp.pipe(texts)]
    for idx, doc in enumerate(docs):
        page_num = idx + 1
        for token in doc:
            token._.page_number = page_num

    combined_doc = Doc.from_docs(docs)
    combined_doc._.pdf_file_name = pdf_path

    if verbose:
        console.print(":white_check_mark: [green]Complete!")

    return combined_doc
예제 #9
0
def convert_record(nlp: spacy.Language, text: str, label: dict):
    """Convert a record from the tsv into a spaCy Doc object."""
    doc = nlp.make_doc(text)
    doc.cats = label
    return doc