def test_serialize_transformer_data(): data = {"x": TransformerData.empty()} bytes_data = srsly.msgpack_dumps(data) new_data = srsly.msgpack_loads(bytes_data) assert isinstance(new_data["x"], TransformerData) nlp = Language() trf = nlp.add_pipe( "transformer", config={ "model": { "name": "distilbert-base-uncased", "transformer_config": { "output_attentions": True }, } }, ) nlp.initialize() doc = nlp("This is a test.") b = doc.to_bytes() reloaded_doc = Doc(nlp.vocab) reloaded_doc.from_bytes(b) assert_docs_equal(doc, reloaded_doc) for key in doc._.trf_data.model_output: assert_array_equal(doc._.trf_data.model_output[key], reloaded_doc._.trf_data.model_output[key])
def test_transformer_model_tobytes(): nlp = Language() trf = nlp.add_pipe("transformer") nlp.initialize() trf_bytes = trf.to_bytes() nlp2 = Language() trf2 = nlp2.add_pipe("transformer") nlp2.initialize() trf2.from_bytes(trf_bytes)
def test_initialized_transformer_todisk(): nlp = Language() trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() with make_tempdir() as d: trf.to_disk(d) nlp2 = Language() trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) trf2.from_disk(d)
def test_initialized_transformer_tobytes(): nlp = Language() trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() trf_bytes = trf.to_bytes() nlp2 = Language() trf2 = nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) trf2.from_bytes(trf_bytes)
def get_raw_doc_generator( abstract_generator: Iterator[Tuple[str, pipe_pubmed.AbstractMetadata]], nlp: spacy.Language, ) -> Generator[Tuple[spacy.tokens.Doc, pipe_pubmed.AbstractMetadata], None, None]: """Get tuples of (abstract_text, metadata) and yield tuples of (Doc, metadata) where Doc is the spacy doc corresponding to the original text. Args: abstract_generator (Iterator[Tuple[str, pipe_pubmed.AbstractMetadata]]): iterator of tuples where the first element is abstract text and second element is a dict with abstract metadata nlp (spacy.Language): Spacy Language object Yields: Generator[Tuple[spacy.tokens.Doc, pipe_pubmed.AbstractMetadata], None, None]: Generator that yields tuples of 1. spacy docs containing the abstract text 2. the same metadata as in the tuples received as argument """ yield from nlp.pipe(abstract_generator, as_tuples=True, n_process=-1)
def reader(nlp: Language): doc = nlp.make_doc(f"This is an example") return [Example.from_dict(doc, annots)]
def test_transformer_pipeline_tobytes(): nlp = Language() nlp.add_pipe("transformer", config=DEFAULT_CONFIG) nlp.initialize() assert nlp.pipe_names == ["transformer"] nlp_bytes = nlp.to_bytes() nlp2 = Language() nlp2.add_pipe("transformer", config=DEFAULT_CONFIG) nlp2.from_bytes(nlp_bytes) assert nlp2.pipe_names == ["transformer"]
def pdf_reader( pdf_path: str, nlp: spacy.Language, pdf_parser: BaseParser = pdfminer.PdfminerParser, verbose: bool = False, **kwargs: Any, ) -> spacy.tokens.Doc: """Convert a PDF document to a spaCy Doc object. Args: pdf_path: Path to a PDF file. nlp: A spaCy Language object with a loaded pipeline. For example `spacy.load("en_core_web_sm")`. pdf_parser: The parser to convert PDF file to text. Read the docs for more detailsDefaults to pdfminer.Parser. verbose: If True details will be printed to the terminal. By default, False. **kwargs: Arbitrary keyword arguments. Returns: A spacy Doc object with the custom extensions. Examples: By default pdfminer is used to extract text from the PDF. >>> import spacy >>> from spacypdfreader import pdf_reader >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp) To be more explicit import `PdfminerParser` and pass it into the `pdf_reader` function. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pdfminer import PdfminerParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PdfminerParser) Alternative parsers can be used as well such as pytesseract. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pytesseract import PytesseractParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser) For more fine tuning you can pass in additional parameters to pytesseract. >>> import spacy >>> from spacypdfreader import pdf_reader >>> from spacypdfreader.parsers.pytesseract import PytesseractParser >>> >>> nlp = spacy.load("en_core_web_sm") >>> params = {"nice": 1} >>> doc = pdf_reader("tests/data/test_pdf_01.pdf", nlp, PytesseractParser, **params) """ if verbose: console.print(f"PDF to text engine: [blue bold]{pdf_parser.name}[/]...") pdf_path = os.path.normpath(pdf_path) num_pages = _get_number_of_pages(pdf_path) # Convert pdf to text. if verbose: console.print(f"Extracting text from {num_pages} pdf pages...") texts = [] for page_num in range(1, num_pages + 1): parser = pdf_parser(pdf_path, page_num) text = parser.pdf_to_text(**kwargs) texts.append(text) # Convert text to spaCy Doc objects. if verbose: console.print("Converting text to [blue bold]spaCy[/] Doc...") docs = [doc for doc in nlp.pipe(texts)] for idx, doc in enumerate(docs): page_num = idx + 1 for token in doc: token._.page_number = page_num combined_doc = Doc.from_docs(docs) combined_doc._.pdf_file_name = pdf_path if verbose: console.print(":white_check_mark: [green]Complete!") return combined_doc
def convert_record(nlp: spacy.Language, text: str, label: dict): """Convert a record from the tsv into a spaCy Doc object.""" doc = nlp.make_doc(text) doc.cats = label return doc