def test_should_convert_multiple_article_authors_of_single_reference( self, scienceparse_jats_xslt): authors = [AUTHOR_1, AUTHOR_2] jats = etree.fromstring( scienceparse_jats_xslt({ 'references': [ extend_dict( REFERENCE_1, { 'authors': [ '%s %s' % (author['first-name'], author['last-name']) for author in authors ] }) ] })) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') person_group = _get_item(element_citation, 'person-group') persons = person_group.xpath('name') assert len(persons) == 2 for person, author in zip(persons, authors): assert _get_text(person, 'surname') == author['last-name'] assert _get_text(person, 'given-names') == author['first-name']
def add_read_source_to_extracted_xml_pipeline_steps(p, opt, get_pipeline_output_file): if opt.lxml_file_list: lxml_urls = p | ReadFileList( opt.lxml_file_list, column=opt.lxml_file_column, limit=opt.limit) annotated_lxml = (lxml_urls | PreventFusion( ) | "ReadLxmlContent" >> TransformAndCount( MapOrLog( lambda url: { DataProps.SOURCE_FILENAME: url, DataProps.STRUCTURED_DOCUMENT: load_structured_document(url ) }, error_count=MetricCounters.READ_LXML_ERROR), MetricCounters.FILES)) extract_tag_scope = None else: annotated_lxml, extract_tag_scope = add_read_pdfs_to_annotated_lxml_pipeline_steps( p, opt, get_pipeline_output_file) extracted_xml = (annotated_lxml | "ExtractToXml" >> MapOrLog( lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.EXTRACTED_XML: extract_annotated_structured_document_to_xml( v[DataProps.STRUCTURED_DOCUMENT], tag_scope=extract_tag_scope) }), keys_to_remove= {DataProps.STRUCTURED_DOCUMENT}), error_count=MetricCounters.EXTRACT_TO_XML_ERROR)) if opt.use_grobid: enhancer = GrobidXmlEnhancer(opt.grobid_url, start_service=opt.start_grobid_service) extracted_xml = (extracted_xml | "GrobidEnhanceXml" >> MapOrLog( lambda v: extend_dict(v, { DataProps.EXTRACTED_XML: enhancer(v[DataProps.EXTRACTED_XML]) }), error_count=MetricCounters.GROBID_ERROR)) return extracted_xml
def test_should_use_process_header_if_includes_only_contains_header( self, config, args, grobid_service_instance): args.grobid_action = None _run_pipeline( config, args, extend_dict(PDF_INPUT, {StepDataProps.INCLUDES: {FieldNames.TITLE}})) grobid_service_instance.assert_called_with( (PDF_INPUT['filename'], PDF_INPUT['content']), path=GrobidApiPaths.PROCESS_HEADER_DOCUMENT)
def test_should_use_process_full_text_if_includes_only_contains_references( self, config, args, grobid_service_instance): args.grobid_action = None _run_pipeline( config, args, extend_dict(PDF_INPUT, {StepDataProps.INCLUDES: {FieldNames.REFERENCES}})) grobid_service_instance.assert_called_with( (PDF_INPUT['filename'], PDF_INPUT['content']), path=GrobidApiPaths.PROCESS_FULL_TEXT_DOCUMENT)
def test_should_convert_venue_as_source(self, scienceparse_jats_xslt): jats = etree.fromstring( scienceparse_jats_xslt({ 'references': [extend_dict(REFERENCE_1, {'venue': VALUE_1})] })) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'source') == VALUE_1
def test_should_convert_single_page_no(self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, page='page1')) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'fpage') == 'page1' assert _get_text(element_citation, 'lpage') == 'page1'
def add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt): grobid_transformer = grobid_service(opt.grobid_url, opt.grobid_action, start_service=opt.start_grobid_service) return (p | PdfUrlSource(opt) | PreventFusion() | ReadPdfContent( ) | "Grobid" >> MapOrLog(lambda v: extend_dict( v, { DataProps.EXTRACTED_XML: grobid_transformer( (v[DataProps.SOURCE_FILENAME], v[DataProps.PDF_CONTENT]))[1] }), error_count=MetricCounters.GROBID_ERROR))
def test_should_convert_year_and_month(self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict( REFERENCE_1, year='2001', month='02')) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'year') == '2001' assert _get_text(element_citation, 'month') == '02'
def wrapper(x): data_type = x['type'] if data_type in supported_types: get_logger().debug('excuting step %s: %s (%s)', step, x.keys(), data_type) result = extend_dict(x, step(x)) get_logger().debug('result of step %s: %s (%s)', step, result.keys(), result.get('type')) processed_counter.inc() return result else: get_logger().debug( 'skipping step %s, %s not in supported types (%s)', step, data_type, supported_types ) ignored_counter.inc() return x
def test_should_only_return_article_title_at_different_levels( self, grobid_jats_xslt, title_level): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, article_title=ARTICLE_TITLE_1, title_level=title_level)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == ARTICLE_TITLE_1
def configure_pipeline(p, opt, pipeline, config): get_pipeline_output_file = lambda source_url, ext: get_output_file( source_url, opt.base_data_path, opt.output_path, ext ) steps = pipeline.get_steps(config, opt) LOGGER.info('steps: %s', steps) input_data = ( p | FileUrlSource(opt) | PreventFusion() | ReadFileContent() | "Determine Type" >> beam.Map(lambda d: extend_dict(d, { DataProps.TYPE: mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0] })) ) result = input_data for step in steps: LOGGER.debug('step: %s', step) result |= get_step_transform(step) _ = ( result | beam.Map(lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE])) ) _ = ( result | "WriteOutput" >> TransformAndLog( beam.Map(lambda v: save_file_content( get_pipeline_output_file( v[DataProps.SOURCE_FILENAME], opt.output_suffix ), encode_if_text_type(v[DataProps.CONTENT]) )), log_fn=lambda x: get_logger().info('saved output to: %s', x) ) )
def test_should_only_return_article_title_even_if_collection_title_exists( self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference( **extend_dict(REFERENCE_1, article_title=ARTICLE_TITLE_1, collection_title=COLLECTION_TITLE_1)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == ARTICLE_TITLE_1
def test_should_fallback_to_collection_title_if_article_title_does_not_exist( self, grobid_jats_xslt): jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference( **extend_dict(REFERENCE_1, article_title=None, collection_title=COLLECTION_TITLE_1)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') assert _get_text(element_citation, 'article-title') == COLLECTION_TITLE_1
def test_should_convert_multiple_collection_authors_of_single_reference( self, grobid_jats_xslt): authors = [AUTHOR_1, AUTHOR_2] jats = etree.fromstring( grobid_jats_xslt( _tei(references=[ _reference(**extend_dict(REFERENCE_1, collection_authors=authors)) ]))) ref_list = _get_item(jats, 'back/ref-list') ref = _get_item(ref_list, 'ref') element_citation = _get_item(ref, 'element-citation') person_group = _get_item(element_citation, 'person-group') persons = person_group.xpath('name') assert len(persons) == 2 for person, author in zip(persons, authors): assert _get_text(person, 'surname') == author['last-name'] assert _get_text(person, 'given-names') == author['first-name']
def add_read_pdfs_to_annotated_lxml_pipeline_steps(p, opt, get_pipeline_output_file): page_range = opt.pages cv_enabled = opt.cv_model_export_dir extract_tag_scope = None pdf_urls = p | PdfUrlSource(opt) lxml_content = (pdf_urls | PreventFusion() | ReadPdfContent() | "ConvertPdfToLxml" >> MapOrLog( lambda v: extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: convert_pdf_bytes_to_structured_document( v[DataProps.PDF_CONTENT], path=v[DataProps.SOURCE_FILENAME], page_range=page_range) }), log_fn=lambda e, v: (get_logger().warning( 'caught exception (ignoring item): %s, pdf: %s', e, v[DataProps.SOURCE_FILENAME], exc_info=e)), error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR)) if cv_enabled: image_size = ((opt.image_width, opt.image_height) if opt.image_width and opt.image_height else None) inference_model_wrapper = InferenceModelWrapper( opt.cv_model_export_dir) cv_predictions = ( lxml_content | "ConvertPdfToPng" >> MapOrLog( lambda v: remove_keys_from_dict( extend_dict( v, { DataProps.PDF_PNG_PAGES: list( pdf_bytes_to_png_pages( v[DataProps.PDF_CONTENT], dpi=90, # not used if the image is scaled image_size=image_size, page_range=page_range)) }), keys_to_remove={DataProps.PDF_CONTENT}), error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR) | "ComputerVisionPrediction" >> MapOrLog( lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.CV_PREDICTION_PNG_PAGES: inference_model_wrapper(v[DataProps.PDF_PNG_PAGES]), DataProps.COLOR_MAP: inference_model_wrapper.get_color_map() }), keys_to_remove= {DataProps.PDF_PNG_PAGES}), error_count=MetricCounters.CV_PREDICTION_ERROR)) if opt.save_cv_output: _ = (cv_predictions | "SaveComputerVisionOutput" >> TransformAndLog(beam.Map(lambda v: save_pages( get_pipeline_output_file(v[DataProps.SOURCE_FILENAME], OutputExt.CV_PNG), '.png', [ image_data_to_png(image_data) for image_data in v[DataProps.CV_PREDICTION_PNG_PAGES] ])), log_fn=lambda x: get_logger().info( 'saved cv output: %s', x))) cv_annotated_lxml = ( cv_predictions | "AnnotateLxmlUsingCvPrediction" >> MapOrLog( lambda v: remove_keys_from_dict(extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: annotate_structured_document_using_predicted_image_data( v[DataProps.STRUCTURED_DOCUMENT], v[DataProps.CV_PREDICTION_PNG_PAGES], v[DataProps.COLOR_MAP], tag_scope=CV_TAG_SCOPE) }), keys_to_remove= {DataProps.PDF_PNG_PAGES}), error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR)) lxml_content = cv_annotated_lxml extract_tag_scope = CV_TAG_SCOPE if opt.crf_model: model = load_crf_model(opt.crf_model) crf_annotated_lxml = ( lxml_content | "AnnotateLxmlUsingCrfPrediction" >> MapOrLog( lambda v: extend_dict( v, { DataProps.STRUCTURED_DOCUMENT: predict_and_annotate_structured_document( v[DataProps.STRUCTURED_DOCUMENT], model) }), error_count=MetricCounters.ANNOTATE_USING_PREDICTION_ERROR)) lxml_content = crf_annotated_lxml extract_tag_scope = CRF_TAG_SCOPE if opt.save_annot_lxml: _ = (lxml_content | "SaveAnnotLxml" >> TransformAndLog( beam.Map(lambda v: save_structured_document( get_pipeline_output_file( v[DataProps.SOURCE_FILENAME], get_annot_lxml_ext(crf_enabled=opt.crf_model, cv_enabled=cv_enabled)), v[ DataProps.STRUCTURED_DOCUMENT])), log_fn=lambda x: get_logger().info('saved annoted lxml to: %s', x)) ) return lxml_content, extract_tag_scope