def test_parse_pubmed_caption(): """ Test parsing captions and figure ID from a PubMed XML file """ captions = pp.parse_pubmed_caption( os.path.join("data", "pone.0046493.nxml")) assert isinstance(captions, list) assert isinstance(captions[0], dict) assert (len(captions) == 4 ), "Expected number of figures/captions to have a length of 4"
def merge(self): print('PubMed path:', self.pubmed_path) with open(self.output_filename, mode='w', newline='\n') as ofile: # PubMed for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.xml'), recursive=self.recursive): print('file:', filename) dicts_out = pmp.parse_medline_xml(filename) self.write_dicts(dicts_out, 'abstract', ofile, 'title', 'pubmed_abstract') # PMC for filename in glob.glob(os.path.join(self.pubmed_path, '**/*.nxml'), recursive=self.recursive): print('file:', filename) # OA abstract try: dicts_out = [pmp.parse_pubmed_xml(filename)] self.write_dicts(dicts_out, 'abstract', ofile, 'full_title', 'pmc_oa_abstract') except: pass # OA image caption try: dicts_out = pmp.parse_pubmed_caption(filename) self.write_dicts(dicts_out, 'fig_caption', ofile, 'fig_label', 'pmc_oa_image-caption') except: pass # OA Paragraph try: dicts_out = pmp.parse_pubmed_paragraph(filename, all_paragraph=True) self.write_dicts(dicts_out, 'text', ofile, 'reference_ids', 'pmc_oa_paragraph') except: pass
def parse_oa_xml(xml_file, output_file, mode): """Import pubmed open access XML file into prophet database.""" # For open access import pubmed_parser as pp if mode == 'paper': dicts_out = pp.parse_pubmed_xml(xml_file) elif mode == 'paragraphs': dicts_out = pp.parse_pubmed_paragraph(xml_file, all_paragraph=True) elif mode == 'references': dicts_out = pp.parse_pubmed_references(xml_file) elif mode == 'tables': dicts_out = pp.parse_pubmed_table(xml_file, return_xml=False) elif mode == 'figures': dicts_out = pp.parse_pubmed_caption(xml_file) with open(output_file, 'w') as fp: json.dump(dicts_out, fp, cls=DateEncoder)
def extract(self, tar_buffer): tar_buffer.seek(0) tar = tarfile.open(fileobj=tar_buffer) members = tar.getmembers() imgs_files = re.compile(r'.*(\.gif|jpe?g|tiff?|png|webp|bmp)$') text_file = re.compile(r'.*(\.nxml)$') imgs = {} for mem in members: if imgs_files.match(mem.name): img_ref = os.path.basename(mem.name) img_ref = os.path.splitext(img_ref)[0] imbuffer = tar.extractfile(mem).read() imgs[img_ref] = imbuffer if text_file.match(mem.name): text = tar.extractfile(mem.name).read().decode('utf-8') caption = pp.parse_pubmed_caption(text) return imgs, caption