def get_abstract_information(pubmed_id): article_xml_tree = get_pubmed_article_xml_tree(pubmed_id) title, abstract_xml_tree = parse_article_xml_tree(article_xml_tree) if isinstance(title, unicode): title = convert_unicode_to_ascii(title) if abstract_xml_tree: return (title, split_abstract(abstract_xml_tree)) return (title, [])
def split_abstract(abstract_xml_tree): """ Splits an abstract XML tree into individual chunks, if they exist. Preserves the background/methods/etc format of some papers (eg pmid 24885308) """ abstract_chunks = [] for child in abstract_xml_tree.iter("AbstractText"): section_name = child.get("Label") text = child.text if isinstance(text, unicode): text = convert_unicode_to_ascii(text) if section_name is not None: text = "{0}: {1}".format(section_name, text) abstract_chunks.append(text) return abstract_chunks