Пример #1
0
def pmc2txt(xml_in, pmcid, job_size):
	pubmed_out = pp.parse_pubmed_xml(xml_in)
	ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
	cnt=0
	bcnt = 0

	print 'PMC2Txt', xml_in

	pmcid_no = pmcid.replace('PMC', '')
	sub_dir = 'pmcinput/%d' % (int(pmcid_no) % job_size)

	full_text = ''

	for paragraph in ft_out:
		if 'text' in paragraph:
			full_text += paragraph['text']

	full_text = u2a_convert(pmcid, full_text, 'fulltext')

	if not os.path.exists(sub_dir):
		os.makedirs(sub_dir)

	f_tmp_in_fn = '%s/%s.txt' % (sub_dir,pmid)
	f_tmp_in = open(f_tmp_in_fn, 'w')

	#text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format
	#text = '%s %s' % (title, abstract) # PWTEES FORMAT
	f_tmp_in.write(full_text)
	f_tmp_in.close()
Пример #2
0
def pmc2txt(xml_in, pmcid, job_size, dest_dir):
    try:
        pubmed_out = pp.parse_pubmed_xml(xml_in)
        ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
    except Error as e:
        print 'Error in parsing nxml file %s' % xml_in
        return

    cnt = 0
    bcnt = 0

    #print 'PMC2Txt', xml_in

    pmcid_no = pmcid.replace('PMC', '')
    sub_dir = '%s/%d' % (dest_dir, int(pmcid_no) % job_size)

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    f_tmp_in_fn = '%s/%s.txt' % (sub_dir, pmcid)
    f_tmp_in = open(f_tmp_in_fn, 'w')

    f_tmp_in.write(full_text)
    f_tmp_in.close()
Пример #3
0
def download_paper(PMCID, get_paragraphs=True):
    if PMCID is None:
        return None
    handle = Entrez.efetch(db="pmc", rettype="full", retmode="xml", id=PMCID)
    xml = handle.read()
    try:
        refs = pp.parse_pubmed_references(xml)
    except AttributeError:  # failed to find paper:
        return None
    if refs is not None:
        refs_dict = {
            i['ref_id']: (i['pmid_cited'] if i['pmid_cited'] else None)
            for i in refs
        }
        paragraphs = pp.parse_pubmed_paragraph(xml)
        references = pp.parse_pubmed_references(xml)
        ref_to_paragraphs = defaultdict(list)
        for i, paragraph in enumerate(paragraphs):
            if 'reference_ids' in paragraph:
                paragraph['reference_ids'] = [
                    refs_dict.get(i, None) for i in paragraph['reference_ids']
                ]
                for ref in paragraph['reference_ids']:
                    if ref:
                        ref_to_paragraphs[ref].append(i)
        return {
            'paragraph': paragraphs,
            'references': references,
            'refs_to_paragraphs': ref_to_paragraphs
        }
    else:
        return None
Пример #4
0
def pubmed_parser(path_xml):
    ar = pp.parse_pubmed_xml(path_xml)
    if not isinstance(path_xml, str):
        path_xml.seek(0)
    paragraph_dicts = pp.parse_pubmed_paragraph(path_xml)
    paragraphs = []
    for p in paragraph_dicts:
        del (p['pmc'])
        del (p['pmid'])
        paragraphs.append(p)
    ar['paragraphs'] = paragraphs
    num(ar, 'publication_year')
    try:
        ar['publication_date'] = datetime.datetime.strptime(
            ar['publication_date'], "%d-%m-%Y")
    except ValueError:
        try:
            print(ar['publication_date'])
            # assume error in 'day' and retry with the first day of the month
            ar['publication_date'] = datetime.datetime.strptime(
                "01" + ar['publication_date'][2:], "%d-%m-%Y")
        except ValueError:
            # a workaround, until we have a robust parser
            ar['publication_date'] = datetime.datetime(2000, 1, 1)
    return ar
Пример #5
0
def extract_content_from_file(filepath: str = None) -> List[str]:
    """
    Return the textual content of a PMC OpenAccess Article

    :param filepath: nxml filepath
    :type filepath: str

    :return: list of text chunks
    :rtype: List[str]
    """

    all_parts = list()

    try:
        metadata = pp.parse_pubmed_xml(filepath)

        if metadata.get("full_title") is not None:
            all_parts.append(metadata.get("full_title").strip("\n "))

        if metadata.get("abstract") is not None:
            all_parts.append(metadata.get("abstract").strip("\n "))
    except TypeError:
        pass

    try:
        paragraphs = pp.parse_pubmed_paragraph(filepath)

        for par in paragraphs:
            if par.get("text") is not None:
                all_parts.append(par.get("text").strip("\n "))
    except TypeError:
        pass

    return all_parts
Пример #6
0
def extract_text(file_path):
    abstract = pp.parse_pubmed_xml(file_path)['abstract']
    body = pp.parse_pubmed_paragraph(file_path)
    body_text = ''
    for i in body:
        body_text += i['text'] + ' '
    with open('../data/text/' + file_path[file_path.rfind('/') + 1:],
              'w') as myfile:
        myfile.write(abstract + ' ' + body_text)
Пример #7
0
def test_parse_pubmed_paragraph():
    """
    Test parsing captions and figure ID from a PubMed XML file
    """
    paragraphs = pp.parse_pubmed_paragraph(
        os.path.join("data", "pone.0046493.nxml"))
    assert isinstance(paragraphs, list)
    assert isinstance(paragraphs[0], dict)
    assert len(paragraphs) == 29, "Expected number of paragraphs to be 29"
    assert (len(paragraphs[0]["reference_ids"]) == 11
            ), "Expected number of references in the first paragraph to be 11"
Пример #8
0
def pmc2redis(xml_in, pmcid, redis_server):
    pubmed_out = pp.parse_pubmed_xml(xml_in)
    ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)

    r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0)
    pipe = r.pipeline()

    print 'PMC2Redis', xml_in

    title = pubmed_out['title'].encode('utf-8').replace('\n', ' ')
    title = u2a_convert(pmcid, title, 'title')

    abstract = ''
    if pubmed_out['abstract'] is not None:
        abstract = pubmed_out['abstract'].encode('utf-8').replace('\n', ' ')
        abstract = u2a_convert(pmcid, abstract, 'abstract')
    else:
        print 'Cannot find abstract for PMCID %s' % pmcid

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    # affiliation: corresponding author's affiliation
    # authors: authors, each separated by ;
    # mesh_terms: list of MeSH terms, each separated by ;
    # keywords: list of keywords, each separated by ;
    # pubdate: Publication date. Defaults to year information only.
    year = pubmed_out['pubdate']
    author = pubmed_out['author']
    keywords = pubmed_out['keywords']
    mesh_terms = pubmed_out['mesh_terms']
    affiliation = pubmed_out['affiliation']
    journal = pubmed_out['journal']

    pipe.set('%s:title' % pmcid, '%s' % title)
    pipe.set('%s:abstract' % pmcid, '%s' % abstract)
    pipe.set('%s:fulltext' % pmcid, '%s' % full_text)
    pipe.set('%s:pubtator' % pmcid,
             '%s|t|%s\n%s|a|%s' % (pmcid, title, pmcid, abstract))
    pipe.set('%s:pubdate' % pmcid, year)
    pipe.set('%s:author' % pmcid, author)
    pipe.set('%s:mesh_terms' % pmcid, mesh_terms)
    pipe.set('%s:keywords' % pmcid, keywords)
    pipe.set('%s:affiliation' % pmcid, affiliation)
    pipe.set('%s:journal' % pmcid, journal)

    pipe.execute()
Пример #9
0
def pmc2pubtator(xml_in, pmcid, job_size, dest_dir):
    try:
        pubmed_out = pp.parse_pubmed_xml(xml_in)
        ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
    except Error as e:
        print 'Error in parsing nxml file %s ' % xml_in
        return -1

    cnt = 0
    bcnt = 0

    #print 'PMC2Txt', xml_in

    pmcid_no = pmcid.replace('PMC', '')

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    pmcnumber = pubmed_out['pmc']

    title = pubmed_out['full_title']
    abstract = pubmed_out['abstract']
    ttle = u2a_convert(pmcid, title, 'title')
    abst = u2a_convert(pmcid, abstract, 'abstract')

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    f_tmp_in_fn = '%s/%s.txt' % (dest_dir, pmcid)
    if os.path.exists(f_tmp_in_fn):
        with open(f_tmp_in_fn) as f_tmp_in:
            if len(f_tmp_in.readlines()) > 0:
                f_tmp_in.close()
                return 1

    f_tmp_in = open(f_tmp_in_fn, 'w')

    f_tmp_in.write(pmcnumber + '|t|' + ttle.strip())
    f_tmp_in.write('\n')
    f_tmp_in.write(pmcnumber + '|a|' + abst.strip() + full_text.strip())
    f_tmp_in.write('\n')
    f_tmp_in.write('\n')

    f_tmp_in.close()
    return 1
 def get_words(self):
     words = []
     pubmed_dict = parser.parse_pubmed_xml(self.filename)
     text = pubmed_dict['full_title'] + ' ' + pubmed_dict['abstract']
     pubmed_paras_dict = parser.parse_pubmed_paragraph(self.filename)
     for paras in pubmed_paras_dict:
         text = text + paras['text']
     # encodes the unicode string to ascii and replaces the xml entity character references
     # with '?' symbols. decode() then converts this byte string to a regular string for later
     # processing - strip(punctuation) fails otherwise. replace() gets rid of all '?' symbols and
     # replaces with a space. Later the text is split into words.
     text = text.encode('ascii',
                        'replace').decode('ascii').replace('?', ' ')
     return text
Пример #11
0
    def index_document(self, doc):
        parsed_doc = pp.parse_pubmed_paragraph(doc, all_paragraph=True)
        pmid = None
        pmc = None
        ref_ids = []
        full_text = []

        for para in parsed_doc:
            if not pmid and 'pmid' in para:
                pmid = para['pmid']
            if not pmc and 'pmc' in para:
                pmc = para['pmc']

            section = para['section']
            text = para['text']

            # We want pure text, not headings
            if section and text:
                clean_text = pattern.sub('', unidecode.unidecode(para['text']))
                full_text.append(clean_text)

            if para['reference_ids']:
                ref_ids.extend(para['reference_ids'])

        full_text = '\n\n'.join(full_text)

        _id = None

        if pmid is None:
            pmid = -1
        else:
            _id = "pmid" + str(pmid)

        if pmc is None:
            pmc = -1
        else:
            _id = "pmc" + str(pmc)

        doc_to_insert = {
            "pmid": pmid,
            "pmc": pmc,
            "ref_ids": ref_ids,
            "text": full_text,
        }

        if _id:
            doc_to_insert['_id'] = _id

        self.col.insert_one(doc_to_insert)
Пример #12
0
    def merge(self):
        print('PubMed path:', self.pubmed_path)

        with open(self.output_filename, mode='w', newline='\n') as ofile:

            # PubMed
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.xml'),
                                      recursive=self.recursive):
                print('file:', filename)
                dicts_out = pmp.parse_medline_xml(filename)

                self.write_dicts(dicts_out, 'abstract', ofile, 'title',
                                 'pubmed_abstract')

            # PMC
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.nxml'),
                                      recursive=self.recursive):
                print('file:', filename)

                # OA abstract
                try:
                    dicts_out = [pmp.parse_pubmed_xml(filename)]
                    self.write_dicts(dicts_out, 'abstract', ofile,
                                     'full_title', 'pmc_oa_abstract')
                except:
                    pass

                # OA image caption
                try:
                    dicts_out = pmp.parse_pubmed_caption(filename)
                    self.write_dicts(dicts_out, 'fig_caption', ofile,
                                     'fig_label', 'pmc_oa_image-caption')
                except:
                    pass

                # OA Paragraph
                try:
                    dicts_out = pmp.parse_pubmed_paragraph(filename,
                                                           all_paragraph=True)
                    self.write_dicts(dicts_out, 'text', ofile, 'reference_ids',
                                     'pmc_oa_paragraph')
                except:
                    pass
Пример #13
0
def extractFullText(full_path, paper_dir, nxml_file):
    """
	Extract full text from nxml_file and write as .txt to paper_dir
	"""
    # Run parser to get full text for all paragraphs
    paragraph_dict_list = pp.parse_pubmed_paragraph(f'{full_path}/{nxml_file}',
                                                    all_paragraph=True)

    # Concat paragraphs into full text
    full_text = ''
    for paper_dict in paragraph_dict_list:
        full_text += ('\n\n' + paper_dict['text'])

    # Write out full text
    with open(f'{full_path}/{paper_dir}_fullText.txt', 'w') as text_file:
        text_file.write(full_text)

    print('\nFull text written!')
Пример #14
0
def parse_oa_xml(xml_file, output_file, mode):
    """Import pubmed open access XML file into prophet database."""
    # For open access
    import pubmed_parser as pp

    if mode == 'paper':
        dicts_out = pp.parse_pubmed_xml(xml_file)
    elif mode == 'paragraphs':
        dicts_out = pp.parse_pubmed_paragraph(xml_file, all_paragraph=True)
    elif mode == 'references':
        dicts_out = pp.parse_pubmed_references(xml_file)
    elif mode == 'tables':
        dicts_out = pp.parse_pubmed_table(xml_file, return_xml=False)
    elif mode == 'figures':
        dicts_out = pp.parse_pubmed_caption(xml_file)

    with open(output_file, 'w') as fp:
        json.dump(dicts_out, fp, cls=DateEncoder)
Пример #15
0
def build_case_report_json(xml_path: str) -> dict:
    """Makes and returns a JSON object from pubmed XML files
    Args:
        xml_path (str): path to input XML file
    """
    pubmed_xml = pp.parse_pubmed_xml(xml_path)
    pubmed_paragraph = pp.parse_pubmed_paragraph(xml_path)
    pubmed_references = pp.parse_pubmed_references(xml_path)
    subjects = pubmed_get_subjects(pubmed_xml)
    keywords = get_keywords(subjects)
    article_type = get_article_type(subjects)

    case_report = {
        "pmID": pubmed_xml.get("pmid"),
        "doi": pubmed_xml.get("doi"),
        "title": pubmed_xml.get("full_title"),
        "messages": [],
        "source_files": [],
        "modifications": [],
        "normalizations": [],
        # ctime            : 1351154734.5055847,
        "text": pubmed_get_text(pubmed_paragraph),
        "entities": [],
        "attributes": [],
        # date : { type: Date, default: Date.now }
        "relations": [],
        "triggers": [],
        "events": [],
        "equivs": [],
        "comments": [],
        # sentence_offsets     : [],
        # token_offsets    : [],
        "action": None,
        "abstract": pubmed_xml.get("abstract"),
        "authors": pubmed_get_authors(pubmed_xml),
        "keywords": keywords,
        "introduction": None,
        "discussion": None,
        "references": [],
        "journal": pubmed_xml.get("journal"),
        "article_type": article_type,  # For filtering.
    }

    return case_report
Пример #16
0
def do_processing(path, source):
    """
    Do the processing for a single file.

    Parameters
    ----------
    path : str
        The path to the file.
    source : {'pubmed', 'springer', 'arxiv'}
        The original source academic repository.

    Returns
    -------
    dict
        A dictionarty with text, source, and filename.
    """

    # for PubMed, just use the pubmed parser library
    if source == 'pubmed':
        doc = pp.parse_pubmed_paragraph(path, all_paragraph=True)

    # springer and arXiv have similar parsing rules, but for XML vs HTML
    elif source == 'springer':
        soup = lxml.html.parse(path)
        remove_namespace(soup)
        paragraphs = soup.xpath("//body//p[@class='Para']")
        doc = []
        for paragraph in paragraphs:
            doc.append({'text': stringify_children(paragraph)})

    elif source == 'arxiv':
        soup = etree.parse(path)
        remove_namespace(soup)
        paragraphs = soup.xpath("//body//p|//body//formula")
        doc = []
        for paragraph in paragraphs:
            doc.append({'text': stringify_children(paragraph)})

    # do the actual cleaning
    cleaned = clean_doc(doc)

    return {'text': cleaned, 'file': path, 'source': source}
Пример #17
0
def parse_pubmed_article(xml_path, section_keywords):
    """
    Parse pubmed xml file into human-readable format

    :param xml_path: path to xml file to parse
    :param section_keywords: keyword strings to filter pubmed article sections
    :return: dictionary file containing relevant fields
    """
    body = pp.parse_pubmed_paragraph(xml_path)
    if check_for_section(body, section_keywords):
        metadata = pp.parse_pubmed_xml(xml_path)
        pubmed_dict = build_pubmed_dict(metadata, body, section_keywords)
        return pubmed_dict
    else:
        metadata = pp.parse_pubmed_xml(xml_path)
        return {
            'pmid': metadata['pmid'],
            'pmcid': metadata['pmc'],
            'title': metadata['full_title'],
            'journal': metadata['journal'],
            'parsed': False
        }
Пример #18
0
                            file['file'])) as r:
        with open('article_files/%s.tar.gz' % file['PMID'], 'wb') as f:
            shutil.copyfileobj(r, f)

# For every tar gz file, extract to "article_files" directory, get to directory and parse .nxml file
print('Extracting archives and parsing nxml files...')
parsed_paragraphs = []
os.chdir('article_files')
for file in tar_gz_files:
    tar = tarfile.open('%s.tar.gz' % file['PMID'])
    tar.extractall()

    dirname = tar.getnames()[0]

    nxml_file = glob.glob('%s/*.nxml' % dirname)[0]
    parsed_file = pp.parse_pubmed_paragraph(nxml_file, all_paragraph=False)

    full_text = []

    for element in parsed_file:
        full_text.append(element['text'])

    parsed_paragraphs.append({'PMID': file['PMID'], 'Full_Text': full_text})

    tar.close()

# Adding a new column to our dataframe and save it as csv
print('Exporting new dataset...')

full_text_column_values = []
for ind in df.index:
    def process_paper(self, file, db):
        """
        Loads a pdf file in the folder, and extracts its content into an XML file, as well as into the mongodb
        database
        :param file: the name of the paper to be processed
        :param db: mongo db
        :return:
        """

        try:
            xml, t = self.get_xml(file)

            if t == 'grobid':

                NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

                result = grobid_mapping.tei_to_dict(xml)

                mongo_set_dict = dict()

                if 'abstract' in result:
                    mongo_set_dict["content.abstract"] = result["abstract"]
                if 'notes' in result:
                    mongo_set_dict["content.notes"] = result["notes"]
                if 'fulltext' in result:
                    mongo_set_dict["content.fulltext"] = result["fulltext"]
                    with open(cfg.folder_content_xml + file + ".txt",
                              'w') as f:
                        print(result["fulltext"])
                if 'chapters' in result:
                    mongo_set_dict["content.chapters"] = result["chapters"]

                mongo_result = db.publications.update_one({'_id': file[:10]},
                                                          {'$set': result},
                                                          upsert=True)
                print(mongo_result)

                logging.info("Processed " + file + ' with new xml')

            if t == 'pmc':
                filename_xml = cfg.source_xml + file[:-4] + ".nxml"
                meta = pp.parse_pubmed_xml(filename_xml)
                ref = pp.parse_pubmed_references(filename_xml)
                article_text = pp.parse_pubmed_paragraph(filename_xml,
                                                         all_paragraph=True)

                result = dict()

                fulltext = []
                for par in article_text:
                    fulltext.append(par['text'])

                result['title'] = meta['full_title']
                result['authors'] = meta['author_list']
                result['journal'] = meta['journal']
                result['year'] = meta['publication_year']
                result['type'] = meta['subjects']
                result['domain'] = 'biomedical'
                result['license'] = 'open_access'
                result['content.abstract'] = meta['abstract']
                result['content.keywords'] = meta['keywords']
                result['content.references'] = ref
                result['content.fulltext'] = ''.join(fulltext)

                translator = str.maketrans('', '', string.punctuation)

                chapters = defaultdict(list)
                for par in article_text:
                    section = par['section']
                    section = section.translate(translator)
                    chapters[section].append(par['text'])

                chapters_par = []
                for key in chapters:
                    chapter_paragraphs = {
                        'paragraphs': chapters[key],
                        'title': key
                    }
                    chapters_par.append([chapter_paragraphs])

                result['content.chapters'] = chapters_par

                mongo_result = db.publications.update_one(
                    {'_id': 'PMC_' + meta['pmc']}, {'$set': result},
                    upsert=True)
                print(mongo_result)

                logging.info("Processed " + file + ' with original nxml')

        except Exception:
            logging.exception('Cannot process paper', file, exc_info=True)
Пример #20
0
def parse_pubmed(src):
    """Parse pubmed xml article data and return metadata and text."""
    metadata = pubmed_parser.parse_pubmed_xml(src)
    text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True)
    text = ' '.join(' '.join([x['text'] for x in text]).split())
    return metadata, text