Exemplo n.º 1
0
def pubmed_parser(path_xml):
    ar = pp.parse_pubmed_xml(path_xml)
    if not isinstance(path_xml, str):
        path_xml.seek(0)
    paragraph_dicts = pp.parse_pubmed_paragraph(path_xml)
    paragraphs = []
    for p in paragraph_dicts:
        del (p['pmc'])
        del (p['pmid'])
        paragraphs.append(p)
    ar['paragraphs'] = paragraphs
    num(ar, 'publication_year')
    try:
        ar['publication_date'] = datetime.datetime.strptime(
            ar['publication_date'], "%d-%m-%Y")
    except ValueError:
        try:
            print(ar['publication_date'])
            # assume error in 'day' and retry with the first day of the month
            ar['publication_date'] = datetime.datetime.strptime(
                "01" + ar['publication_date'][2:], "%d-%m-%Y")
        except ValueError:
            # a workaround, until we have a robust parser
            ar['publication_date'] = datetime.datetime(2000, 1, 1)
    return ar
Exemplo n.º 2
0
def pmc2txt(xml_in, pmcid, job_size):
	pubmed_out = pp.parse_pubmed_xml(xml_in)
	ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
	cnt=0
	bcnt = 0

	print 'PMC2Txt', xml_in

	pmcid_no = pmcid.replace('PMC', '')
	sub_dir = 'pmcinput/%d' % (int(pmcid_no) % job_size)

	full_text = ''

	for paragraph in ft_out:
		if 'text' in paragraph:
			full_text += paragraph['text']

	full_text = u2a_convert(pmcid, full_text, 'fulltext')

	if not os.path.exists(sub_dir):
		os.makedirs(sub_dir)

	f_tmp_in_fn = '%s/%s.txt' % (sub_dir,pmid)
	f_tmp_in = open(f_tmp_in_fn, 'w')

	#text = '%s|t|%s\n%s|a|%s\n' % (pmid, title, pmid, abstract) #PubTator Format
	#text = '%s %s' % (title, abstract) # PWTEES FORMAT
	f_tmp_in.write(full_text)
	f_tmp_in.close()
Exemplo n.º 3
0
def parse_single_nxml(fpath, fname):
    """parse a particular file in the path

    Args:
        fpath - the whole path of the file
        fname - the file name

    Return:
        text - a text formulated in a special format
    """

    fname = fname[:-5]  # remove extension
    parsed_dict = pp.parse_pubmed_xml(fpath)

    abstract = parsed_dict["abstract"]
    if len(abstract) == 0:
        return None

    title = parsed_dict["full_title"] if len(parsed_dict["full_title"]) > 1 else EMPTY_TITLE

    text = START_TOKEN + "{}\n".format(fname)
    text += "{}\n".format(title.strip())
    text += abstract + "\n"

    return text
Exemplo n.º 4
0
def extract_content_from_file(filepath: str = None) -> List[str]:
    """
    Return the textual content of a PMC OpenAccess Article

    :param filepath: nxml filepath
    :type filepath: str

    :return: list of text chunks
    :rtype: List[str]
    """

    all_parts = list()

    try:
        metadata = pp.parse_pubmed_xml(filepath)

        if metadata.get("full_title") is not None:
            all_parts.append(metadata.get("full_title").strip("\n "))

        if metadata.get("abstract") is not None:
            all_parts.append(metadata.get("abstract").strip("\n "))
    except TypeError:
        pass

    try:
        paragraphs = pp.parse_pubmed_paragraph(filepath)

        for par in paragraphs:
            if par.get("text") is not None:
                all_parts.append(par.get("text").strip("\n "))
    except TypeError:
        pass

    return all_parts
Exemplo n.º 5
0
def pmc2txt(xml_in, pmcid, job_size, dest_dir):
    try:
        pubmed_out = pp.parse_pubmed_xml(xml_in)
        ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
    except Error as e:
        print 'Error in parsing nxml file %s' % xml_in
        return

    cnt = 0
    bcnt = 0

    #print 'PMC2Txt', xml_in

    pmcid_no = pmcid.replace('PMC', '')
    sub_dir = '%s/%d' % (dest_dir, int(pmcid_no) % job_size)

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    if not os.path.exists(sub_dir):
        os.makedirs(sub_dir)

    f_tmp_in_fn = '%s/%s.txt' % (sub_dir, pmcid)
    f_tmp_in = open(f_tmp_in_fn, 'w')

    f_tmp_in.write(full_text)
    f_tmp_in.close()
Exemplo n.º 6
0
    def query_data(fin_loc):
        # 450 out of 610 abstractions are available
        dict_out = pp.parse_pubmed_xml(fin_loc)
        # abstract = pub_dict["abstract"]
        # pmid = pub_dict["pmid"]
        # title = pub_dict["full_title"]
        # journal = pub_dict["journal"]
        # year = pub_dict["publication_year"]
        # date = dict_out["publication_date"]
        # # url = pub_dict["url"]
        # data_to_write = (abstract, pmid, title, journal, year, date)
        count = 0

        dump_count = 0
        if len(
                dict_out["abstract"]
        ) >= 5:  # some publication don't have abstract, here we filter out abstraction by string length
            with open(os.path.join(fout_path, f"{dict_out['pmid']}.json"),
                      "w") as out_f:
                json.dump(dict_out, out_f)
            # with open(os.path.join(fout_path, f"{pub_dict['pmid']}.txt"), "w") as out_f:
            #     for data in data_to_write:
            #         out_f.write(data + "\n")
            dump_count += 1
        else:
            count += 1

            with open(f"{fout_path}/missing_abstract", "a+") as f:
                f.write(dict_out["pmid"] + ":     " + dict_out["abstract"])
Exemplo n.º 7
0
def extract_text(file_path):
    abstract = pp.parse_pubmed_xml(file_path)['abstract']
    body = pp.parse_pubmed_paragraph(file_path)
    body_text = ''
    for i in body:
        body_text += i['text'] + ' '
    with open('../data/text/' + file_path[file_path.rfind('/') + 1:],
              'w') as myfile:
        myfile.write(abstract + ' ' + body_text)
Exemplo n.º 8
0
def test_parse_pubmed_xml():
    """
    Test parsing metadata from a PubMed XML file
    """
    parsed_xml = pp.parse_pubmed_xml(os.path.join("data", "pone.0046493.nxml"))
    assert isinstance(parsed_xml, dict)
    assert len(parsed_xml.get("abstract")) > 0
    assert len(parsed_xml.get("full_title")) > 0
    assert parsed_xml.get("pmc") == "3460867"
    assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
Exemplo n.º 9
0
def test_parse_pubmed_xml():
    """
    Test parsing metadata from a PubMed XML file
    """
    parsed_xml = pp.parse_pubmed_xml(os.path.join("data", "pone.0046493.nxml"))
    assert isinstance(parsed_xml, dict)
    assert len(parsed_xml.get("abstract")) > 0
    assert len(parsed_xml.get("full_title")) > 0
    assert parsed_xml.get("pmc") == "3460867"
    assert parsed_xml.get("doi") == "10.1371/journal.pone.0046493"
    assert parsed_xml.get("subjects") == "Research Article; Biology; Biochemistry; Enzymes; Enzyme Metabolism; Lipids; Fatty Acids; Glycerides; Lipid Metabolism; Neutral Lipids; Metabolism; Lipid Metabolism; Proteins; Globular Proteins; Protein Classes; Recombinant Proteins; Biotechnology; Microbiology; Bacterial Pathogens; Bacteriology; Emerging Infectious Diseases; Host-Pathogen Interaction; Microbial Growth and Development; Microbial Metabolism; Microbial Pathogens; Microbial Physiology; Proteomics; Sequence Analysis; Spectrometric Identification of Proteins"
Exemplo n.º 10
0
def extractAbstract(full_path, paper_dir, nxml_file):
    """
	Extract abstract from nxml_file and write as .txt to paper_dir
	"""
    # Run parser
    pubmed_xml_dict = pp.parse_pubmed_xml(f'{full_path}/{nxml_file}')

    # Write out abstract
    with open(f'{full_path}/{paper_dir}_abstract.txt', 'w') as text_file:
        text_file.write(pubmed_xml_dict['abstract'])

    print('\nAbstract written!')
Exemplo n.º 11
0
def pmc2redis(xml_in, pmcid, redis_server):
    pubmed_out = pp.parse_pubmed_xml(xml_in)
    ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)

    r = redis.StrictRedis(host='%s' % redis_server, port=6379, db=0)
    pipe = r.pipeline()

    print 'PMC2Redis', xml_in

    title = pubmed_out['title'].encode('utf-8').replace('\n', ' ')
    title = u2a_convert(pmcid, title, 'title')

    abstract = ''
    if pubmed_out['abstract'] is not None:
        abstract = pubmed_out['abstract'].encode('utf-8').replace('\n', ' ')
        abstract = u2a_convert(pmcid, abstract, 'abstract')
    else:
        print 'Cannot find abstract for PMCID %s' % pmcid

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    # affiliation: corresponding author's affiliation
    # authors: authors, each separated by ;
    # mesh_terms: list of MeSH terms, each separated by ;
    # keywords: list of keywords, each separated by ;
    # pubdate: Publication date. Defaults to year information only.
    year = pubmed_out['pubdate']
    author = pubmed_out['author']
    keywords = pubmed_out['keywords']
    mesh_terms = pubmed_out['mesh_terms']
    affiliation = pubmed_out['affiliation']
    journal = pubmed_out['journal']

    pipe.set('%s:title' % pmcid, '%s' % title)
    pipe.set('%s:abstract' % pmcid, '%s' % abstract)
    pipe.set('%s:fulltext' % pmcid, '%s' % full_text)
    pipe.set('%s:pubtator' % pmcid,
             '%s|t|%s\n%s|a|%s' % (pmcid, title, pmcid, abstract))
    pipe.set('%s:pubdate' % pmcid, year)
    pipe.set('%s:author' % pmcid, author)
    pipe.set('%s:mesh_terms' % pmcid, mesh_terms)
    pipe.set('%s:keywords' % pmcid, keywords)
    pipe.set('%s:affiliation' % pmcid, affiliation)
    pipe.set('%s:journal' % pmcid, journal)

    pipe.execute()
Exemplo n.º 12
0
def parse_pubmed_article(xml_path, section_keywords):
    """
    Parse pubmed xml file into human-readable format

    :param xml_path: path to xml file to parse
    :param section_keywords: keyword strings to filter pubmed article sections
    :return: dictionary file containing relevant fields
    """
    body = pp.parse_pubmed_paragraph(xml_path)
    if check_for_section(body, section_keywords):
        metadata = pp.parse_pubmed_xml(xml_path)
        pubmed_dict = build_pubmed_dict(metadata, body, section_keywords)
        return pubmed_dict
    else:
        metadata = pp.parse_pubmed_xml(xml_path)
        return {
            'pmid': metadata['pmid'],
            'pmcid': metadata['pmc'],
            'title': metadata['full_title'],
            'journal': metadata['journal'],
            'parsed': False
        }
 def get_words(self):
     words = []
     pubmed_dict = parser.parse_pubmed_xml(self.filename)
     text = pubmed_dict['full_title'] + ' ' + pubmed_dict['abstract']
     pubmed_paras_dict = parser.parse_pubmed_paragraph(self.filename)
     for paras in pubmed_paras_dict:
         text = text + paras['text']
     # encodes the unicode string to ascii and replaces the xml entity character references
     # with '?' symbols. decode() then converts this byte string to a regular string for later
     # processing - strip(punctuation) fails otherwise. replace() gets rid of all '?' symbols and
     # replaces with a space. Later the text is split into words.
     text = text.encode('ascii',
                        'replace').decode('ascii').replace('?', ' ')
     return text
Exemplo n.º 14
0
def pmc2pubtator(xml_in, pmcid, job_size, dest_dir):
    try:
        pubmed_out = pp.parse_pubmed_xml(xml_in)
        ft_out = pp.parse_pubmed_paragraph(xml_in, all_paragraph=False)
    except Error as e:
        print 'Error in parsing nxml file %s ' % xml_in
        return -1

    cnt = 0
    bcnt = 0

    #print 'PMC2Txt', xml_in

    pmcid_no = pmcid.replace('PMC', '')

    full_text = ''

    for paragraph in ft_out:
        if 'text' in paragraph:
            full_text += paragraph['text']

    full_text = u2a_convert(pmcid, full_text, 'fulltext')

    pmcnumber = pubmed_out['pmc']

    title = pubmed_out['full_title']
    abstract = pubmed_out['abstract']
    ttle = u2a_convert(pmcid, title, 'title')
    abst = u2a_convert(pmcid, abstract, 'abstract')

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    f_tmp_in_fn = '%s/%s.txt' % (dest_dir, pmcid)
    if os.path.exists(f_tmp_in_fn):
        with open(f_tmp_in_fn) as f_tmp_in:
            if len(f_tmp_in.readlines()) > 0:
                f_tmp_in.close()
                return 1

    f_tmp_in = open(f_tmp_in_fn, 'w')

    f_tmp_in.write(pmcnumber + '|t|' + ttle.strip())
    f_tmp_in.write('\n')
    f_tmp_in.write(pmcnumber + '|a|' + abst.strip() + full_text.strip())
    f_tmp_in.write('\n')
    f_tmp_in.write('\n')

    f_tmp_in.close()
    return 1
Exemplo n.º 15
0
def main():

    try:
        for subdir, dirs, files in os.walk(directory_path_chunk):
            for file in files:
                if file.endswith('.nxml'):
                    print(file)
                    filename = os.path.join(subdir, file)
                    dict_out = pp.parse_pubmed_xml(filename)
                    xml_json = json.dumps(dict_out, ensure_ascii=False)
                    document_info = parse_document(dict_out)
                    parse_scientist(dict_out, document_info)
    except Exception:
        pass
Exemplo n.º 16
0
def process_file(date_update, fraction=0.01):
    """Process unzipped Pubmed Open-Access folder to parquet file"""
    print("Process Pubmed Open-Access file to parquet with fraction = %s" %
          str(fraction))
    date_update_str = date_update.strftime("%Y_%m_%d")
    if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')):
        subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet'
                         ])  # remove if folder still exist

    path_all = pp.list_xml_path(unzip_dir)
    if fraction < 1:
        n_sample = int(fraction * len(path_all))
        rand_index = random.sample(range(len(path_all)), n_sample)
        rand_index.sort()
        path_sample = [path_all[i] for i in rand_index]
    else:
        path_sample = path_all

    path_rdd = sc.parallelize(path_sample,
                              numSlices=10000)  # use only example path
    parse_results_rdd = path_rdd.map(
        lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
    pubmed_oa_df = parse_results_rdd.toDF()
    pubmed_oa_df_sel = pubmed_oa_df[[
        'full_title', 'abstract', 'doi', 'file_name', 'pmc', 'pmid',
        'publication_year', 'publisher_id', 'journal', 'subjects'
    ]]
    pubmed_oa_df_sel.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_%s.parquet' % date_update_str),
                                   mode='overwrite')

    parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_name_df = parse_name_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
                                mode='overwrite')

    parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_affil_df = parse_affil_rdd.toDF()
    # change to parse_affil_df
    parse_affil_df.write.parquet(os.path.join(
        save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
                                 mode='overwrite')
    print('Finished parsing Pubmed Open-Access subset')
Exemplo n.º 17
0
def extract(path):
    article = pp.parse_pubmed_xml(path)
    pmid = article['pmid']
    title = article['full_title']
    abstract = article['abstract']
    if pmid not in pmids:
        return None
    doc = {
        'pmid': pmid,
        'title': title,
        'abstract': abstract,
        'toks': {
            'title': [tok.text for tok in nlp(title)],
            'abstract': [tok.text for tok in nlp(abstract)],
        }
    }
    return doc
Exemplo n.º 18
0
    def merge(self):
        print('PubMed path:', self.pubmed_path)

        with open(self.output_filename, mode='w', newline='\n') as ofile:

            # PubMed
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.xml'),
                                      recursive=self.recursive):
                print('file:', filename)
                dicts_out = pmp.parse_medline_xml(filename)

                self.write_dicts(dicts_out, 'abstract', ofile, 'title',
                                 'pubmed_abstract')

            # PMC
            for filename in glob.glob(os.path.join(self.pubmed_path,
                                                   '**/*.nxml'),
                                      recursive=self.recursive):
                print('file:', filename)

                # OA abstract
                try:
                    dicts_out = [pmp.parse_pubmed_xml(filename)]
                    self.write_dicts(dicts_out, 'abstract', ofile,
                                     'full_title', 'pmc_oa_abstract')
                except:
                    pass

                # OA image caption
                try:
                    dicts_out = pmp.parse_pubmed_caption(filename)
                    self.write_dicts(dicts_out, 'fig_caption', ofile,
                                     'fig_label', 'pmc_oa_image-caption')
                except:
                    pass

                # OA Paragraph
                try:
                    dicts_out = pmp.parse_pubmed_paragraph(filename,
                                                           all_paragraph=True)
                    self.write_dicts(dicts_out, 'text', ofile, 'reference_ids',
                                     'pmc_oa_paragraph')
                except:
                    pass
Exemplo n.º 19
0
def parse_oa_xml(xml_file, output_file, mode):
    """Import pubmed open access XML file into prophet database."""
    # For open access
    import pubmed_parser as pp

    if mode == 'paper':
        dicts_out = pp.parse_pubmed_xml(xml_file)
    elif mode == 'paragraphs':
        dicts_out = pp.parse_pubmed_paragraph(xml_file, all_paragraph=True)
    elif mode == 'references':
        dicts_out = pp.parse_pubmed_references(xml_file)
    elif mode == 'tables':
        dicts_out = pp.parse_pubmed_table(xml_file, return_xml=False)
    elif mode == 'figures':
        dicts_out = pp.parse_pubmed_caption(xml_file)

    with open(output_file, 'w') as fp:
        json.dump(dicts_out, fp, cls=DateEncoder)
Exemplo n.º 20
0
def build_case_report_json(xml_path: str) -> dict:
    """Makes and returns a JSON object from pubmed XML files
    Args:
        xml_path (str): path to input XML file
    """
    pubmed_xml = pp.parse_pubmed_xml(xml_path)
    pubmed_paragraph = pp.parse_pubmed_paragraph(xml_path)
    pubmed_references = pp.parse_pubmed_references(xml_path)
    subjects = pubmed_get_subjects(pubmed_xml)
    keywords = get_keywords(subjects)
    article_type = get_article_type(subjects)

    case_report = {
        "pmID": pubmed_xml.get("pmid"),
        "doi": pubmed_xml.get("doi"),
        "title": pubmed_xml.get("full_title"),
        "messages": [],
        "source_files": [],
        "modifications": [],
        "normalizations": [],
        # ctime            : 1351154734.5055847,
        "text": pubmed_get_text(pubmed_paragraph),
        "entities": [],
        "attributes": [],
        # date : { type: Date, default: Date.now }
        "relations": [],
        "triggers": [],
        "events": [],
        "equivs": [],
        "comments": [],
        # sentence_offsets     : [],
        # token_offsets    : [],
        "action": None,
        "abstract": pubmed_xml.get("abstract"),
        "authors": pubmed_get_authors(pubmed_xml),
        "keywords": keywords,
        "introduction": None,
        "discussion": None,
        "references": [],
        "journal": pubmed_xml.get("journal"),
        "article_type": article_type,  # For filtering.
    }

    return case_report
Exemplo n.º 21
0
def BibParser(item):
    '''
    Function is designed to take a nxml file with file path, parse it, and then return a list containing the
    specific values of interest.
    :param item: string containing the path to a nxml file
    :return: list of specific information parsed from the nxml file
    '''

    # parse dictionary
    dict1 = pp.parse_pubmed_xml(item)

    # get publication info
    pmid = dict1['pmid']
    pmcid = dict1['pmc']
    title = dict1['full_title'].encode('utf8')
    doi = dict1['doi']
    date = dict1['publication_year']
    authors = [x[:-1] for x in dict1['author_list']]
    journal = dict1['journal']

    return pmcid, pmid, title, doi, date, journal, authors
Exemplo n.º 22
0
def process_file(date_update, fraction=0.01):
    """Process unzipped Pubmed Open-Access folder to parquet file"""
    print("Process Pubmed Open-Access file to parquet with fraction = %s" % str(fraction))
    date_update_str = date_update.strftime("%Y_%m_%d")
    if glob(os.path.join(save_dir, 'pubmed_oa_*.parquet')):
        subprocess.call(['rm', '-rf', 'pubmed_oa_*.parquet']) # remove if folder still exist

    path_all = pp.list_xml_path(unzip_dir)
    if fraction < 1:
        n_sample = int(fraction * len(path_all))
        rand_index = random.sample(range(len(path_all)), n_sample)
        rand_index.sort()
        path_sample = [path_all[i] for i in rand_index]
    else:
        path_sample = path_all

    path_rdd = sc.parallelize(path_sample, numSlices=10000) # use only example path
    parse_results_rdd = path_rdd.map(lambda x: Row(file_name=os.path.basename(x), **pp.parse_pubmed_xml(x)))
    pubmed_oa_df = parse_results_rdd.toDF()
    pubmed_oa_df_sel = pubmed_oa_df[['full_title', 'abstract', 'doi',
                                     'file_name', 'pmc', 'pmid',
                                     'publication_year', 'publisher_id',
                                     'journal', 'subjects']]
    pubmed_oa_df_sel.write.parquet(os.path.join(save_dir, 'pubmed_oa_%s.parquet' % date_update_str),
                                   mode='overwrite')

    parse_name_rdd = parse_results_rdd.map(lambda x: parse_name(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_name_df = parse_name_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_author_%s.parquet' % date_update_str),
                                mode='overwrite')

    parse_affil_rdd = parse_results_rdd.map(lambda x: parse_affiliation(x)).\
        filter(lambda x: x is not None).\
        flatMap(lambda xs: [x for x in xs])
    parse_affil_df = parse_affil_rdd.toDF()
    parse_name_df.write.parquet(os.path.join(save_dir, 'pubmed_oa_affiliation_%s.parquet' % date_update_str),
                                mode='overwrite')
    print('Finished parsing Pubmed Open-Access subset')
    def process_paper(self, file, db):
        """
        Loads a pdf file in the folder, and extracts its content into an XML file, as well as into the mongodb
        database
        :param file: the name of the paper to be processed
        :param db: mongo db
        :return:
        """

        try:
            xml, t = self.get_xml(file)

            if t == 'grobid':

                NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

                result = grobid_mapping.tei_to_dict(xml)

                mongo_set_dict = dict()

                if 'abstract' in result:
                    mongo_set_dict["content.abstract"] = result["abstract"]
                if 'notes' in result:
                    mongo_set_dict["content.notes"] = result["notes"]
                if 'fulltext' in result:
                    mongo_set_dict["content.fulltext"] = result["fulltext"]
                    with open(cfg.folder_content_xml + file + ".txt",
                              'w') as f:
                        print(result["fulltext"])
                if 'chapters' in result:
                    mongo_set_dict["content.chapters"] = result["chapters"]

                mongo_result = db.publications.update_one({'_id': file[:10]},
                                                          {'$set': result},
                                                          upsert=True)
                print(mongo_result)

                logging.info("Processed " + file + ' with new xml')

            if t == 'pmc':
                filename_xml = cfg.source_xml + file[:-4] + ".nxml"
                meta = pp.parse_pubmed_xml(filename_xml)
                ref = pp.parse_pubmed_references(filename_xml)
                article_text = pp.parse_pubmed_paragraph(filename_xml,
                                                         all_paragraph=True)

                result = dict()

                fulltext = []
                for par in article_text:
                    fulltext.append(par['text'])

                result['title'] = meta['full_title']
                result['authors'] = meta['author_list']
                result['journal'] = meta['journal']
                result['year'] = meta['publication_year']
                result['type'] = meta['subjects']
                result['domain'] = 'biomedical'
                result['license'] = 'open_access'
                result['content.abstract'] = meta['abstract']
                result['content.keywords'] = meta['keywords']
                result['content.references'] = ref
                result['content.fulltext'] = ''.join(fulltext)

                translator = str.maketrans('', '', string.punctuation)

                chapters = defaultdict(list)
                for par in article_text:
                    section = par['section']
                    section = section.translate(translator)
                    chapters[section].append(par['text'])

                chapters_par = []
                for key in chapters:
                    chapter_paragraphs = {
                        'paragraphs': chapters[key],
                        'title': key
                    }
                    chapters_par.append([chapter_paragraphs])

                result['content.chapters'] = chapters_par

                mongo_result = db.publications.update_one(
                    {'_id': 'PMC_' + meta['pmc']}, {'$set': result},
                    upsert=True)
                print(mongo_result)

                logging.info("Processed " + file + ' with original nxml')

        except Exception:
            logging.exception('Cannot process paper', file, exc_info=True)
Exemplo n.º 24
0
def parse_pubmed(src):
    """Parse pubmed xml article data and return metadata and text."""
    metadata = pubmed_parser.parse_pubmed_xml(src)
    text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True)
    text = ' '.join(' '.join([x['text'] for x in text]).split())
    return metadata, text
Exemplo n.º 25
0
import pubmed_parser as pp


def walk(path='./sample'):
    for parent, _, file_lst in os.walk(path):
        for file_name in file_lst:
            if file_name.endswith('xml'):
                yield os.path.join(parent, file_name)


if __name__ == '__main__':
    corpus = []

    # Read text.
    for path in walk():
        doc = pp.parse_pubmed_xml(path)
        text = doc['abstract']
        corpus.append(text)

    ################################################
    # Example with n-grams for n in [1, 2, 3].
    ################################################
    print('\n\n\nExample with n-grams for n in [1, 2, 3].')
    vectorizer = TfidfVectorizer(ngram_range=(1, 3))
    X = vectorizer.fit_transform(corpus)
    ngrams = vectorizer.get_feature_names()

    print('# of n-grams:')
    print(collections.Counter([len(x.split()) for x in ngrams]))
    # Counter({3: 618, 2: 550, 1: 295})
Exemplo n.º 26
0
"""
Must install pubmed_parser.

    pip install git+git://github.com/titipata/pubmed_parser.git

Source: https://github.com/titipata/pubmed_parser
"""

import os
import pubmed_parser as pp

path = './sample/Am_J_Speech_Lang_Pathol/PMC6802915.nxml'

dict_out = pp.parse_pubmed_xml(path)

print(dict_out.keys())