예제 #1
0
def read_one_from_web():
    pmid = "11250746"
    d = pp.parse_xml_web(pmid, save_xml=True)
    xml = d["xml"]
    d.pop("xml", None)

    pprint.pprint(d)
    print()

    os.makedirs("xml", exist_ok=True)

    with open("xml/{}.xml".format(pmid), "wb") as f:
        f.write(xml)
예제 #2
0
def download_api(ids, outfile, save_dir):

    # if more than 1000 ids, do batch
    if len(ids) > 1000:
        chunks = np.array_split(ids, len(ids) / 1000)
    else:
        chunks = [ids]

    # run in chunks
    print("Downloading from API in {:} chunks".format(len(chunks)))
    i = 1
    for chunk in chunks:
        # list to store data
        articles = []
        citations = []
        for id in tqdm(chunk):
            try:
                # article
                time.sleep(0.25)
                articles.append(pp.parse_xml_web(id, save_xml=False))

                # citations
                time.sleep(0.25)
                d_ = pp.parse_outgoing_citation_web(id, id_type='PMID')
                if d_:
                    for bpmid in d_['pmid_cited']:
                        citations.append({
                            'apmid': d_['doc_id'],
                            'bpmid': bpmid
                        })
            except TypeError:
                pass

        # write to SQL
        df_data = pd.DataFrame(articles)
        df_data.to_pickle(os.path.join(save_dir, 'df_data_' + str(i) + '.pkl'))
        df_ref = pd.DataFrame(citations)
        df_ref.to_pickle(os.path.join(save_dir, 'df_ref_' + str(i) + '.pkl'))

        # write to log
        if len(df_data) != 0:
            for id in df_data['pmid'].tolist():
                outfile.write(id + '\n')
        i += 1
    print("All done!")
예제 #3
0
def make_csv(catfile):

    headers = [
        'pmid', 'title', 'abstract', 'journal', 'affiliation', 'authors',
        'year', 'keywords', 'category'
    ]
    csv_db = PD.DataFrame(columns=headers, index=None)

    labelled = PD.read_csv(catfile)
    pmids = labelled.loc[:, 'pubmed_id']
    pmid_cat = labelled.loc[:, 'paper_category']

    for n, c in zip(pmids, pmid_cat):
        paper = PP.parse_xml_web(n)
        paper_row = [n,paper['title'],paper['abstract'],paper['journal'],paper['affiliation'],\
           paper['authors'],paper['year'],paper['keywords'],c]
        csv_db.loc[len(csv_db) + 1] = paper_row

    outfile_name = catfile[:-4] + '_processed.csv'

    csv_db.to_csv(outfile_name, index=None)
    return outfile_name
예제 #4
0
def make_csv_(catfile, outfile_name):

    headers = [
        'pmid', 'title', 'abstract', 'journal', 'affiliation', 'authors',
        'year', 'keywords', 'category'
    ]
    csv_db = PD.DataFrame(columns=headers, index=None)

    labelled = catfile
    pmids = labelled.loc[:, 'id']
    pmid_cat = labelled.loc[:, 'label']

    counter = 0
    for n, c in zip(pmids, pmid_cat):
        paper = PP.parse_xml_web(n)
        paper_row = [n,paper['title'],paper['abstract'],paper['journal'],paper['affiliation'],\
           paper['authors'],paper['year'],paper['keywords'],c]
        csv_db.loc[len(csv_db) + 1] = paper_row
        counter += 1
        if counter % 10 == 0:
            print(counter, end=' ')

    csv_db.to_csv(outfile_name, index=None)
    return outfile_name
예제 #5
0
    'B-Intervention': 7,
    'I-Intervention': 8,
    'B-Comparator': 9,
    'I-Comparator': 10,
    'B-Outcome': 11,
    'I-Outcome': 12
}
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

#%% App
st.header('PICO extraction for in vivo abstract')
# pmid = st.text_input('Input one PMID: ', 23326526)
pmid = st.text_input('Input one PMID: ', 27231887)

try:
    xml = pubmed_parser.parse_xml_web(pmid)
    title = xml['title']
    text = xml['abstract']
    if text == "":
        st.text("No abstract available!")
    else:
        # st.write(title)
        # st.write(text)
        ## Extract pico text
        text = sent_detect(text, sent_pth)
        # Extract pico phrases
        tup, _, _ = pico_extract(text, pth_path, idx2tag)
        res = tup2dict(tup)

        st.write("""### Extracted PICO text  ###
                 """)
예제 #6
0
def pmid2text(pmid):
    outputs = pp.parse_xml_web(pmid, save_xml=False)
    return outputs["title"], outputs["abstract"]
예제 #7
0
    def get_article_info_dict_from_pubmed(self):
        """
        :param store_file_name:
        :param pmid_list:
        :return: a dictionary whose key is pmid, value is information of referred article,
            value is also a dictionary, including keys below:
            title : title
            abstract : abstract
            journal : journal
            affiliation : affiliation of first author
            authors : string of authors, separated by ;
            year : Publication year
            keywords : keywords or MESH terms of the article

        store path: 'articles_info_' + store_file_name
        """
        print('------------------------------------------------------------')
        print('Get article info:')

        def clean(s):
            s.replace("*", " ")
            s.replace("¶", " ")
            s.replace("†", " ")
            s.replace("§", " ")
            s.replace("-", " ")
            s.replace("<", " ")
            s.replace(">", " ")
            s.replace("#", " ")
            s = s.rstrip().lstrip()
            if s == '':
                return ''

            if s[0] == ',' or s[0] == '.':
                s = s[1:]
            if s == '':
                return ''
            # if s[-1] == ')':
            #     s = s[:-1]
            return s

        begin_time = time.time()
        store_file_name = self.store_file_name + 'articles' + '.npy'
        article_info_dict = dict()
        if os.path.exists(store_file_name):
            # TODO(lyj): 为什么这里不用flat[0]
            article_info_dict = np.load(store_file_name,
                                        allow_pickle=True).flat[0]
            if self.update == False:
                return article_info_dict

        delete_list = article_info_dict.keys() - self.pmid_list
        add_list = self.pmid_list - article_info_dict.keys()
        print("need to delete %d articles" % len(delete_list))
        print("need to add %d articles" % len(add_list))
        for pmid in delete_list:
            del article_info_dict[pmid]
        i = 0
        for pmid in add_list:
            if i % 20 == 0:
                np.save(store_file_name, article_info_dict)
                print('\n%d new articles are saved' % i)
            try:
                article_info = pp.parse_xml_web(pmid, save_xml=False)
            except KeyboardInterrupt:
                # do the saving here
                exit(-1)  # exit the program with the return code -1
            except:
                print('\nerror in ', pmid)
            else:

                temp_list = []
                for item in article_info['authors'].split(';'):
                    item = clean(item)
                    if item == '':
                        continue
                    temp_list.append(item)
                article_info['authors'] = temp_list

                temp_list = []
                for item in article_info['affiliation'].split(';'):
                    item = clean(item)
                    if item == '':
                        continue
                    temp_list.append(item)
                article_info['affiliation'] = temp_list

                temp_list = []
                for item in article_info['keywords'].split(';'):
                    # item = item.split(':')
                    # item = item[1] if len(item) > 1 else item[0]
                    item = clean(item)
                    if item == '':
                        continue
                    if "," in item:
                        temp_list += item.rstrip().lstrip().split(',')
                    else:
                        temp_list.append(str.lower(item.rstrip().lstrip()))
                    article_info['keywords'] = temp_list

                article_info_dict[pmid] = article_info
                i += 1
                print(i + 1, end=' ')

        np.save(store_file_name, article_info_dict)
        print('\n%d new articles are saved' % i)
        print('Get article info done, used time:', time.time() - begin_time)
        print('------------------------------------------------------------')
        return article_info_dict