Пример #1
0
    def test_article_by_pmid(self):
        pmid = '4'
        fetch = PubMedFetcher()
        article = fetch.article_by_pmid(pmid)
        assert str(article.pmid) == pmid

        pmid = '25763451'
        fetch = PubMedFetcher()
        article = fetch.article_by_pmid(pmid)
        assert str(article.pmid) == pmid
Пример #2
0
 def search(source = "PubMed", level = "basic", db = "PubMed", query = None, unlabeled_string = None, affiliation = None, article_identifier = None, all_fields = None, author = None, author_identifier = None, book = None, corporate_author = None, create_date = None, completion_date = None, conflict_of_interest = None, ec_rn_number = None, editor = None, entrez_date = None, filter_citations = None, first_author_name = None, full_author_name = None, full_investigator_name = None, grant_number = None, investigator = None, isbn = None, issue = None, journal = None, language = None, last_author = None, location_id = None, mesh_date = None, mesh_major_topic = None, mesh_subheadings = None, mesh_terms = None, modification_date = None, nlm_unique_id = None, other_term = None, owner = None, pagination = None, personal_name_as_subject = None, pharmacological_action = None, place_of_publication = None, pmid = None, publisher = None, publication_date = None, publication_type = None, retmax = None, retmode = None, secondary_source_id = None, sort = None, subset = None, supplementary_concept = None, text_words = None, title = None, title_abstract = None, transliterated_title = None, uid = None, volume = None, raw = False, exact = False, user = None):
     
     if source.lower() in ["pubmed"] and level.lower() == "complex":
         
         return eutils_search(db = db, retmode = retmode, retmax = retmax, sort = sort, unlabeled_string = unlabeled_string, affiliation = affiliation, article_identifier = article_identifier, all_fields = all_fields, author = author, author_identifier = author_identifier, book = book, corporate_author = corporate_author, create_date = create_date, completion_date = completion_date, conflict_of_interest = conflict_of_interest, ec_rn_number = ec_rn_number, editor = editor, entrez_date = entrez_date, filter_citations = filter_citations, first_author_name = first_author_name, full_author_name = full_author_name, full_investigator_name = full_investigator_name, grant_number = grant_number, investigator = investigator, isbn = isbn, issue = issue, journal = journal, language = language, last_author = last_author, location_id = location_id, mesh_date = mesh_date, mesh_major_topic = mesh_major_topic, mesh_subheadings = mesh_subheadings, mesh_terms = mesh_terms, modification_date = modification_date, nlm_unique_id = nlm_unique_id, other_term = other_term, owner = owner, pagination = pagination, personal_name_as_subject = personal_name_as_subject, pharmacological_action = pharmacological_action, place_of_publication = place_of_publication, pmid = pmid, publisher = publisher, publication_date = publication_date, publication_type = publication_type, secondary_source_id = secondary_source_id, subset = subset, supplementary_concept = supplementary_concept, text_words = text_words, title = title, title_abstract = title_abstract, transliterated_title = transliterated_title, uid = uid, volume = volume, raw = raw, exact = exact)
     
     elif source.lower() in ["pubmed"] and level.lower() == "basic":
         
         # Use 'unlabeled_string' or 'query' here.
         # This function already takes completed
         # PubMed queries as strings (with
         # various connectors and constructors).
         if unlabeled_string:
             
             fetch = PubMedFetcher()
             pubmed_id_list = fetch.pmids_for_query(unlabeled_string)
             ref_list = []
             for pubmed_id in pubmed_id_list:
                 article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles...
                 temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title)
                 ref_list.append(temp_ref)
             return ref_list
         elif query:
             
             # This is where the basic reference
             # search redirects for now, but it
             # is relatively slow.
             fetch = PubMedFetcher()
             pubmed_id_list = fetch.pmids_for_query(query)
             ref_list = []
             for pubmed_id in pubmed_id_list:
                 try:
                     article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles...
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title)
                     ref_list.append(temp_ref)
                 except metapub.exceptions.InvalidPMID:
                     print("An invalid PMID error occurred.")
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed")
                     ref_list.append(temp_ref)
                 else:
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed")
                     ref_list.append(temp_ref)
             return ref_list
     
     elif source.lower() in ["google", "google scholar"]:
         return google_scholar_search(unlabeled_string)
     
     elif source.lower() in ["openlibrary"]:
         return openlibrary_search(unlabeled_string)
Пример #3
0
    def processPMID(self, description, document, text):
        """XXX"""

        pmid = re.compile('PMID *(\d+)')
        list_pmid = pmid.findall(description)
        description = re.sub(r'\[PMID *\d+\]', '', description)
        pmcid = re.compile('PMCID *(\d+)')
        list_pmcid = pmcid.findall(description)
        description = re.sub(r'\[PMCID *\d+\]', '', description)
        para = description.split(ur'\n')
        for para_str in para:
            #print para_str
            p = document.add_paragraph('	')
            p.add_run(para_str)
        std_str = u"我们通过检测您的基因位点,使用PUBMED等国际公认参考系统,我们认为" + text + u"。"
        p = document.add_paragraph('	')
        p.add_run(std_str)

        fetch = PubMedFetcher()
        for pmid in list_pmid:
            # http://www.ncbi.nlm.nih.gov/pubmed/26471457
            pm = fetch.article_by_pmid(pmid)
            title = pm.title
            title = re.sub('\.', '', title)
            citation = '. '.join([title, pm.journal])
            p = document.add_paragraph()
            p.add_run(citation).italic = True

        for pmcid in list_pmcid:
            pm = fetch.article_by_pmcid(pmcid)
            title = pm.title
            title = re.sub('\.', '', title)
            citation = '. '.join([title, pm.journal])
            p = document.add_paragraph()
            p.add_run(citation).italic = True
Пример #4
0
class TestPubMedArticle(unittest.TestCase):
    def setUp(self):
        self.fetch = PubMedFetcher()

    def tearDown(self):
        pass

    def test_random_efetch(self):
        pmid = str(random.randint(22222222, 23333333))
        try:
            article = self.fetch.article_by_pmid(pmid)
            if article is not None:
                assert article.pmid == pmid
                assert article.title is not None
        except InvalidPMID:
            self.test_random_efetch()
            # print "PMID %s returned InvalidPMID response (which is totally OK). Run test again!" % pmid

    def test_init1(self):
        """
        Test on the xml returned by eutils
        """
        article = PubMedArticle(xml_str1)
        assert str(article.pmid) == '4'

    def test_init2(self):
        """
        Test on the xml downloaded from medline
        """
        article = PubMedArticle(xml_str2)
        assert str(article.pmid) == '23697015'

    def test_to_dict(self):
        article = PubMedArticle(xml_str1)
        self.assertTrue(isinstance(article.to_dict(), dict))
Пример #5
0
def consultametapub():
    fetch = PubMedFetcher()
    if not request.json:
        abort(400)
    pmid = request.json['id']
    article = fetch.article_by_pmid(pmid)
    return jsonify(output=article.title)
Пример #6
0
    def downloadAbstract(self, keywords, file_name,max_return=1e+6):
        fetcher = PubMedFetcher(cachedir=self.cache_dir, api_key=self.api_key)
        pmids = fetcher.pmids_for_query(keywords, retmax=max_return)
        
        corpus = ET.Element('corpus')
        keywords_item = ET.SubElement(corpus, 'keywords')
        keywords_item.text = keywords
        
        for pmid in pmids:
            print(pmid)
            fetcher._eutils_article_by_pmid(pmid)
            doc = fetcher.article_by_pmid(pmid)
            title_str = self.removeHtmlTags(doc.title)
            abstract_str = self.removeHtmlTags(doc.abstract)
            
            if abstract_str == '':
                continue
            
            doc_item = ET.SubElement(corpus, 'article')
            doc_item.set('id', pmid)
            
            title_item = ET.SubElement(doc_item, 'title')
            title_item.text = title_str

            abstract_item = ET.SubElement(doc_item, 'abstract')
            abstract_item.text = abstract_str
            
        corpus_in_string = ET.tostring(corpus)
        xml_file = open(file_name, 'wb')
        xml_file.write(corpus_in_string)
Пример #7
0
def search(entry):
    fetch = PubMedFetcher()
    try:
        article = fetch.article_by_pmid(entry['pmid'])
    except:
        try:
            article = fetch.article_by_pmcid(entry['pmcid'])
        except:
            try:
                article = fetch.article_by_doi(entry['doi'])
            except:
                try:
                    pmids = fetch.pmids_for_citation(authors=entry['author'], journal=entry['journal'], year=entry['year'], volume=entry['volume'])
                    # pmids2 = fetch.pmids_for_query(entry['title'])
                    article = fetch.article_by_pmid(pmids[0])
                except:
                    return None
    return article
Пример #8
0
    def crawl_chem_abstract(self, keyword, retmax=300):
        fetch = PubMedFetcher()
        self.progress_bar_value.emit(self.count)

        pmids = fetch.pmids_for_query(keyword, retmax=retmax)

        self.textBrowser_value.emit("Scanning Iteration : " + str(retmax))
        self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.")

        self.textBrowser_value.emit("PMID Scan Done!")

        json_dicts = []
        self.textBrowser_value.emit("Crawling Paper Info..")

        for i in range(len(pmids)):
            pmid = pmids[i]
            try:
                if int(i / len(pmids) * 100) > self.count:
                    self.count = int(i / len(pmids) * 100)
                    self.progress_bar_value.emit(self.count)

                try:
                    article = fetch.article_by_pmid(pmid)
                except:
                    self.textBrowser_value.emit("Error reading " + str(pmid))
                    continue

                chemical = article.chemicals
                if not chemical:
                    continue

                abstract = article.abstract.replace(",", "*")
                if not abstract:
                    continue
                elif "\t" in abstract or "\n" in abstract:
                    abstract = abstract.replace("\t", " ")
                    abstract = abstract.replace("\n", " ")

                title = article.title
                if not title:
                    continue
                elif "\t" in title or "\n" in title:
                    title = title.replace("\t", " ")
                    title = title.replace("\n", " ")

                chemical["title"] = title
                chemical["abstract"] = abstract

                json_dicts.append(chemical)
            except:
                continue

        self.textBrowser_value.emit("Progress Done!")
        return json_dicts
Пример #9
0
def keyword_query(keywords=sys.argv[1],
                  savepath=sys.argv[2],
                  start_date=None,
                  end_date=None,
                  num_of_articles=1000):
    """
	keyword_query takes in a keyword string or list of keywords, and outputs 
	a dataframe with article meta data that matches the keyword query.

	**NOTE**: Long queries (~1000+ articles) will take > 5 minutes. 
	Thus, it is advisable to add additional keywords and filters to constrain the 
	search space.

	:param keywords:         A string or a list of keywords to query.
	:param savepath:         A string denoting the full path to save the file in.
	:param start_date:       A string denoting the start date.
	:param end_date:         A string denoting the end date.
	:param num_of_articles:  An integer denoting the maximum number of articles.

	:return df:              A pandas dataframe of the query.
	"""

    fetch = PubMedFetcher()

    # Get PMIDs using query
    pmids = fetch.pmids_for_query(query=keywords,
                                  since=start_date,
                                  until=end_date,
                                  retmax=num_of_articles)
    print("Number of PMIDs with search query: " + str(len(pmids)))

    # Get abstracts based on keyword search.
    # The query saves to a dictionary, using the PMID as the key.
    abstracts = {}
    for id in pmids:
        article = fetch.article_by_pmid(id)
        abstracts[id] = [
            article.title, article.abstract, article.journal, article.year,
            article.authors
        ]

    # Save the dictionary as a dataframe
    df = pd.DataFrame.from_dict(
        abstracts,
        orient='index',
        columns=['Title', 'Abstract', 'Journal', 'Year', 'Authors'])

    # Save the dataframe
    df.index.name = 'PMID'
    df.to_csv(savepath)

    return df
Пример #10
0
    def pmid_article(ref, user=None):
        article_array = []
        if user:
            if user.email is not None:
                for pmid in Reference.pmid(ref):
                    url = "http://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid)
                    lookup = PubMedLookup(url, user.email)
                    publication = Publication(lookup)
                    article_array.append(publication)

        fetch = PubMedFetcher()
        for pmid in Reference.pmid(ref):
            article = fetch.article_by_pmid(pmid)
            article_array.append(article)
                
        return article_array
Пример #11
0
    def pmid_article(ref, user=None):
        article_array = []
        if user:
            if user.email is not None:
                for pmid in Reference.pmid(ref):
                    url = "http://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid)
                    lookup = PubMedLookup(url, user.email)
                    publication = Publication(lookup)
                    article_array.append(publication)

        fetch = PubMedFetcher()
        for pmid in Reference.pmid(ref):
            article = fetch.article_by_pmid(pmid)
            article_array.append(article)

        return article_array
Пример #12
0
 def __init__(self, pmid):
     self.pmid = pmid
     fetch = PubMedFetcher(email='*****@*****.**')
     article = fetch.article_by_pmid(pmid)
     self.title = article.title
     self.journal = article.journal
     self.authors = article.authors
     # pm_cited - which papers cited current paper
     try:
         self.pm_cited = fetch.related_pmids(pmid)['citedin']
     except:
         self.pm_cited = None
     self.h_index = self.get_H_index() + 1
     # self.h_index = 1
     # pm_cite - which papers cited by current paper
     self.pm_cite = []
     print("create paper with pmid" + pmid)
Пример #13
0
def fetch_pubmed(pub_id, id_type = "pmid"):
    """
        Fetches and formats pub data from
        pubmed
    """
    pm = PubMedFetcher()
    if id_type == 'doi':
        try:
            result = pm.article_by_doi(pub_id)
        except (AttributeError, MetaPubError, EutilsNCBIError):
            return None
    elif id_type == "pmid":
        try:
            result = pm.article_by_pmid(pub_id)
        except (AttributeError, InvalidPMID, EutilsNCBIError):
            return None
    elif id_type == "pmc":
        try:
            result = pm.article_by_pmcid('PMC' + str(pub_id))
        except (AttributeError, MetaPubError, EutilsNCBIError):
            return None
    result = result.to_dict()

    # Set link using DOI
    if result.get('doi'):
        result['url'] = "http://dx.doi.org/" + result.get('doi')
    else:
        result['url'] = result.get('url')

    # Provide PDF if possible
    if result.get('pmc'):
        result['pdf_url'] = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{result['pmc']}/pdf"
    

    out = {"pub_title": result.get('title'),
           "pub_authors": result.get('authors'),
           "pub_abstract": result.get('abstract'),
           "pub_doi": result.get('doi'),
           "pub_pmid": result.get('pmid'),
           "pub_pmc": pub_id if id_type == 'pmc' else None,
           "pub_url": result.get('url'),
           "pub_pdf_url": result.get('pdf_url') or 'searching',
           "pub_journal": result.get('journal'),
           "pub_date": result['history'].get('pubmed')}
    return out
Пример #14
0
def get_info_by_PMID(PMID: str) -> Dict:
    '''This function takes a PMID str, requests information about the corresponding
	article via metapub and checks if all necessary information has been retrieved.'''
    article_dict = {}
    fetch = PubMedFetcher()
    try:
        article = fetch.article_by_pmid(PMID)
        # Save information in Dict
        for info in dir(article):
            if info[0] != '_':
                article_dict[info] = eval('article.' + info)
    except MetaPubError:
        pass
    #if contains_minimal_information(article_dict):
    # Add data retrieval info to the dict and return it
    article_dict = add_retrieval_information(article_dict, 'MetaPub', 'PMID',
                                             PMID)
    return article_dict
Пример #15
0
def filter_results(results, words_in_tilte, limit):
    fetch = PubMedFetcher(email='*****@*****.**')
    filtered_results = []
    counter = 0
    for paper in results:
        pmid = paper.split('/')[-1].split('\n')[0]
        article = fetch.article_by_pmid(pmid)
        for words in words_in_tilte:
            include = False
            for word in words:
                if word.strip().lower() in article.title.lower():
                    include = True
                    continue
            if not include:
                break
        if include:
            filtered_results.append(paper)
            counter += 1
        if counter == limit:
            return filtered_results

    return filtered_results
def get_reference_from_pmid_by_metapub(pmid:str)->dict:
    fetch = PubMedFetcher(cachedir=cache)
    reference = None
    try:
        time.sleep(0.34)
        article = fetch.article_by_pmid(pmid)
        reference = {'journal':article.journal,
                     'authors': article.authors,
                     'issue':article.issue,
                     'first_page':article.first_page,
                     'last_page': article.last_page,
                     'volume':article.volume,
                     'year': str(article.year),
                     'abstract': replace_characters(article.abstract),
                     'title': replace_characters(article.title),
                     'doi': article.doi,
                     'pmid': article.pmid
                     }
    except:
        print('*** Bad PMID:',pmid)

    return reference
Пример #17
0
def measure_similarity_abstracts(nlp, pmid):
    def scrape_related_abstracts(pm_id):
        related_ids = scrape_related_ids(pm_id)

        if len(related_ids) > 8:
            related_ids = related_ids[:8]

        abstracts = []

        for related in related_ids:
            starter = 'https://pubmed.ncbi.nlm.nih.gov/'
            link = starter + related

            data = requests.get(link).text
            soup = BeautifulSoup(data, 'html.parser')
            abstract_header = soup.find('div', {'id': 'en-abstract'})
            try:
                abstract = str(abstract_header.p.string).strip()
                abstracts.append(abstract)
            except:
                pass

        return abstracts

    fetch = PubMedFetcher()
    exemplary = fetch.article_by_pmid(pmid).abstract

    doc1 = nlp(exemplary)

    scores = []

    for abstract in scrape_related_abstracts(pmid):
        doc2 = nlp(abstract)
        scores.append(doc1.similarity(doc2))

    return mean(scores)
Пример #18
0
def crawl_chem_json(keyword, retmax=1000):
    fetch = PubMedFetcher()

    pmids = fetch.pmids_for_query(keyword, retmax=retmax)
    print("PMID scan Done!")

    json_dicts = []
    print("Crawling Paper Info..")

    for pmid in tqdm(pmids):
        try:
            article = fetch.article_by_pmid(pmid)
        except:
            print("Error reading " + str(pmid))
            continue

        chemical = article.chemicals
        if not chemical:
            continue

        json_dicts.append(chemical)

    print("Process Done!")
    return json_dicts
Пример #19
0
    def crawl_chem_json(self, keyword, retmax=300):
        fetch = PubMedFetcher()

        pmids = fetch.pmids_for_query(keyword, retmax=retmax)

        self.textBrowser_value.emit("Scanning Iteration : " + str(retmax))
        self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.")

        self.textBrowser_value.emit("PMID Scan Done!")
        self.progress_bar_value.emit(self.count)

        json_dicts = []
        self.textBrowser_value.emit("Crawling Paper Info..")

        for i in range(len(pmids)):
            pmid = pmids[i]
            try:
                if int(i / len(pmids) * 100) > self.count:
                    self.count = int(i / len(pmids) * 100)
                    self.progress_bar_value.emit(self.count)
                try:
                    article = fetch.article_by_pmid(pmid)
                except:
                    self.textBrowser_value.emit("Error reading " + str(pmid))
                    continue

                chemical = article.chemicals
                if not chemical:
                    continue

                json_dicts.append(chemical)
            except:
                continue

        self.textBrowser_value.emit("Progress Done!")
        return json_dicts
Пример #20
0
from __future__ import absolute_import, print_function, unicode_literals

import logging
from metapub import PubMedFetcher

logging.getLogger('eutils').setLevel(logging.DEBUG)
logging.getLogger('metapub').setLevel(logging.DEBUG)

fetch = PubMedFetcher()
pmbook = fetch.article_by_pmid('20301577')

print(pmbook.title)
print(pmbook.abstract)
print(pmbook.year)
Пример #21
0
    def search(source="PubMed",
               level="basic",
               db="PubMed",
               query=None,
               unlabeled_string=None,
               affiliation=None,
               article_identifier=None,
               all_fields=None,
               author=None,
               author_identifier=None,
               book=None,
               corporate_author=None,
               create_date=None,
               completion_date=None,
               conflict_of_interest=None,
               ec_rn_number=None,
               editor=None,
               entrez_date=None,
               filter_citations=None,
               first_author_name=None,
               full_author_name=None,
               full_investigator_name=None,
               grant_number=None,
               investigator=None,
               isbn=None,
               issue=None,
               journal=None,
               language=None,
               last_author=None,
               location_id=None,
               mesh_date=None,
               mesh_major_topic=None,
               mesh_subheadings=None,
               mesh_terms=None,
               modification_date=None,
               nlm_unique_id=None,
               other_term=None,
               owner=None,
               pagination=None,
               personal_name_as_subject=None,
               pharmacological_action=None,
               place_of_publication=None,
               pmid=None,
               publisher=None,
               publication_date=None,
               publication_type=None,
               retmax=None,
               retmode=None,
               secondary_source_id=None,
               sort=None,
               subset=None,
               supplementary_concept=None,
               text_words=None,
               title=None,
               title_abstract=None,
               transliterated_title=None,
               uid=None,
               volume=None,
               raw=False,
               exact=False,
               user=None):

        if source.lower() in ["pubmed"] and level.lower() == "complex":

            return eutils_search(
                db=db,
                retmode=retmode,
                retmax=retmax,
                sort=sort,
                unlabeled_string=unlabeled_string,
                affiliation=affiliation,
                article_identifier=article_identifier,
                all_fields=all_fields,
                author=author,
                author_identifier=author_identifier,
                book=book,
                corporate_author=corporate_author,
                create_date=create_date,
                completion_date=completion_date,
                conflict_of_interest=conflict_of_interest,
                ec_rn_number=ec_rn_number,
                editor=editor,
                entrez_date=entrez_date,
                filter_citations=filter_citations,
                first_author_name=first_author_name,
                full_author_name=full_author_name,
                full_investigator_name=full_investigator_name,
                grant_number=grant_number,
                investigator=investigator,
                isbn=isbn,
                issue=issue,
                journal=journal,
                language=language,
                last_author=last_author,
                location_id=location_id,
                mesh_date=mesh_date,
                mesh_major_topic=mesh_major_topic,
                mesh_subheadings=mesh_subheadings,
                mesh_terms=mesh_terms,
                modification_date=modification_date,
                nlm_unique_id=nlm_unique_id,
                other_term=other_term,
                owner=owner,
                pagination=pagination,
                personal_name_as_subject=personal_name_as_subject,
                pharmacological_action=pharmacological_action,
                place_of_publication=place_of_publication,
                pmid=pmid,
                publisher=publisher,
                publication_date=publication_date,
                publication_type=publication_type,
                secondary_source_id=secondary_source_id,
                subset=subset,
                supplementary_concept=supplementary_concept,
                text_words=text_words,
                title=title,
                title_abstract=title_abstract,
                transliterated_title=transliterated_title,
                uid=uid,
                volume=volume,
                raw=raw,
                exact=exact)

        elif source.lower() in ["pubmed"] and level.lower() == "basic":

            # Use 'unlabeled_string' or 'query' here.
            # This function already takes completed
            # PubMed queries as strings (with
            # various connectors and constructors).
            if unlabeled_string:

                fetch = PubMedFetcher()
                pubmed_id_list = fetch.pmids_for_query(unlabeled_string)
                ref_list = []
                for pubmed_id in pubmed_id_list:
                    article = fetch.article_by_pmid(
                        pubmed_id)  # Need a faster way to get titles...
                    temp_ref = Reference(identifier=str(pubmed_id),
                                         identifier_type="PubMed ID",
                                         source="PubMed",
                                         name=article.title)
                    ref_list.append(temp_ref)
                return ref_list
            elif query:

                # This is where the basic reference
                # search redirects for now, but it
                # is relatively slow.
                fetch = PubMedFetcher()
                pubmed_id_list = fetch.pmids_for_query(query)
                ref_list = []
                for pubmed_id in pubmed_id_list:
                    try:
                        article = fetch.article_by_pmid(
                            pubmed_id)  # Need a faster way to get titles...
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed",
                                             name=article.title)
                        ref_list.append(temp_ref)
                    except metapub.exceptions.InvalidPMID:
                        print("An invalid PMID error occurred.")
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed")
                        ref_list.append(temp_ref)
                    else:
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed")
                        ref_list.append(temp_ref)
                return ref_list

        elif source.lower() in ["google", "google scholar"]:
            return google_scholar_search(unlabeled_string)

        elif source.lower() in ["openlibrary"]:
            return openlibrary_search(unlabeled_string)
Пример #22
0
for key in file_annotations:
    tool = file_annotations[key]

    if 'identifiers' in tool and ('keywords' not in tool
                                  or len(tool['keywords']) == 0):
        identifiers = tool['identifiers']
        for identifier in identifiers:
            try:
                if 'doi' in identifier:
                    doi = identifier.replace('doi:', '')
                    pubmedid = doi2pmid(doi)
                    print('doi: ' + doi + ' --> ' + 'pmid: ' + pubmedid)
                    if pubmedid is not None:
                        fetch = PubMedFetcher()
                        article = fetch.article_by_pmid(pubmedid)
                        if article.mesh is not None:
                            keywords = []
                            if 'keywords' in tools:
                                keywords = tool['keywords']
                            for keyword_key in article.mesh:
                                keyword = article.mesh[keyword_key]
                                if keyword['descriptor_name'] not in top_words:
                                    keywords.append(keyword['descriptor_name'])
                            keywords = list(dict.fromkeys(keywords))
                            tool['keywords'] = keywords
                        print(article.mesh)
            except Exception as e:
                print('Error doi --' + doi)

    tools[key] = tool
Пример #23
0
import eutils
from metapub import PubMedFetcher
fetch = PubMedFetcher()

# get the first 1000 pmids matching "breast neoplasm" keyword search
pmids = fetch.pmids_for_query('breast neoplasm', retmax=1000)

# get abstract for each article:
abstracts = {}
for pmid in pmids:
    abstracts[pmid] = fetch.article_by_pmid(pmid).abstract
Пример #24
0
def main(folder_name, query):
    initial_location = os.getcwd()
    working_directory = folder_name
    if os.path.exists(working_directory) is True:
        os.chdir(working_directory)
        # Check if ActiveSite directory exists
        if os.path.exists("RelevantPapers") is False:
            os.mkdir("RelevantPapers")
        os.chdir("./RelevantPapers")
    else:
        os.mkdir(working_directory)
        os.chdir(working_directory)
        os.mkdir("RelevantPapers")
        os.chdir("./RelevantPapers")

# Check if the files from the Blast&Modeller exist
    if query == 'YES':
        if os.path.exists("../Blast&Modeller") is True:
            shutil.copy("../Blast&Modeller/query.fasta", "./query.fasta")
        else:
            print(
                "The query file doesn't exists! Make sure you have run the BLAST&Modeller and the query.fasta is ok!"
            )
    else:
        with open("query.fasta", "w+") as f:
            protein_sequence = query
            f.write(">query" + "\n" + protein_sequence)

# Using the query sequence copied or indicated by the user, perform a blast to identify the uniprot identifier
    try:
        query = SeqIO.read(open("query.fasta"), format="fasta")
        print("Blast search running online... This migth take a while.")
        result_handle = NCBIWWW.qblast("blastp",
                                       "swissprot",
                                       query,
                                       auto_format="XML",
                                       matrix_name="BLOSUM62",
                                       expect=0.0001,
                                       word_size="6",
                                       gapcosts="11 1",
                                       alignments=10)
        print("Blast succesfull!")
        with open("blast_result.xml", "w+") as blast_result:
            blast_result.write(result_handle.read())
    except BaseException as ex:
        print("Some error occured:" + ex.message)
        time.sleep(5)
        quit()

# Check if blast was succesfull
    try:
        blastup = SearchIO.read("blast_result.xml", "blast-xml")
    except BaseException as ex:
        print("Some error occured during your blast search.\n" + ex.message)
        time.sleep(5)
        quit()

# Extract uniprotID and the protein name
    uniprot_id = blastup[0].id.split("|")[1][0:6]
    if uniprot_id.isalpha():
        uniprot_id = input(
            "It seems there is some problem finding the Uniprot ID of your protein. "
            "If you have it, please enter it. Else, press enter to exit.")
    else:
        print("Uniprot id found (" + str(uniprot_id) +
              "). Extracting information...")
    handle = urllib.request.urlopen("https://www.uniprot.org/uniprot/" +
                                    str(uniprot_id) + ".xml")
    record = SeqIO.read(handle, "uniprot-xml")
    with open("uniprot_papers.txt", "w+") as papers1:
        for papers in record.annotations["references"]:
            papers1.write(str(papers) + "\n")

    protein = []

    for info in record.annotations:
        if info == "submittedName_fullName":
            protein.append(record.annotations[info])
        elif info == "recommendedName_fullName":
            protein.append(record.annotations[info])
        elif info == "alternativeName_fullName":
            protein.append(record.annotations[info])


# Use the protein name, term by term, as keywords and add immob*
    keywords = []
    flat_keywords = []
    for names in protein:
        for name in names:
            keywords.append(name.split(" "))

    for list_1 in keywords:
        for a in list_1:
            flat_keywords.append(a)

    keywords = ""
    for words in flat_keywords:
        keywords += (str(words) + " OR ")

    keywords = keywords[0:-3]
    keywords += "AND immob*"

    # Search on pubmed using Entrez from Biopython
    print("Search on PubMed database is going to start with: \"" + keywords +
          "\" as the keywords.")
    Entrez.email = "*****@*****.**"
    handle1 = Entrez.esearch(db="pubmed",
                             sort="relevance",
                             retmax="20",
                             retmode="xml",
                             term=keywords)
    article_list = Entrez.read(handle1)
    article_id_list = article_list["IdList"]

    uniprot_articles = []
    with open("uniprot_papers.txt", "r") as papers1:
        for line in papers1:
            if "pubmed id" in line:
                uniprot_articles.append(line[11:-1])
    for i in range(len(uniprot_articles) - 1):
        try:
            if uniprot_articles[i] == "":
                del (uniprot_articles[i])
        except ErrorBase:
            break

    # Use metapub package to retreive the information and write it in a csv file
    print("Retrieving the information...")
    with open("relevant_papers.csv", "w", newline="",
              encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(
            ["Number", "Article ID", "Title", "Year", "Link", "DOI"])
        fetcher = PubMedFetcher()
        for i in range(0, len(article_id_list) - 1):
            src = fetcher.article_by_pmid(article_id_list[i])
            number = i + 1
            article_id = article_id_list[i]
            title = src.title
            year = src.year
            link = "https://pubmed.ncbi.nlm.nih.gov/" + article_id_list[i]
            DOI = src.doi
            writer.writerow([number, article_id, title, year, link, DOI])
        for i in range(0, len(uniprot_articles) - 1):
            up_src = fetcher.article_by_pmid(uniprot_articles[i])
            number = "Uniprot" + str(i + 1)
            article_id = uniprot_articles[i]
            title = up_src.title
            year = up_src.year
            link = "https://pubmed.ncbi.nlm.nih.gov/" + uniprot_articles[i]
            DOI = up_src.doi
            writer.writerow([number, article_id, title, year, link, DOI])

    print(
        "\tFinished running the module Reference retrieval module!\n You can find your result files in "
        + str(working_directory) +
        " in the RelevantPapers folder.\n The papers are organized in the csv file named "
        "\"relevant_papers.csv!\"")
    os.chdir(initial_location)
Пример #25
0
    hostname = urlparse(url).hostname
    write_one_mapping(hostname, jrnl)

# PII based
for jrnl, url in misc_pii.simple_formats_pii.items():
    hostname = urlparse(url).hostname
    write_one_mapping(hostname, jrnl)

# BIOCHEMSOC (VIP format)
for jrnl, value in biochemsoc.biochemsoc_journals.items():
    write_one_mapping(value['host'], jrnl)

# AAAS (VIP format)

# dummy pma for formatting
pma = fetch.article_by_pmid(27095592)
for jrnl, value in aaas.aaas_journals.items():
    hostname = urlparse(aaas.aaas_format.format(ja=value['ja'],
                                                a=pma)).hostname
    write_one_mapping(hostname, jrnl)

# One-offs we know about
write_one_mapping('joponline.org', 'J Periodontol')
write_one_mapping('medicinabuenosaires.com', 'Medicina (B Aires)')

fh.write('}\n')

# More complicated reversals...

# JAMA?
Пример #26
0
def psearch(pmid):
    fetch = PubMedFetcher()
    ret = fetch.article_by_pmid(pmid)
    print ret.to_dict()
df = pd.read_excel('Journals_PMID.xlsx', dtype=str)
with open('pmlist.txt', 'w') as wrtf:
    for i, column in enumerate(df):
        pmids = df[column].tolist()
        for j, pmid in enumerate(pmids):
            if str(pmid) != 'nan':
                wrtf.write(str(pmid) + '\n')
                print('Journal: ' + str(i) + ' | Abs: ' + str(j))

with open('abs.txt', 'w') as wrtf:
    pmids = [line.rstrip('\n') for line in open('pmlist.txt')]
    print('Totaly: ' + str(len(pmids)) + ' papers')
    for j, pmid in enumerate(pmids):
        try:
            download = fetch.article_by_pmid(pmid)
            if download.abstract and download.journal and download.year:
                wrtf.write(download.journal + '-!!-' + str(download.year) +
                           '-##-' + download.abstract + '\n')
                print(' | Abs: ' + str(j) + ' downloaded for: ' + pmid)
        except:
            print('download fail for: ' + ' | Abs: ' + str(j) + ' pmid: ' +
                  pmid)
'''

df = pd.read_excel('Journals_PMID.xlsx', dtype=str)
with open('abs.txt' , 'w') as wrtf:
  for i, column in enumerate(df):
    pmids = df[column].tolist()
    for j, pmid in enumerate(pmids):
      if pmid:
Пример #28
0
def main():
    '''
    Collects all .ris citation files from the publications folder
    and generates a Publications.md in the wiki folder
    containing all important information.
    '''

    #Collect .ris files
    ris_files = []
    for ris in glob.glob(
        os.path.join(
            'publications',
            '**',
            '*.ris'   
        ),
        recursive=True
    ):
        ris_files.append(ris)

    #Extract information from ris files
    #and store it in a dictionary
    publications_dict = {}
    all_ris_doi = set()
    for fullpath in ris_files:
        head, ris = os.path.split(fullpath)
        subfolder = os.path.basename(head)
        if subfolder not in publications_dict.keys():
            publications_dict[subfolder] = {}
        with open(fullpath, 'r') as in_file:
            tmp_dict = {
                'Authors': []
            }
            doi = None
            for line in in_file:
                l = line.strip()
                if l[:2] in ['A1', 'AU']:
                    tmp_dict['Authors'].append(
                        l.split('  - ')[1]
                    )
                elif l[:2] in ['T1', 'TI']:
                    title = l.split('  - ')[1].replace('<em>', '').replace('</em>', '')
                    tmp_dict['Title'] = title
                elif l[:2] in ['Y1', 'DA','PY']:
                    year = int(l.split('  - ')[1].split('/')[0])
                    tmp_dict['Year'] = year
                elif l[:2] in ['JO', 'JF', 'T2']:
                    tmp_dict['Journal'] = l.split('  - ')[1]
                elif l[:2] in ['VL']:
                    tmp_dict['Volume'] = l.split('  - ')[1]
                elif l[:2] in ['IS']:
                    tmp_dict['Issue'] = l.split('  - ')[1]
                elif l[:2] in ['UR']:
                    tmp_dict['URL'] = l.split('  - ')[1]
                elif l[:2] in ['N2', 'AB']:
                    tmp_dict['Abstract'] = l.split('  - ')[1]
                elif l[:2] in ['DO', 'M3', 'N1']:
                    doi_line = l.split('  - ')[1].replace('doi:', '')
                    doi = '/'.join(doi_line.split('/')[-2:])
                    tmp_dict['DOI'] = doi
            for k in ['Title', 'Authors', 'Year', 'Journal', 'URL', 'DOI']:
                if k not in tmp_dict.keys():
                    print('''
                        {0} is required but could not be found
                        for {1}
                    '''.format(
                            k,
                            fullpath
                        )
                    )
                    sys.exit(1)
            for k in ['Volume', 'Issue', 'Abstract']:
                if k not in tmp_dict.keys():
                    tmp_dict[k] = ''
            publications_dict[subfolder][doi] = tmp_dict
            publications_dict[subfolder][doi]['Authors'] = '; '.join(tmp_dict['Authors'])
            citation_file = 'https://github.com/halophiles/halowiki//tree/master/publications/{0}/{1}'.format(
                subfolder,
                ris
            )
            publications_dict[subfolder][doi]['Citation'] = citation_file
            all_ris_doi.add(doi)

    #Fetching publications from PubMed
    #and store their info in the same dict
    pm_fetch = PubMedFetcher()
    hfx_pmids = pm_fetch.pmids_for_query('Haloferax volcanii')
    known_problems = [
        '29906440',
        '29888297',
        '29038254',
        '28660233',
        '25954264',
        '24240572',
    ]
    for pmid in hfx_pmids:
        if pmid in known_problems:
            continue
        try:
            article = pm_fetch.article_by_pmid(pmid)
            doi = '/'.join(article.doi.split('/')[-2:])
            tmp_dict = {}
            tmp_dict['Authors'] = '; '.join(article.authors)
            tmp_dict['Title'] = article.title.replace('<em>', '').replace('</em>', '')
            tmp_dict['Year'] = int(article.year)
            tmp_dict['Journal'] = article.journal
            tmp_dict['Volume'] = article.volume
            tmp_dict['Issue'] = article.issue
            tmp_dict['URL'] = article.url
            tmp_dict['Abstract'] = article.abstract.replace('~', '')
            tmp_dict['DOI'] = doi
            tmp_dict['Citation'] = ''
        except:
            print('unsuccessful for {0}'.format(pmid))
            continue
        if doi in all_ris_doi:
            continue
        publications_dict['Others'][doi] = tmp_dict

    #Write markdown file for wiki
    #based on info in dict
    output_filename = os.path.join(
        'wiki',
        'Publications.md'
    )
    total_pubs = 0
    with open(output_filename, 'w', encoding="utf-8") as out_file:
        print('# Publications [ ](# )', file=out_file)
        print('', file=out_file)
        for subheading in sorted(publications_dict.keys()):
            print(' * [{0}](#{1})'.format(
                subheading.replace ('_', ' '),
                subheading.replace(' ', '-').lower()
            ), file=out_file)
        print('', file=out_file)
        for subheading in sorted(publications_dict.keys()):
            print('## {0}'.format(subheading.replace ('_', ' ')), file=out_file)
            print('', file=out_file)
            pub_list = []
            for pub in publications_dict[subheading].keys():
                try:
                    publications_dict[subheading][pub]['Lead Author'] = publications_dict[subheading][pub]['Authors'][0]
                    pub_list.append(publications_dict[subheading][pub])
                except:
                    print(pub)
                    print(publications_dict[subheading][pub]['Authors'])
            for pub in sorted(
                pub_list,
                key=itemgetter('Year', 'Lead Author'),
                reverse=True,
            ):
                total_pubs += 1
                print(
'''*{Title}*<br/>
{Authors}<br/>
**{Year}**<br/>
{Journal} {Volume}({Issue})<br/>
{DOI}
<details>
<summary>Abstract and Links</summary>

[Link to Publication]({URL})<br/>
[Citation]({Citation})<br/>
{Abstract}<br/>
</details><br/>

---
'''.format(**pub), file=out_file)

            print(
'''[Go to top of page](# )<br/>
----''',
                file=out_file
            )
    print('Total Number of Publications written to Publications.md:')
    print(total_pubs)
Пример #29
0
from metapub import PubMedFetcher

fetch = PubMedFetcher()

print "Get paper information by PMID"
article = fetch.article_by_pmid('21931568')
print article.title
print article.journal, article.year, article.volume, article.issue
print article.authors

print '\nGet paper information by PMCID'

article = fetch.article_by_pmcid(2674488)
print article.title
print article.journal, article.year, article.volume, article.issue
print article.authors
from metapub.convert import pmid2doi

DEBUG = True

####
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("eutils").setLevel(logging.INFO)
####

fetch = PubMedFetcher()

if __name__ == '__main__':
    try:
        filename = sys.argv[1]
    except IndexError:
        print(
            'Supply a filename containing a list of PMIDs as argument to this script.'
        )
        sys.exit()

    pmids = open(filename, 'r').readlines()
    for pmid in [item.strip() for item in pmids if item.strip() != '']:
        try:
            pma = fetch.article_by_pmid(pmid)
            doi = pmid2doi(pmid) or ''
            print(','.join([pmid, doi, pma.title]))
            print('')

        except InvalidPMID:
            print(pmid, ',,INVALID')
Пример #31
0
def crawl_abstract(keyword, outfile=None, max_iter=1000, has_chem_only=False):
    fetch = PubMedFetcher()

    pmids = fetch.pmids_for_query(keyword, retmax=max_iter)
    print("PMID scan Done!")

    if not outfile:
        outfile = "[Crawling Results]" + keyword + ".tsv"

    o_file = open(outfile, 'w', encoding="utf8")

    header = "PMID\tAuthors\tYear\tTitle\tAbstract\tURL\tCitation\tChemicals\n"
    o_file.write(header)

    print("Crawling Paper Info..")

    for pmid in tqdm(pmids):
        article = fetch.article_by_pmid(pmid)
        if not article:
            continue

        authors = article.authors_str
        if not authors:
            continue
        elif "\t" in authors or "\n" in authors:
            authors = remove_escape(authors)

        year = article.year
        if not year:
            continue
        elif "\t" in year or "\n" in year:
            year = remove_escape(year)

        title = article.title
        if not title:
            continue
        elif "\t" in title or "\n" in title:
            title = remove_escape(title)

        abstract = article.abstract
        if not abstract:
            continue
        elif "\t" in abstract or "\n" in abstract:
            abstract = remove_escape(abstract)

        url = article.url
        if not url:
            continue
        elif "\t" in url or "\n" in url:
            url = remove_escape(url)

        citation = article.citation
        if not citation:
            continue
        elif "\t" in citation or "\n" in citation:
            citation = remove_escape(citation)

        chemical = article.chemicals
        if not chemical:
            if has_chem_only:
                continue
            chemical = "None"
        else:
            chemical = str(chemical).replace("\'", "\"")
            if "\t" in chemical or "\n" in chemical:
                chemical = remove_escape(chemical)

        o_file.write(pmid + "\t")
        o_file.write(authors + "\t")
        o_file.write(year + "\t")
        o_file.write(title + "\t")
        o_file.write(abstract + "\t")
        o_file.write(url + "\t")
        o_file.write(citation + "\t")
        o_file.write(chemical + "\n")

    o_file.close()
    print("Process Done!")
    print("Result is saved in <" + outfile + ">.")