Пример #1
0
class TestPubmedFetcher(unittest.TestCase):
    def setUp(self):
        self.fetch = PubMedFetcher()

    def tearDown(self):
        pass

    def test_pmids_for_query(self):
        params = {'journal': 'PLoS One', 'year': 2013, 'author': 'McMurry AJ'}

        pmids = self.fetch.pmids_for_query(**params)
        assert len(pmids) == 1
        assert pmids[0] == '23533569'

        # this pubmed ID was deleted
        params = {
            'TA': 'Journal of Neural Transmission',
            'pdat': 2014,
            'vol': 121,
            'aulast': 'Freitag'
        }

        pmids = self.fetch.pmids_for_query(**params)
        assert len(pmids) == 0

    def test_medical_genetics_query(self):
        # we presume that the results for a fixed year prior to this one will not change.
        results = self.fetch.pmids_for_medical_genetics_query(
            'Brugada Syndrome', 'diagnosis', debug=True, year=2013)
        assert '24775617' in results

    def test_clinical_query(self):
        # we presume that the results for a fixed year prior to this one will not change.
        results = self.fetch.pmids_for_clinical_query(
            'Global developmental delay',
            'etiology',
            'narrow',
            debug=True,
            year=2013)
        assert results[0] == '24257216'
        assert results[1] == '24123848'
        assert results[2] == '24089199'

    def test_specified_return_slice(self):
        pmids = self.fetch.pmids_for_query(since='2015/3/1', retmax=1000)
        assert len(pmids) == 1000

        pmids = self.fetch.pmids_for_query(since='2015/3/1',
                                           retstart=200,
                                           retmax=500)
        assert len(pmids) == 500

    def test_pmc_only(self):
        params = {'mesh': 'breast neoplasm'}
        stuff = self.fetch.pmids_for_query(since='2015/1/1',
                                           until='2015/3/1',
                                           pmc_only=True,
                                           **params)
        print(stuff)
Пример #2
0
 def search(source = "PubMed", level = "basic", db = "PubMed", query = None, unlabeled_string = None, affiliation = None, article_identifier = None, all_fields = None, author = None, author_identifier = None, book = None, corporate_author = None, create_date = None, completion_date = None, conflict_of_interest = None, ec_rn_number = None, editor = None, entrez_date = None, filter_citations = None, first_author_name = None, full_author_name = None, full_investigator_name = None, grant_number = None, investigator = None, isbn = None, issue = None, journal = None, language = None, last_author = None, location_id = None, mesh_date = None, mesh_major_topic = None, mesh_subheadings = None, mesh_terms = None, modification_date = None, nlm_unique_id = None, other_term = None, owner = None, pagination = None, personal_name_as_subject = None, pharmacological_action = None, place_of_publication = None, pmid = None, publisher = None, publication_date = None, publication_type = None, retmax = None, retmode = None, secondary_source_id = None, sort = None, subset = None, supplementary_concept = None, text_words = None, title = None, title_abstract = None, transliterated_title = None, uid = None, volume = None, raw = False, exact = False, user = None):
     
     if source.lower() in ["pubmed"] and level.lower() == "complex":
         
         return eutils_search(db = db, retmode = retmode, retmax = retmax, sort = sort, unlabeled_string = unlabeled_string, affiliation = affiliation, article_identifier = article_identifier, all_fields = all_fields, author = author, author_identifier = author_identifier, book = book, corporate_author = corporate_author, create_date = create_date, completion_date = completion_date, conflict_of_interest = conflict_of_interest, ec_rn_number = ec_rn_number, editor = editor, entrez_date = entrez_date, filter_citations = filter_citations, first_author_name = first_author_name, full_author_name = full_author_name, full_investigator_name = full_investigator_name, grant_number = grant_number, investigator = investigator, isbn = isbn, issue = issue, journal = journal, language = language, last_author = last_author, location_id = location_id, mesh_date = mesh_date, mesh_major_topic = mesh_major_topic, mesh_subheadings = mesh_subheadings, mesh_terms = mesh_terms, modification_date = modification_date, nlm_unique_id = nlm_unique_id, other_term = other_term, owner = owner, pagination = pagination, personal_name_as_subject = personal_name_as_subject, pharmacological_action = pharmacological_action, place_of_publication = place_of_publication, pmid = pmid, publisher = publisher, publication_date = publication_date, publication_type = publication_type, secondary_source_id = secondary_source_id, subset = subset, supplementary_concept = supplementary_concept, text_words = text_words, title = title, title_abstract = title_abstract, transliterated_title = transliterated_title, uid = uid, volume = volume, raw = raw, exact = exact)
     
     elif source.lower() in ["pubmed"] and level.lower() == "basic":
         
         # Use 'unlabeled_string' or 'query' here.
         # This function already takes completed
         # PubMed queries as strings (with
         # various connectors and constructors).
         if unlabeled_string:
             
             fetch = PubMedFetcher()
             pubmed_id_list = fetch.pmids_for_query(unlabeled_string)
             ref_list = []
             for pubmed_id in pubmed_id_list:
                 article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles...
                 temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title)
                 ref_list.append(temp_ref)
             return ref_list
         elif query:
             
             # This is where the basic reference
             # search redirects for now, but it
             # is relatively slow.
             fetch = PubMedFetcher()
             pubmed_id_list = fetch.pmids_for_query(query)
             ref_list = []
             for pubmed_id in pubmed_id_list:
                 try:
                     article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles...
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title)
                     ref_list.append(temp_ref)
                 except metapub.exceptions.InvalidPMID:
                     print("An invalid PMID error occurred.")
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed")
                     ref_list.append(temp_ref)
                 else:
                     temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed")
                     ref_list.append(temp_ref)
             return ref_list
     
     elif source.lower() in ["google", "google scholar"]:
         return google_scholar_search(unlabeled_string)
     
     elif source.lower() in ["openlibrary"]:
         return openlibrary_search(unlabeled_string)
Пример #3
0
    def downloadAbstract(self, keywords, file_name,max_return=1e+6):
        fetcher = PubMedFetcher(cachedir=self.cache_dir, api_key=self.api_key)
        pmids = fetcher.pmids_for_query(keywords, retmax=max_return)
        
        corpus = ET.Element('corpus')
        keywords_item = ET.SubElement(corpus, 'keywords')
        keywords_item.text = keywords
        
        for pmid in pmids:
            print(pmid)
            fetcher._eutils_article_by_pmid(pmid)
            doc = fetcher.article_by_pmid(pmid)
            title_str = self.removeHtmlTags(doc.title)
            abstract_str = self.removeHtmlTags(doc.abstract)
            
            if abstract_str == '':
                continue
            
            doc_item = ET.SubElement(corpus, 'article')
            doc_item.set('id', pmid)
            
            title_item = ET.SubElement(doc_item, 'title')
            title_item.text = title_str

            abstract_item = ET.SubElement(doc_item, 'abstract')
            abstract_item.text = abstract_str
            
        corpus_in_string = ET.tostring(corpus)
        xml_file = open(file_name, 'wb')
        xml_file.write(corpus_in_string)
Пример #4
0
    def crawl_chem_abstract(self, keyword, retmax=300):
        fetch = PubMedFetcher()
        self.progress_bar_value.emit(self.count)

        pmids = fetch.pmids_for_query(keyword, retmax=retmax)

        self.textBrowser_value.emit("Scanning Iteration : " + str(retmax))
        self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.")

        self.textBrowser_value.emit("PMID Scan Done!")

        json_dicts = []
        self.textBrowser_value.emit("Crawling Paper Info..")

        for i in range(len(pmids)):
            pmid = pmids[i]
            try:
                if int(i / len(pmids) * 100) > self.count:
                    self.count = int(i / len(pmids) * 100)
                    self.progress_bar_value.emit(self.count)

                try:
                    article = fetch.article_by_pmid(pmid)
                except:
                    self.textBrowser_value.emit("Error reading " + str(pmid))
                    continue

                chemical = article.chemicals
                if not chemical:
                    continue

                abstract = article.abstract.replace(",", "*")
                if not abstract:
                    continue
                elif "\t" in abstract or "\n" in abstract:
                    abstract = abstract.replace("\t", " ")
                    abstract = abstract.replace("\n", " ")

                title = article.title
                if not title:
                    continue
                elif "\t" in title or "\n" in title:
                    title = title.replace("\t", " ")
                    title = title.replace("\n", " ")

                chemical["title"] = title
                chemical["abstract"] = abstract

                json_dicts.append(chemical)
            except:
                continue

        self.textBrowser_value.emit("Progress Done!")
        return json_dicts
Пример #5
0
def keyword_query(keywords=sys.argv[1],
                  savepath=sys.argv[2],
                  start_date=None,
                  end_date=None,
                  num_of_articles=1000):
    """
	keyword_query takes in a keyword string or list of keywords, and outputs 
	a dataframe with article meta data that matches the keyword query.

	**NOTE**: Long queries (~1000+ articles) will take > 5 minutes. 
	Thus, it is advisable to add additional keywords and filters to constrain the 
	search space.

	:param keywords:         A string or a list of keywords to query.
	:param savepath:         A string denoting the full path to save the file in.
	:param start_date:       A string denoting the start date.
	:param end_date:         A string denoting the end date.
	:param num_of_articles:  An integer denoting the maximum number of articles.

	:return df:              A pandas dataframe of the query.
	"""

    fetch = PubMedFetcher()

    # Get PMIDs using query
    pmids = fetch.pmids_for_query(query=keywords,
                                  since=start_date,
                                  until=end_date,
                                  retmax=num_of_articles)
    print("Number of PMIDs with search query: " + str(len(pmids)))

    # Get abstracts based on keyword search.
    # The query saves to a dictionary, using the PMID as the key.
    abstracts = {}
    for id in pmids:
        article = fetch.article_by_pmid(id)
        abstracts[id] = [
            article.title, article.abstract, article.journal, article.year,
            article.authors
        ]

    # Save the dictionary as a dataframe
    df = pd.DataFrame.from_dict(
        abstracts,
        orient='index',
        columns=['Title', 'Abstract', 'Journal', 'Year', 'Authors'])

    # Save the dataframe
    df.index.name = 'PMID'
    df.to_csv(savepath)

    return df
Пример #6
0
def measure_all_from_query(query):
    fetch = PubMedFetcher()
    pm_ids = fetch.pmids_for_query(query)

    if len(pm_ids) > 8:
        pm_ids = pm_ids[:8]

    scores = []
    nlp = spacy.load('en_core_sci_lg')
    for id in pm_ids:
        scores.append((id, measure_similarity_abstracts(nlp, id)))

    return scores
Пример #7
0
def search(request):
    ctx = {
        'query_saved' : None,
        'saved_pmids':[],
        'total_saved_queries':SearchStash.objects.filter(user=request.user).count(),
    }
    f = PubMedFetcher()
    initial = {}
    query_saved = None
    try:
        query_saved = SearchStash.objects.get(search_used=request.GET.get('q'))
    except: pass
    else:
        ctx['saved_pmids'] = [pub.pmid for pub in query_saved.pmids.all()]
        ctx['query_saved'] = query_saved

    if not ctx['query_saved'] and request.GET.get('q'):
        messages.add_message(request, messages.INFO, '<strong>Note:</strong> You must click "Save Query" above to start capturing publications for this query.')
    if request.GET.get('q', None):
        keywords = request.GET.get('q', None)
        initial['q'] = request.GET.get('q')
        pmids = f.pmids_for_query(query=keywords, retmax=100)
        pmid_list = []
        for pmid in pmids:
            new_pmid = Publication.objects.get_or_create(pmid=pmid)[0]
            row = {
                'pmid': new_pmid.pmid,
            }
            pmid_list.append(row)
        ctx['keywords'] = keywords
        ctx['pmids'] = pmids
        ctx['pmid_list'] = pmid_list
        ctx['result_count'] = len(pmids)
    form = PubMedForm(initial=initial)
    ctx['form'] = form
    return render(request, 'lum/search.html', ctx)
Пример #8
0
    def crawl_chem_json(self, keyword, retmax=300):
        fetch = PubMedFetcher()

        pmids = fetch.pmids_for_query(keyword, retmax=retmax)

        self.textBrowser_value.emit("Scanning Iteration : " + str(retmax))
        self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.")

        self.textBrowser_value.emit("PMID Scan Done!")
        self.progress_bar_value.emit(self.count)

        json_dicts = []
        self.textBrowser_value.emit("Crawling Paper Info..")

        for i in range(len(pmids)):
            pmid = pmids[i]
            try:
                if int(i / len(pmids) * 100) > self.count:
                    self.count = int(i / len(pmids) * 100)
                    self.progress_bar_value.emit(self.count)
                try:
                    article = fetch.article_by_pmid(pmid)
                except:
                    self.textBrowser_value.emit("Error reading " + str(pmid))
                    continue

                chemical = article.chemicals
                if not chemical:
                    continue

                json_dicts.append(chemical)
            except:
                continue

        self.textBrowser_value.emit("Progress Done!")
        return json_dicts
Пример #9
0
def crawl_chem_json(keyword, retmax=1000):
    fetch = PubMedFetcher()

    pmids = fetch.pmids_for_query(keyword, retmax=retmax)
    print("PMID scan Done!")

    json_dicts = []
    print("Crawling Paper Info..")

    for pmid in tqdm(pmids):
        try:
            article = fetch.article_by_pmid(pmid)
        except:
            print("Error reading " + str(pmid))
            continue

        chemical = article.chemicals
        if not chemical:
            continue

        json_dicts.append(chemical)

    print("Process Done!")
    return json_dicts
Пример #10
0
# coding: utf-8

# In[14]:

get_ipython().system('pip install metapub')

from metapub import PubMedFetcher
from metapub import MedGenFetcher
from metapub import ClinVarFetcher

# In[15]:

fetch = PubMedFetcher()

# get the first 5 pmids matching "breast neoplasm" keyword search
pmids = fetch.pmids_for_query('breast neoplasm', retmax=5)

# get abstract for each article:
for pmid in pmids:
    article = fetch.article_by_pmid(pmid)
    print(article.title)
    print(article.journal, article.year, article.volume, article.issue)
    print(article.authors)

# In[16]:
'''
The MedGen (medical genetics) database is a clinical dictionary linking medical 
concepts across multiple medical ontologies and dictionaries such as OMIM and SNOMED.
'''

fetch = MedGenFetcher()
    query = sys.argv[1]
except IndexError:
    print("supply query in quotation marks as argument to this script.")
    sys.exit()

fetch = PubMedFetcher()

filename = 'PMID_lists/%s_all.txt' % query.replace(' ', '_')
pmid_file = open(filename, 'w')
print(filename)

retstart = 0
retmax = 500
while True:
    pmids = fetch.pmids_for_query(query,
                                  pmc_only=False,
                                  retstart=retstart,
                                  retmax=retmax)
    if len(pmids) > 0:
        pmid_file.write('\n')
        pmid_file.write('\n'.join(pmids))
        retstart = retstart + retmax + 1
    else:
        break

filename = 'PMID_lists/%s_pmc_only.txt' % query.replace(' ', '_')
pmid_file = open(filename, 'w')
print(filename)

retstart = 0
retmax = 500
while True:
Пример #12
0
    return [t + " " + diagnosis for t in l]


def get_abstract_from_pmid(pmid, fetcher):
    """Get abstract of an article given its pmid"""
    try:
        return fetcher.article_by_pmid(pmid).abstract
    # Import MetaPubError so it catches that specificaly
    except:
        return None


depression_queries = make_queries(
    "depression", "drug_lists/depression/depression_targets.txt")
fetch = PubMedFetcher()
depression_pmids = [fetch.pmids_for_query(q) for q in depression_queries]
with open("temp/depression_pmids.json", "w") as dpjson:
    dpjson.write(json.dumps(depression_pmids))
# Not sure this part works yet
depression_abstracts = [[get_abstract_from_pmid(p, fetch) for p in l]
                        for l in depression_pmids]
with open("temp/depression_abstracts.json", "w") as dajson:
    dajson.write(json.dumps(depression_abstracts))

schizophrenia_queries = make_queries(
    "schizophrenia", "drug_lists/schizophrenia/schizophrenia_targets.txt")
fetch = PubMedFetcher()
schizophrenia_pmids = [fetch.pmids_for_query(q) for q in schizophrenia_queries]
with open("temp/schizophrenia_pmids.json", "w") as spjson:
    spjson.write(json.dumps(schizophrenia_pmids))
schizophrenia_abstracts = [[get_abstract_from_pmid(p, fetch) for p in l]
Пример #13
0
    def search(source="PubMed",
               level="basic",
               db="PubMed",
               query=None,
               unlabeled_string=None,
               affiliation=None,
               article_identifier=None,
               all_fields=None,
               author=None,
               author_identifier=None,
               book=None,
               corporate_author=None,
               create_date=None,
               completion_date=None,
               conflict_of_interest=None,
               ec_rn_number=None,
               editor=None,
               entrez_date=None,
               filter_citations=None,
               first_author_name=None,
               full_author_name=None,
               full_investigator_name=None,
               grant_number=None,
               investigator=None,
               isbn=None,
               issue=None,
               journal=None,
               language=None,
               last_author=None,
               location_id=None,
               mesh_date=None,
               mesh_major_topic=None,
               mesh_subheadings=None,
               mesh_terms=None,
               modification_date=None,
               nlm_unique_id=None,
               other_term=None,
               owner=None,
               pagination=None,
               personal_name_as_subject=None,
               pharmacological_action=None,
               place_of_publication=None,
               pmid=None,
               publisher=None,
               publication_date=None,
               publication_type=None,
               retmax=None,
               retmode=None,
               secondary_source_id=None,
               sort=None,
               subset=None,
               supplementary_concept=None,
               text_words=None,
               title=None,
               title_abstract=None,
               transliterated_title=None,
               uid=None,
               volume=None,
               raw=False,
               exact=False,
               user=None):

        if source.lower() in ["pubmed"] and level.lower() == "complex":

            return eutils_search(
                db=db,
                retmode=retmode,
                retmax=retmax,
                sort=sort,
                unlabeled_string=unlabeled_string,
                affiliation=affiliation,
                article_identifier=article_identifier,
                all_fields=all_fields,
                author=author,
                author_identifier=author_identifier,
                book=book,
                corporate_author=corporate_author,
                create_date=create_date,
                completion_date=completion_date,
                conflict_of_interest=conflict_of_interest,
                ec_rn_number=ec_rn_number,
                editor=editor,
                entrez_date=entrez_date,
                filter_citations=filter_citations,
                first_author_name=first_author_name,
                full_author_name=full_author_name,
                full_investigator_name=full_investigator_name,
                grant_number=grant_number,
                investigator=investigator,
                isbn=isbn,
                issue=issue,
                journal=journal,
                language=language,
                last_author=last_author,
                location_id=location_id,
                mesh_date=mesh_date,
                mesh_major_topic=mesh_major_topic,
                mesh_subheadings=mesh_subheadings,
                mesh_terms=mesh_terms,
                modification_date=modification_date,
                nlm_unique_id=nlm_unique_id,
                other_term=other_term,
                owner=owner,
                pagination=pagination,
                personal_name_as_subject=personal_name_as_subject,
                pharmacological_action=pharmacological_action,
                place_of_publication=place_of_publication,
                pmid=pmid,
                publisher=publisher,
                publication_date=publication_date,
                publication_type=publication_type,
                secondary_source_id=secondary_source_id,
                subset=subset,
                supplementary_concept=supplementary_concept,
                text_words=text_words,
                title=title,
                title_abstract=title_abstract,
                transliterated_title=transliterated_title,
                uid=uid,
                volume=volume,
                raw=raw,
                exact=exact)

        elif source.lower() in ["pubmed"] and level.lower() == "basic":

            # Use 'unlabeled_string' or 'query' here.
            # This function already takes completed
            # PubMed queries as strings (with
            # various connectors and constructors).
            if unlabeled_string:

                fetch = PubMedFetcher()
                pubmed_id_list = fetch.pmids_for_query(unlabeled_string)
                ref_list = []
                for pubmed_id in pubmed_id_list:
                    article = fetch.article_by_pmid(
                        pubmed_id)  # Need a faster way to get titles...
                    temp_ref = Reference(identifier=str(pubmed_id),
                                         identifier_type="PubMed ID",
                                         source="PubMed",
                                         name=article.title)
                    ref_list.append(temp_ref)
                return ref_list
            elif query:

                # This is where the basic reference
                # search redirects for now, but it
                # is relatively slow.
                fetch = PubMedFetcher()
                pubmed_id_list = fetch.pmids_for_query(query)
                ref_list = []
                for pubmed_id in pubmed_id_list:
                    try:
                        article = fetch.article_by_pmid(
                            pubmed_id)  # Need a faster way to get titles...
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed",
                                             name=article.title)
                        ref_list.append(temp_ref)
                    except metapub.exceptions.InvalidPMID:
                        print("An invalid PMID error occurred.")
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed")
                        ref_list.append(temp_ref)
                    else:
                        temp_ref = Reference(identifier=str(pubmed_id),
                                             identifier_type="PubMed ID",
                                             source="PubMed")
                        ref_list.append(temp_ref)
                return ref_list

        elif source.lower() in ["google", "google scholar"]:
            return google_scholar_search(unlabeled_string)

        elif source.lower() in ["openlibrary"]:
            return openlibrary_search(unlabeled_string)
Пример #14
0
def main():
    '''
    Collects all .ris citation files from the publications folder
    and generates a Publications.md in the wiki folder
    containing all important information.
    '''

    #Collect .ris files
    ris_files = []
    for ris in glob.glob(
        os.path.join(
            'publications',
            '**',
            '*.ris'   
        ),
        recursive=True
    ):
        ris_files.append(ris)

    #Extract information from ris files
    #and store it in a dictionary
    publications_dict = {}
    all_ris_doi = set()
    for fullpath in ris_files:
        head, ris = os.path.split(fullpath)
        subfolder = os.path.basename(head)
        if subfolder not in publications_dict.keys():
            publications_dict[subfolder] = {}
        with open(fullpath, 'r') as in_file:
            tmp_dict = {
                'Authors': []
            }
            doi = None
            for line in in_file:
                l = line.strip()
                if l[:2] in ['A1', 'AU']:
                    tmp_dict['Authors'].append(
                        l.split('  - ')[1]
                    )
                elif l[:2] in ['T1', 'TI']:
                    title = l.split('  - ')[1].replace('&lt;em&gt;', '').replace('&lt;/em&gt;', '')
                    tmp_dict['Title'] = title
                elif l[:2] in ['Y1', 'DA','PY']:
                    year = int(l.split('  - ')[1].split('/')[0])
                    tmp_dict['Year'] = year
                elif l[:2] in ['JO', 'JF', 'T2']:
                    tmp_dict['Journal'] = l.split('  - ')[1]
                elif l[:2] in ['VL']:
                    tmp_dict['Volume'] = l.split('  - ')[1]
                elif l[:2] in ['IS']:
                    tmp_dict['Issue'] = l.split('  - ')[1]
                elif l[:2] in ['UR']:
                    tmp_dict['URL'] = l.split('  - ')[1]
                elif l[:2] in ['N2', 'AB']:
                    tmp_dict['Abstract'] = l.split('  - ')[1]
                elif l[:2] in ['DO', 'M3', 'N1']:
                    doi_line = l.split('  - ')[1].replace('doi:', '')
                    doi = '/'.join(doi_line.split('/')[-2:])
                    tmp_dict['DOI'] = doi
            for k in ['Title', 'Authors', 'Year', 'Journal', 'URL', 'DOI']:
                if k not in tmp_dict.keys():
                    print('''
                        {0} is required but could not be found
                        for {1}
                    '''.format(
                            k,
                            fullpath
                        )
                    )
                    sys.exit(1)
            for k in ['Volume', 'Issue', 'Abstract']:
                if k not in tmp_dict.keys():
                    tmp_dict[k] = ''
            publications_dict[subfolder][doi] = tmp_dict
            publications_dict[subfolder][doi]['Authors'] = '; '.join(tmp_dict['Authors'])
            citation_file = 'https://github.com/halophiles/halowiki//tree/master/publications/{0}/{1}'.format(
                subfolder,
                ris
            )
            publications_dict[subfolder][doi]['Citation'] = citation_file
            all_ris_doi.add(doi)

    #Fetching publications from PubMed
    #and store their info in the same dict
    pm_fetch = PubMedFetcher()
    hfx_pmids = pm_fetch.pmids_for_query('Haloferax volcanii')
    known_problems = [
        '29906440',
        '29888297',
        '29038254',
        '28660233',
        '25954264',
        '24240572',
    ]
    for pmid in hfx_pmids:
        if pmid in known_problems:
            continue
        try:
            article = pm_fetch.article_by_pmid(pmid)
            doi = '/'.join(article.doi.split('/')[-2:])
            tmp_dict = {}
            tmp_dict['Authors'] = '; '.join(article.authors)
            tmp_dict['Title'] = article.title.replace('&lt;em&gt;', '').replace('&lt;/em&gt;', '')
            tmp_dict['Year'] = int(article.year)
            tmp_dict['Journal'] = article.journal
            tmp_dict['Volume'] = article.volume
            tmp_dict['Issue'] = article.issue
            tmp_dict['URL'] = article.url
            tmp_dict['Abstract'] = article.abstract.replace('~', '')
            tmp_dict['DOI'] = doi
            tmp_dict['Citation'] = ''
        except:
            print('unsuccessful for {0}'.format(pmid))
            continue
        if doi in all_ris_doi:
            continue
        publications_dict['Others'][doi] = tmp_dict

    #Write markdown file for wiki
    #based on info in dict
    output_filename = os.path.join(
        'wiki',
        'Publications.md'
    )
    total_pubs = 0
    with open(output_filename, 'w', encoding="utf-8") as out_file:
        print('# Publications [ ](# )', file=out_file)
        print('', file=out_file)
        for subheading in sorted(publications_dict.keys()):
            print(' * [{0}](#{1})'.format(
                subheading.replace ('_', ' '),
                subheading.replace(' ', '-').lower()
            ), file=out_file)
        print('', file=out_file)
        for subheading in sorted(publications_dict.keys()):
            print('## {0}'.format(subheading.replace ('_', ' ')), file=out_file)
            print('', file=out_file)
            pub_list = []
            for pub in publications_dict[subheading].keys():
                try:
                    publications_dict[subheading][pub]['Lead Author'] = publications_dict[subheading][pub]['Authors'][0]
                    pub_list.append(publications_dict[subheading][pub])
                except:
                    print(pub)
                    print(publications_dict[subheading][pub]['Authors'])
            for pub in sorted(
                pub_list,
                key=itemgetter('Year', 'Lead Author'),
                reverse=True,
            ):
                total_pubs += 1
                print(
'''*{Title}*<br/>
{Authors}<br/>
**{Year}**<br/>
{Journal} {Volume}({Issue})<br/>
{DOI}
<details>
<summary>Abstract and Links</summary>

[Link to Publication]({URL})<br/>
[Citation]({Citation})<br/>
{Abstract}<br/>
</details><br/>

---
'''.format(**pub), file=out_file)

            print(
'''[Go to top of page](# )<br/>
----''',
                file=out_file
            )
    print('Total Number of Publications written to Publications.md:')
    print(total_pubs)
Пример #15
0
import numpy as np
#import json
#import objectpath
import webbrowser
import html2text
from ast import literal_eval
import urllib.request
from bs4 import BeautifulSoup
import re

#stage I
#to run the first section type the command as python cp.py > trial.txt
fetch = PubMedFetcher()
counter = 0
# get the first 1000 pmids matching "breast neoplasm" keyword search
pmids = fetch.pmids_for_query('fragile sites', retmax=2)
print(pmids)

retmax = 20
#in order to store data in a relevant format
print('[')
for pmid in pmids:
    counter += 1
    #print (pmid)
    article = fetch.article_by_pmid(pmid)
    title = article.title
    #print (title)

    search_query = scholarly.search_pubs_query(title)
    print(next(search_query))
    #print (res)
Пример #16
0
def crawl_abstract(keyword, outfile=None, max_iter=1000, has_chem_only=False):
    fetch = PubMedFetcher()

    pmids = fetch.pmids_for_query(keyword, retmax=max_iter)
    print("PMID scan Done!")

    if not outfile:
        outfile = "[Crawling Results]" + keyword + ".tsv"

    o_file = open(outfile, 'w', encoding="utf8")

    header = "PMID\tAuthors\tYear\tTitle\tAbstract\tURL\tCitation\tChemicals\n"
    o_file.write(header)

    print("Crawling Paper Info..")

    for pmid in tqdm(pmids):
        article = fetch.article_by_pmid(pmid)
        if not article:
            continue

        authors = article.authors_str
        if not authors:
            continue
        elif "\t" in authors or "\n" in authors:
            authors = remove_escape(authors)

        year = article.year
        if not year:
            continue
        elif "\t" in year or "\n" in year:
            year = remove_escape(year)

        title = article.title
        if not title:
            continue
        elif "\t" in title or "\n" in title:
            title = remove_escape(title)

        abstract = article.abstract
        if not abstract:
            continue
        elif "\t" in abstract or "\n" in abstract:
            abstract = remove_escape(abstract)

        url = article.url
        if not url:
            continue
        elif "\t" in url or "\n" in url:
            url = remove_escape(url)

        citation = article.citation
        if not citation:
            continue
        elif "\t" in citation or "\n" in citation:
            citation = remove_escape(citation)

        chemical = article.chemicals
        if not chemical:
            if has_chem_only:
                continue
            chemical = "None"
        else:
            chemical = str(chemical).replace("\'", "\"")
            if "\t" in chemical or "\n" in chemical:
                chemical = remove_escape(chemical)

        o_file.write(pmid + "\t")
        o_file.write(authors + "\t")
        o_file.write(year + "\t")
        o_file.write(title + "\t")
        o_file.write(abstract + "\t")
        o_file.write(url + "\t")
        o_file.write(citation + "\t")
        o_file.write(chemical + "\n")

    o_file.close()
    print("Process Done!")
    print("Result is saved in <" + outfile + ">.")
Пример #17
0
from __future__ import absolute_import, print_function, unicode_literals

from metapub import PubMedFetcher
fetch = PubMedFetcher()
params = {
    'jtitle': 'American Journal of Medical Genetics',
    'year': 1996,
    'volume': 61,
    'author1_lastfm': 'Hegmann'
}

stuff = fetch.pmids_for_query(**params)
print(params)
print(stuff)

# the following article was deleted from pubmed (or changed such that this
# set of parameters no longer returns an article)
params = {
    'TA': 'Journal of Neural Transmission',
    'pdat': 2014,
    'vol': 121,
    'aulast': 'Freitag'
}
stuff = fetch.pmids_for_query(**params)

print(params)
print(stuff)

#params = { 'mesh': 'breast neoplasm' }
#stuff = fetch.pmids_for_query(since='2014', until='2015/3/1', pmc_only=True, **params)