Пример #1
0
def main(taxon):
    query = db.gene_detail.find(
        {
            "entrezgene": {
                "$ne": ""
            },
            "Taxonomy_Id": taxon
        }, {
            "ensembl_gene_id": 1,
            "entrezgene": 1
        })
    ec = eutils.Client()
    for result in query:
        id = int(result["entrezgene"])
        try:
            gene = ec.efetch(db='gene', id=id)
        except:
            print(
                str(result["entrezgene"]) + "\t" + result["ensembl_gene_id"] +
                "not fonund")
            continue
        detail = gene.entrezgenes[0]
        summary = detail.summary
        synonyms = detail.synonyms
        if not summary and not synonyms:
            continue
        record = {
            'ensembl_gene_id': result["ensembl_gene_id"],
            'summary': summary,
            'synonyms': synonyms
        }
        db.summary.insert_one(record)
Пример #2
0
def main(inputfile):
    genelist = pd.read_csv(inputfile)
    for esembl_id in genelist["gene_ID"]:
        ec = eutils.Client()
        esr = ec.esearch(db='gene', term=esembl_id)
        if esr.count == 0:
            print(esembl_id + "can't match gene id")
            continue
        if esr.count > 0:
            print(esembl_id + "can match multiple gene ids")
        id = esr.ids[0]
        gene = ec.efetch(db='gene', id=id)
        detail = gene.entrezgenes[0]
        summary = detail.summary
        synonyms = detail.synonyms
        record = {
            'ensembl_gene_id': esembl_id,
            'summary': summary,
            'synonyms': synonyms
        }
        db.summary.insert_one(record)
Пример #3
0
import argparse

import eutils


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='ESummary', epilog='')
    parser.add_argument('db', help='Database to use')
    parser.add_argument('--id_list', help='list of ids')
    parser.add_argument('--id', help='Comma separated individual IDs')
    parser.add_argument('--history_file', help='Filter existing history')
    parser.add_argument('--user_email', help="User email")
    parser.add_argument('--admin_email', help="Admin email")
    args = parser.parse_args()

    c = eutils.Client(history_file=args.history_file, user_email=args.user_email, admin_email=args.admin_email)

    merged_ids = c.parse_ids(args.id_list, args.id, args.history_file)

    payload = {
        'db': args.db,
    }

    if args.history_file is not None:
        payload.update(c.get_history())
    else:
        payload['id'] = ','.join(merged_ids)

    print(c.summary(**payload))
Пример #4
0
    parser = argparse.ArgumentParser(description='ECitMatch', epilog='')
    parser.add_argument('--file', type=argparse.FileType('r'), help='Tabular file containing citations to search')

    parser.add_argument('--key', nargs='*', help='Citation Key')
    parser.add_argument('--journal_title', nargs='*', help='Journal Title')
    parser.add_argument('--year', nargs='*', help='Year')
    parser.add_argument('--volume', nargs='*', help='Volume')
    parser.add_argument('--first_page', nargs='*', help='First Page')
    parser.add_argument('--author_name', nargs='*', help='Author name')

    # Emails
    parser.add_argument('--user_email', help="User email")
    parser.add_argument('--admin_email', help="Admin email")
    args = parser.parse_args()

    c = eutils.Client(user_email=args.user_email, admin_email=args.admin_email)

    citations = []
    if args.file is None:
        for key, journal, year, volume, first_page, author_name in \
                zip(args.key, args.journal_title, args.year, args.volume, args.first_page, args.author_name):
            citations.append({
                'key': key,
                'journal': journal,
                'year': year,
                'volume': volume,
                'first_page': first_page,
                'author_name': author_name,
            })
    else:
        for line in args.file:
Пример #5
0
def eutils_search(db="PubMed",
                  exact=False,
                  raw=False,
                  retmode=None,
                  retmax=None,
                  sort=None,
                  unlabeled_string=None,
                  affiliation=None,
                  article_identifier=None,
                  all_fields=None,
                  author=None,
                  author_identifier=None,
                  book=None,
                  corporate_author=None,
                  create_date=None,
                  completion_date=None,
                  conflict_of_interest=None,
                  ec_rn_number=None,
                  editor=None,
                  entrez_date=None,
                  filter_citations=None,
                  first_author_name=None,
                  full_author_name=None,
                  full_investigator_name=None,
                  grant_number=None,
                  investigator=None,
                  isbn=None,
                  issue=None,
                  journal=None,
                  language=None,
                  last_author=None,
                  location_id=None,
                  mesh_date=None,
                  mesh_major_topic=None,
                  mesh_subheadings=None,
                  mesh_terms=None,
                  modification_date=None,
                  nlm_unique_id=None,
                  other_term=None,
                  owner=None,
                  pagination=None,
                  personal_name_as_subject=None,
                  pharmacological_action=None,
                  place_of_publication=None,
                  pmid=None,
                  publisher=None,
                  publication_date=None,
                  publication_type=None,
                  secondary_source_id=None,
                  subset=None,
                  supplementary_concept=None,
                  text_words=None,
                  title=None,
                  title_abstract=None,
                  transliterated_title=None,
                  uid=None,
                  volume=None):

    ref_set = []
    result_set = []

    if not exact:

        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"

        if db is not None:
            base_url = base_url + "db=" + str(db) + "&"
        if retmode is not None:
            base_url = base_url + "retmode=" + str(retmode) + "&"
        if retmax is not None:
            base_url = base_url + "retmax=" + str(retmax) + "&"
        else:
            base_url = base_url + "retmax=" + str(100000) + "&"
        if sort is not None:
            base_url = base_url + "sort=" + str(sort) + "&"

        term_url = "term="
        if unlabeled_string is not None:
            term_url = eutils_param_process(unlabeled_string)
        if affiliation is not None:
            term_url = term_url + eutils_param_process(affiliation, "ad")
        if article_identifier is not None:
            term_url = term_url + eutils_param_process(article_identifier,
                                                       "aid")
        if all_fields is not None:
            term_url = term_url + eutils_param_process(
                all_fields, "all", raw=raw)
        if author is not None:
            term_url = term_url + eutils_param_process(author, "au")
        if author_identifier is not None:
            term_url = term_url + eutils_param_process(author_identifier,
                                                       "auid")
        if book is not None:
            term_url = term_url + eutils_param_process(book, "book")
        if corporate_author is not None:
            term_url = term_url + eutils_param_process(corporate_author, "cn")
        if create_date is not None:
            term_url = term_url + eutils_param_process(create_date, "crdt")
        if completion_date is not None:
            term_url = term_url + eutils_param_process(completion_date, "dcom")
        if conflict_of_interest is not None:
            term_url = term_url + eutils_param_process(conflict_of_interest,
                                                       "cois")
        if ec_rn_number is not None:
            term_url = term_url + eutils_param_process(ec_rn_number, "rn")
        if editor is not None:
            term_url = term_url + eutils_param_process(editor, "ed")
        if entrez_date is not None:
            term_url = term_url + eutils_param_process(entrez_date, "edat")
        if filter_citations is not None:
            term_url = term_url + eutils_param_process(filter_citations,
                                                       "filter")
        if first_author_name is not None:
            term_url = term_url + eutils_param_process(first_author_name,
                                                       "iau")
        if full_author_name is not None:
            term_url = term_url + eutils_param_process(full_author_name, "fau")
        if full_investigator_name is not None:
            term_url = term_url + eutils_param_process(full_investigator_name,
                                                       "fir")
        if grant_number is not None:
            term_url = term_url + eutils_param_process(grant_number, "gr")
        if investigator is not None:
            term_url = term_url + eutils_param_process(investigator, "ir")
        if isbn is not None:
            term_url = term_url + eutils_param_process(isbn, "isbn")
        if issue is not None:
            term_url = term_url + eutils_param_process(issue, "ip")
        if journal is not None:
            term_url = term_url + eutils_param_process(journal, "ta")
        if language is not None:
            term_url = term_url + eutils_param_process(language, "la")
        if last_author is not None:
            term_url = term_url + eutils_param_process(last_author, "lastau")
        if location_id is not None:
            term_url = term_url + eutils_param_process(location_id, "lid")
        if mesh_date is not None:
            term_url = term_url + eutils_param_process(mesh_date, "mhda")
        if mesh_major_topic is not None:
            term_url = term_url + eutils_param_process(mesh_major_topic,
                                                       "majr")
        if mesh_subheadings is not None:
            term_url = term_url + eutils_param_process(mesh_subheadings, "sh")
        if mesh_terms is not None:
            term_url = term_url + eutils_param_process(mesh_terms, "mh")
        if modification_date is not None:
            term_url = term_url + eutils_param_process(modification_date, "lr")
        if nlm_unique_id is not None:
            term_url = term_url + eutils_param_process(nlm_unique_id, "jid")
        if other_term is not None:
            term_url = term_url + eutils_param_process(other_term, "ot")
        if owner is not None:
            print("NOT FUNCTIONAL.")
        if pagination is not None:
            term_url = term_url + eutils_param_process(pagination, "pg")
        if personal_name_as_subject is not None:
            term_url = term_url + eutils_param_process(
                personal_name_as_subject, "ps")
        if pharmacological_action is not None:
            term_url = term_url + eutils_param_process(pharmacological_action,
                                                       "pa")
        if place_of_publication is not None:
            term_url = term_url + eutils_param_process(place_of_publication,
                                                       "pl")
        if pmid is not None:
            term_url = term_url + eutils_param_process(pmid, "pmid")
        if publisher is not None:
            term_url = term_url + eutils_param_process(publisher, "pubn")
        if publication_date is not None:
            term_url = term_url + eutils_param_process(publication_date, "dp")
        if publication_type is not None:
            term_url = term_url + eutils_param_process(publication_type, "pt")
        if secondary_source_id is not None:
            term_url = term_url + eutils_param_process(secondary_source_id,
                                                       "si")
        if subset is not None:
            term_url = term_url + eutils_param_process(subset, "sb")
        if supplementary_concept is not None:
            term_url = term_url + eutils_param_process(supplementary_concept,
                                                       "nm")
        if text_words is not None:
            term_url = term_url + eutils_param_process(text_words, "tw")
        if title is not None:
            term_url = term_url + eutils_param_process(title, "ti")
        if title_abstract is not None:
            term_url = term_url + eutils_param_process(title_abstract, "tiab")
        if transliterated_title is not None:
            term_url = term_url + eutils_param_process(transliterated_title,
                                                       "tt")
        if uid is not None:
            term_url = term_url + eutils_param_process(uid, "pmid")
        if volume is not None:
            term_url = term_url + eutils_param_process(volume, "vi")

        if term_url[-1] == "+":
            term_url = term_url[:-1]
        if term_url[-1] == "&":
            term_url = term_url[:-1]

        print(base_url + term_url)
        print(base_url + "term=" + term_url)

        r = requests.get(base_url + "term=" + term_url,
                         headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()
        result_set.append(decoded)

        if retmax is None:
            if "count" in decoded["esearchresult"]:
                count_set = int(decoded['esearchresult']['count']) - 100000

                while count_set > 0:
                    retstart = int(decoded['esearchresult']['retstart']) + 1

                    r = requests.get(
                        base_url + "term=" + term_url + "&" + "retstart=" +
                        str(retstart),
                        headers={"Content-Type": "application/json"})

                    if not r.ok:
                        r.raise_for_status()
                        sys.exit()

                    decoded = r.json()
                    result_set.append(decoded)
                    count_set = count_set - 100000
            else:
                print("An error occurred")
                return []

        ec = eutils.Client()
        ref_set = []
        id_set = []
        for dec_set in result_set:
            for pmid in dec_set["esearchresult"]["idlist"]:
                id_set.append(pmid)

        pmset = ec.efetch(db="pubmed", id=id_set)
        for pm in pmset:
            title = pm.title
            authors = pm.authors
            journal = pm.jrnl
            volume = pm.volume
            issue = pm.issue
            year = pm.year
            pages = pm.pages
            pmid = pm.pmid
            doi = pm.doi
            pmc = pm.pmc

            temp_ref = gnomics.objects.reference.Reference(
                identifier=pmid,
                identifier_type="PubMed ID",
                source="Entrez Programming Utilities",
                language=None,
                name=title)

            if doi is not None:
                gnomics.objects.reference.Reference.add_identifier(
                    temp_ref,
                    identifier=doi,
                    identifier_type="DOI",
                    source="Entrez Programming Utilities",
                    language=None,
                    name=title)

            if pmc is not None:
                gnomics.objects.reference.Reference.add_identifier(
                    temp_ref,
                    identifier=pmc,
                    identifier_type="PMC ID",
                    source="Entrez Programming Utilities",
                    language=None,
                    name=title)

            ref_set.append(temp_ref)

    else:
        base_url = "https://www.ncbi.nlm.nih.gov/pubmed/?"

        ext_url = ""
        if unlabeled_string:
            ext_url = ext_url + unlabeled_string
        elif all_fields:
            ext_url = ext_url + all_fields

        ext_url = "term=" + str(
            urllib.parse.quote_plus('("' + ext_url.replace('"', "'")))
        r = requests.get(base_url + ext_url,
                         headers={"Content-Type": "application/json"})

        if "PMID:" in r.text:
            pmid_html = re.findall('<dt>PMID:</dt>.{1,}?[\d].{1,}?</dd>',
                                   r.text)
            pmid = re.findall('\d+', pmid_html[0])

            temp_ref = gnomics.objects.reference.Reference(
                identifier=pmid[0],
                identifier_type="PubMed ID",
                source="PubMed",
                language=None)

            ref_set.append(temp_ref)

    return ref_set
Пример #6
0
def eutils_from_df(input_df, chunksize, output_csv):
    """
    Retrieves and saves PubMed article content from PubMed via E-Utilities API to CSV file
    for set of PMIDs contained within Pandas Dataframe.

    Args:
        input_df: object name for Dataframe containing PMIDs of interest
        chunksize: number of PMIDs to pass to API
        output_csv: filename for CSV file to which article content will be saved

    Returns:
        CSV file with rows pertaining to article content for each PMID in input_csv.
        Columns correspond to fields retrieved via efetch client:
            'PMID', 'Year', 'Title', 'Abstract', 'Authors', 'Journal', 'Volume', 'Issue',
            'Pages', 'DOI', 'PMC'
        List and dataframe containing all PubMed article data successfully retrieved from database
    """

    # Specifies names for output csv column headers
    fieldnames = [
        'PMID',
        'Year',
        'Title',
        'Abstract',
        'Authors',
        'Journal',
        'Volume',
        'Issue',
        'Pages',
        'DOI',
        'PMC',
    ]

    # Creates generator object containing each row in the input dataframe
    pm_chunks_gen = (input_df[i:i + chunksize]
                     for i in range(0, len(input_df), chunksize))

    # Initialises empty list for compilation of article dictionaries into single container
    pm_article_list = []

    # Initialise eutils client to access NCBI E-Utilities API
    ec = eutils.Client()

    # Open CSV file to which each PubMed IDs downloaded data appended as a new row with specified column names
    with open(output_csv, 'a') as datafile:
        writer = csv.DictWriter(
            datafile,
            fieldnames=fieldnames,
        )
        writer.writeheader()

        # Converts each chunk of PubMed IDs from dataframe to list
        for chunk_count, chunk in zip(range(0, len(input_df)), pm_chunks_gen):
            try:
                index_list = list(chunk.index.values)
                chunk_list = list(chunk['PMID'])
                print('Chunk No. ' + str(chunk_count))

                # Passes chunk of PubMed IDs to E-Utilities API
                # Returns iterator object containing key data for each PubMed ID
                pm_article_set = iter(ec.efetch(
                    db='pubmed',
                    id=chunk_list,
                ))

                # Assigns each PubMed ID an index value
                # Iterates over pm_article_set to access data for each individual PubMed ID
                for id_index, id_value in enumerate(chunk_list):
                    print(index_list[id_index], id_value)
                    try:
                        # For each PMID index/value pair, iterates through article set
                        # Aggregates key article attributes for each PubMed ID into dictionary
                        pm_article = next(pm_article_set)
                        pm_article_content = dict(
                            PMID=str(pm_article.pmid),
                            Year=str(pm_article.year),
                            Title=str(pm_article.title),
                            Abstract=str(pm_article.abstract),
                            Authors=str(pm_article.authors),
                            Journal=str(pm_article.jrnl),
                            Volume=str(pm_article.volume),
                            Issue=str(pm_article.issue),
                            Pages=str(pm_article.pages),
                            DOI=str(pm_article.doi),
                            PMC=str(pm_article.pmc),
                        )

                        print(pm_article_content)
                        print(pm_article.pmid +
                              ' - Download from Enterez complete')

                        # Saves dictionary as new item in list for later construction of dataframe
                        pm_article_list.append(pm_article_content)
                        print(pm_article.pmid + ' - Save to list complete')

                        # Writes dictionary to new row of csv file for future reference
                        writer.writerow(pm_article_content)
                        print(pm_article.pmid +
                              ' - Write Data to CSV Complete')

                    # Except statements for content errors
                    except (
                            StopIteration,
                            TypeError,
                            NameError,
                            ValueError,
                            lxml.etree.XMLSyntaxError,
                            eutils.exceptions.EutilsNCBIError,
                    ) as e1:
                        print('Error: ' + str(e1))
                        continue
                    # Except statements for network/connection errors
                    except (
                            TimeoutError,
                            RuntimeError,
                            ConnectionError,
                            ConnectionResetError,
                            eutils.exceptions.EutilsRequestError,
                            requests.exceptions.ConnectionError,
                    ) as e2:
                        print('Error: ' + str(e2))
                        time.sleep(10)
                        continue

            except StopIteration:
                print('All downloads complete')
                break

    # Save list of dictionaries to dataframe & write to CSV file
    pm_article_df = pd.DataFrame.from_records(
        pm_article_list,
        columns=fieldnames,
    )
    print('Save to DataFrame complete')
    datafile.close()
    return pm_article_df
Пример #7
0
def client():
    return eutils.Client()
Пример #8
0
 def __init__(self, *args, **kwargs):
     super(PubmedLoader, self).__init__(*args, **kwargs)
     self.client = eutils.Client()