class Entrez: def __init__(self, download_path): self.online_version = None self.get_online_version() self.version = Version('Entrez', version=self.online_version, download_path=download_path) self.logged_version = self.version.last_logged_version() self.download_path = download_path # self.http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def get_online_version(self): # This assumes that if gene2accession needs updating, so will other Entrez files. r = requests.get('http://ftp.ncbi.nlm.nih.gov/gene/DATA/', stream=True, allow_redirects=True) bsObj = BeautifulSoup(r.text, "html.parser") for link in bsObj.find_all('a'): if link.get('href') == 'gene2accession.gz': self.online_version = datetime.datetime.strptime( link.next.next.split()[0], '%d-%b-%Y').strftime('%d-%B-%Y') break def download_file(self, url, local_filename): # NOTE the stream=True parameter r = requests.get(url, stream=True, allow_redirects=True, auth=HTTPBasicAuth(self.username, self.password)) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) def extract(self, file): with gzip.open(file, 'rb') as rf: with open( os.path.join(self.download_path, file.rsplit('.', 1)[0] + '.human'), 'w') as wf: for i, line in enumerate(rf): line_ascii = line.decode('utf-8') if i == 0: wf.write(line_ascii) else: species = line_ascii.split()[0] if species == '9606': # Grab human only wf.write(line_ascii) def download_file(self, url, local_filename): # NOTE the stream=True parameter r = requests.get(url, stream=True, allow_redirects=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) def download_files(self): """Download and extract the gene2accession and gene_info files""" print('Downloading Entrez Accessions...') print('gene2accession:') g2a_filename = os.path.join(self.download_path, 'gene2accession.gz') self.download_file( "http://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz", g2a_filename) print('\ngene_info:') gi_filename = os.path.join(self.download_path, 'gene_info.gz') self.download_file( "http://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", gi_filename) print('\ninteractions:') ia_filename = os.path.join(self.download_path, 'interactions.gz') self.download_file( "http://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/interactions.gz", ia_filename) print('\nExtracting Entrez Accessions...') print('gene_info...') self.extract(gi_filename) print('gene2accession...') self.extract(g2a_filename) print('interactions...') self.extract(ia_filename) os.remove(g2a_filename) os.remove(gi_filename) os.remove(ia_filename) def parse(self): self.rows = [] with open(os.path.join(self.download_path, 'gene_info.human')) as f: fieldnames = [ 'tax_id', 'entrez_id', 'entrez_gene_symbol', 'locus_tag', 'entrez_gene_synonyms', 'dbXrefs', 'chromosome', 'map_loc', 'description', 'type', 'sym_from_auth', 'full_from_auth', 'nom_status', 'other_designations', 'mod_date' ] reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames) for i, row in enumerate(reader): if i == 0: continue ensembl = set() for key, value in row.items(): if value == '-': row[key] = 'N/A' dbXrefs = row['dbXrefs'].split('|') for xRef in dbXrefs: if xRef == 'N/A': continue source, label = xRef.split(':', 1) if source == 'Ensembl': ensembl.add(label) row['ensembl_ids'] = '|'.join(ensembl) or 'N/A' self.rows.append(row) def write(self): filename = os.path.join(self.download_path, 'entrez_genes.tsv') with open(filename, 'w') as f: fieldnames = [ 'entrez_id', 'entrez_gene_symbol', 'entrez_gene_synonyms', 'ensembl_ids', 'description' ] writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() writer.writerows(self.rows) self.version.write_log() def update(self): if not self.is_current(): self.download_files() self.parse() self.write()
class DrugBank(object): def __init__(self, username, password, download_path, tsv_file): self.online_version = None self.get_online_version() self.version = Version('DrugBank', version=self.online_version, download_path=download_path) self.logged_version = self.version.last_logged_version() self.interactions = self.drug_info = None self.username = username self.password = password self.download_path = download_path self.tsv_file = tsv_file def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def get_online_version(self): print('Checking DrugBank Version...') context = ssl._create_unverified_context() html = requests.get('http://www.drugbank.ca/downloads') bsObj = BeautifulSoup(html.text, "html.parser") r = re.compile(r'Version ([\d\.]+)') match = r.search(bsObj.h1.text) if match: self.online_version = match.group(1) else: raise ValueError('Error loading online version.') def download_file(self, url, local_filename): # NOTE the stream=True parameter r = requests.get(url, stream=True, allow_redirects=True, auth=HTTPBasicAuth(self.username, self.password)) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) def download_files(self): print('Downloading DrugBank XML...') filename = os.path.join(self.download_path, 'drugbank.zip') self.download_file( 'https://www.drugbank.ca/releases/5-1-7/downloads/all-full-database', filename) print('\nExtracting DrugBank XML...') zfile = zipfile.ZipFile(filename) zfile.extract('full database.xml', self.download_path) os.remove(filename) e = Entrez(self.download_path) e.update() def parse(self): print('Parsing Entrez...') symbol_to_info = dict() hgnc_id_to_info = dict() entrez_to_info = dict() sources = set() with open(os.path.join(self.download_path, 'gene_info.human')) as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue gene_symbol = line[2] entrez_id = line[1] symbol_to_info[gene_symbol] = { 'Entrez': entrez_id, 'Symbol': gene_symbol } if line[5] == '-': continue synonyms = line[5].split('|') for synonym in synonyms: (source, accession) = synonym.split(':', 1) symbol_to_info[gene_symbol][source] = accession sources.add(source) if 'HGNC' in symbol_to_info[gene_symbol]: hgnc_id_to_info[symbol_to_info[gene_symbol] ['HGNC']] = symbol_to_info[gene_symbol] entrez_to_info[entrez_id] = symbol_to_info[gene_symbol] uniprot_to_entrez = dict() r = re.compile( r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}' ) # regex from: http://www.uniprot.org/help/accession_numbers with open(os.path.join(self.download_path, 'gene2accession.human')) as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue uniprot_id = line[5].split('.', 1)[0] if not r.match(uniprot_id): continue entrez_id = line[1] uniprot_to_entrez[uniprot_id] = entrez_id print('Parsing DrugBank XML...') ns = {'entry': 'http://www.drugbank.ca'} tree = ET.parse(os.path.join(self.download_path, 'full database.xml')) drugbank = tree.getroot() drugs = drugbank.findall('entry:drug', ns) interactions = dict() drug_info = dict() uniprot_fail = uniprot_success = 0 hgnc_fail = hgnc_success = 0 no_info = info = no_ensembl = 0 total = 0 for drug in drugs: drug_id = drug.find('entry:drugbank-id', ns).text drug_name = drug.find('entry:name', ns).text synonyms = drug.find('entry:synonyms', ns) drug_synonyms = set() for synonym in synonyms: language = synonym.get('language') if language == '' or language == 'English': drug_synonyms.add(synonym.text) external_identifiers = drug.find('entry:external-identifiers', ns) chembl_id = '' for external_identfier in external_identifiers: resource = external_identfier.find('entry:resource', ns).text if resource == 'ChEMBL': chembl_id = external_identfier.find( 'entry:identifier', ns).text drug_cas_number = drug.find('entry:cas-number', ns).text drug_brands = set() for product in drug.find('entry:products', ns): drug_brands.add(product.find('entry:name', ns).text) for int_brand in drug.find('entry:international-brands', ns): drug_brands.add(int_brand.find('entry:name', ns).text) drug_type = drug.get('type') drug_groups = set() for group in drug.find('entry:groups', ns): drug_groups.add(group.text) drug_categories = set() for category in drug.find('entry:categories', ns): drug_categories.add( category.find('entry:category', ns).text.lower()) targets = drug.find('entry:targets', ns) if len(targets) == 0: continue drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)), drug_cas_number, tuple(sorted(drug_brands)), drug_type, tuple(sorted(drug_groups)), tuple(sorted(drug_categories)), chembl_id) for target in targets: organism = target.find('entry:organism', ns).text if organism != 'Humans': continue gene_id = target.find('entry:id', ns).text known_action = target.find('entry:known-action', ns).text target_actions = set() for action in target.find('entry:actions', ns): target_actions.add(action.text) gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None pmids = set() references = target.find('entry:references', ns) articles = references.find('entry:articles', ns).findall('entry:article', ns) for article in articles: pmids.add(article.find('entry:pubmed-id', ns).text) pmids = tuple(pmids) polypeptide = target.find('entry:polypeptide', ns) synonyms = None if polypeptide is not None: gene_symbol = polypeptide.find('entry:gene-name', ns).text for identifier in polypeptide.find( 'entry:external-identifiers', ns): if identifier.find( 'entry:resource', ns ).text == 'HUGO Gene Nomenclature Committee (HGNC)': hgnc_gene_acc = identifier.find( 'entry:identifier', ns).text # Some identifiers are incorrectly labeled by DrugBank r = re.compile(r'^\d+$') if hgnc_gene_acc.startswith('GNC:'): hgnc_gene_acc = 'H' + hgnc_gene_acc elif r.match(hgnc_gene_acc): hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc try: synonyms = hgnc_id_to_info[hgnc_gene_acc] except: hgnc_fail += 1 else: entrez_id = synonyms['Entrez'] try: ensembl_id = synonyms['Ensembl'] except KeyError: no_ensembl += 1 hgnc_success += 1 elif identifier.find('entry:resource', ns).text == 'UniProtKB': uniprot_id = identifier.find( 'entry:identifier', ns).text if not synonyms: try: entrez_id = uniprot_to_entrez[uniprot_id] synonyms = entrez_to_info[entrez_id] except KeyError: uniprot_fail += 1 else: uniprot_success += 1 if not synonyms: try: synonyms = symbol_to_info[gene_symbol] except KeyError: no_info += 1 else: entrez_id = synonyms['Entrez'] ensembl_id = synonyms['Ensembl'] info += 1 interaction_tuple = (gene_id, known_action, tuple(sorted(target_actions)), gene_symbol, uniprot_id, entrez_id, ensembl_id, pmids) total += 1 try: interactions[drug_id].append(interaction_tuple) except KeyError: interactions[drug_id] = [ interaction_tuple, ] self.interactions = interactions self.drug_info = drug_info def write(self): print('Writing to .tsv...') i = 0 no_ensembl = no_entrez = total = 0 header = ('count', 'drug_id', 'drug_name', 'drug_synonyms', 'drug_cas_number', 'drug_brands', 'drug_type', 'drug_groups', 'drug_categories', 'chembl_id', 'gene_id', 'known_action', 'target_actions', 'gene_symbol', 'uniprot_id', 'entrez_id', 'ensembl_id', 'pmid') with open(self.tsv_file, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(header) for drug in sorted(self.interactions): for interaction in self.interactions[drug]: i += 1 data = (i, drug) + self.drug_info[drug] + interaction out = list() for datum in data: if isinstance(datum, tuple): datum = ';'.join(str(x) for x in datum) datum = str(datum).replace("\t", '') if not datum or datum == 'None': datum = 'N/A' out.append(datum) # Some small number of rows contain tabs within text. if out[14] == 'N/A': no_entrez += 1 if out[15] == 'N/A': no_ensembl += 1 writer.writerow(out) with open('tmp/version', 'w') as version_file: version_file.write(self.version) def update(self): if not self.is_current(): self.download_files() self.parse() self.write()
class GO: def __init__(self): # These were assembled from the original DGIdb identifiers self.dgidb_go_terms = [{"short_name": "Kinase", "full_go_name": "KinaseActivity", "human_readable": "Kinase", "go_id": "GO0016301"}, {"short_name": "TyrosineKinase", "full_go_name": "ProteinTyrosineKinaseActivity", "human_readable": "Tyrosine Kinase", "go_id": "GO0004713"}, {"short_name": "SerineThreonineKinase", "full_go_name": "ProteinSerineThreonineKinaseActivity", "human_readable": "Serine Threonine Kinase", "go_id": "GO0004674"}, {"short_name": "ProteinPhosphatase", "full_go_name": "PhospoproteinPhosphataseActivity", "human_readable": "Protein Phosphatase", "go_id": "GO0004721"}, {"short_name": "GProteinCoupledReceptor", "full_go_name": "GpcrActivity", "human_readable": "G Protein Coupled Receptor", "go_id": "GO0004930"}, {"short_name": "NeutralZincMetallopeptidases", "full_go_name": "MetallopeptidaseActivity", "human_readable": "Neutral Zinc Metallopeptidase", "go_id": "GO0008237"}, {"short_name": "ABCTransporter", "full_go_name": "ABCTransporterActivity", "human_readable": "ABC Transporter", "go_id": "GO0042626"}, {"short_name": "RNADirectedDNAPolymerase", "full_go_name": "RNADirectedDnaPolymeraseActivity", "human_readable": "RNA Directed DNA Polymerase", "go_id": "GO0003964"}, {"short_name": "Transporter", "full_go_name": "TransporterActivity", "human_readable": "Transporter", "go_id": "GO0005215"}, {"short_name": "IonChannel", "full_go_name": "IonChannelActivity", "human_readable": "Ion Channel", "go_id": "GO0005216"}, {"short_name": "NuclearHormoneReceptor", "full_go_name": "LigandDependentNuclearReceptorActivity", "human_readable": "Nuclear Hormone Receptor", "go_id": "GO0004879"}, {"short_name": "LipidKinase", "full_go_name": "LipidKinaseActivity", "human_readable": "Lipid Kinase", "go_id": "GO0001727"}, {"short_name": "Phospholipase", "full_go_name": "PhospholipaseActivity", "human_readable": "Phospholipase", "go_id": "GO0004620"}, {"short_name": "ProteaseInhibitorActivity", "full_go_name": "PeptidaseInhibitorActivity", "human_readable": "Protease Inhibitor", "go_id": "GO0030414"}, {"short_name": "DNARepair", "full_go_name": "DnaRepair", "human_readable": "DNA Repair", "go_id": "GO0006281"}, {"short_name": "CellSurface", "full_go_name": "CellSurface", "human_readable": "Cell Surface", "go_id": "GO0009986"}, {"short_name": "ExternalSideOfPlasmaMembrane", "full_go_name": "ExternalSideOfPlasmaMembrane", "human_readable": "External Side Of Plasma Membrane", "go_id": "GO0009897"}, {"short_name": "GrowthFactor", "full_go_name": "GrowthFactorActivity", "human_readable": "Growth Factor", "go_id": "GO0008083"}, {"short_name": "HormoneActivity", "full_go_name": "HormoneActivity", "human_readable": "Hormone Activity", "go_id": "GO0005179"}, {"short_name": "TumorSuppressor", "full_go_name": "RegulationOfCellCycle", "human_readable": "Tumor Suppressor", "go_id": "GO0051726"}, {"short_name": "TranscriptionFactorBinding", "full_go_name": "TranscriptionFactorBinding", "human_readable": "Transcription Factor Binding", "go_id": "GO0008134"}, {"short_name": "TranscriptionFactorComplex", "full_go_name": "TranscriptionFactorComplex", "human_readable": "Transcription Factor Complex", "go_id": "GO0005667"}, {"short_name": "HistoneModification", "full_go_name": "HistoneModification", "human_readable": "Histone Modification", "go_id": "GO0016570"}, {"short_name": "DrugMetabolism", "full_go_name": "DrugMetabolism", "human_readable": "Drug Metabolism", "go_id": "GO0017144"}, {"short_name": "DrugResistance", "full_go_name": "ResponseToDrug", "human_readable": "Drug Resistance", "go_id": "GO0042493"}, {"short_name": "ProteaseActivity", "full_go_name": "PeptidaseActivity", "human_readable": "Protease", "go_id": "GO0008233"}] #TODO: Use http://www.ebi.ac.uk/QuickGO/GHistory#info=2 for versioning. self.version = Version('GO', append_date=True) self.logged_version = self.version.last_logged_version() self.rows = [] def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def download_files(self): """Download and extract the gene2accession and gene_info files""" go_ids = [':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms] url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&limit=-1&gz=false&tax=9606&goid=' os.makedirs('data/GO', exist_ok=True) for go_id in go_ids: file = 'data/GO/' + go_id.replace(':','') + '.tsv' try: os.remove(file) except FileNotFoundError: pass wget.download(url + go_id, out=file) def parse(self): self.rows = [] go_ids = [':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms] category_lookup = {x['go_id']: x['human_readable'].upper() for x in self.dgidb_go_terms} for go_id in go_ids: temp = {} category = category_lookup[go_id.replace(':', '')] file = 'data/GO/' + go_id.replace(':', '') + '.tsv' with open(file, 'r') as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: if row['Symbol'] == '-' or ' ' in row['Symbol']: continue try: temp[row['Symbol']].add(row['ID']) except KeyError: temp[row['Symbol']] = set((row['ID'],)) for symbol in temp: row = {'Symbol': symbol, 'Category': category, 'IDs': '|'.join(temp[symbol])} self.rows.append(row) def write(self): fieldnames = ['Symbol', 'IDs', 'Category'] with open('data/go.human.tsv', 'w') as f: writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() for row in self.rows: writer.writerow(row) self.version.write_log() def update(self): if not self.is_current(): #self.download_files() self.parse() self.write()
class GO: def __init__(self): # These were assembled from the original DGIdb identifiers self.dgidb_go_terms = [{ "short_name": "Kinase", "full_go_name": "KinaseActivity", "human_readable": "Kinase", "go_id": "GO0016301" }, { "short_name": "TyrosineKinase", "full_go_name": "ProteinTyrosineKinaseActivity", "human_readable": "Tyrosine Kinase", "go_id": "GO0004713" }, { "short_name": "SerineThreonineKinase", "full_go_name": "ProteinSerineThreonineKinaseActivity", "human_readable": "Serine Threonine Kinase", "go_id": "GO0004674" }, { "short_name": "ProteinPhosphatase", "full_go_name": "PhospoproteinPhosphataseActivity", "human_readable": "Protein Phosphatase", "go_id": "GO0004721" }, { "short_name": "GProteinCoupledReceptor", "full_go_name": "GpcrActivity", "human_readable": "G Protein Coupled Receptor", "go_id": "GO0004930" }, { "short_name": "NeutralZincMetallopeptidases", "full_go_name": "MetallopeptidaseActivity", "human_readable": "Neutral Zinc Metallopeptidase", "go_id": "GO0008237" }, { "short_name": "ABCTransporter", "full_go_name": "ABCTransporterActivity", "human_readable": "ABC Transporter", "go_id": "GO0042626" }, { "short_name": "RNADirectedDNAPolymerase", "full_go_name": "RNADirectedDnaPolymeraseActivity", "human_readable": "RNA Directed DNA Polymerase", "go_id": "GO0003964" }, { "short_name": "Transporter", "full_go_name": "TransporterActivity", "human_readable": "Transporter", "go_id": "GO0005215" }, { "short_name": "IonChannel", "full_go_name": "IonChannelActivity", "human_readable": "Ion Channel", "go_id": "GO0005216" }, { "short_name": "NuclearHormoneReceptor", "full_go_name": "LigandDependentNuclearReceptorActivity", "human_readable": "Nuclear Hormone Receptor", "go_id": "GO0004879" }, { "short_name": "LipidKinase", "full_go_name": "LipidKinaseActivity", "human_readable": "Lipid Kinase", "go_id": "GO0001727" }, { "short_name": "Phospholipase", "full_go_name": "PhospholipaseActivity", "human_readable": "Phospholipase", "go_id": "GO0004620" }, { "short_name": "ProteaseInhibitorActivity", "full_go_name": "PeptidaseInhibitorActivity", "human_readable": "Protease Inhibitor", "go_id": "GO0030414" }, { "short_name": "DNARepair", "full_go_name": "DnaRepair", "human_readable": "DNA Repair", "go_id": "GO0006281" }, { "short_name": "CellSurface", "full_go_name": "CellSurface", "human_readable": "Cell Surface", "go_id": "GO0009986" }, { "short_name": "ExternalSideOfPlasmaMembrane", "full_go_name": "ExternalSideOfPlasmaMembrane", "human_readable": "External Side Of Plasma Membrane", "go_id": "GO0009897" }, { "short_name": "GrowthFactor", "full_go_name": "GrowthFactorActivity", "human_readable": "Growth Factor", "go_id": "GO0008083" }, { "short_name": "HormoneActivity", "full_go_name": "HormoneActivity", "human_readable": "Hormone Activity", "go_id": "GO0005179" }, { "short_name": "TumorSuppressor", "full_go_name": "RegulationOfCellCycle", "human_readable": "Tumor Suppressor", "go_id": "GO0051726" }, { "short_name": "TranscriptionFactorBinding", "full_go_name": "TranscriptionFactorBinding", "human_readable": "Transcription Factor Binding", "go_id": "GO0008134" }, { "short_name": "TranscriptionFactorComplex", "full_go_name": "TranscriptionFactorComplex", "human_readable": "Transcription Factor Complex", "go_id": "GO0005667" }, { "short_name": "HistoneModification", "full_go_name": "HistoneModification", "human_readable": "Histone Modification", "go_id": "GO0016570" }, { "short_name": "DrugMetabolism", "full_go_name": "DrugMetabolism", "human_readable": "Drug Metabolism", "go_id": "GO0017144" }, { "short_name": "DrugResistance", "full_go_name": "ResponseToDrug", "human_readable": "Drug Resistance", "go_id": "GO0042493" }, { "short_name": "ProteaseActivity", "full_go_name": "PeptidaseActivity", "human_readable": "Protease", "go_id": "GO0008233" }] #TODO: Use http://www.ebi.ac.uk/QuickGO/GHistory#info=2 for versioning. self.version = Version('GO', append_date=True) self.logged_version = self.version.last_logged_version() self.rows = [] def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def download_files(self): """Download and extract the gene2accession and gene_info files""" go_ids = [ ':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms ] url = 'http://www.ebi.ac.uk/QuickGO/GAnnotation?format=tsv&limit=-1&gz=false&tax=9606&goid=' os.makedirs('data/GO', exist_ok=True) for go_id in go_ids: file = 'data/GO/' + go_id.replace(':', '') + '.tsv' try: os.remove(file) except FileNotFoundError: pass wget.download(url + go_id, out=file) def parse(self): self.rows = [] go_ids = [ ':'.join((x['go_id'][:2], x['go_id'][2:])) for x in self.dgidb_go_terms ] category_lookup = { x['go_id']: x['human_readable'].upper() for x in self.dgidb_go_terms } for go_id in go_ids: temp = {} category = category_lookup[go_id.replace(':', '')] file = 'data/GO/' + go_id.replace(':', '') + '.tsv' with open(file, 'r') as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: if row['Symbol'] == '-' or ' ' in row['Symbol']: continue try: temp[row['Symbol']].add(row['ID']) except KeyError: temp[row['Symbol']] = set((row['ID'], )) for symbol in temp: row = { 'Symbol': symbol, 'Category': category, 'IDs': '|'.join(temp[symbol]) } self.rows.append(row) def write(self): fieldnames = ['Symbol', 'IDs', 'Category'] with open('data/go.human.tsv', 'w') as f: writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() for row in self.rows: writer.writerow(row) self.version.write_log() def update(self): if not self.is_current(): #self.download_files() self.parse() self.write()
class DrugBank(): def __init__(self): self.online_version = None self.get_online_version() self.version = Version('DrugBank', version=self.online_version) self.logged_version = self.version.last_logged_version() self.interactions = self.drug_info = None def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def get_online_version(self): print('Checking DrugBank Version...') html = urlopen('http://www.drugbank.ca/downloads') bsObj = BeautifulSoup(html.read(), "html.parser") r = re.compile(r'Version ([\d\.]+)') match = r.search(bsObj.h1.text) if match: self.online_version = match.group(1) else: raise ValueError('Error loading online version.') @staticmethod def download_files(): print('Downloading DrugBank XML...') filename = wget.download("http://www.drugbank.ca/system/downloads/current/drugbank.xml.zip") print('\nExtracting DrugBank XML...') zfile = zipfile.ZipFile(filename) zfile.extract('drugbank.xml', 'data') os.remove(filename) e = Entrez() e.update() def parse(self): print('Parsing Entrez...') symbol_to_info = dict() hgnc_id_to_info = dict() entrez_to_info = dict() sources = set() with open('data/gene_info.human') as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue gene_symbol = line[2] entrez_id = line[1] symbol_to_info[gene_symbol] = {'Entrez': entrez_id, 'Symbol': gene_symbol} if line[5] == '-': continue synonyms = line[5].split('|') for synonym in synonyms: (source, accession) = synonym.split(':', 1) symbol_to_info[gene_symbol][source] = accession sources.add(source) if 'HGNC' in symbol_to_info[gene_symbol]: hgnc_id_to_info[symbol_to_info[gene_symbol]['HGNC']] = symbol_to_info[gene_symbol] entrez_to_info[entrez_id] = symbol_to_info[gene_symbol] uniprot_to_entrez = dict() r = re.compile(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}') # regex from: http://www.uniprot.org/help/accession_numbers with open('data/gene2accession.human') as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue uniprot_id = line[5].split('.',1)[0] if not r.match(uniprot_id): continue entrez_id = line[1] uniprot_to_entrez[uniprot_id] = entrez_id print('Parsing DrugBank XML...') ns = {'entry': 'http://www.drugbank.ca'} tree = ET.parse('data/drugbank.xml') drugbank = tree.getroot() drugs = drugbank.findall('entry:drug', ns) interactions = dict() drug_info = dict() uniprot_fail = uniprot_success = 0 hgnc_fail = hgnc_success = 0 no_info = info = no_ensembl = 0 total = 0 for drug in drugs: drug_id = drug.find('entry:drugbank-id', ns).text drug_name = drug.find('entry:name', ns).text synonyms = drug.find('entry:synonyms', ns) drug_synonyms = set() for synonym in synonyms: language = synonym.get('language') if language == '' or language == 'English': drug_synonyms.add(synonym.text) drug_cas_number = drug.find('entry:cas-number',ns).text drug_brands = set() for product in drug.find('entry:products', ns): drug_brands.add(product.find('entry:name', ns).text) for int_brand in drug.find('entry:international-brands', ns): drug_brands.add(int_brand.find('entry:name', ns).text) drug_type = drug.get('type') drug_groups = set() for group in drug.find('entry:groups', ns): drug_groups.add(group.text) drug_categories = set() for category in drug.find('entry:categories', ns): drug_categories.add(category.find('entry:category', ns).text.lower()) targets = drug.find('entry:targets', ns) if len(targets) == 0: continue drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)), drug_cas_number, tuple(sorted(drug_brands)), drug_type, tuple(sorted(drug_groups)), tuple(sorted(drug_categories))) for target in targets: organism = target.find('entry:organism', ns).text if organism != 'Human': continue gene_id = target.find('entry:id',ns).text known_action = target.find('entry:known-action', ns).text target_actions = set() for action in target.find('entry:actions', ns): target_actions.add(action.text) gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None polypeptide = target.find('entry:polypeptide',ns) synonyms = None if polypeptide is not None: gene_symbol = polypeptide.find('entry:gene-name', ns).text for identifier in polypeptide.find('entry:external-identifiers',ns): if identifier.find('entry:resource',ns).text == 'HUGO Gene Nomenclature Committee (HGNC)': hgnc_gene_acc = identifier.find('entry:identifier',ns).text # Some identifiers are incorrectly labeled by DrugBank r = re.compile(r'^\d+$') if hgnc_gene_acc.startswith('GNC:'): hgnc_gene_acc = 'H' + hgnc_gene_acc elif r.match(hgnc_gene_acc): hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc try: synonyms = hgnc_id_to_info[hgnc_gene_acc] except: hgnc_fail += 1 else: entrez_id = synonyms['Entrez'] try: ensembl_id = synonyms['Ensembl'] except KeyError: no_ensembl += 1 hgnc_success += 1 elif identifier.find('entry:resource',ns).text == 'UniProtKB': uniprot_id = identifier.find('entry:identifier',ns).text if not synonyms: try: entrez_id = uniprot_to_entrez[uniprot_id] synonyms = entrez_to_info[entrez_id] except KeyError: uniprot_fail += 1 else: uniprot_success += 1 if not synonyms: try: synonyms = symbol_to_info[gene_symbol] except KeyError: no_info += 1 else: entrez_id = synonyms['Entrez'] ensembl_id = synonyms['Ensembl'] info += 1 interaction_tuple = (gene_id, known_action, tuple(sorted(target_actions)), gene_symbol, uniprot_id, entrez_id, ensembl_id) total += 1 try: interactions[drug_id].append(interaction_tuple) except KeyError: interactions[drug_id] = [interaction_tuple, ] self.interactions = interactions self.drug_info = drug_info def write(self): print('Writing to .tsv...') i = 0 no_ensembl = no_entrez = total = 0 header = ('count','drug_id','drug_name','drug_synonyms','drug_cas_number','drug_brands', 'drug_type','drug_groups','drug_categories','gene_id','known_action','target_actions', 'gene_symbol','uniprot_id','entrez_id','ensembl_id') with open('data/DrugBankInteractions.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(header) for drug in sorted(self.interactions): for interaction in self.interactions[drug]: i += 1 data = (i, drug) + self.drug_info[drug] + interaction out = list() for datum in data: if isinstance(datum, tuple): datum = ','.join(datum) datum = str(datum).replace("\t", '') if not datum or datum == 'None': datum = 'N/A' out.append(datum) # Some small number of rows contain tabs within text. if out[14] == 'N/A': no_entrez += 1 if out[15] == 'N/A': no_ensembl += 1 writer.writerow(out) self.version.write_log() def update(self): if not self.is_current(): self.download_files() self.parse() self.write()
class Entrez: def __init__(self): self.online_version = None self.get_online_version() self.version = Version('Entrez', version=self.online_version) self.logged_version = self.version.last_logged_version() def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def get_online_version(self): # This assumes that if gene2accession needs updating, so will other Entrez files. html = urlopen('http://ftp.ncbi.nlm.nih.gov/gene/DATA/') bsObj = BeautifulSoup(html.read(), "html.parser") a = bsObj.hr.find('a', {"href": "gene2accession.gz"}) self.online_version = datetime.datetime.strptime(a.next.next.split()[0], '%d-%b-%Y').strftime('%d-%B-%Y') @staticmethod def extract(file): with gzip.open(file, 'rb') as rf: with open('data/' + file.rsplit('.', 1)[0] + '.human', 'w') as wf: for i, line in enumerate(rf): line_ascii = line.decode('utf-8') if i == 0: wf.write(line_ascii) else: species = line_ascii.split()[0] if species == '9606': # Grab human only wf.write(line_ascii) @staticmethod def download_files(): """Download and extract the gene2accession and gene_info files""" print('Downloading Entrez Accessions...') print('gene_info:') g2a_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2accession.gz") print('\ngene2accession:') gi_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz") print('\ninteractions:') ia_filename = wget.download("ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/interactions.gz") print('\nExtracting Entrez Accessions...') print('gene_info...') Entrez.extract("gene_info.gz") print('gene2accession...') Entrez.extract("gene2accession.gz") print('interactions...') Entrez.extract("interactions.gz") os.remove(g2a_filename) os.remove(gi_filename) os.remove(ia_filename) def parse(self): self.rows = [] with open('data/gene_info.human') as f: fieldnames = ['tax_id', 'entrez_id', 'entrez_gene_symbol', 'locus_tag', 'entrez_gene_synonyms', 'dbXrefs', 'chromosome', 'map_loc', 'description', 'type', 'sym_from_auth', 'full_from_auth', 'nom_status', 'other_designations', 'mod_date'] reader = csv.DictReader(f, delimiter='\t', fieldnames=fieldnames) for i, row in enumerate(reader): if i == 0: continue ensembl = set() for key, value in row.items(): if value == '-': row[key] = 'N/A' dbXrefs = row['dbXrefs'].split('|') for xRef in dbXrefs: if xRef == 'N/A': continue source, label = xRef.split(':', 1) if source == 'Ensembl': ensembl.add(label) row['ensembl_ids'] = '|'.join(ensembl) or 'N/A' self.rows.append(row) def write(self): with open('data/entrez_genes.tsv', 'w') as f: fieldnames = ['entrez_id', 'entrez_gene_symbol', 'entrez_gene_synonyms', 'ensembl_ids', 'description'] writer = csv.DictWriter(f, delimiter='\t', fieldnames=fieldnames, extrasaction='ignore') writer.writeheader() writer.writerows(self.rows) self.version.write_log() def update(self): if not self.is_current(): self.download_files() self.parse() self.write()
class DrugBank(): def __init__(self): self.online_version = None self.get_online_version() self.version = Version('DrugBank', version=self.online_version) self.logged_version = self.version.last_logged_version() self.interactions = self.drug_info = None def is_current(self): """Returns True if local versions of Entrez files are up-to-date.""" return self.version.is_current() def get_online_version(self): print('Checking DrugBank Version...') html = urlopen('http://www.drugbank.ca/downloads') bsObj = BeautifulSoup(html.read(), "html.parser") r = re.compile(r'Version ([\d\.]+)') match = r.search(bsObj.h1.text) if match: self.online_version = match.group(1) else: raise ValueError('Error loading online version.') @staticmethod def download_files(): print('Downloading DrugBank XML...') filename = wget.download( "http://www.drugbank.ca/system/downloads/current/drugbank.xml.zip") print('\nExtracting DrugBank XML...') zfile = zipfile.ZipFile(filename) zfile.extract('drugbank.xml', 'data') os.remove(filename) e = Entrez() e.update() def parse(self): print('Parsing Entrez...') symbol_to_info = dict() hgnc_id_to_info = dict() entrez_to_info = dict() sources = set() with open('data/gene_info.human') as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue gene_symbol = line[2] entrez_id = line[1] symbol_to_info[gene_symbol] = { 'Entrez': entrez_id, 'Symbol': gene_symbol } if line[5] == '-': continue synonyms = line[5].split('|') for synonym in synonyms: (source, accession) = synonym.split(':', 1) symbol_to_info[gene_symbol][source] = accession sources.add(source) if 'HGNC' in symbol_to_info[gene_symbol]: hgnc_id_to_info[symbol_to_info[gene_symbol] ['HGNC']] = symbol_to_info[gene_symbol] entrez_to_info[entrez_id] = symbol_to_info[gene_symbol] uniprot_to_entrez = dict() r = re.compile( r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}' ) # regex from: http://www.uniprot.org/help/accession_numbers with open('data/gene2accession.human') as f: c = csv.reader(f, delimiter='\t') for i, line in enumerate(c): if i == 0: continue if line[0] != '9606': continue uniprot_id = line[5].split('.', 1)[0] if not r.match(uniprot_id): continue entrez_id = line[1] uniprot_to_entrez[uniprot_id] = entrez_id print('Parsing DrugBank XML...') ns = {'entry': 'http://www.drugbank.ca'} tree = ET.parse('data/drugbank.xml') drugbank = tree.getroot() drugs = drugbank.findall('entry:drug', ns) interactions = dict() drug_info = dict() uniprot_fail = uniprot_success = 0 hgnc_fail = hgnc_success = 0 no_info = info = no_ensembl = 0 total = 0 for drug in drugs: drug_id = drug.find('entry:drugbank-id', ns).text drug_name = drug.find('entry:name', ns).text synonyms = drug.find('entry:synonyms', ns) drug_synonyms = set() for synonym in synonyms: language = synonym.get('language') if language == '' or language == 'English': drug_synonyms.add(synonym.text) drug_cas_number = drug.find('entry:cas-number', ns).text drug_brands = set() for product in drug.find('entry:products', ns): drug_brands.add(product.find('entry:name', ns).text) for int_brand in drug.find('entry:international-brands', ns): drug_brands.add(int_brand.find('entry:name', ns).text) drug_type = drug.get('type') drug_groups = set() for group in drug.find('entry:groups', ns): drug_groups.add(group.text) drug_categories = set() for category in drug.find('entry:categories', ns): drug_categories.add( category.find('entry:category', ns).text.lower()) targets = drug.find('entry:targets', ns) if len(targets) == 0: continue drug_info[drug_id] = (drug_name, tuple(sorted(drug_synonyms)), drug_cas_number, tuple(sorted(drug_brands)), drug_type, tuple(sorted(drug_groups)), tuple(sorted(drug_categories))) for target in targets: organism = target.find('entry:organism', ns).text if organism != 'Human': continue gene_id = target.find('entry:id', ns).text known_action = target.find('entry:known-action', ns).text target_actions = set() for action in target.find('entry:actions', ns): target_actions.add(action.text) gene_symbol = hgnc_gene_acc = uniprot_id = entrez_id = ensembl_id = None raw_refs = target.find('entry:references', ns).text refs_regex = re.compile( r'"Pubmed":http://www.ncbi.nlm.nih.gov/pubmed/(\d+)') references = set() try: for string in raw_refs.split('#'): match = refs_regex.search(string) if match: references.add(match.group(1)) except AttributeError: pass references = tuple(references) polypeptide = target.find('entry:polypeptide', ns) synonyms = None if polypeptide is not None: gene_symbol = polypeptide.find('entry:gene-name', ns).text for identifier in polypeptide.find( 'entry:external-identifiers', ns): if identifier.find( 'entry:resource', ns ).text == 'HUGO Gene Nomenclature Committee (HGNC)': hgnc_gene_acc = identifier.find( 'entry:identifier', ns).text # Some identifiers are incorrectly labeled by DrugBank r = re.compile(r'^\d+$') if hgnc_gene_acc.startswith('GNC:'): hgnc_gene_acc = 'H' + hgnc_gene_acc elif r.match(hgnc_gene_acc): hgnc_gene_acc = 'HGNC:' + hgnc_gene_acc try: synonyms = hgnc_id_to_info[hgnc_gene_acc] except: hgnc_fail += 1 else: entrez_id = synonyms['Entrez'] try: ensembl_id = synonyms['Ensembl'] except KeyError: no_ensembl += 1 hgnc_success += 1 elif identifier.find('entry:resource', ns).text == 'UniProtKB': uniprot_id = identifier.find( 'entry:identifier', ns).text if not synonyms: try: entrez_id = uniprot_to_entrez[uniprot_id] synonyms = entrez_to_info[entrez_id] except KeyError: uniprot_fail += 1 else: uniprot_success += 1 if not synonyms: try: synonyms = symbol_to_info[gene_symbol] except KeyError: no_info += 1 else: entrez_id = synonyms['Entrez'] ensembl_id = synonyms['Ensembl'] info += 1 interaction_tuple = (gene_id, known_action, tuple(sorted(target_actions)), gene_symbol, uniprot_id, entrez_id, ensembl_id, references) total += 1 try: interactions[drug_id].append(interaction_tuple) except KeyError: interactions[drug_id] = [ interaction_tuple, ] self.interactions = interactions self.drug_info = drug_info def write(self): print('Writing to .tsv...') i = 0 no_ensembl = no_entrez = total = 0 header = ('count', 'drug_id', 'drug_name', 'drug_synonyms', 'drug_cas_number', 'drug_brands', 'drug_type', 'drug_groups', 'drug_categories', 'gene_id', 'known_action', 'target_actions', 'gene_symbol', 'uniprot_id', 'entrez_id', 'ensembl_id', 'pmid') with open('data/DrugBankInteractions.tsv', 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(header) for drug in sorted(self.interactions): for interaction in self.interactions[drug]: i += 1 data = (i, drug) + self.drug_info[drug] + interaction out = list() for datum in data: if isinstance(datum, tuple): datum = ','.join(datum) datum = str(datum).replace("\t", '') if not datum or datum == 'None': datum = 'N/A' out.append(datum) # Some small number of rows contain tabs within text. if out[14] == 'N/A': no_entrez += 1 if out[15] == 'N/A': no_ensembl += 1 writer.writerow(out) self.version.write_log() def update(self): if not self.is_current(): self.download_files() self.parse() self.write()