def pull_pantherfamily(): outfile = f'{PANTHERFAMILY}/family.csv' pull_via_ftp( 'ftp.pantherdb.org', '/sequence_classifications/current_release/PANTHER_Sequence_Classification_files/', 'PTHR16.0_human', outfilename=outfile)
def pull_pubchem(): files = ['CID-MeSH', 'CID-Synonym-filtered.gz', 'CID-Title.gz'] for f in files: outfile = f'PUBCHEM/{f}' pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras', f, outfilename=outfile)
def pull_ncbigene(filenames): remotedir = 'https://ftp.ncbi.nih.gov/gene/DATA/' for fn in filenames: pull_via_ftp('ftp.ncbi.nih.gov', '/gene/DATA', fn, decompress_data=False, outfilename=f'NCBIGene/{fn}')
def pull_chebi(): pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/chebi/SDF/', 'ChEBI_complete.sdf.gz', decompress_data=True, outfilename='CHEBI/ChEBI_complete.sdf') pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/chebi/Flat_file_tab_delimited/', 'database_accession.tsv', outfilename='CHEBI/database_accession.tsv')
def pull_chembl(moleculefilename): fname = get_latest_chembl_name() if not fname is None: # fname should be like chembl_28.0_molecule.ttl.gz #Pull via ftp is going to add the download_dir, so this is a hack until pull_via_ftp is nicer. oname = 'CHEMBLCOMPOUND/' + moleculefilename.split('/')[-1] pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/chembl/ChEMBL-RDF/latest/', fname, decompress_data=True, outfilename=oname) pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/chembl/ChEMBL-RDF/latest/', 'cco.ttl.gz', decompress_data=True, outfilename='CHEMBL/cco.ttl')
def pull_pubchem_labels(): print('LABEL PUBCHEM') f_name = 'CID-Title.gz' cname = pull_via_ftp('ftp.ncbi.nlm.nih.gov','/pubchem/Compound/Extras/', f_name, outfilename=f_name) fname = make_local_name('labels', subpath='PUBCHEM.COMPOUND') with open(fname, 'w') as outf, gzip.open(cname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') outf.write(f'PUBCHEM.COMPOUND:{x[0]}\t{x[1]}\n')
def test_pull_gzip_to_memory(): """Pull a gzipped file into memory, decompressed""" data = pull_via_ftp('ftp.ncbi.nlm.nih.gov', 'gene/DATA/', 'gene_group.gz', decompress_data=True) lines = data.split('\n') assert len(lines) > 1000 assert lines[0].startswith('#tax_id')
def test_pull_gzip_to_compressed_file(): """Pull a gzipped file into memory, decompressed""" ofname = 'test_gz.gz' outname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', 'gene/DATA/', 'gene_group.gz', outfilename=ofname) with gzip.open(outname, 'rt') as inf: lines = inf.read().split('\n') assert len(lines) > 1000 assert lines[0].startswith('#tax_id')
def test_pull_text_to_file(): """Pull a text file into local file""" ofname = 'test_text' outname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', 'gene/DATA/', 'stopwords_gene', outfilename=ofname) with open(outname, 'r') as inf: lines = inf.read().split('\n') assert len(lines) > 100 assert lines[0] == 'a'
def pull_pubchem_synonyms(): f_name = 'CID-Synonym-filtered.gz' sname = pull_via_ftp('ftp.ncbi.nlm.nih.gov', '/pubchem/Compound/Extras/', f_name, outfilename=f_name) fname = make_local_name('synonyms', subpath='PUBCHEM.COMPOUND') with open(fname, 'w') as outf, gzip.open(sname,mode='rt',encoding='latin-1') as inf: for line in inf: x = line.strip().split('\t') if x[1].startswith('CHEBI'): continue if x[1].startswith('SCHEMBL'): continue outf.write(f'PUBCHEM.COMPOUND:{x[0]}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{x[1]}\n')
def pull_hgnc(): data = pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json') hgnc_json = loads(data) lname = make_local_name('labels', subpath='HGNC') sname = make_local_name('synonyms', subpath='HGNC') with open(lname,'w') as lfile, open(sname,'w') as sfile: for gene in hgnc_json['response']['docs']: hgnc_id =gene['hgnc_id'] symbol = gene['symbol'] lfile.write(f'{hgnc_id}\t{symbol}\n') name = gene['name'] sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasExactSynonym\t{name}\n') if 'alias_symbol' in gene: alias_symbols = gene['alias_symbol'] for asym in alias_symbols: sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n') if 'alias_name' in gene: alias_names = gene['alias_name'] for asym in alias_names: sfile.write(f'{hgnc_id}\thttp://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym\t{asym}\n')
def pull_ncbitaxon(): pull_via_ftp('ftp.ncbi.nih.gov', '/pub/taxonomy', 'taxdump.tar.gz', decompress_data=True, outfilename=f'{NCBITAXON}/taxdump.tar')
def pull_hgnc(): outfile = 'HGNC/hgnc_complete_set.json' pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/json', 'hgnc_complete_set.json', outfilename=outfile)
def pull_hgncfamily(): outfile = f'{HGNCFAMILY}/family.csv' pull_via_ftp('ftp.ebi.ac.uk', '/pub/databases/genenames/new/csv/genefamily_db_tables', 'family.csv', outfilename=outfile)
def pull_mesh(): pull_via_ftp('ftp.nlm.nih.gov', '/online/mesh/rdf', 'mesh.nt.gz', decompress_data=True, outfilename='MESH/mesh.nt')
def test_pull_text_to_memory(): """Pull a text file into memory so it will be usable""" data = pull_via_ftp('ftp.ncbi.nlm.nih.gov', 'gene/DATA/', 'stopwords_gene') lines = data.split('\n') assert len(lines) > 100 assert lines[0] == 'a'