def main(): os.chdir(path) html = HTML(url="http://www.genenames.org/cgi-bin/hgnc_downloads.cgi" ) # Check html for attributes. attributes = html.find_between( "</td> <td>", "</td>", '"', all=True) # Retrieve all aviable attributes. print("Number of attributes: %s" % len(attributes)) # Check number of attributes. # Building url: url_begin = "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=Core+Data" url_context = ";col=" + ";col=".join( attributes ) #col=gd_hgnc_id;col=gd_app_sym;col=gd_app_name;col=gd_status;col=gd_prev_sym;col=gd_aliases;col=gd_pub_chrom_map;col=gd_pub_acc_ids;col=gd_pub_refseq_ids; url_end = ";status=Approved;status=Approved+Non-Human;status=Entry+Withdrawn;status_opt=3;=on;where=;order_by=gd_app_sym_sort;limit=;format=text;submit=submit;.cgifields=;.cgifields=status;.cgifields=chr" url = url_begin + url_context + url_end f = File(name="hgnc.txt", url=url, path=path) contents = f.parse(printing=False, header=True) genes.name = "HGNC" genes.key = "hgnc" genes.taxid = 9606 genes.addData(contents) genes.save() genes.buildMappings()
def main(): os.chdir(path) html = HTML(url="http://www.genenames.org/cgi-bin/hgnc_downloads.cgi") # Check html for attributes. attributes = html.find_between("</td> <td>", "</td>", '"', all=True) # Retrieve all aviable attributes. print("Number of attributes: %s" % len(attributes)) # Check number of attributes. # Building url: url_begin = "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=Core+Data" url_context = ";col="+";col=".join(attributes) #col=gd_hgnc_id;col=gd_app_sym;col=gd_app_name;col=gd_status;col=gd_prev_sym;col=gd_aliases;col=gd_pub_chrom_map;col=gd_pub_acc_ids;col=gd_pub_refseq_ids; url_end = ";status=Approved;status=Approved+Non-Human;status=Entry+Withdrawn;status_opt=3;=on;where=;order_by=gd_app_sym_sort;limit=;format=text;submit=submit;.cgifields=;.cgifields=status;.cgifields=chr" url = url_begin + url_context + url_end f = File(name="hgnc.txt", url=url, path=path) contents = f.parse(printing=False, header=True) genes.name = "HGNC" genes.key = "hgnc" genes.taxid = 9606 genes.addData(contents) genes.save() genes.buildMappings()
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True): """Performs the download of interaction and annotation files from MGI. Builds a gene annotation file and mapping tables. TODO: - Inspect and eventually use interaction file, else discard from this module. - Also check whether other information from MGI is worse to integrate such as homology or phenotypes.""" os.chdir(path) genes.name = 'MGI' genes.key = 'mgi' folder = Folder(path) if interactions: ftp = FTP( url='ftp://ftp.informatics.jax.org/pub/protein-interaction-data/', path=path) ftp.download(path) if download: url = "ftp://ftp.informatics.jax.org/pub/reports/" files = [ "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt", "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt", "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt" ] # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt for f in files: f = File(url=url + f) # automatically does f.download() res = f.parse(header=True, printing=False) folder.downloads.append(f.name) if parse: folder.update() if withdrawn: filename = "MRK_List1.rpt" else: filename = "MRK_List2.rpt" data = folder[filename].parse(header=True, printing=False) genes.addData(data, key='mgi', taxid=10090) data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False) for i in data: i = change_keys(i) i['taxid'] = 10090 genes.add(i) data = folder['MRK_Sequence.rpt'].parse(header=True, printing=False) genes.addData(data, key='mgi', taxid=10090) header = "mgi symbol status name cm_position chromosome type "\ "secondary_accession_ids id synonyms feature_types start "\ "stop strand biotypes".split() data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False) genes.addData(data, key="mgi", taxid=10090) print len(genes) if cleanup: if interactions: ftp.remove(confirm=False) for f in folder.downloads: folder.remove(f) genes.keep("category", "Gene") genes.remove("name", "withdrawn") genes.save() genes.buildMappings()
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True): """Performs the download of interaction and annotation files from MGI. Builds a gene annotation file and mapping tables. TODO: - Inspect and eventually use interaction file, else discard from this module. - Also check whether other information from MGI is worse to integrate such as homology or phenotypes.""" os.chdir(path) genes.name = "MGI" genes.key = "mgi" folder = Folder(path) if interactions: ftp = FTP(url="ftp://ftp.informatics.jax.org/pub/protein-interaction-data/", path=path) ftp.download(path) if download: url = "ftp://ftp.informatics.jax.org/pub/reports/" files = [ "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt", "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt", "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt", ] # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt for f in files: f = File(url=url + f) # automatically does f.download() res = f.parse(header=True, printing=False) folder.downloads.append(f.name) if parse: folder.update() if withdrawn: filename = "MRK_List1.rpt" else: filename = "MRK_List2.rpt" data = folder[filename].parse(header=True, printing=False) genes.addData(data, key="mgi", taxid=10090) data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False) for i in data: i = change_keys(i) i["taxid"] = 10090 genes.add(i) data = folder["MRK_Sequence.rpt"].parse(header=True, printing=False) genes.addData(data, key="mgi", taxid=10090) header = ( "mgi symbol status name cm_position chromosome type " "secondary_accession_ids id synonyms feature_types start " "stop strand biotypes".split() ) data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False) genes.addData(data, key="mgi", taxid=10090) print len(genes) if cleanup: if interactions: ftp.remove(confirm=False) for f in folder.downloads: folder.remove(f) genes.keep("category", "Gene") genes.remove("name", "withdrawn") genes.save() genes.buildMappings()