pickles_dir) # Use GENCODE datasource to get list of all possible genes gencode_ds_loc = expanduser(args.gencode_ds) gencode_ds = DatasourceFactory.createDatasource( configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) # Use simple_uniprot TSV to get the uniprot_entry_names # Create the transcript to uniprot info mappings. But take less RAM. Given a gene, get the uniprot record. uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv, title="UniProt", version="2014_12", geneColumnName="gene") # key is the uniprot_entry_name from the uniprotDS muts = generateTranscriptMuts(gencode_ds, uniprotDS) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys() featureTypeToAnnotation = { "SITE": "site", "VARIANT": "natural_variation", "COMPBIAS": "region", "REGION": "region", "DOMAIN": "region", "CONFLICT": "experimental_info" } featureTypes = featureTypeToAnnotation.keys() ctr = 0 numTranscriptsNotInUniprot = 0
# TODO: Remove hardcoded paths # TODO: Reduce code duplication swiss_data = parseWithShove(uniprot_swiss_fname, parse_uniprot_data, "/bulk/pickles/") trembl_data = parseWithShove(uniprot_trembl_fname, parse_uniprot_data, "/bulk/pickles/") # Use GAF datasource to get list of all possible genes gafDS = Gaf(gaf_file, gaf_transcript_file) # Use simple_uniprot TSV to get the uniprot_entry_names # Create the gene to uniprot info mappings. But take less RAM. Given a gene, get the uniprot record. uniprotDS = Generic_Gene_DataSource(src_file=uniprot_tsv, title="UniProt", version="2011_09", geneColumnName="gene") # key is the uniprot_entry_name from the uniprotDS muts = generateTranscriptMuts(gafDS, uniprotDS) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys() featureTypeToAnnotation = {"SITE":"site", "VARIANT":"natural_variation", "COMPBIAS":"region" , "REGION":"region", "DOMAIN":"region", "CONFLICT":"experimental_info"} featureTypes = featureTypeToAnnotation.keys() ctr = 0 numTranscriptsNotInUniprot = 0 uniprotEntryNameKey = 'UniProt_uniprot_entry_name' for m in muts: ctr += 1 if (ctr % 1000) == 0: print(str(ctr)) if m[uniprotEntryNameKey] in swissKeys:
# TODO: Reduce code duplication swiss_data = parse_with_shove(uniprot_swiss_fname, parse_uniprot_data, pickles_dir) trembl_data = parse_with_shove(uniprot_trembl_fname, parse_uniprot_data, pickles_dir) # Use GENCODE datasource to get list of all possible genes gencode_ds_loc = expanduser(args.gencode_ds) gencode_ds = DatasourceFactory.createDatasource(configFilename=gencode_ds_loc, leafDir=os.path.dirname(gencode_ds_loc)) # Use simple_uniprot TSV to get the uniprot_entry_names # Create the transcript to uniprot info mappings. But take less RAM. Given a gene, get the uniprot record. uniprotDS = GenericTranscriptDatasource(src_file=uniprot_tsv, title="UniProt", version="2014_12", geneColumnName="gene") # key is the uniprot_entry_name from the uniprotDS muts = generateTranscriptMuts(gencode_ds, uniprotDS) swissKeys = swiss_data.keys() tremblKeys = trembl_data.keys() featureTypeToAnnotation = {"SITE":"site", "VARIANT":"natural_variation", "COMPBIAS":"region" , "REGION":"region", "DOMAIN":"region", "CONFLICT":"experimental_info"} featureTypes = featureTypeToAnnotation.keys() ctr = 0 numTranscriptsNotInUniprot = 0 uniprotEntryNameKey = 'UniProt_uniprot_entry_name' txs_already_processed = set() records_already_processed = set() num_txs_with_same_data = 0 for m in muts: if m['transcript_id'] in txs_already_processed: continue