def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [ m for m in mtc if (m[ttype_ix] == ttype) and ( float(m[qval_ix]) <= opts['q_value']) ] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): # fringe case if pdb_id not in pdb_info: print('skipping ' + pdb_id) continue # get path info struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line) + '\n') logger.info('Finished Successfully!!!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC', 'TGCT', 'THYM', 'UVM'] if ttype not in banned_ttypes: gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # use external module to separate out the residues in the hotspot.py output # onto separate lines mtc = read_residue_info(opts['input']) pval_thresholds = read_thresholds(opts['significance']) # read in multiple testing file #mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') struct_ix = header.index('Structure') model_ix = header.index('Model') chain_ix = header.index('Chain') res_ix = header.index('Mutation Residues') pval_ix = header.index('Hotspot P-value') # iterate through each tumor type output = [] uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # if there is no pval threshold, nothing is significant if not ttype in pval_thresholds: continue # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])] # ANY EQUIVALENT COPY THING FOR STRUCTURES? # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) # for m in mtc_ttype]) #significant_res = list(mtc_ttype) significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix])) for m in mtc_ttype] # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) all_annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure all_annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]): # initialize the graph to empty struct2graph = {} struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} non_signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: continue if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res: signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) else: non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) #print "Pushing update", pdb_id # update the graph to reflect info from the current structure struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours) output += tmp_out # format the results into the output list # tmp_out = retrieve_components(struct2graph, ttype) # output += tmp_out logger.info('Finished {0}'.format(ttype)) # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')