def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[qval_ix])<=opts['q_value'])] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions banned_ttypes = ['COAD', 'READ', 'PANCAN12', 'CHOL', 'SARC', 'TGCT', 'THYM', 'UVM'] if ttype not in banned_ttypes: gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # read in multiple testing file mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') qval_ix = header.index('q-value') gene_ix = header.index('HUGO Symbol') tx_ix = header.index('Sequence Ontology Transcript') res_ix = header.index('CRAVAT Res') #mtc.sort(key=lambda x: x[0]) # iterate through each tumor type output = [] gene2graph_all = {} # graphs for combined tumor types uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # initialize the graph to empty gene2graph = {} # graph for an individual tumor type # get the significant residues for the tumor type mtc_ttype = [ m for m in mtc if (m[ttype_ix] == ttype) and ( float(m[qval_ix]) <= opts['q_value']) ] significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) for m in mtc_ttype]) # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(annotation, lambda x: x[pdb_ix]): # fringe case if pdb_id not in pdb_info: print('skipping ' + pdb_id) continue # get path info struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry cog = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: print 'int error' continue signif_struct_info[tmp_pos] = (s[anot_gene_ix], s[anot_tx_ix], s[anot_res_ix]) # update the graph to reflect info from the current structure gene2graph = update_graph(gene2graph, cog, signif_struct_info, struct, opts['radius']) # update graph for the combined cross-tumor type regions gene2graph_all = update_graph(gene2graph_all, cog, signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(gene2graph, ttype) output += tmp_out logger.info('Finished {0}'.format(ttype)) # update output to contain cross-tumor type reference regions tmp_out = retrieve_components(gene2graph_all, 'REF') output += tmp_out # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line) + '\n') logger.info('Finished Successfully!!!')
def main(opts): """Currently, performs analysis for the given genes. It attempts to use any available PDB sturctures. It then loops through each protein chain and tumor type. """ # read in data logger.info('Reading in annotations . . .') pdb_info = utils.read_pdb_info(opts['annotation']) logger.info('Finished reading in annotations.') logger.info('Reading in mutations . . .') mutations = utils.read_mutations(opts['mutations']) logger.info('Finished reading in mutations.') # iterate over each structure logger.info('Running of PDB structures . . .') output = [] num_pdbs = 0 num_missing_pdbs = 0 missing_pdb_list = [] error_pdb_structs = [] quiet = True if opts[ 'log_level'] != "DEBUG" else False # flag indicating pdb warnings pdb_parser = PDBParser(QUIET=quiet) # parser for pdb files for structure_id in pdb_info: print(structure_id) # get pdb info struct_info = pdb_info[structure_id] pdb_path = struct_info.pop('path') # read in structure structure = utils.read_structure(pdb_path, structure_id, quiet=quiet) if structure is None: continue # make a list of all chain letters in structure struct_chains = [] for k in struct_info.keys(): struct_chains.extend(struct_info[k]) # get mutation info structure_mutations = mutations.get(structure_id, []) # skip structure if no mutations if not structure_mutations: continue # separate out mutation info ttypes, mres, mcount, mchains = zip( *structure_mutations) # if model_mutations else ([], [], []) # stratify mutations by their tumor type # ttype_ixs is a dictionary that contains # ttype as the keys and a list of relevant # indices as the values unique_ttypes = set(ttypes) ttype_ixs = { t: [i for i in range(len(mcount)) if ttypes[i] == t] for t in unique_ttypes } unique_ttypes = list(unique_ttypes) # obtain relevant info from structure tmp_info = get_structure_info(structure, mchains, mres, mcount, struct_chains, ttype_ixs) (mut_res_centers_of_geometry, mut_res_mutation_counts, all_res_centers_of_geometry, models) = tmp_info if not all_res_centers_of_geometry: logger.error('No available center of geometries for {0}'.format( structure_id)) continue # get neigbours for all residues neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius']) # iterate through each tumour type for tumour in unique_ttypes: # skip tumor types if not one specified if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'): continue # draw information for the specific tumour type t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour] t_mut_res_mutation_counts = mut_res_mutation_counts[tumour] mut_density = src.mutations.mutation_density( t_mut_res_mutation_counts, neighbors) mut_vals = mut_density.values() if mut_vals: max_obs_dens = max(mut_density.values()) else: max_obs_dens = 0 # generate null distribution # count total mutations in structure while # avoiding double counting due to same id and chain # being on multiple models obs_models = [] obs_chains = [] total_mutations = 0 for k in t_mut_res_mutation_counts: mutations_to_add = t_mut_res_mutation_counts[k] for i in range(len(obs_models)): if not k[1] == obs_models[i] and k[2] == obs_chains[i]: mutations_to_add = 0 break total_mutations += mutations_to_add obs_models.append(k[1]) obs_chains.append(k[2]) # generate empirical null distribution sim_null_dist = sim.generate_null_dist( structure_id, models, struct_info, all_res_centers_of_geometry, total_mutations, opts['num_simulations'], opts['seed'], neighbors, opts['stop_criterion'], max_obs_dens) # get a list of lists format for compute p values function mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density] if not t_mut_res_mutation_counts: print("here") # aditional information about p-values # for specific residues in a structure # compute p-values for observed obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist) output.append([ structure_id, tumour, ','.join([str(o[0][1]) for o in mut_list]), ','.join([str(o[0][2]) for o in mut_list]), ','.join([str(o[0][3][1]) for o in mut_list]), ','.join( [str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]), ','.join([str(o[1]) for o in mut_list]), ','.join(map(str, obs_pvals)), ]) # write output to file output = [[ 'Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues', 'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value', ]] + output with open(opts['output'], 'w') as handle: csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output) # if user specified to log failed reading of pdbs if opts['error_pdb'] and error_pdb_structs: with open(opts['error_pdb'], 'w') as handle: for bad_pdb in error_pdb_structs: handle.write(bad_pdb + '\n') print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF)) print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF)) print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF)) print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF)) logger.info('Finished successfully!')
def main(opts): # read in the PDB info file pdb_info = utils.read_pdb_info(opts['pdb_info']) # use external module to separate out the residues in the hotspot.py output # onto separate lines mtc = read_residue_info(opts['input']) pval_thresholds = read_thresholds(opts['significance']) # read in multiple testing file #mtc = read_delim(opts['multiple_testing']) header = mtc.pop(0) ttype_ix = header.index('Tumor Type') struct_ix = header.index('Structure') model_ix = header.index('Model') chain_ix = header.index('Chain') res_ix = header.index('Mutation Residues') pval_ix = header.index('Hotspot P-value') # iterate through each tumor type output = [] uniq_ttypes = set(m[ttype_ix] for m in mtc) for ttype in uniq_ttypes: logger.info('Working on {0} . . .'.format(ttype)) # if there is no pval threshold, nothing is significant if not ttype in pval_thresholds: continue # get the significant residues for the tumor type mtc_ttype = [m for m in mtc if (m[ttype_ix] == ttype) and (float(m[pval_ix])<=pval_thresholds[ttype])] # ANY EQUIVALENT COPY THING FOR STRUCTURES? # significant_res = set([(m[gene_ix], m[tx_ix], int(m[res_ix])) # for m in mtc_ttype]) #significant_res = list(mtc_ttype) significant_res = [(m[struct_ix], m[chain_ix], int(m[res_ix])) for m in mtc_ttype] # read annotation file annotation_file = os.path.join(opts['annotation_dir'], 'mupit_mutations_' + ttype) all_annotation, col_pos = read_mupit_file(annotation_file, significant_res) pdb_ix = col_pos['pdb'] anot_gene_ix = col_pos['gene'] anot_tx_ix = col_pos['tx'] anot_res_ix = col_pos['res'] # sort by structure all_annotation.sort(key=lambda x: x[pdb_ix]) for pdb_id, grp in it.groupby(all_annotation, lambda x: x[pdb_ix]): # initialize the graph to empty struct2graph = {} struct_info = pdb_info[pdb_id].copy() pdb_path = struct_info.pop('path') struct_chains = [] for d in struct_info: struct_chains.extend(struct_info[d]) #pdb_path = pdb2path[pdb_id] struct = utils.read_structure(pdb_path, pdb_id) if struct is None: continue # skip if pdb file not found # calculate the centers of geometry all_cogs = pstruct.calc_center_of_geometry(struct, struct_chains) # contains relevant mupit annotations for this pdb tmp = list(grp) # get significant residues signif_struct_info = {} non_signif_struct_info = {} for s in tmp: try: tmp_pos = (s[col_pos['chain']], int(s[col_pos['pdb_res']])) except: continue if (s[col_pos['pdb']], s[col_pos['chain']], int(s[col_pos['pdb_res']])) in significant_res: signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) else: non_signif_struct_info[tmp_pos] = (s[pdb_ix], s[anot_tx_ix], int(s[anot_res_ix])) #print "Pushing update", pdb_id # update the graph to reflect info from the current structure struct2graph, signif_res_neighbours = update_graph(struct2graph, all_cogs, signif_struct_info, non_signif_struct_info, struct, opts['radius']) # format the results into the output list tmp_out = retrieve_components(struct2graph, ttype, all_cogs, opts['radius'], signif_res_neighbours) output += tmp_out # format the results into the output list # tmp_out = retrieve_components(struct2graph, ttype) # output += tmp_out logger.info('Finished {0}'.format(ttype)) # write output with open(opts['output'], 'wb') as handle: for line in output: handle.write('\t'.join(line)+'\n') logger.info('Finished Successfully!!!')
def main(opts): """Currently, performs analysis for the given genes. It attempts to use any available PDB sturctures. It then loops through each protein chain and tumor type. """ # read in data logger.info('Reading in annotations . . .') pdb_info = utils.read_pdb_info(opts['annotation']) logger.info('Finished reading in annotations.') logger.info('Reading in mutations . . .') mutations = utils.read_mutations(opts['mutations']) logger.info('Finished reading in mutations.') # iterate over each structure logger.info('Running of PDB structures . . .') output = [] num_pdbs = 0 num_missing_pdbs = 0 missing_pdb_list = [] error_pdb_structs = [] quiet = True if opts['log_level'] != "DEBUG" else False # flag indicating pdb warnings pdb_parser = PDBParser(QUIET=quiet) # parser for pdb files for structure_id in pdb_info: print (structure_id) # get pdb info struct_info = pdb_info[structure_id] pdb_path = struct_info.pop('path') # read in structure structure = utils.read_structure(pdb_path, structure_id, quiet=quiet) if structure is None: continue # make a list of all chain letters in structure struct_chains = [] for k in struct_info.keys(): struct_chains.extend(struct_info[k]) # get mutation info structure_mutations = mutations.get(structure_id, []) # skip structure if no mutations if not structure_mutations: continue # separate out mutation info ttypes, mres, mcount, mchains = zip(*structure_mutations) # if model_mutations else ([], [], []) # stratify mutations by their tumor type # ttype_ixs is a dictionary that contains # ttype as the keys and a list of relevant # indices as the values unique_ttypes = set(ttypes) ttype_ixs = {t: [i for i in range(len(mcount)) if ttypes[i]==t] for t in unique_ttypes} unique_ttypes = list(unique_ttypes) # obtain relevant info from structure tmp_info = get_structure_info(structure, mchains, mres, mcount, struct_chains, ttype_ixs) (mut_res_centers_of_geometry, mut_res_mutation_counts, all_res_centers_of_geometry, models) = tmp_info if not all_res_centers_of_geometry: logger.error('No available center of geometries for {0}'.format(structure_id)) continue # get neigbours for all residues neighbors = find_neighbors(all_res_centers_of_geometry, opts['radius']) # iterate through each tumour type for tumour in unique_ttypes: # skip tumor types if not one specified if (not opts['tumor_type'] == tumour and not opts['tumor_type'] == 'EVERY'): continue # draw information for the specific tumour type t_mut_res_centers_of_geometry = mut_res_centers_of_geometry[tumour] t_mut_res_mutation_counts = mut_res_mutation_counts[tumour] mut_density = src.mutations.mutation_density(t_mut_res_mutation_counts, neighbors) mut_vals = mut_density.values() if mut_vals: max_obs_dens = max(mut_density.values()) else: max_obs_dens =0 # generate null distribution # count total mutations in structure while # avoiding double counting due to same id and chain # being on multiple models obs_models = [] obs_chains = [] total_mutations = 0 for k in t_mut_res_mutation_counts: mutations_to_add = t_mut_res_mutation_counts[k] for i in range(len(obs_models)): if not k[1] == obs_models[i] and k[2] == obs_chains[i]: mutations_to_add = 0 break total_mutations += mutations_to_add obs_models.append(k[1]) obs_chains.append(k[2]) # generate empirical null distribution sim_null_dist = sim.generate_null_dist(structure_id, models, struct_info, all_res_centers_of_geometry, total_mutations, opts['num_simulations'], opts['seed'], neighbors, opts['stop_criterion'], max_obs_dens) # get a list of lists format for compute p values function mut_list = [[res_id, mut_density[res_id]] for res_id in mut_density] if not t_mut_res_mutation_counts: print("here") # aditional information about p-values # for specific residues in a structure # compute p-values for observed obs_pvals, sim_cdf = sim.compute_pvals(mut_list, sim_null_dist) output.append([structure_id, tumour, ','.join([str(o[0][1]) for o in mut_list]), ','.join([str(o[0][2]) for o in mut_list]), ','.join([str(o[0][3][1]) for o in mut_list]), ','.join([str(t_mut_res_mutation_counts[o[0]]) for o in mut_list]), ','.join([str(o[1]) for o in mut_list]), ','.join(map(str, obs_pvals)),]) # write output to file output = [['Structure', 'Tumor Type', 'Model', 'Chain', 'Mutation Residues', 'Residue Mutation Count', 'Mutation Density', 'Hotspot P-value', ]] + output with open(opts['output'], 'w') as handle: csv.writer(handle, delimiter='\t', lineterminator='\n').writerows(output) # if user specified to log failed reading of pdbs if opts['error_pdb'] and error_pdb_structs: with open(opts['error_pdb'], 'w') as handle: for bad_pdb in error_pdb_structs: handle.write(bad_pdb+'\n') print("NUM_MODEL_DIFF: " + str(sim.NUM_MODEL_DIFF)) print("NUM_CHAIN_DIFF: " + str(sim.NUM_CHAIN_DIFF)) print("STRUCT_MODEL_DIFF: " + str(sim.STRUCT_MODEL_DIFF)) print("STRUCT_CHAIN_DIFF: " + str(sim.STRUCT_CHAIN_DIFF)) logger.info('Finished successfully!')