def extract_snvs(fanns_db, data_path, logger=None): logger = logger or logging.getLogger("perf-cosmic") snvs = dict() logger.info("Reading mutations ...") progress = RatedProgress(logger, name="mutations") with tsv.open(data_path, "r") as df: columns = [ "Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = dbfound_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields # wide_screen != "y" if mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): dbfound_rows += 1 k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: snvs[k] = snv = dict( transcript=row["transcript"], symbol=row["xrefs"]["symbol"], msamples=set(), wsamples=set()) else: snv = snvs[k] if wide_screen == "y": snv["wsamples"].add(sample_id) else: snv["msamples"].add(sample_id) progress.update() progress.log_totals() logger.info("Counting the number of samples per mutation ...") for data in snvs.itervalues(): data["msamples"] = len(data["msamples"]) data["wsamples"] = len(data["wsamples"]) logger.info("Total: total_rows={}, queried_rows={}, found_rows={}, protein_changes={}".format(total_rows, queried_rows, dbfound_rows, len(snvs))) return snvs
def main(): parser = argparse.ArgumentParser( description="Export SNV's") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="DEST", help="The destination file. Use - for standard output.") args, log = cmd.parse_args("export-snvs") db = cmd.open_db() logger.info("Exporting SNV's ...") total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") rows_count = 0 with tsv.open(args.dest_path, "w") as f: for snv in db.snvs(): rows_count += 1 tsv.write_line(f, snv["chr"], snv["start"], snv["start"], snv["strand"], "{}>{}".format(snv["ref"], snv["alt"]), "S") progress.update() log.info("Finished. Total rows = {}, elapsed_time = {}".format(rows_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser(description="Map score values") cmd = DefaultCommandHelper(parser) cmd.add_db_args() cmd.add_transform_args() parser.add_argument( "--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip transformation for empty scores", ) args, logger = cmd.parse_args("scores-transform") db = cmd.open_db() try: transforms = cmd.get_transforms() predictors = transforms.keys() logger.info("Transforming scores ...") progress = RatedProgress(logger, name="SNVs") rows_count = updated_count = 0 for row in db.query_scores(predictors=predictors): rows_count += 1 scores = row["scores"] upd_scores = {} for predictor in transforms: score = scores[predictor] if args.skip_empty_scores and score is None: continue prev_score = score for name, func in transforms[predictor]: try: score = func(score) except: raise Exception("Error transforming the {} score {} with {}".format(predictor, score, name)) if prev_score != score: upd_scores[predictor] = score if len(upd_scores) > 0: db.update_scores(row["id"], upd_scores) updated_count += 1 progress.update() progress.log_totals() logger.info("Commit ...") db.commit() logger.info( "Finished. Total rows = {}, updated rows = {}, elapsed time = {}".format( rows_count, updated_count, progress.elapsed_time ) ) except: return cmd.handle_error() finally: db.close() return 0
def create_datasets(snvs, cgc_path, tdrivers_path, pdrivers_path, output_prefix, logger=None): logger = logger or logging.getLogger("perf-cosmic") prefix = output_prefix or "cosmic-" logger.info("Loading CGC genes ...") cgc_genes = set() with open(cgc_path, "r") as f: for line in f: cgc_genes.add(line.rstrip("\n")) logger.info("Loading TD drivers ...") tdrivers = set() with open(tdrivers_path, "r") as f: for line in f: tdrivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Loading PD drivers ...") pdrivers = set() with open(pdrivers_path, "r") as f: for line in f: pdrivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Creating datasets ...") progress = RatedProgress(logger, name="mutations") with Dataset(prefix + "1") as rec1,\ Dataset(prefix + "2") as rec2,\ Dataset(prefix + "4") as rec4,\ Dataset(prefix + "CGC") as cgc,\ Dataset(prefix + "noCGC") as nocgc,\ Dataset(prefix + "TD") as td,\ Dataset(prefix + "noTD") as notd,\ Dataset(prefix + "PD") as pd,\ Dataset(prefix + "noPD") as nopd: for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items(): num_samples = len(snv["samples"]) line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]]) symbol = snv["symbol"] or "" if isinstance(symbol, basestring): symbol = set([symbol]) elif isinstance(symbol, list): symbol = set(symbol) if num_samples == 1: rec1.write(line) if num_samples >= 2: rec2.write(line) if num_samples >= 4: rec4.write(line) if len(symbol & cgc_genes) > 0: cgc.write(line) elif num_samples == 1: nocgc.write(line) if len(symbol & tdrivers) > 0: td.write(line) elif num_samples == 1: notd.write(line) if len(symbol & pdrivers) > 0: pd.write(line) elif num_samples == 1: nopd.write(line) progress.update() progress.log_totals() logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [ rec1, rec2, rec4, cgc, nocgc, td, notd, pd, nopd]])))
def main(): parser = argparse.ArgumentParser( description="Fetch Condel scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("muts_path", metavar="SNVS_PATH", help="SNV's to check. Use - for standard input.") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="The results path. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() args, logger = cmd.parse_args("fetch") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input")) try: progress = RatedProgress(logger, name="SNVs") with tsv.open(args.muts_path) as f: with tsv.open(args.out_path, "w") as wf: tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) hit = fail = 0 mut = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) fail += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) fail += 1 continue exists = False for row in query_mutation(logger, db, mut, annotations, predictors): exists = True ann = row["annotations"] scores = row["scores"] tsv.write_line(wf, mut.identifier, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) """ if logger.isEnabledFor(logging.DEBUG): logger.debug(" --> {} {} {} {} {} {} {} {} {} {}".format( row["chr"], row["start"], row["ref"], row["alt"], row["transcript"], row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"], mut.identifier or "*")) """ progress.update() if exists: hit += 1 else: fail += 1 progress.log_totals() logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Update scores in the database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") cmd.add_selected_predictors_args() parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False, help="Update of the predictors.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("update") db = cmd.open_db() predictors = cmd.get_selected_predictors(check_missing=False) try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))]) db_predictors = set([p["id"] for p in db.predictors()]) if len(predictors) == 0: predictors = [name for name in hdr if name in db_predictors] if len(predictors) == 0: raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.") logger.info("Predictors: {}".format(", ".join(predictors))) for predictor in filter(lambda p: p not in db_predictors, predictors): logger.info("Creating predictor {} ...".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0 use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0 if not use_genome_coords and not use_protein_coords: raise Exception("No coordinate columns found. " "Use {} for genomic coordinates or {} for protein coordinates.".format( GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS)) elif use_genome_coords and use_protein_coords: logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default") if use_genome_coords: coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)] coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_dna elif use_protein_coords: coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)] coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_protein coord_column_indices = [hdr[n] for n in coord_column_names] score_indices = [hdr[n] for n in predictors] max_column_index = max(coord_column_indices + score_indices) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column_index: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip( coord_column_names, coord_column_types, coord_column_indices)]) scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)]) except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise try: for row in db.query_scores(fields=[], **coords): db.update_scores(row["id"], scores) except Exception as ex: logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex))) logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()]))) if not args.ignore_errors: raise progress.update() progress.log_totals() logger.info("Finalizing database ...") if args.update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Filter for the longest transcript") cmd = DefaultCommandHelper(parser) parser.add_argument("len_path", metavar="PATH", help="The tsv containing the transcripts length") parser.add_argument("data_path", metavar="PATH", help="The data file") parser.add_argument("out_path", metavar="PATH", help="Output file. Use - for standard output.") parser.add_argument("-k", "--key", dest="key", metavar="KEY", default="PROTEIN,AA_POS,AA_REF,AA_ALT", help="List of columns that conforms the key. Default: PROTEIN,AA_POS,AA_REF,AA_ALT") args, logger = cmd.parse_args("filter-transcript") try: logger.info("Loading transcripts length ...") trslen = defaultdict(int) with tsv.open(args.len_path) as f: for name, length in tsv.rows(f): trslen[name] = length logger.info("Filtering {} ...".format(os.path.basename(args.data_path))) total_count = filter_count = 0 progress = RatedProgress(logger, name="mutations") key_columns = args.key.split(",") with tsv.open(args.data_path, "r") as df, tsv.open(args.out_path, "w") as of: hdr_line = df.readline() of.write(hdr_line) _, hdr = tsv.header_from_line(hdr_line) key_indices = [hdr[name] for name in key_columns] trs_index = hdr["TRANSCRIPT"] last_key = None longest = (0, "") for line in df: total_count += 1 fields = line.rstrip("\n").split("\t") key = tuple([fields[index] for index in key_indices]) trs = fields[trs_index] tl = trslen[trs] if last_key != key: if last_key is not None: of.write(longest[1]) filter_count += 1 longest = (tl, line) last_key = key elif tl > longest[0]: longest = (tl, line) progress.update() filter_count += 1 of.write(longest[1]) progress.log_totals() logger.info("Finished. in={}, out={}, filtered={}, elapsed={}".format( total_count, filter_count, total_count - filter_count, progress.elapsed_time)) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Calculate TransFIC labels") cmd = Command.withtraits(DbTrait, PredictorsInDbTrait)(parser) cmd.add_db_args() parser.add_argument("cutoffs_path", metavar="CUTOFFS", help="File containing the cutoffs") cmd.add_selected_predictors_args() parser.add_argument("-u", "--updated-predictors", dest="updated_predictors", metavar="NAMES", help="Updated predictor names") args, logger = cmd.parse_args("calc-label") db = cmd.open_db() try: logger.info("Loading state ...") state = load_weights(args.cutoffs_path) avail_predictors, stats = [state[k] for k in ["predictors", "stats"]] predictors = cmd.get_selected_predictors(default_all=True) missing_predictors = [p for p in predictors if p not in set(avail_predictors)] if len(missing_predictors) > 0: raise Exception("Missing cutoff stats for predictors: {}".format(", ".join(missing_predictors))) if args.updated_predictors is not None: if len(predictors) != len(args.updated_predictors): raise Exception("The number of selected predictors does not match the number of predictor names to update") updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)]) else: updated_predictors = dict([(p, "{}_LABEL".format(p)) for p in predictors]) # create predictors in the database if required db_predictors = set([p["id"] for p in db.predictors()]) for predictor, updated_predictor in updated_predictors.items(): if updated_predictor not in db_predictors: logger.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) cutoffs = {} for predictor in predictors: cutoff_low_mid, cutoff_mid_high = [stats[predictor][v] for v in ["cutoff_low_mid", "cutoff_mid_high"]] logger.info("{}: cutoffs: low_mid={}, mid_high={}".format(predictor, cutoff_low_mid, cutoff_mid_high)) cutoffs[predictor] = (cutoff_low_mid, cutoff_mid_high) logger.info("Calculating ...") progress = RatedProgress(logger, name="SNVs") for num_rows, row in enumerate(db.query_scores(predictors=predictors), start=1): scores = row["scores"] uscores = {} for predictor in predictors: score = scores[predictor] if score is None: continue cutoff_low_mid, cutoff_mid_high = cutoffs[predictor] updated_predictor = updated_predictors[predictor] uscores[updated_predictor] = 0.0 if score < cutoff_low_mid else 1.0 if score < cutoff_mid_high else 2.0 if len(uscores) > 0: db.update_scores(row["id"], uscores) progress.update() db.commit() except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser( description="Import scores into the database") cmd = Command.withtraits(DbTrait, PredictorsTrait)(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") #TODO: which are the coordinates column cmd.add_selected_predictors_args() parser.add_argument("--skip-empty-scores", dest="skip_empty_scores", action="store_true", default=False, help="Skip SNV's where all the scores are empty") parser.add_argument("--skip-update-predictors", dest="skip_update_predictors", action="store_true", default=False, help="Skip the update of the predictors.") parser.add_argument("--skip-create-index", dest="skip_create_index", action="store_true", default=False, help="Skip the creation of the database indices.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("import") db = cmd.open_db() try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = {} for index, name in enumerate(hdr_line.rstrip("\n").split("\t")): hdr[name] = index # Predictors to update from the user selection and source availability db_predictors = set([p["id"] for p in db.predictors()]) src_predictors = [name for name in hdr if name not in COORD_COLUMNS] predictors = cmd.get_selected_predictors(available_predictors=src_predictors) for predictor in predictors: if predictor not in db_predictors: logger.info("Creating non existing predictor: {}".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) logger.info("Predictors: {}".format(", ".join(predictors))) all_columns = COORD_COLUMNS + predictors types = COORD_TYPES + ([score_value] * len(predictors)) missing_columns = [name for name in all_columns if name not in hdr] if len(missing_columns) > 0: raise Exception("The following columns are missing: {}".format(", ".join(missing_columns))) columns = [hdr[name] for name in all_columns] max_column = max(columns) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: fields = [type_cast(fields[index]) for type_cast, index in zip(types, columns)] except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise (chr, strand, start, ref, alt, transcript, aa_pos, aa_ref, aa_alt, protein) = fields[:10] scores = fields[10:] if args.skip_empty_scores and sum([0 if s is None else 1 for s in scores]) == 0: continue try: db.add_snv( chr=chr, strand=strand, start=start, ref=ref, alt=alt, transcript=transcript, protein=protein, aa_pos=aa_pos, aa_ref=aa_ref, aa_alt=aa_alt, scores=dict(zip(predictors, scores))) except Exception as ex: logger.error("Error importing SNV at line {}: {}".format(line_num, str(ex))) if not args.ignore_errors: raise progress.update() total_lines += line_num progress.log_totals() logger.info("Finalizing database ...") if not args.skip_update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() if not args.skip_create_index: logger.info("Creating indices ...") db.create_indices() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Export Scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="OUTPUT_PATH", help="The output file. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Export the results in json format") parser.add_argument("--sample", dest="sample", type=int, metavar="PCT", help="Export a random sample of PCT %%") parser.add_argument("--start", dest="start", type=int, metavar="N", help="Start to export from the SNV number N") parser.add_argument("--limit", dest="limit", type=int, metavar="N", help="Limit the number of SNVs to export to N") args, logger = cmd.parse_args("export") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Exporting ...") random.seed(time.time()) total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") to_json = args.to_json sample = args.sample start = args.start or 0 limit = args.limit doc = None last_pos = None rows_count = 0 snvs_count = 0 with tsv.open(args.dest_path, "w") as f: if not to_json: tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) for row in db.query_scores(predictors=predictors, maps=annotations): if not to_json: if start > 0: start -= 1 continue if sample is not None and random.randint(1, 100) > sample: continue pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"]) if last_pos != pos: if to_json: if start > 0: start -= 1 continue if limit is not None and snvs_count >= limit: if doc is not None: json.dump(doc, f) f.write("\n") break snvs_count += 1 rows_count += 1 ann = row["annotations"] scores = row["scores"] if to_json: tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] + [(k,scores[k]) for k in predictors]) if pos != last_pos: if doc is not None: if sample is None or random.randint(1, 100) <= sample: json.dump(doc, f) f.write("\n") else: snvs_count -= 1 doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] + [("transcripts", [tdoc])]) else: doc["transcripts"] += [tdoc] else: tsv.write_line(f, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) progress.update() last_pos = pos if not to_json and limit is not None and rows_count >= limit: break progress.log_totals() logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Generate datasets needed to evaluate performance from Cosmic mutations") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("data_path", metavar="PATH", help="The CosmicMutantExport tsv file") parser.add_argument("cgc_path", metavar="PATH", help="The list of CGC genes") parser.add_argument("drivers_path", metavar="PATH", help="The list of CHASM drivers (drivers.tmps)") parser.add_argument("-o", dest="prefix", metavar="PREFIX", help="Output prefix.") args, logger = cmd.parse_args("perf-cosmic") prefix = args.prefix or "cosmic-" fanns_db = cmd.open_db() try: snvs = dict() logger.info("Counting the number of samples per mutation ...") with tsv.open(args.data_path, "r") as df: columns = [ #"Genome-wide screen", "Mutation Description", "Mutation CDS", "Mutation AA", "Mutation GRCh37 genome position", "Mutation GRCh37 strand", "Accession Number", "ID_sample"] total_rows = queried_rows = 0 for fields in tsv.rows(df, columns=columns, header=True): total_rows += 1 #wide_screen, mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields mut_desc, mut_cds, mut_aa, mut_pos, mut_strand, acc, sample_id = fields wide_screen = "y" if wide_screen != "y" or mut_desc != "Substitution - Missense": continue queried_rows += 1 for row in get_transcripts(fanns_db, mut_cds, mut_aa, mut_pos, mut_strand, acc, logger): k = tuple([row[k] for k in ["protein", "aa_pos", "aa_ref", "aa_alt"]]) if k not in snvs: symbol = row["xrefs"]["symbol"] snvs[k] = dict( transcript=row["transcript"], symbol=symbol, samples=set([sample_id])) else: snvs[k]["samples"].add(sample_id) logger.info("Total: total_rows={}, queried_rows={}, protein_changes={}".format(total_rows, queried_rows, len(snvs))) logger.info("Loading CGC genes ...") cgc_genes = set() with open(args.cgc_path, "r") as f: for line in f: cgc_genes.add(line.rstrip("\n")) logger.info("Loading CHASM drivers ...") drivers = set() with open(args.drivers_path, "r") as f: for line in f: drivers.add(line.rstrip("\n").split("\t")[0]) logger.info("Creating datasets ...") progress = RatedProgress(logger, name="mutations") with Dataset(prefix + "1") as rec1,\ Dataset(prefix + "2") as rec2,\ Dataset(prefix + "4") as rec4,\ Dataset(prefix + "CGC") as cgc,\ Dataset(prefix + "noCGC") as nocgc,\ Dataset(prefix + "D") as drv,\ Dataset(prefix + "O") as nodrv: for (protein, aa_pos, aa_ref, aa_alt), snv in snvs.items(): num_samples = len(snv["samples"]) line = "\t".join([str(v) for v in [protein, aa_pos, aa_ref, aa_alt]]) if num_samples == 1: rec1.write(line) if num_samples >= 2: rec2.write(line) if num_samples >= 4: rec4.write(line) symbol = snv["symbol"] if symbol is not None and ((isinstance(symbol, basestring) and symbol in cgc_genes) or len(set(symbol) & cgc_genes) > 0): cgc.write(line) elif num_samples == 1: nocgc.write(line) if snv["transcript"] in drivers: drv.write(line) elif num_samples == 1: nodrv.write(line) progress.update() progress.log_totals() logger.info("Datasets: {}".format(", ".join(["{}={}".format(os.path.basename(d.name), d.size) for d in [ rec1, rec2, rec4, cgc, nocgc, drv, nodrv]]))) except: cmd.handle_error() return 0
def main(): parser = argparse.ArgumentParser(description="Calculate TransFIC for the selected scores") cmd = Command.withtraits(DbTrait, PredictorsInDbTrait, TransformsTrait)(parser) cmd.add_db_args() parser.add_argument( "feature_name", metavar="FEATURE_COLUMN", help="The column name with the features. It can be transcript, protein or any of the available annotations.", ) parser.add_argument("blt_path", metavar="BLT_PATH", help="The baseline tolerance statistics.") cmd.add_selected_predictors_args() parser.add_argument( "-u", "--updated-predictors", dest="updated_predictors", metavar="NAME", help="Updated predictor names" ) cmd.add_transform_args() args, logger = cmd.parse_args("calc") db = cmd.open_db() # initialize feature selection db_annotations = [a["id"] for a in db.maps()] if args.feature_name not in set(["transcript", "protein"] + db_annotations): logger.error("Feature name not available in the database: {}".format(args.feature_name)) logger.error("Available annotations: {}".format(", ".join(db_annotations))) exit(-1) if args.feature_name.lower() in ["transcript", "protein"]: annotations = None feature_getter = lambda row: row[args.feature_name] else: annotations = [args.feature_name] feature_getter = lambda row: row["annotations"][args.feature_name] # predictors, transforms, and updated_predictors predictors = cmd.get_selected_predictors(default_all=True) transforms = cmd.get_transforms() if args.updated_predictors is not None: if len(predictors) != len(args.updated_predictors): logger.error("The number of selected predictors does not match the number of predictor names to update") exit(-1) updated_predictors = dict([(p, u) for p, u in zip(predictors, args.updated_predictors)]) else: updated_predictors = dict([(p, "TFIC_{}".format(p)) for p in predictors]) # create predictors in the database if required db_predictors = set([p["id"] for p in db.predictors()]) for predictor, updated_predictor in updated_predictors.items(): if updated_predictor not in db_predictors: logger.info("Creating predictor {} ...".format(updated_predictor)) db.add_predictor(updated_predictor, FannsDb.CALCULATED_PREDICTOR_TYPE, source=[predictor]) try: logger.info("Loading baseline tolerance statistics ...") with tsv.open(args.blt_path) as f: doc = json.load(f) blt_predictors = doc["predictors"] features = doc["features"] blt_stats = doc["blt"] num_predictors = len(blt_predictors) logger.info(" Predictors: {}".format(", ".join(blt_predictors))) logger.info(" Features: {}".format(len(features))) logger.info("Calculating ...") progress = RatedProgress(logger, name="SNVs") rows_count = updated_count = 0 for row in db.query_scores(predictors=predictors, maps=annotations): rows_count += 1 scores = row["scores"] feature = feature_getter(row) if feature not in blt_stats: continue feature_stats = blt_stats[feature] tfic_scores = calculate_tfic(predictors, updated_predictors, feature_stats, scores, transforms) if len(tfic_scores) > 0: db.update_scores(row["id"], tfic_scores) updated_count += 1 progress.update() progress.log_totals() logger.info("Commit ...") db.commit() logger.info( "Finished. Total rows = {}, updated rows = {}, elapsed_time = {}".format( rows_count, updated_count, progress.elapsed_time ) ) except: cmd.handle_error() return 0
def fetch_iter(db, muts_path, maps=None, predictors=None, muts_header=False, state=None, logger=None): """ Iterator that fetches scores from the database from the mutations in a file. :param db: FannsDb interface. :param muts_path: The input path for mutations. :param maps: Map transcript/protein ensembl identifiers with external identifiers (swissprot_id, ...) :param predictors: Predictors for which to obtain the scores. :param muts_header: Whether the muts_path has a header or not. :param state: The state of the iteration: hits, fails. :param logger: Logger to use. If not specified a new one is created. """ def query_mutation(logger, db, mut, maps, predictors): if mut.coord == Mutation.GENOMIC: if logger.isEnabledFor(logging.DEBUG): logger.debug(" Querying {} {} {} {} {} {} {} ...".format( mut.chr, mut.start, mut.end or "*", mut.ref or "*", mut.alt, mut.strand or "*", mut.identifier or "*")) for row in db.query_scores(chr=mut.chr, start=mut.start, ref=mut.ref, alt=mut.alt, strand=mut.strand, predictors=predictors, maps=maps): yield row elif mut.coord == Mutation.PROTEIN: if logger.isEnabledFor(logging.DEBUG): logger.debug(" Querying {} {} {} {} {} ...".format( mut.protein, mut.start, mut.ref or "*", mut.alt, mut.identifier or "*")) for row in db.query_scores(protein=mut.protein, aa_pos=mut.start, aa_ref=mut.ref, aa_alt=mut.alt, predictors=predictors, maps=maps): yield row else: logger.warn("Unknown coordinates system: {}".format(mut.line)) if logger is None: logger = logging.getLogger("fannsdb.fetch") state = state if state is not None else {} state[STATE_HITS] = state[STATE_FAILS] = 0 maps = maps if maps is not None else [] predictors = predictors if predictors is not None else [] logger.info("Reading {} ...".format(os.path.basename(muts_path) if muts_path != "-" else "from standard input")) progress = RatedProgress(logger, name="SNVs") with tsv.open(muts_path) as f: if muts_header: tsv.skip_comments_and_empty(f) # this returns the first non empty nor comment line (the header) mutparser = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut = mutparser.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) state[STATE_FAILS] += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) state[STATE_FAILS] += 1 continue state.update({ STATE_LINE_NUM : line_num, STATE_LINE : line, STATE_MUTATION : mut}) exists = False for row in query_mutation(logger, db, mut, maps, predictors): exists = True yield row progress.update() if exists: state[STATE_HITS] += 1 else: state[STATE_FAILS] += 1 progress.log_totals() hits, fails = [state[k] for k in [STATE_HITS, STATE_FAILS]] logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hits + fails, hits, fails, progress.elapsed_time))