def main(): parser = argparse.ArgumentParser( description="Update predictors min, max and count") cmd = DefaultCommandHelper(parser) cmd.add_db_args() cmd.add_selected_predictors_args() args, logger = cmd.parse_args("pred-update") db = cmd.open_db() try: predictors = cmd.get_selected_predictors(default_all=True) logger.info("Updating predictors ...") start_time = datetime.now() db.update_predictors(predictors) db.commit() logger.info("Finished. elapsed={}".format(datetime.now() - start_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Fetch Condel scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("muts_path", metavar="SNVS_PATH", help="SNV's to check. Use - for standard input.") parser.add_argument("out_path", metavar="OUTPUT_PATH", help="The results path. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() args, logger = cmd.parse_args("fetch") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Reading {} ...".format(args.muts_path if args.muts_path != "-" else "from standard input")) try: progress = RatedProgress(logger, name="SNVs") with tsv.open(args.muts_path) as f: with tsv.open(args.out_path, "w") as wf: tsv.write_line(wf, "ID", *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) hit = fail = 0 mut = DnaAndProtMutationParser() for line_num, line in enumerate(f, start=1): line = line.rstrip(" \n\r") if len(line) == 0 or line.startswith("#"): continue try: mut.parse(line) except PrematureEnd: logger.error("Missing fields at line {}".format(line_num)) fail += 1 continue except UnexpectedToken as ex: logger.error("Unexpected field '{}' at line {}".format(ex.args[0], line_num)) fail += 1 continue exists = False for row in query_mutation(logger, db, mut, annotations, predictors): exists = True ann = row["annotations"] scores = row["scores"] tsv.write_line(wf, mut.identifier, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) """ if logger.isEnabledFor(logging.DEBUG): logger.debug(" --> {} {} {} {} {} {} {} {} {} {}".format( row["chr"], row["start"], row["ref"], row["alt"], row["transcript"], row["protein"], row["aa_pos"], row["aa_ref"], row["aa_alt"], mut.identifier or "*")) """ progress.update() if exists: hit += 1 else: fail += 1 progress.log_totals() logger.info("Finished. total={}, hits={}, fails={}, elapsed={}".format(hit + fail, hit, fail, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close()
def main(): parser = argparse.ArgumentParser( description="Update scores in the database") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("source_path", metavar="SOURCE", help="The source file. Use - for standard input.") cmd.add_selected_predictors_args() parser.add_argument("--update-predictors", dest="update_predictors", action="store_true", default=False, help="Update of the predictors.") parser.add_argument("--ignore-errors", dest="ignore_errors", action="store_true", default=False, help="When errors on the input file, report them but continue processing the input.") args, logger = cmd.parse_args("update") db = cmd.open_db() predictors = cmd.get_selected_predictors(check_missing=False) try: progress = RatedProgress(logger, name="SNVs") total_lines = 0 logger.info("Reading {} ...".format(args.source_path if args.source_path != "-" else "from standard input")) with tsv.open(args.source_path) as f: # Parse header hdr_line = f.readline() hdr = dict([(name, index) for index, name in enumerate(hdr_line.rstrip("\n").split("\t"))]) db_predictors = set([p["id"] for p in db.predictors()]) if len(predictors) == 0: predictors = [name for name in hdr if name in db_predictors] if len(predictors) == 0: raise Exception("Any input file header match the available predictors in the database. Please specify them using -p.") logger.info("Predictors: {}".format(", ".join(predictors))) for predictor in filter(lambda p: p not in db_predictors, predictors): logger.info("Creating predictor {} ...".format(predictor)) db.add_predictor(predictor, FannsDb.SOURCE_PREDICTOR_TYPE) use_genome_coords = len(set(hdr.keys()) & set(GENOME_COORD_COLUMNS)) > 0 use_protein_coords = len(set(hdr.keys()) & set(PROTEIN_COORD_COLUMNS)) > 0 if not use_genome_coords and not use_protein_coords: raise Exception("No coordinate columns found. " "Use {} for genomic coordinates or {} for protein coordinates.".format( GENOME_COORD_COLUMNS, PROTEIN_COORD_COLUMNS)) elif use_genome_coords and use_protein_coords: logger.warn("Both, genomic and protein coordinates columns found. Using genomic coordinates by default") if use_genome_coords: coord_column_names = [n for n in hdr if n in set(GENOME_COORD_COLUMNS)] coord_column_types = [GENOME_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_dna elif use_protein_coords: coord_column_names = [n for n in hdr if n in set(PROTEIN_COORD_COLUMNS)] coord_column_types = [PROTEIN_COORD_COLUMN_TYPE[n] for n in coord_column_names] #get_rows = db.get_transcripts_by_protein coord_column_indices = [hdr[n] for n in coord_column_names] score_indices = [hdr[n] for n in predictors] max_column_index = max(coord_column_indices + score_indices) for line_num, line in enumerate(f, start=2): fields = line.rstrip("\n").split("\t") if len(fields) < max_column_index: log.error("Missing columns for line {}: {}".format(line_num, " ".join(fields))) if not args.ignore_errors: raise try: coords = dict([(name.lower(), type_cast(fields[index])) for name, type_cast, index in zip( coord_column_names, coord_column_types, coord_column_indices)]) scores = dict([(p, score_value(fields[i])) for p, i in zip(predictors, score_indices)]) except Exception as ex: logger.error("{} at line {}: {}".format(str(ex), line_num, " ".join(fields))) if not args.ignore_errors: raise try: for row in db.query_scores(fields=[], **coords): db.update_scores(row["id"], scores) except Exception as ex: logger.error("Error updating SNV at line {}: {}".format(line_num, str(ex))) logger.error("SNV: {}".format(", ".join(["{}={}".format(k, v) for k, v in coords.items()]))) if not args.ignore_errors: raise progress.update() progress.log_totals() logger.info("Finalizing database ...") if args.update_predictors: logger.info("Updating predictors ...") db.update_predictors() logger.info("Committing ...") db.commit() logger.info("Finished successfully. Elapsed time: {}".format(progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0
def main(): parser = argparse.ArgumentParser( description="Export Scores") cmd = DefaultCommandHelper(parser) cmd.add_db_args() parser.add_argument("dest_path", metavar="OUTPUT_PATH", help="The output file. Use - for standard output.") cmd.add_selected_predictors_args() cmd.add_selected_annotations_args() cmd.add_selected_columns_args() parser.add_argument("--json", dest="to_json", action="store_true", default=False, help="Export the results in json format") parser.add_argument("--sample", dest="sample", type=int, metavar="PCT", help="Export a random sample of PCT %%") parser.add_argument("--start", dest="start", type=int, metavar="N", help="Start to export from the SNV number N") parser.add_argument("--limit", dest="limit", type=int, metavar="N", help="Limit the number of SNVs to export to N") args, logger = cmd.parse_args("export") db = cmd.open_db() predictors = cmd.get_selected_predictors() annotations = cmd.get_selected_annotations() columns = cmd.get_selected_columns() logger.info("Exporting ...") random.seed(time.time()) total_count = 0 total_start_time = time.time() try: progress = RatedProgress(logger, name="SNVs") to_json = args.to_json sample = args.sample start = args.start or 0 limit = args.limit doc = None last_pos = None rows_count = 0 snvs_count = 0 with tsv.open(args.dest_path, "w") as f: if not to_json: tsv.write_line(f, *[c.upper() for c in columns] + [a.upper() for a in annotations] + predictors) for row in db.query_scores(predictors=predictors, maps=annotations): if not to_json: if start > 0: start -= 1 continue if sample is not None and random.randint(1, 100) > sample: continue pos = (row["chr"], row["strand"], row["start"], row["ref"], row["alt"]) if last_pos != pos: if to_json: if start > 0: start -= 1 continue if limit is not None and snvs_count >= limit: if doc is not None: json.dump(doc, f) f.write("\n") break snvs_count += 1 rows_count += 1 ann = row["annotations"] scores = row["scores"] if to_json: tdoc = dict([(k,row[k]) for k in ["transcript", "protein", "aa_pos", "aa_ref", "aa_alt"]] + [(k,scores[k]) for k in predictors]) if pos != last_pos: if doc is not None: if sample is None or random.randint(1, 100) <= sample: json.dump(doc, f) f.write("\n") else: snvs_count -= 1 doc = dict([(k, row[k]) for k in ["chr", "strand", "start", "ref", "alt"]] + [("transcripts", [tdoc])]) else: doc["transcripts"] += [tdoc] else: tsv.write_line(f, *[row[c] for c in columns] + [ann[a] for a in annotations] + [scores[p] for p in predictors]) progress.update() last_pos = pos if not to_json and limit is not None and rows_count >= limit: break progress.log_totals() logger.info("Finished. Total rows = {}, SNVs = {}, elapsed_time = {}".format(rows_count, snvs_count, progress.elapsed_time)) except: return cmd.handle_error() finally: db.close() return 0