def test_database(self): if find_cmd("sqlite3"): with make_workdir() as workdir: arg = namedtuple('args', 'debug print_debug json ') args = arg(True, True, "../../data/examples/seqcluster.json") initialize_logger(".", args.debug, args.print_debug) logger = mylog.getLogger(__name__) logger.info(args) logger.info("Reading data") data = load_data(args.json) logger.info("Create database") make_database(data)
def test_databse(self): if find_cmd("sqlite3"): mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace("seqcluster/", "") os.chdir(os.path.join(mod_dir, "data/examples")) arg = namedtuple('args', 'debug print_debug json ') args = arg(True, True, "seqcluster.json") initialize_logger(".", args.debug, args.print_debug) logger = mylog.getLogger(__name__) logger.info(args) logger.info("Reading data") data = load_data(args.json) logger.info("Create databse") make_database(data)
def report(args): """ Create report in html format """ logger.info("reading sequeces") data = load_data(args.json) logger.info("create profile") data = make_profile(data, os.path.join(args.out, "profiles"), args) logger.info("create database") make_database(data, "seqcluster.db", args.out) logger.info("Done. Download https://github.com/lpantano/seqclusterViz/archive/master.zip to browse the output.")
def report(args): """ Create report in html format """ logger.info("Reading sequences") data = load_data(args.json) logger.info("Create profile") data = make_profile(data, os.path.join(args.out, "profiles"), args) logger.info("Create database") make_database(data, "seqcluster.db", args.out) logger.info("Done. Download https://github.com/lpantano/seqclusterViz/archive/master.zip to browse the output.")
def test_predict(self): if find_cmd("tRNAscan-SE"): mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace("seqcluster/", "") os.chdir(os.path.join(mod_dir, "data/examples")) out_dir = os.path.join(mod_dir, "data/examples/predictions") arg = namedtuple('args', 'debug print_debug MIN_SEQ json reference') args = arg(True, True, 1, "seqcluster.json", "../genomes/genome.fa") initialize_logger(".", args.debug, args.print_debug) logger = mylog.getLogger(__name__) logger.info(args) logger.info("Reading data") data = load_data(args.json) logger.info("Start prediction") make_predictions(data, out_dir, args)
def test_databse(self): if find_cmd("sqlite3"): mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace( "seqcluster/", "") os.chdir(os.path.join(mod_dir, "data/examples")) arg = namedtuple('args', 'debug print_debug json ') args = arg(True, True, "seqcluster.json") initialize_logger(".", args.debug, args.print_debug) logger = mylog.getLogger(__name__) logger.info(args) logger.info("Reading data") data = load_data(args.json) logger.info("Create databse") make_database(data)
def test_predict(self): if find_cmd("tRNAscan-SE"): mod_dir = os.path.dirname(inspect.getfile(seqcluster)).replace( "seqcluster/", "") os.chdir(os.path.join(mod_dir, "data/examples")) out_dir = os.path.join(mod_dir, "data/examples/test_out_predictions") arg = namedtuple('args', 'debug print_debug MIN_SEQ json reference') args = arg(True, True, 1, "seqcluster.json", "../genomes/genome.fa") initialize_logger(".", args.debug, args.print_debug) logger = mylog.getLogger(__name__) logger.info(args) logger.info("Reading data") data = load_data(args.json) logger.info("Start prediction") is_tRNA(data, out_dir, args)
def explore(args): """Create mapping of sequences of two clusters """ logger.info("reading sequeces") data = load_data(args.json) logger.info("get sequences from json") #get_sequences_from_cluster() c1, c2 = args.names.split(",") seqs, names = get_sequences_from_cluster(c1, c2, data[0]) loci = get_precursors_from_cluster(c1, c2, data[0]) logger.info("map all sequences to all loci") print("%s" % (loci)) map_to_precursors(seqs, names, loci, os.path.join(args.out, "map.tsv"), args) #map_sequences_w_bowtie(sequences, precursors) logger.info("plot sequences on loci") #get_matrix_position() #plot_sequences() logger.info("Done")
def explore(args): """Create mapping of sequences of two clusters """ logger.info("Reading sequences") data = load_data(args.json) logger.info("Get sequences from json") #get_sequences_from_cluster() c1, c2 = args.names.split(",") seqs, names = get_sequences_from_cluster(c1, c2, data[0]) loci = get_precursors_from_cluster(c1, c2, data[0]) logger.info("Map all sequences to all loci") print("%s" % (loci)) map_to_precursors(seqs, names, loci, os.path.join(args.out, "map.tsv"), args) #map_sequences_w_bowtie(sequences, precursors) logger.info("Plot sequences on loci") #get_matrix_position() #plot_sequences() logger.info("Done")
def predictions(args): """ Create predictions of clusters """ logger.info(args) logger.info("Reading sequences") out_file = os.path.abspath(os.path.splitext(args.json)[0] + "_prediction.json") data = load_data(args.json) out_dir = os.path.abspath(safe_dirs(os.path.join(args.out, "predictions"))) logger.info("Make predictions") data = is_tRNA(data, out_dir, args) if args.coral: logger.info("Make CoRaL predictions") run_coral(data, out_dir, args) write_data(data[0], out_file) logger.info("Done")
def predictions(args): """ Create predictions of clusters """ logger.info(args) logger.info("reading sequeces") out_file = os.path.abspath(os.path.splitext(args.json)[0] + "_prediction.json") data = load_data(args.json) out_dir = os.path.abspath(safe_dirs(os.path.join(args.out, "predictions"))) logger.info("make predictions") data = is_tRNA(data, out_dir, args) if args.coral: logger.info("make CoRaL predictions") run_coral(data, out_dir, args) write_data(data[0], out_file) logger.info("Done")
def cluster(args): """ Creating clusters """ args = _check_args(args) read_stats_file = op.join(args.dir_out, "read_stats.tsv") if file_exists(read_stats_file): os.remove(read_stats_file) bam_file, seq_obj = _clean_alignment(args) logger.info("Parsing matrix file") seqL, y, l = parse_ma_file(seq_obj, args.ffile) # y, l = _total_counts(seqL.keys(), seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'aligned' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') if len(seqL.keys()) < 10: logger.error( "It seems you have low coverage. Please check your fastq files have enough sequences." ) raise ValueError("So few sequences.") logger.info("Cleaning bam file") y, l = _total_counts(list(seqL.keys()), seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'cleaned' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') clusL = _create_clusters(seqL, bam_file, args) y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'clusters' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') logger.info("Solving multi-mapping events in the network of clusters") clusLred = _cleaning(clusL, args.dir_out) y, l = _total_counts(clusLred.clus, seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'meta-cluster' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') logger.info("Clusters up to %s" % (len(clusLred.clus.keys()))) if args.show: logger.info("Creating sequences alignment to precursor") clusLred = show_seq(clusLred, args.index) clusLred = peak_calling(clusLred) clusLred = _annotate(args, clusLred) logger.info("Creating json and count matrix") json_file = _create_json(clusLred, args) logger.info("Output file in: %s" % args.dir_out) if args.db: name = args.db + ".db" logger.info("Create database: database/" + name) data = load_data(json_file) out_dir = op.join(args.dir_out, "database") make_database(data, name, out_dir) logger.info("Finished")
def cluster(args): """ Creating clusters """ args = _check_args(args) read_stats_file = op.join(args.dir_out, "read_stats.tsv") if file_exists(read_stats_file): os.remove(read_stats_file) bam_file, seq_obj = _clean_alignment(args) logger.info("Parsing matrix file") seqL, y, l = parse_ma_file(seq_obj, args.ffile) # y, l = _total_counts(seqL.keys(), seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'aligned' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') if len(seqL.keys()) < 10: logger.error("It seems you have low coverage. Please check your fastq files have enough sequences.") raise ValueError("So few sequences.") logger.info("Cleaning bam file") y, l = _total_counts(list(seqL.keys()), seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'cleaned' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') clusL = _create_clusters(seqL, bam_file, args) y, l = _total_counts(list(clusL.seq.keys()), clusL.seq, aligned=True) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'clusters' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') logger.info("Solving multi-mapping events in the network of clusters") clusLred = _cleaning(clusL, args.dir_out) y, l = _total_counts(clusLred.clus, seqL) logger.info("counts after: %s" % sum(y.values())) logger.info("# sequences after: %s" % l) dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()}) dt['step'] = 'meta-cluster' dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a') logger.info("Clusters up to %s" % (len(clusLred.clus.keys()))) if args.show: logger.info("Creating sequences alignment to precursor") clusLred = show_seq(clusLred, args.index) clusLred = peak_calling(clusLred) clusLred = _annotate(args, clusLred) logger.info("Creating json and count matrix") json_file = _create_json(clusLred, args) logger.info("Output file in: %s" % args.dir_out) if args.db: name = args.db + ".db" logger.info("Create database: database/" + name) data = load_data(json_file) out_dir = op.join(args.dir_out, "database") make_database(data, name, out_dir) logger.info("Finished")