def extract_process_kmers(name): """Extract k-mers from genomic sequence and run initial processing. Load project arguments and produce three files: extract k-mers from the genome: <name>/<name>_kmers.txt.gz shuffle all extracted k-mers: <name>/<name>_kmers_shuffled.txt.gz count occurrences of k-mers: <name>/<name>_kmers_counts.txt.gz Args: name: project name, used to get project args and in all output """ util.print_log('start extract_process_kmers()') util.print_log('load arguments...') args = util.load_args(name) util.print_args(args) util.print_log('done') util.print_log('load FASTA...') util.print_log('load from %s' % args['fasta']) fasta = load_fasta(args['fasta']) util.print_log('done') util.print_log('extract k-mers...') kmers_filename = '%s/%s_kmers.txt.gz' % (name, name) allpams = [args['pam']] + args['altpam'] util.print_log('write in file %s' % kmers_filename) genome = extract_kmers(name=name, fasta=fasta, length=args['length'], pams=allpams, pampos=args['pampos'], filename=kmers_filename, chroms=args['chrom'], minchrlen=args['minchrlen'], processes=args['processes']) sys.stdout.write('genome: %s' % genome) util.print_log('save genome info') args['genome'] = genome util.save_args(args) util.print_log('calculate k-mer statistics') print_stats_kmers(kmers_filename, gnupath=args['gnupath']) util.print_log('done') util.print_log('shuffle k-mers...') kmers_shuffled_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name) util.print_log('write in file %s' % kmers_shuffled_filename) shuffle_kmers(fileinput=kmers_filename, fileoutput=kmers_shuffled_filename, gnupath=args['gnupath']) util.print_log('done') util.print_log('count k-mers...') count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name) util.print_log('write in file %s' % count_filename) sort_count_kmers(fileinput=kmers_filename, fileoutput=count_filename, mincount=args['maxoffpos'], gnupath=args['gnupath']) util.print_log('done') return True
def main(): #user inputs args = arg_parser() args_dict = args.__dict__ #tidy PAM and chrom args args_dict['altpam'] = [s.upper() for s in args_dict['altpam'].split(',')] args_dict['altpam'] = [s.strip() for s in args_dict['altpam'] if s] args_dict['pam'] = args_dict['pam'].upper() if args_dict['chrom']: if os.path.isfile(args_dict['chrom']): chroms = open(args_dict['chrom']).read().split(',') else: chroms = args_dict['chrom'].split(',') chroms = [c.strip() for c in chroms] chroms = [c for c in chroms if c] else: chroms = [] args_dict['chrom'] = chroms util.print_log('save arguments...') util.print_args(args_dict) util.save_args(args_dict) util.print_log('done') #main util.print_log2('start extract_process_kmers()') kmers.extract_process_kmers(args_dict['name']) util.print_log2('start analyze_guides()') kmers_trie = guides.analyze_guides(args_dict['name']) util.print_log2('start produce_bams_main()') bamdata.produce_bams_main(kmers_trie, args_dict['name']) util.print_log2('processer done.')
val_loaders, device, fig_dir=eval_img_path) if __name__ == "__main__": if not os.path.exists(log_path): os.makedirs(log_path) if not os.path.exists(plot_path): os.makedirs(plot_path) if not os.path.exists(chkpt_path): os.makedirs(chkpt_path) trainer.run(dataloaders["train"], max_epochs=args.max_epochs) if args.data_kwargs.get("batch_size", None) is None: args.data_kwargs["batch_size"] = dataloaders["train"].batch_size logger.save(os.path.join(log_path, "val_log.csv")) save_args(args, os.path.join(log_path, "args.csv")) save_model( network, network._get_name(), epoch=args.max_epochs, score_name="val_loss", score_value=logger.log["val_loss"][-1], tstamp=tstamp, save_dir=chkpt_path, ) # # train_cnn_cvae_script.py ends here
parser.add_argument( "--n_per_file", default=100000, type=int, help="How many demos to load per pickle file", ) parser.add_argument( "--exp_dir", default="exp/debug", help="Path to exp dir", ) parser.add_argument("--epochs", default=20, type=int) parser.add_argument("--cuda", action="store_true") args = parser.parse_args() os.makedirs(args.exp_dir, exist_ok=True) util.save_args(args, args.exp_dir) demos = load_demos(*args.demos, n_per_file=args.n_per_file) val_size = int(len(demos) * 0.1) test_size = int(len(demos) * 0.1) dsets = torch.utils.data.random_split( demos, [len(demos) - val_size - test_size, val_size, test_size] ) def to_dl(d): return torch.utils.data.DataLoader( d, batch_size=100, pin_memory=True, num_workers=4, collate_fn=demo_collate ) dataloaders = { "train": to_dl(dsets[0]),