Exemplo n.º 1
0
def extract_process_kmers(name):
    """Extract k-mers from genomic sequence and run initial processing.

    Load project arguments and produce three files:
    extract k-mers from the genome: <name>/<name>_kmers.txt.gz
    shuffle all extracted k-mers: <name>/<name>_kmers_shuffled.txt.gz
    count occurrences of k-mers: <name>/<name>_kmers_counts.txt.gz

    Args:
    name: project name, used to get project args and in all output
    """
    util.print_log('start extract_process_kmers()')
    util.print_log('load arguments...')
    args = util.load_args(name)
    util.print_args(args)
    util.print_log('done')

    util.print_log('load FASTA...')
    util.print_log('load from %s' % args['fasta'])
    fasta = load_fasta(args['fasta'])
    util.print_log('done')

    util.print_log('extract k-mers...')
    kmers_filename = '%s/%s_kmers.txt.gz' % (name, name)
    allpams = [args['pam']] + args['altpam']
    util.print_log('write in file %s' % kmers_filename)
    genome = extract_kmers(name=name,
                           fasta=fasta,
                           length=args['length'],
                           pams=allpams,
                           pampos=args['pampos'],
                           filename=kmers_filename,
                           chroms=args['chrom'],
                           minchrlen=args['minchrlen'],
                           processes=args['processes'])
    sys.stdout.write('genome: %s' % genome)
    util.print_log('save genome info')
    args['genome'] = genome
    util.save_args(args)
    util.print_log('calculate k-mer statistics')
    print_stats_kmers(kmers_filename, gnupath=args['gnupath'])
    util.print_log('done')

    util.print_log('shuffle k-mers...')
    kmers_shuffled_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name)
    util.print_log('write in file %s' % kmers_shuffled_filename)
    shuffle_kmers(fileinput=kmers_filename,
                  fileoutput=kmers_shuffled_filename,
                  gnupath=args['gnupath'])
    util.print_log('done')

    util.print_log('count k-mers...')
    count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name)
    util.print_log('write in file %s' % count_filename)
    sort_count_kmers(fileinput=kmers_filename,
                     fileoutput=count_filename,
                     mincount=args['maxoffpos'],
                     gnupath=args['gnupath'])
    util.print_log('done')
    return True
Exemplo n.º 2
0
def main():

    #user inputs
    args = arg_parser()
    args_dict = args.__dict__
    #tidy PAM and chrom args
    args_dict['altpam'] = [s.upper() for s in args_dict['altpam'].split(',')]
    args_dict['altpam'] = [s.strip() for s in args_dict['altpam'] if s]
    args_dict['pam'] = args_dict['pam'].upper()
    if args_dict['chrom']:
        if os.path.isfile(args_dict['chrom']):
            chroms = open(args_dict['chrom']).read().split(',')
        else:
            chroms = args_dict['chrom'].split(',')
        chroms = [c.strip() for c in chroms]
        chroms = [c for c in chroms if c]
    else:
        chroms = []
    args_dict['chrom'] = chroms

    util.print_log('save arguments...')
    util.print_args(args_dict)
    util.save_args(args_dict)
    util.print_log('done')

    #main
    util.print_log2('start extract_process_kmers()')
    kmers.extract_process_kmers(args_dict['name'])

    util.print_log2('start analyze_guides()')
    kmers_trie = guides.analyze_guides(args_dict['name'])

    util.print_log2('start produce_bams_main()')
    bamdata.produce_bams_main(kmers_trie, args_dict['name'])

    util.print_log2('processer done.')
                                                 val_loaders,
                                                 device,
                                                 fig_dir=eval_img_path)

if __name__ == "__main__":
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    if not os.path.exists(plot_path):
        os.makedirs(plot_path)
    if not os.path.exists(chkpt_path):
        os.makedirs(chkpt_path)

    trainer.run(dataloaders["train"], max_epochs=args.max_epochs)

    if args.data_kwargs.get("batch_size", None) is None:
        args.data_kwargs["batch_size"] = dataloaders["train"].batch_size
    logger.save(os.path.join(log_path, "val_log.csv"))
    save_args(args, os.path.join(log_path, "args.csv"))

    save_model(
        network,
        network._get_name(),
        epoch=args.max_epochs,
        score_name="val_loss",
        score_value=logger.log["val_loss"][-1],
        tstamp=tstamp,
        save_dir=chkpt_path,
    )

# # train_cnn_cvae_script.py ends here
Exemplo n.º 4
0
    parser.add_argument(
        "--n_per_file",
        default=100000,
        type=int,
        help="How many demos to load per pickle file",
    )
    parser.add_argument(
        "--exp_dir", default="exp/debug", help="Path to exp dir",
    )
    parser.add_argument("--epochs", default=20, type=int)
    parser.add_argument("--cuda", action="store_true")

    args = parser.parse_args()

    os.makedirs(args.exp_dir, exist_ok=True)
    util.save_args(args, args.exp_dir)

    demos = load_demos(*args.demos, n_per_file=args.n_per_file)
    val_size = int(len(demos) * 0.1)
    test_size = int(len(demos) * 0.1)
    dsets = torch.utils.data.random_split(
        demos, [len(demos) - val_size - test_size, val_size, test_size]
    )

    def to_dl(d):
        return torch.utils.data.DataLoader(
            d, batch_size=100, pin_memory=True, num_workers=4, collate_fn=demo_collate
        )

    dataloaders = {
        "train": to_dl(dsets[0]),