Пример #1
0
def extract_process_kmers(name):
    """Extract k-mers from genomic sequence and run initial processing.

    Load project arguments and produce three files:
    extract k-mers from the genome: <name>/<name>_kmers.txt.gz
    shuffle all extracted k-mers: <name>/<name>_kmers_shuffled.txt.gz
    count occurrences of k-mers: <name>/<name>_kmers_counts.txt.gz

    Args:
    name: project name, used to get project args and in all output
    """
    util.print_log('start extract_process_kmers()')
    util.print_log('load arguments...')
    args = util.load_args(name)
    util.print_args(args)
    util.print_log('done')

    util.print_log('load FASTA...')
    util.print_log('load from %s' % args['fasta'])
    fasta = load_fasta(args['fasta'])
    util.print_log('done')

    util.print_log('extract k-mers...')
    kmers_filename = '%s/%s_kmers.txt.gz' % (name, name)
    allpams = [args['pam']] + args['altpam']
    util.print_log('write in file %s' % kmers_filename)
    genome = extract_kmers(name=name,
                           fasta=fasta,
                           length=args['length'],
                           pams=allpams,
                           pampos=args['pampos'],
                           filename=kmers_filename,
                           chroms=args['chrom'],
                           minchrlen=args['minchrlen'],
                           processes=args['processes'])
    sys.stdout.write('genome: %s' % genome)
    util.print_log('save genome info')
    args['genome'] = genome
    util.save_args(args)
    util.print_log('calculate k-mer statistics')
    print_stats_kmers(kmers_filename, gnupath=args['gnupath'])
    util.print_log('done')

    util.print_log('shuffle k-mers...')
    kmers_shuffled_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name)
    util.print_log('write in file %s' % kmers_shuffled_filename)
    shuffle_kmers(fileinput=kmers_filename,
                  fileoutput=kmers_shuffled_filename,
                  gnupath=args['gnupath'])
    util.print_log('done')

    util.print_log('count k-mers...')
    count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name)
    util.print_log('write in file %s' % count_filename)
    sort_count_kmers(fileinput=kmers_filename,
                     fileoutput=count_filename,
                     mincount=args['maxoffpos'],
                     gnupath=args['gnupath'])
    util.print_log('done')
    return True
Пример #2
0
def produce_bams_main(kmers_trie, name):
    """Produce BAM file with all guideRNAs and info about their off-targets.

    Run after all files and trie were generated
    by kmers.extract_process_kmers() and guides.analyze_guides()

    Produce files:
    sorted BAM file with off-target info: <name>/<name>_guides.bam
    index for the BAM file with off-target info: <name>/<name>_guides.bam.bai
    also, BAM file and index for all guideRNAs without any off-target info
    (produced much faster):
        <name>/<name>_guides_nooff.bam
        <name>/<name>_guides_nooff.bam.bai

    Args:
    kmers_trie: trie.trie object as produced by guides.analyze_guides()
    name: project name, used to get project args and in all output
    """
    util.print_log('start produce_bam()')
    util.print_log('load arguments...')
    args = util.load_args(name)
    util.print_args(args)
    util.print_log('done')

    util.print_log('produce SAM file with guideRNAs only (no off-targets)...')
    # guides_filename = '%s/%s_guides.txt.gz' % (name, name)
    # parts = 256
    n = args['greateroffdist']
    parts = 4 ** n

    guides_dir = '%s%s' % (name,'/classifiedfiles/guides')
    guides_filenames = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)]
    util.print_log('read guides from %s' % guides_dir)
    produce_bam_custom(kmers_trie=kmers_trie, name=name, label='nooff',
                       guides_filename=guides_filenames,
                       args=args, offdist=-1,  # -1 for no off-targets
                       maxoffcount=args['maxoffcount'],
                       processes=args['processes'],
                       n = n,
                       parts=parts)
    util.print_log('done')

    if args['offdist'] != -1:
        util.print_log('produce SAM file with guideRNAs'
                       ' and off-target info...')
        # guides_filename = '%s/%s_guides.txt.gz' % (name, name)
        util.print_log('read guides from %s' % guides_dir)
        produce_bam_custom(kmers_trie=kmers_trie, name=name, 
                           label='offdist%s' % args['offdist'],
                           guides_filename=guides_filenames,
                           args=args, offdist=args['offdist'],
                           maxoffcount=args['maxoffcount'],
                           processes=args['processes'],
                           n = n,
                           parts=parts)
        util.print_log('done')
Пример #3
0
    if str(args.load_from)[:9] == "Tokenizer":
        print("resume training the tokenizer..." + args.load_from)
        CHECKPOINTS = CHECKPOINTS_TOKENIZER  # path to the folder that store the trained model, if any
        if args.over_write == 1:
            args.save_to = args.load_from  # overwrite the weights
            print("overwrite the weights to ", args.save_to)

    # if start training a new model (with and without downloading LM)
    elif str(args.load_from)[:2] == "LM":
        CHECKPOINTS = CHECKPOINTS_LM
        print("download the language model from " + args.load_from)

    # get the network structure from the loaded model
    if True:
        imported_model = os.path.join(CHECKPOINTS_TOKENIZER, args.load_from)
        args_dict = util.load_args(imported_model)
        args.char_embedding_size = args_dict["char_embedding_size"]
        args.hidden_dim = args_dict["hidden_dim"]
        args.layer_num = args_dict["layer_num"]
        # learning_rate = args_dict["learning_rate"]
        args.clip_grad = args_dict["clip_grad"]
        args.sequence_length = args_dict["sequence_length"]
        args.batchSize = args_dict["batchSize"]
        args.lstm_num_direction = args_dict["lstm_num_direction"]
        # sgd_momentum = args_dict["sgd_momentum"]
        args.len_lines_per_chunk = args_dict["len_lines_per_chunk"]
        args.optim = args_dict["optim"]
        print("get the network structure from the loaded model...")

# set the default note
if args.add_note is None:
Пример #4
0
# 1 for saving a resumed model to the old model, 0 for saving it as a new model
parser.add_argument("--over_write", type=int, default=0)
parser.add_argument("--add_note", type=str)

args = parser.parse_args()
args_dict = vars(args)

train = True
print()
if args.load_from is None:
    print("===========start training a language model===========")
else:
    print("===========resume the training of " + str(args.load_from) +
          "===========")
    json_path = os.path.join(CHECKPOINTS_LM, args.load_from)
    args_dict = util.load_args(json_path)
    args.char_embedding_size = args_dict["char_embedding_size"]
    args.hidden_dim = args_dict["hidden_dim"]
    args.layer_num = args_dict["layer_num"]
    args.clip_grad = args_dict["clip_grad"]
    args.sequence_length = args_dict["sequence_length"]
    args.batchSize = args_dict["batchSize"]
    args.lstm_num_direction = args_dict["lstm_num_direction"]
    args.len_lines_per_chunk = args_dict["len_lines_per_chunk"]
    # args.optim = args_dict["optim"]
    print("set up the network structure...")

# set a default note
if args.add_note is None:
    args.add_note = str(args.dataset) + " , " + str(
        args.learning_rate) + ", epoch " + str(args.epoch)
Пример #5
0
def main(args):

    # cfg_file = os.path.join(args.example_config_path, args.primitive) + ".yaml"
    cfg = get_vae_defaults()
    # cfg.merge_from_file(cfg_file)
    cfg.freeze()

    batch_size = args.batch_size
    dataset_size = args.total_data_size

    if args.experiment_name is None:
        experiment_name = args.model_name
    else:
        experiment_name = args.experiment_name

    if not os.path.exists(os.path.join(args.log_dir, experiment_name)):
        os.makedirs(os.path.join(args.log_dir, experiment_name))

    description_txt = raw_input('Please enter experiment notes: \n')
    if isinstance(description_txt, str):
        with open(
                os.path.join(args.log_dir, experiment_name,
                             experiment_name + '_description.txt'), 'wb') as f:
            f.write(description_txt)

    writer = SummaryWriter(os.path.join(args.log_dir, experiment_name))

    # torch_seed = np.random.randint(low=0, high=1000)
    # np_seed = np.random.randint(low=0, high=1000)
    torch_seed = 0
    np_seed = 0

    torch.manual_seed(torch_seed)
    np.random.seed(np_seed)

    trained_model_path = os.path.join(args.model_path, args.model_name)
    if not os.path.exists(trained_model_path):
        os.makedirs(trained_model_path)

    if args.task == 'contact':
        if args.start_rep == 'keypoints':
            start_dim = 24
        elif args.start_rep == 'pose':
            start_dim = 7

        if args.goal_rep == 'keypoints':
            goal_dim = 24
        elif args.goal_rep == 'pose':
            goal_dim = 7

        if args.skill_type == 'pull':
            # + 7 because single arm palm pose
            input_dim = start_dim + goal_dim + 7
        else:
            # + 14 because both arms palm pose
            input_dim = start_dim + goal_dim + 14
        output_dim = 7
        decoder_input_dim = start_dim + goal_dim

        vae = VAE(input_dim,
                  output_dim,
                  args.latent_dimension,
                  decoder_input_dim,
                  hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                  lr=args.learning_rate)
    elif args.task == 'goal':
        if args.start_rep == 'keypoints':
            start_dim = 24
        elif args.start_rep == 'pose':
            start_dim = 7

        if args.goal_rep == 'keypoints':
            goal_dim = 24
        elif args.goal_rep == 'pose':
            goal_dim = 7

        input_dim = start_dim + goal_dim
        output_dim = goal_dim
        decoder_input_dim = start_dim
        vae = GoalVAE(input_dim,
                      output_dim,
                      args.latent_dimension,
                      decoder_input_dim,
                      hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                      lr=args.learning_rate)
    elif args.task == 'transformation':
        input_dim = args.input_dimension
        output_dim = args.output_dimension
        decoder_input_dim = args.input_dimension - args.output_dimension
        vae = GoalVAE(input_dim,
                      output_dim,
                      args.latent_dimension,
                      decoder_input_dim,
                      hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP,
                      lr=args.learning_rate)
    else:
        raise ValueError('training task not recognized')

    if torch.cuda.is_available():
        vae.encoder.cuda()
        vae.decoder.cuda()

    if args.start_epoch > 0:
        start_epoch = args.start_epoch
        num_epochs = args.num_epochs
        fname = os.path.join(
            trained_model_path,
            args.model_name + '_epoch_%d.pt' % args.start_epoch)
        torch_seed, np_seed = load_seed(fname)
        load_net_state(vae, fname)
        load_opt_state(vae, fname)
        args = load_args(fname)
        args.start_epoch = start_epoch
        args.num_epochs = num_epochs
        torch.manual_seed(torch_seed)
        np.random.seed(np_seed)

    data_dir = args.data_dir
    data_loader = DataLoader(data_dir=data_dir)

    data_loader.create_random_ordering(size=dataset_size)

    dataset = data_loader.load_dataset(start_rep=args.start_rep,
                                       goal_rep=args.goal_rep,
                                       task=args.task)

    total_loss = []
    start_time = time.time()
    print('Saving models to: ' + trained_model_path)
    kl_weight = 1.0
    print('Starting on epoch: ' + str(args.start_epoch))

    for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs):
        print('Epoch: ' + str(epoch))
        epoch_total_loss = 0
        epoch_kl_loss = 0
        epoch_pos_loss = 0
        epoch_ori_loss = 0
        epoch_recon_loss = 0
        kl_coeff = 1 - kl_weight
        kl_weight = args.kl_anneal_rate * kl_weight
        print('KL coeff: ' + str(kl_coeff))
        for i in range(0, dataset_size, batch_size):
            vae.optimizer.zero_grad()

            input_batch, decoder_input_batch, target_batch = \
                data_loader.sample_batch(dataset, i, batch_size)
            input_batch = to_var(torch.from_numpy(input_batch))
            decoder_input_batch = to_var(torch.from_numpy(decoder_input_batch))

            z, recon_mu, z_mu, z_logvar = vae.forward(input_batch,
                                                      decoder_input_batch)
            kl_loss = vae.kl_loss(z_mu, z_logvar)

            if args.task == 'contact':
                output_r, output_l = recon_mu
                if args.skill_type == 'grasp':
                    target_batch_right = to_var(
                        torch.from_numpy(target_batch[:, 0]))
                    target_batch_left = to_var(
                        torch.from_numpy(target_batch[:, 1]))

                    pos_loss_right = vae.mse(output_r[:, :3],
                                             target_batch_right[:, :3])
                    ori_loss_right = vae.rotation_loss(
                        output_r[:, 3:], target_batch_right[:, 3:])

                    pos_loss_left = vae.mse(output_l[:, :3],
                                            target_batch_left[:, :3])
                    ori_loss_left = vae.rotation_loss(output_l[:, 3:],
                                                      target_batch_left[:, 3:])

                    pos_loss = pos_loss_left + pos_loss_right
                    ori_loss = ori_loss_left + ori_loss_right
                elif args.skill_type == 'pull':
                    target_batch = to_var(
                        torch.from_numpy(target_batch.squeeze()))

                    #TODO add flags for when we're training both arms
                    # output = recon_mu[0]  # right arm is index [0]
                    # output = recon_mu[1]  # left arm is index [1]

                    pos_loss_right = vae.mse(output_r[:, :3],
                                             target_batch[:, :3])
                    ori_loss_right = vae.rotation_loss(output_r[:, 3:],
                                                       target_batch[:, 3:])

                    pos_loss = pos_loss_right
                    ori_loss = ori_loss_right

            elif args.task == 'goal':
                target_batch = to_var(torch.from_numpy(target_batch.squeeze()))
                output = recon_mu
                if args.goal_rep == 'pose':
                    pos_loss = vae.mse(output[:, :3], target_batch[:, :3])
                    ori_loss = vae.rotation_loss(output[:, 3:],
                                                 target_batch[:, 3:])
                elif args.goal_rep == 'keypoints':
                    pos_loss = vae.mse(output, target_batch)
                    ori_loss = torch.zeros(pos_loss.shape)

            elif args.task == 'transformation':
                target_batch = to_var(torch.from_numpy(target_batch.squeeze()))
                output = recon_mu
                pos_loss = vae.mse(output[:, :3], target_batch[:, :3])
                ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:,
                                                                         3:])

            recon_loss = pos_loss + ori_loss

            loss = kl_coeff * kl_loss + recon_loss
            loss.backward()
            vae.optimizer.step()

            epoch_total_loss = epoch_total_loss + loss.data
            epoch_kl_loss = epoch_kl_loss + kl_loss.data
            epoch_pos_loss = epoch_pos_loss + pos_loss.data
            epoch_ori_loss = epoch_ori_loss + ori_loss.data
            epoch_recon_loss = epoch_recon_loss + recon_loss.data

            writer.add_scalar('loss/train/ori_loss', ori_loss.data, i)
            writer.add_scalar('loss/train/pos_loss', pos_loss.data, i)
            writer.add_scalar('loss/train/kl_loss', kl_loss.data, i)

            if (i / batch_size) % args.batch_freq == 0:
                if args.skill_type == 'pull' or args.task == 'goal' or args.task == 'transformation':
                    print(
                        'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tPos: %f\t Ori: %f'
                        % (epoch, i, dataset_size,
                           100.0 * i / dataset_size / batch_size, loss.item(),
                           kl_loss.item(), pos_loss.item(), ori_loss.item()))
                elif args.skill_type == 'grasp' and args.task == 'contact':
                    print(
                        'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tR Pos: %f\t R Ori: %f\tL Pos: %f\tL Ori: %f'
                        % (epoch, i, dataset_size, 100.0 * i / dataset_size /
                           batch_size, loss.item(), kl_loss.item(),
                           pos_loss_right.item(), ori_loss_right.item(),
                           pos_loss_left.item(), ori_loss_left.item()))
        print(' --avgerage loss: ')
        print(epoch_total_loss / (dataset_size / batch_size))
        loss_dict = {
            'epoch_total': epoch_total_loss / (dataset_size / batch_size),
            'epoch_kl': epoch_kl_loss / (dataset_size / batch_size),
            'epoch_pos': epoch_pos_loss / (dataset_size / batch_size),
            'epoch_ori': epoch_ori_loss / (dataset_size / batch_size),
            'epoch_recon': epoch_recon_loss / (dataset_size / batch_size)
        }
        total_loss.append(loss_dict)

        if epoch % args.save_freq == 0:
            print('\n--Saving model\n')
            print('time: ' + str(time.time() - start_time))

            save_state(net=vae,
                       torch_seed=torch_seed,
                       np_seed=np_seed,
                       args=args,
                       fname=os.path.join(
                           trained_model_path,
                           args.model_name + '_epoch_' + str(epoch) + '.pt'))

            np.savez(os.path.join(
                trained_model_path,
                args.model_name + '_epoch_' + str(epoch) + '_loss.npz'),
                     loss=np.asarray(total_loss))

    print('Done!')
    save_state(net=vae,
               torch_seed=torch_seed,
               np_seed=np_seed,
               args=args,
               fname=os.path.join(
                   trained_model_path,
                   args.model_name + '_epoch_' + str(epoch) + '.pt'))
Пример #6
0
def analyze_guides(name):
    """Analyze k-mers and find all candidate guideRNAs and their off-targets.

    Load project arguments, build and analyze a trie, find guideRNAs.
    Run after all files were generated by kmers.extract_process_kmers()

    Produce files:
    trie with all k-mers, values store label for good or bad candidate
        guideRNA and coordinates in the genome: <name>/<name>_trie.dat
    intermediate files with candidate guideRNA k-mers used as keys
        to the trie: <name>/<name>_triekeys_v?.txt.gz
    final list of guideRNAs: <name>/<name>_guides.txt.gz

    Args:
    name: project name, used to get project args and in all output

    Return:
    trie.trie object with all k-mers, their coordinates in the genome,
    and labels of good and bad candidate guideRNAs
    """
    # parts = 256

    util.print_log('start analyze_guides()')
    util.print_log('load arguments...')
    args = util.load_args(name)
    util.print_args(args)
    util.print_log('done')
    n = args['greateroffdist']
    parts = 4 ** n

    if os.path.exists('%s%s' % (name,'/blacklist')):
        util.print_log('blacklist directory already exists \n')
        pass
    else:
        os.mkdir(('%s%s' % (name,'/blacklist')))
        util.print_log('blacklist directory made \n')

    # in order to store classified files
    if os.path.exists('%s%s' % (name,'/classifiedfiles')):
        util.print_log('classifiedfiles directory already exists \n')
        pass
    else:
        os.makedirs(('%s%s' % (name,'/classifiedfiles/kmers')))
        os.makedirs(('%s%s' % (name,'/classifiedfiles/triekeys_v1')))
        os.makedirs(('%s%s' % (name,'/classifiedfiles/triekeys_v2')))
        os.makedirs(('%s%s' % (name,'/classifiedfiles/guides')))
        os.makedirs(('%s%s' % (name,'/classifiedfiles/tempfiles')))
        util.print_log('classifiedfiles directory made \n')

    if os.path.exists('%s%s' % (name,'/kmers_tries')):
        util.print_log('kmers_tries directory already exists \n')
        pass
    else:
        os.mkdir(('%s%s' % (name,'/kmers_tries')))
        util.print_log('kmers_tries directory made \n')



    util.print_log('construct trie...')
    kmers_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name)
    util.print_log('load k-mers from %s' % kmers_filename)
    genome = args['genome']
    goodkeysfile = '%s/%s_triekeys_v1.txt.gz' % (name, name) \
                   if args['altpam'] else ''
    badkeysfile = '%s/%s/%s_nonCandidate_triekeys_with_altpams.txt.gz' % (name,'blacklist',name) if args['altpam'] else ''
    if goodkeysfile:
        util.print_log('print candidate guideRNAs to %s' % goodkeysfile)

    tempdir = '%s%s' % (name,'/classifiedfiles/tempfiles')
    triekeys_v1_dir = '%s%s' % (name,'/classifiedfiles/triekeys_v1')
    triekeys_v1_filenames = ['%s/keys%s.txt.gz' % (triekeys_v1_dir, i) for i in range(parts)]
    kmers_dir = '%s%s' % (name,'/classifiedfiles/kmers')
    kmers_filenames = ['%s/kmers%s.txt.gz' % (kmers_dir, i) for i in range(parts)]

    kmers_trie = build_kmers_trie(kmers_filename, genome, name,
                                  altpam=args['altpam'], pampos=args['pampos'],
                                  maxcount=args['maxoffpos'],goodkeysfile=goodkeysfile,
                                  badkeysfile=badkeysfile,tempdir=tempdir, 
                                  triekeys_v1_filenames=triekeys_v1_filenames, 
                                  kmers_filenames=kmers_filenames, processes=args['processes'], n=n, parts=parts)
    util.print_log('done')


    util.print_log('label as bad guideRNAs multimapping k-mers in trie...')
    # keysinputfile = goodkeysfile if goodkeysfile else kmers_filename
    keysoutputfile = '%s/%s_triekeys_v2.txt.gz' % (name, name)
    nonCandidatekeysoutputfile = '%s/%s/%s_nonCandidate_triekeys_targetSites_with_multiple_perfect_hits.txt.gz' %\
                                 (name,'blacklist',name)
    util.print_log('read keys from %s and write to %s'
                   % (triekeys_v1_dir, keysoutputfile))

    triekeys_v2_dir = '%s%s' % (name,'/classifiedfiles/triekeys_v2')
    triekeys_v2_filenames = ['%s/keys%s.txt.gz' % (triekeys_v2_dir, i) for i in range(parts)]

    filter_keys_trie(tempdir, kmers_trie, triekeys_v1_filenames, triekeys_v2_filenames, keysoutputfile,
                     nonCandidatekeysoutputfile, args['processes'], n, parts)
    util.print_log('done')


    util.print_log('assign correct counts to multimapping k-mers in trie...')
    count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name)
    util.print_log('read counts from %s' % count_filename)
    kmers_trie = label_multimapping(kmers_trie, count_filename, n)
    util.print_log('done')


    util.print_log('label as bad guideRNAs k-mers in trie few mismatches away'
                   ' from other k-mers...')
    sim = args['sim'] - 1
    util.print_log('label as bad k-mers with other k-mer at distance <=%s'
                   % sim)
    keysfile = '%s/%s_triekeys_v2.txt.gz' % (name, name)
    util.print_log('read keys from %s' % keysfile)
    filter_trie_mismatch_similarity(tempdir, name, kmers_trie, args['sim'] - 1, triekeys_v2_filenames, args['processes'], n, parts)
    util.print_log('done')


    util.print_log('produce list of good guideRNAs...')
    keysinputfile = keysfile
    keysoutputfile = '%s/%s_guides.txt.gz' % (name, name)
    nonCandidatekeysoutputfile = '%s/%s/%s_nonCandidate_guides_with_mismatch_neighbors.txt.gz' % (name,'blacklist',name)
    util.print_log('read keys from %s and write to %s'
                   % (keysinputfile, keysoutputfile))

    guides_dir = '%s%s' % (name,'/classifiedfiles/guides')
    guides_filenames = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)]

    filter_keys_trie(tempdir, kmers_trie, triekeys_v2_filenames, guides_filenames, keysoutputfile,
                     nonCandidatekeysoutputfile, args['processes'], n, parts)
    util.print_log('done')

    badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)]
    for i in range(parts):
        if(os.path.exists(badkeysfiles[i])):
            os.remove(badkeysfiles[i])

    util.print_log('save tries...')
    trie_filename = ['%s/%s/%s_trie%s.dat' % (name, 'kmers_tries', name, i) for i in range(parts)]
    # util.print_log('save in file %s' % trie_filename)
    save_trie(kmers_trie, trie_filename, parts)
    util.print_log('done')

    return kmers_trie
Пример #7
0
def main():
    p = argparse.ArgumentParser(description='Produce BAM file with guideRNA'
                                            ' database from precomputed trie'
                                            ' and list of guideRNAs',
                                formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    p.add_argument('-n', dest='name', default='myguides', required=True,
                   help='project name, load previously saved arguments'
                        ' and save additional output')
    p.add_argument('--label', dest='label', default='test', required=True,
                   help='use in file name of output database for this run')
    p.add_argument('-g', dest='guidesfile', default='',
                   help='name of file with guideRNAs for which to compute'
                        ' BAM database; may be gzipped (.gz);'
                        ' if not provided, use all candidate guideRNAs'
                        ' found in the project')
    p.add_argument('-d', dest='offdist', type=int, default=3,
                   help='maximum Hamming distance to consider from guideRNA'
                        ' to its off-target;'
                        ' off-target is an alternative occurrence (with any'
                        ' PAM) of this guideRNA in the genome at Hamming' 
                        ' distance at most this number (including PAM);'
                        ' use -1 for omitting any off-target info in resulting'
                        ' BAM (works much faster)')
    p.add_argument('-k', dest='greateroffdist', type=int, default=4,
                   help='a number greater than offdist used for preprocessed data'
                        '(the length of key for classifying guide RNAs)')
    p.add_argument('--maxoffcount', dest='maxoffcount', type=int, default=1000,
                   help='maximum number of off-targets to store for'
                        ' a guideRNA in a resulting BAM library;'
                        ' ignore if OFFDIST is -1')
    p.add_argument('-t', dest='processes', type=int, default=1,
                   help='how many processes to use; do not specify more'
                        ' than you have on your system;'
                        ' currently not implemented')
    args = p.parse_args()
    sam_args_dict = args.__dict__
    name = sam_args_dict['name']
    guides_filename = sam_args_dict['guidesfile']

    n = sam_args_dict['greateroffdist']
    parts = 4 ** n
    # parts = 256
    if not guides_filename:
        # guides_filename = '%s/%s_guides.txt.gz' % (name, name)
        guides_dir = '%s%s' % (name,'/classifiedfiles/guides')
        guides_filename = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)]

    util.print_log('local script arguments:')
    util.print_args(sam_args_dict)
    util.print_log('load main arguments...')
    args = util.load_args(name)
    util.print_args(args)
    util.print_log('done')

    # main
    trie_filename = ['%s/%s/%s_trie%s.dat' % (name, 'kmers_tries', name, i) for i in range(parts)]
    kmers_trie = guides.load_restore_trie(name, trie_filename, n, parts)
    produce_bam_custom(kmers_trie=kmers_trie,
                       name=name,
                       label=sam_args_dict['label'],
                       guides_filename=guides_filename,
                       args=args,
                       offdist=sam_args_dict['offdist'],
                       maxoffcount=sam_args_dict['maxoffcount'],
                       processes=sam_args_dict['processes'],
                       n = n,
                       parts=parts)
import sys
import pathlib
import argparse
import numpy as np
import pandas as pd
from scipy.stats import describe

from cmapPy.math import fast_corr
from pycytominer.cyto_utils import infer_cp_features

from util import diffuse_wells, load_args

# Define command arguments
args = load_args()

data_dir = args.data_dir
output_dir = args.output_dir
profile_file = args.profile_file
diffusion = args.diffusion
mirror = args.mirror
drop_same_position = args.drop_same_position
l1000 = args.l1000

# Load common compounds
common_file = pathlib.Path(
    "..",
    "..",
    "..",
    "6.paper_figures",
    "data",
    "significant_compounds_by_threshold_both_assays.tsv.gz",