def extract_process_kmers(name): """Extract k-mers from genomic sequence and run initial processing. Load project arguments and produce three files: extract k-mers from the genome: <name>/<name>_kmers.txt.gz shuffle all extracted k-mers: <name>/<name>_kmers_shuffled.txt.gz count occurrences of k-mers: <name>/<name>_kmers_counts.txt.gz Args: name: project name, used to get project args and in all output """ util.print_log('start extract_process_kmers()') util.print_log('load arguments...') args = util.load_args(name) util.print_args(args) util.print_log('done') util.print_log('load FASTA...') util.print_log('load from %s' % args['fasta']) fasta = load_fasta(args['fasta']) util.print_log('done') util.print_log('extract k-mers...') kmers_filename = '%s/%s_kmers.txt.gz' % (name, name) allpams = [args['pam']] + args['altpam'] util.print_log('write in file %s' % kmers_filename) genome = extract_kmers(name=name, fasta=fasta, length=args['length'], pams=allpams, pampos=args['pampos'], filename=kmers_filename, chroms=args['chrom'], minchrlen=args['minchrlen'], processes=args['processes']) sys.stdout.write('genome: %s' % genome) util.print_log('save genome info') args['genome'] = genome util.save_args(args) util.print_log('calculate k-mer statistics') print_stats_kmers(kmers_filename, gnupath=args['gnupath']) util.print_log('done') util.print_log('shuffle k-mers...') kmers_shuffled_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name) util.print_log('write in file %s' % kmers_shuffled_filename) shuffle_kmers(fileinput=kmers_filename, fileoutput=kmers_shuffled_filename, gnupath=args['gnupath']) util.print_log('done') util.print_log('count k-mers...') count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name) util.print_log('write in file %s' % count_filename) sort_count_kmers(fileinput=kmers_filename, fileoutput=count_filename, mincount=args['maxoffpos'], gnupath=args['gnupath']) util.print_log('done') return True
def produce_bams_main(kmers_trie, name): """Produce BAM file with all guideRNAs and info about their off-targets. Run after all files and trie were generated by kmers.extract_process_kmers() and guides.analyze_guides() Produce files: sorted BAM file with off-target info: <name>/<name>_guides.bam index for the BAM file with off-target info: <name>/<name>_guides.bam.bai also, BAM file and index for all guideRNAs without any off-target info (produced much faster): <name>/<name>_guides_nooff.bam <name>/<name>_guides_nooff.bam.bai Args: kmers_trie: trie.trie object as produced by guides.analyze_guides() name: project name, used to get project args and in all output """ util.print_log('start produce_bam()') util.print_log('load arguments...') args = util.load_args(name) util.print_args(args) util.print_log('done') util.print_log('produce SAM file with guideRNAs only (no off-targets)...') # guides_filename = '%s/%s_guides.txt.gz' % (name, name) # parts = 256 n = args['greateroffdist'] parts = 4 ** n guides_dir = '%s%s' % (name,'/classifiedfiles/guides') guides_filenames = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)] util.print_log('read guides from %s' % guides_dir) produce_bam_custom(kmers_trie=kmers_trie, name=name, label='nooff', guides_filename=guides_filenames, args=args, offdist=-1, # -1 for no off-targets maxoffcount=args['maxoffcount'], processes=args['processes'], n = n, parts=parts) util.print_log('done') if args['offdist'] != -1: util.print_log('produce SAM file with guideRNAs' ' and off-target info...') # guides_filename = '%s/%s_guides.txt.gz' % (name, name) util.print_log('read guides from %s' % guides_dir) produce_bam_custom(kmers_trie=kmers_trie, name=name, label='offdist%s' % args['offdist'], guides_filename=guides_filenames, args=args, offdist=args['offdist'], maxoffcount=args['maxoffcount'], processes=args['processes'], n = n, parts=parts) util.print_log('done')
if str(args.load_from)[:9] == "Tokenizer": print("resume training the tokenizer..." + args.load_from) CHECKPOINTS = CHECKPOINTS_TOKENIZER # path to the folder that store the trained model, if any if args.over_write == 1: args.save_to = args.load_from # overwrite the weights print("overwrite the weights to ", args.save_to) # if start training a new model (with and without downloading LM) elif str(args.load_from)[:2] == "LM": CHECKPOINTS = CHECKPOINTS_LM print("download the language model from " + args.load_from) # get the network structure from the loaded model if True: imported_model = os.path.join(CHECKPOINTS_TOKENIZER, args.load_from) args_dict = util.load_args(imported_model) args.char_embedding_size = args_dict["char_embedding_size"] args.hidden_dim = args_dict["hidden_dim"] args.layer_num = args_dict["layer_num"] # learning_rate = args_dict["learning_rate"] args.clip_grad = args_dict["clip_grad"] args.sequence_length = args_dict["sequence_length"] args.batchSize = args_dict["batchSize"] args.lstm_num_direction = args_dict["lstm_num_direction"] # sgd_momentum = args_dict["sgd_momentum"] args.len_lines_per_chunk = args_dict["len_lines_per_chunk"] args.optim = args_dict["optim"] print("get the network structure from the loaded model...") # set the default note if args.add_note is None:
# 1 for saving a resumed model to the old model, 0 for saving it as a new model parser.add_argument("--over_write", type=int, default=0) parser.add_argument("--add_note", type=str) args = parser.parse_args() args_dict = vars(args) train = True print() if args.load_from is None: print("===========start training a language model===========") else: print("===========resume the training of " + str(args.load_from) + "===========") json_path = os.path.join(CHECKPOINTS_LM, args.load_from) args_dict = util.load_args(json_path) args.char_embedding_size = args_dict["char_embedding_size"] args.hidden_dim = args_dict["hidden_dim"] args.layer_num = args_dict["layer_num"] args.clip_grad = args_dict["clip_grad"] args.sequence_length = args_dict["sequence_length"] args.batchSize = args_dict["batchSize"] args.lstm_num_direction = args_dict["lstm_num_direction"] args.len_lines_per_chunk = args_dict["len_lines_per_chunk"] # args.optim = args_dict["optim"] print("set up the network structure...") # set a default note if args.add_note is None: args.add_note = str(args.dataset) + " , " + str( args.learning_rate) + ", epoch " + str(args.epoch)
def main(args): # cfg_file = os.path.join(args.example_config_path, args.primitive) + ".yaml" cfg = get_vae_defaults() # cfg.merge_from_file(cfg_file) cfg.freeze() batch_size = args.batch_size dataset_size = args.total_data_size if args.experiment_name is None: experiment_name = args.model_name else: experiment_name = args.experiment_name if not os.path.exists(os.path.join(args.log_dir, experiment_name)): os.makedirs(os.path.join(args.log_dir, experiment_name)) description_txt = raw_input('Please enter experiment notes: \n') if isinstance(description_txt, str): with open( os.path.join(args.log_dir, experiment_name, experiment_name + '_description.txt'), 'wb') as f: f.write(description_txt) writer = SummaryWriter(os.path.join(args.log_dir, experiment_name)) # torch_seed = np.random.randint(low=0, high=1000) # np_seed = np.random.randint(low=0, high=1000) torch_seed = 0 np_seed = 0 torch.manual_seed(torch_seed) np.random.seed(np_seed) trained_model_path = os.path.join(args.model_path, args.model_name) if not os.path.exists(trained_model_path): os.makedirs(trained_model_path) if args.task == 'contact': if args.start_rep == 'keypoints': start_dim = 24 elif args.start_rep == 'pose': start_dim = 7 if args.goal_rep == 'keypoints': goal_dim = 24 elif args.goal_rep == 'pose': goal_dim = 7 if args.skill_type == 'pull': # + 7 because single arm palm pose input_dim = start_dim + goal_dim + 7 else: # + 14 because both arms palm pose input_dim = start_dim + goal_dim + 14 output_dim = 7 decoder_input_dim = start_dim + goal_dim vae = VAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) elif args.task == 'goal': if args.start_rep == 'keypoints': start_dim = 24 elif args.start_rep == 'pose': start_dim = 7 if args.goal_rep == 'keypoints': goal_dim = 24 elif args.goal_rep == 'pose': goal_dim = 7 input_dim = start_dim + goal_dim output_dim = goal_dim decoder_input_dim = start_dim vae = GoalVAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) elif args.task == 'transformation': input_dim = args.input_dimension output_dim = args.output_dimension decoder_input_dim = args.input_dimension - args.output_dimension vae = GoalVAE(input_dim, output_dim, args.latent_dimension, decoder_input_dim, hidden_layers=cfg.ENCODER_HIDDEN_LAYERS_MLP, lr=args.learning_rate) else: raise ValueError('training task not recognized') if torch.cuda.is_available(): vae.encoder.cuda() vae.decoder.cuda() if args.start_epoch > 0: start_epoch = args.start_epoch num_epochs = args.num_epochs fname = os.path.join( trained_model_path, args.model_name + '_epoch_%d.pt' % args.start_epoch) torch_seed, np_seed = load_seed(fname) load_net_state(vae, fname) load_opt_state(vae, fname) args = load_args(fname) args.start_epoch = start_epoch args.num_epochs = num_epochs torch.manual_seed(torch_seed) np.random.seed(np_seed) data_dir = args.data_dir data_loader = DataLoader(data_dir=data_dir) data_loader.create_random_ordering(size=dataset_size) dataset = data_loader.load_dataset(start_rep=args.start_rep, goal_rep=args.goal_rep, task=args.task) total_loss = [] start_time = time.time() print('Saving models to: ' + trained_model_path) kl_weight = 1.0 print('Starting on epoch: ' + str(args.start_epoch)) for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): print('Epoch: ' + str(epoch)) epoch_total_loss = 0 epoch_kl_loss = 0 epoch_pos_loss = 0 epoch_ori_loss = 0 epoch_recon_loss = 0 kl_coeff = 1 - kl_weight kl_weight = args.kl_anneal_rate * kl_weight print('KL coeff: ' + str(kl_coeff)) for i in range(0, dataset_size, batch_size): vae.optimizer.zero_grad() input_batch, decoder_input_batch, target_batch = \ data_loader.sample_batch(dataset, i, batch_size) input_batch = to_var(torch.from_numpy(input_batch)) decoder_input_batch = to_var(torch.from_numpy(decoder_input_batch)) z, recon_mu, z_mu, z_logvar = vae.forward(input_batch, decoder_input_batch) kl_loss = vae.kl_loss(z_mu, z_logvar) if args.task == 'contact': output_r, output_l = recon_mu if args.skill_type == 'grasp': target_batch_right = to_var( torch.from_numpy(target_batch[:, 0])) target_batch_left = to_var( torch.from_numpy(target_batch[:, 1])) pos_loss_right = vae.mse(output_r[:, :3], target_batch_right[:, :3]) ori_loss_right = vae.rotation_loss( output_r[:, 3:], target_batch_right[:, 3:]) pos_loss_left = vae.mse(output_l[:, :3], target_batch_left[:, :3]) ori_loss_left = vae.rotation_loss(output_l[:, 3:], target_batch_left[:, 3:]) pos_loss = pos_loss_left + pos_loss_right ori_loss = ori_loss_left + ori_loss_right elif args.skill_type == 'pull': target_batch = to_var( torch.from_numpy(target_batch.squeeze())) #TODO add flags for when we're training both arms # output = recon_mu[0] # right arm is index [0] # output = recon_mu[1] # left arm is index [1] pos_loss_right = vae.mse(output_r[:, :3], target_batch[:, :3]) ori_loss_right = vae.rotation_loss(output_r[:, 3:], target_batch[:, 3:]) pos_loss = pos_loss_right ori_loss = ori_loss_right elif args.task == 'goal': target_batch = to_var(torch.from_numpy(target_batch.squeeze())) output = recon_mu if args.goal_rep == 'pose': pos_loss = vae.mse(output[:, :3], target_batch[:, :3]) ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:, 3:]) elif args.goal_rep == 'keypoints': pos_loss = vae.mse(output, target_batch) ori_loss = torch.zeros(pos_loss.shape) elif args.task == 'transformation': target_batch = to_var(torch.from_numpy(target_batch.squeeze())) output = recon_mu pos_loss = vae.mse(output[:, :3], target_batch[:, :3]) ori_loss = vae.rotation_loss(output[:, 3:], target_batch[:, 3:]) recon_loss = pos_loss + ori_loss loss = kl_coeff * kl_loss + recon_loss loss.backward() vae.optimizer.step() epoch_total_loss = epoch_total_loss + loss.data epoch_kl_loss = epoch_kl_loss + kl_loss.data epoch_pos_loss = epoch_pos_loss + pos_loss.data epoch_ori_loss = epoch_ori_loss + ori_loss.data epoch_recon_loss = epoch_recon_loss + recon_loss.data writer.add_scalar('loss/train/ori_loss', ori_loss.data, i) writer.add_scalar('loss/train/pos_loss', pos_loss.data, i) writer.add_scalar('loss/train/kl_loss', kl_loss.data, i) if (i / batch_size) % args.batch_freq == 0: if args.skill_type == 'pull' or args.task == 'goal' or args.task == 'transformation': print( 'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tPos: %f\t Ori: %f' % (epoch, i, dataset_size, 100.0 * i / dataset_size / batch_size, loss.item(), kl_loss.item(), pos_loss.item(), ori_loss.item())) elif args.skill_type == 'grasp' and args.task == 'contact': print( 'Train Epoch: %d [%d/%d (%f)]\tLoss: %f\tKL: %f\tR Pos: %f\t R Ori: %f\tL Pos: %f\tL Ori: %f' % (epoch, i, dataset_size, 100.0 * i / dataset_size / batch_size, loss.item(), kl_loss.item(), pos_loss_right.item(), ori_loss_right.item(), pos_loss_left.item(), ori_loss_left.item())) print(' --avgerage loss: ') print(epoch_total_loss / (dataset_size / batch_size)) loss_dict = { 'epoch_total': epoch_total_loss / (dataset_size / batch_size), 'epoch_kl': epoch_kl_loss / (dataset_size / batch_size), 'epoch_pos': epoch_pos_loss / (dataset_size / batch_size), 'epoch_ori': epoch_ori_loss / (dataset_size / batch_size), 'epoch_recon': epoch_recon_loss / (dataset_size / batch_size) } total_loss.append(loss_dict) if epoch % args.save_freq == 0: print('\n--Saving model\n') print('time: ' + str(time.time() - start_time)) save_state(net=vae, torch_seed=torch_seed, np_seed=np_seed, args=args, fname=os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '.pt')) np.savez(os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '_loss.npz'), loss=np.asarray(total_loss)) print('Done!') save_state(net=vae, torch_seed=torch_seed, np_seed=np_seed, args=args, fname=os.path.join( trained_model_path, args.model_name + '_epoch_' + str(epoch) + '.pt'))
def analyze_guides(name): """Analyze k-mers and find all candidate guideRNAs and their off-targets. Load project arguments, build and analyze a trie, find guideRNAs. Run after all files were generated by kmers.extract_process_kmers() Produce files: trie with all k-mers, values store label for good or bad candidate guideRNA and coordinates in the genome: <name>/<name>_trie.dat intermediate files with candidate guideRNA k-mers used as keys to the trie: <name>/<name>_triekeys_v?.txt.gz final list of guideRNAs: <name>/<name>_guides.txt.gz Args: name: project name, used to get project args and in all output Return: trie.trie object with all k-mers, their coordinates in the genome, and labels of good and bad candidate guideRNAs """ # parts = 256 util.print_log('start analyze_guides()') util.print_log('load arguments...') args = util.load_args(name) util.print_args(args) util.print_log('done') n = args['greateroffdist'] parts = 4 ** n if os.path.exists('%s%s' % (name,'/blacklist')): util.print_log('blacklist directory already exists \n') pass else: os.mkdir(('%s%s' % (name,'/blacklist'))) util.print_log('blacklist directory made \n') # in order to store classified files if os.path.exists('%s%s' % (name,'/classifiedfiles')): util.print_log('classifiedfiles directory already exists \n') pass else: os.makedirs(('%s%s' % (name,'/classifiedfiles/kmers'))) os.makedirs(('%s%s' % (name,'/classifiedfiles/triekeys_v1'))) os.makedirs(('%s%s' % (name,'/classifiedfiles/triekeys_v2'))) os.makedirs(('%s%s' % (name,'/classifiedfiles/guides'))) os.makedirs(('%s%s' % (name,'/classifiedfiles/tempfiles'))) util.print_log('classifiedfiles directory made \n') if os.path.exists('%s%s' % (name,'/kmers_tries')): util.print_log('kmers_tries directory already exists \n') pass else: os.mkdir(('%s%s' % (name,'/kmers_tries'))) util.print_log('kmers_tries directory made \n') util.print_log('construct trie...') kmers_filename = '%s/%s_kmers_shuffled.txt.gz' % (name, name) util.print_log('load k-mers from %s' % kmers_filename) genome = args['genome'] goodkeysfile = '%s/%s_triekeys_v1.txt.gz' % (name, name) \ if args['altpam'] else '' badkeysfile = '%s/%s/%s_nonCandidate_triekeys_with_altpams.txt.gz' % (name,'blacklist',name) if args['altpam'] else '' if goodkeysfile: util.print_log('print candidate guideRNAs to %s' % goodkeysfile) tempdir = '%s%s' % (name,'/classifiedfiles/tempfiles') triekeys_v1_dir = '%s%s' % (name,'/classifiedfiles/triekeys_v1') triekeys_v1_filenames = ['%s/keys%s.txt.gz' % (triekeys_v1_dir, i) for i in range(parts)] kmers_dir = '%s%s' % (name,'/classifiedfiles/kmers') kmers_filenames = ['%s/kmers%s.txt.gz' % (kmers_dir, i) for i in range(parts)] kmers_trie = build_kmers_trie(kmers_filename, genome, name, altpam=args['altpam'], pampos=args['pampos'], maxcount=args['maxoffpos'],goodkeysfile=goodkeysfile, badkeysfile=badkeysfile,tempdir=tempdir, triekeys_v1_filenames=triekeys_v1_filenames, kmers_filenames=kmers_filenames, processes=args['processes'], n=n, parts=parts) util.print_log('done') util.print_log('label as bad guideRNAs multimapping k-mers in trie...') # keysinputfile = goodkeysfile if goodkeysfile else kmers_filename keysoutputfile = '%s/%s_triekeys_v2.txt.gz' % (name, name) nonCandidatekeysoutputfile = '%s/%s/%s_nonCandidate_triekeys_targetSites_with_multiple_perfect_hits.txt.gz' %\ (name,'blacklist',name) util.print_log('read keys from %s and write to %s' % (triekeys_v1_dir, keysoutputfile)) triekeys_v2_dir = '%s%s' % (name,'/classifiedfiles/triekeys_v2') triekeys_v2_filenames = ['%s/keys%s.txt.gz' % (triekeys_v2_dir, i) for i in range(parts)] filter_keys_trie(tempdir, kmers_trie, triekeys_v1_filenames, triekeys_v2_filenames, keysoutputfile, nonCandidatekeysoutputfile, args['processes'], n, parts) util.print_log('done') util.print_log('assign correct counts to multimapping k-mers in trie...') count_filename = '%s/%s_kmers_counts.txt.gz' % (name, name) util.print_log('read counts from %s' % count_filename) kmers_trie = label_multimapping(kmers_trie, count_filename, n) util.print_log('done') util.print_log('label as bad guideRNAs k-mers in trie few mismatches away' ' from other k-mers...') sim = args['sim'] - 1 util.print_log('label as bad k-mers with other k-mer at distance <=%s' % sim) keysfile = '%s/%s_triekeys_v2.txt.gz' % (name, name) util.print_log('read keys from %s' % keysfile) filter_trie_mismatch_similarity(tempdir, name, kmers_trie, args['sim'] - 1, triekeys_v2_filenames, args['processes'], n, parts) util.print_log('done') util.print_log('produce list of good guideRNAs...') keysinputfile = keysfile keysoutputfile = '%s/%s_guides.txt.gz' % (name, name) nonCandidatekeysoutputfile = '%s/%s/%s_nonCandidate_guides_with_mismatch_neighbors.txt.gz' % (name,'blacklist',name) util.print_log('read keys from %s and write to %s' % (keysinputfile, keysoutputfile)) guides_dir = '%s%s' % (name,'/classifiedfiles/guides') guides_filenames = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)] filter_keys_trie(tempdir, kmers_trie, triekeys_v2_filenames, guides_filenames, keysoutputfile, nonCandidatekeysoutputfile, args['processes'], n, parts) util.print_log('done') badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)] for i in range(parts): if(os.path.exists(badkeysfiles[i])): os.remove(badkeysfiles[i]) util.print_log('save tries...') trie_filename = ['%s/%s/%s_trie%s.dat' % (name, 'kmers_tries', name, i) for i in range(parts)] # util.print_log('save in file %s' % trie_filename) save_trie(kmers_trie, trie_filename, parts) util.print_log('done') return kmers_trie
def main(): p = argparse.ArgumentParser(description='Produce BAM file with guideRNA' ' database from precomputed trie' ' and list of guideRNAs', formatter_class=argparse.ArgumentDefaultsHelpFormatter) p.add_argument('-n', dest='name', default='myguides', required=True, help='project name, load previously saved arguments' ' and save additional output') p.add_argument('--label', dest='label', default='test', required=True, help='use in file name of output database for this run') p.add_argument('-g', dest='guidesfile', default='', help='name of file with guideRNAs for which to compute' ' BAM database; may be gzipped (.gz);' ' if not provided, use all candidate guideRNAs' ' found in the project') p.add_argument('-d', dest='offdist', type=int, default=3, help='maximum Hamming distance to consider from guideRNA' ' to its off-target;' ' off-target is an alternative occurrence (with any' ' PAM) of this guideRNA in the genome at Hamming' ' distance at most this number (including PAM);' ' use -1 for omitting any off-target info in resulting' ' BAM (works much faster)') p.add_argument('-k', dest='greateroffdist', type=int, default=4, help='a number greater than offdist used for preprocessed data' '(the length of key for classifying guide RNAs)') p.add_argument('--maxoffcount', dest='maxoffcount', type=int, default=1000, help='maximum number of off-targets to store for' ' a guideRNA in a resulting BAM library;' ' ignore if OFFDIST is -1') p.add_argument('-t', dest='processes', type=int, default=1, help='how many processes to use; do not specify more' ' than you have on your system;' ' currently not implemented') args = p.parse_args() sam_args_dict = args.__dict__ name = sam_args_dict['name'] guides_filename = sam_args_dict['guidesfile'] n = sam_args_dict['greateroffdist'] parts = 4 ** n # parts = 256 if not guides_filename: # guides_filename = '%s/%s_guides.txt.gz' % (name, name) guides_dir = '%s%s' % (name,'/classifiedfiles/guides') guides_filename = ['%s/%s.txt.gz' % (guides_dir, i) for i in range(parts)] util.print_log('local script arguments:') util.print_args(sam_args_dict) util.print_log('load main arguments...') args = util.load_args(name) util.print_args(args) util.print_log('done') # main trie_filename = ['%s/%s/%s_trie%s.dat' % (name, 'kmers_tries', name, i) for i in range(parts)] kmers_trie = guides.load_restore_trie(name, trie_filename, n, parts) produce_bam_custom(kmers_trie=kmers_trie, name=name, label=sam_args_dict['label'], guides_filename=guides_filename, args=args, offdist=sam_args_dict['offdist'], maxoffcount=sam_args_dict['maxoffcount'], processes=sam_args_dict['processes'], n = n, parts=parts)
import sys import pathlib import argparse import numpy as np import pandas as pd from scipy.stats import describe from cmapPy.math import fast_corr from pycytominer.cyto_utils import infer_cp_features from util import diffuse_wells, load_args # Define command arguments args = load_args() data_dir = args.data_dir output_dir = args.output_dir profile_file = args.profile_file diffusion = args.diffusion mirror = args.mirror drop_same_position = args.drop_same_position l1000 = args.l1000 # Load common compounds common_file = pathlib.Path( "..", "..", "..", "6.paper_figures", "data", "significant_compounds_by_threshold_both_assays.tsv.gz",