def __init__(self, config_file, include_vcf_features=False): self.config_file = config_file self.include_vcf_features = include_vcf_features config_params = custom_utils.get_config_params(config_file) self.win_len = config_params['win_len'] #self.win_len = int(config_params['win_len'] / 2) self.init_ouput_dirs() print(self.out_dir) jarvis_pkl_file = self.ml_data_dir + '/jarvis_data.pkl' if predict_on_test_set: jarvis_pkl_file = self.ml_data_dir + '/jarvis_data.' + str(test_indexes[0]) + '_' + str(test_indexes[1]) + '.pkl' self.data_dict_file = jarvis_pkl_file # @anchor -- REUNDANT check: already calling the prepara_data.py module prior to training if not os.path.exists(self.data_dict_file): print("\nPreparing training data - calling JarvisDataPreprocessing object...") data_preprocessor = JarvisDataPreprocessing(config_file, predict_on_test_set=predict_on_test_set, test_indexes=test_indexes) # Extract raw sequences from input variant windows and combine with original feature set additional_features_df, filtered_onehot_seqs = data_preprocessor.compile_feature_table_incl_raw_seqs() # Merge data, transform into form appropriate for DNN training and save into file self.data_dict_file = data_preprocessor.transform_and_save_data(additional_features_df, filtered_onehot_seqs) print(self.data_dict_file)
def __init__(self, config_file, genomic_class): config_params = custom_utils.get_config_params(config_file) self.genomic_class = genomic_class self.tables_per_metric = {} self.current_palette = sns.color_palette() + sns.color_palette("Paired") self.hex_colors = [matplotlib.colors.to_hex(x) for x in self.current_palette]
def __init__(self, config_file, input_features, chrom, NTHREADS=20): print("Initialising new JarvisDataPreprocessing object...") self.input_features = input_features self.chrom = chrom self.NTHREADS = NTHREADS # ==== Read config parameters ==== config_params = custom_utils.get_config_params(config_file) self.hg_version = config_params['hg_version'] print('\n\nhg_version:', self.hg_version) self.grch = {'hg19': '37', 'hg38': '38'} pathogenic_set = config_params['pathogenic_set'] benign_set = config_params['benign_set'] self.patho_benign_sets = pathogenic_set + '_' + benign_set self.win_len = config_params['win_len'] #self.win_len = int(config_params['win_len'] / 2) self.Y_label = config_params['Y_label'] # ==== Define dir structure ==== out_dir = custom_utils.create_out_dir(config_file) self.ml_data_dir = out_dir + '/ml_data' if not os.path.exists(self.ml_data_dir): os.makedirs(self.ml_data_dir) self.seq_out_dir = self.ml_data_dir + '/raw_seq' if not os.path.exists(self.seq_out_dir): os.makedirs(self.seq_out_dir) self.feature_tables_dir = self.ml_data_dir + '/clinvar_feature_tables' if not os.path.exists(self.feature_tables_dir): os.makedirs(self.feature_tables_dir) self.jarvis_predictions_dir = self.ml_data_dir + '/jarvis_predictions' if not os.path.exists(self.jarvis_predictions_dir): os.makedirs(self.jarvis_predictions_dir) self.jarvis_predictions_per_chr_dir = self.jarvis_predictions_dir + '/chr' + str( self.chrom) if not os.path.exists(self.jarvis_predictions_per_chr_dir): os.makedirs(self.jarvis_predictions_per_chr_dir) # Specificy input (static) files self.human_ref_genome_2bit = '../' + self.hg_version + '/homo_sapiens_GRCh' + self.grch[ self.hg_version] + '_FASTA/hsa' + self.grch[ self.hg_version] + '.2bit'
if __name__ == '__main__': startTime = datetime.now() args = sys.argv chrom = args[1] config_file = args[2] #'config.yaml' single_nt_offset = int(args[3]) # 1 to (win_len-1) # Read run parameters from config file and store into a dictionary config_params = get_config_params(config_file) print(config_params) hg_version = config_params['hg_version'] grch = {'hg19': '37', 'hg38': '38'} genomic_classes_files = {} print('cwd:', os.getcwd()) with open(config_params['genomic_classes']) as fh: for line in fh: line = line.rstrip() genomic_class, cur_path, _ = line.split('\t') genomic_classes_files[genomic_class] = cur_path # ==================== Initialisation ====================
config_file = sys.argv[1] input_features = sys.argv[2] genomic_classes = sys.argv[3] # comma-separated genomic_classes = genomic_classes.split(',') use_fixed_cv_batches = bool(int(sys.argv[4])) cv_repeats = int(sys.argv[5]) include_vcf_features = False test_indexes = [] # ---------------------- train_with_cv = False # get generalised performance with cross-validation config_suffix = re.split("\.", config_file.split('/')[-1])[1] print('config_suffix:', config_suffix) run_params = custom_utils.get_config_params(config_file) # [Note]: predict_on_test_set is __redundant__ and __deprecated__ #predict_on_test_set = bool(run_params['predict_on_test_set']) predict_on_test_set = False # -- Compatible only with: train_with_cv = True # ************************************* use_pathogenicity_trained_model = True use_conservation_trained_model = False # ************************************* # sanity check if use_pathogenicity_trained_model: train_with_cv = True