def main(): if len(sys.argv) != 6: usage() cg_dir = sys.argv[1] helper.check_dir_exist(cg_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) num_chromHMM_model = helper.get_command_line_integer(sys.argv[3]) num_score_bins = helper.get_command_line_integer(sys.argv[4]) cell_type_list_fn = sys.argv[5] ct_list = helper.get_list_from_line_seperated_file(cell_type_list_fn) helper.check_file_exist(cell_type_list_fn) print "Done getting command line arguments" calculate_summary_staistics_across_ct(cg_dir, out_dir, num_chromHMM_model, num_score_bins, ct_list) print "Done!"
def main(): num_mandatory_args = 8 if len(sys.argv) < num_mandatory_args: usage() train_segment_fn = sys.argv[1] helper.check_file_exist(train_segment_fn) all_ct_segment_folder = sys.argv[ 2] # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome. if not os.path.isdir(all_ct_segment_folder): print "all_ct_segment_folder IS NOT VALID: " + all_ct_segment_folder usage() predict_outDir = sys.argv[3] helper.make_dir(predict_outDir) response_ct = sys.argv[4] try: num_chromHMM_state = int(sys.argv[5]) assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive" num_train_ct = int(sys.argv[6]) assert num_train_ct > 0, "num_train_ct needs to be positive" except: print "num_chromHMM_state or num_train_ct is not valid" usage() train_mode = sys.argv[7] if len(sys.argv) != (num_train_ct + num_mandatory_args): print "num_train_ct is different from the number of arguments passed into the program" usage() print "Done getting command line arguments" train_cell_types = sys.argv[ num_mandatory_args:] # the rest of the arguments are the cell types that we use to train the model # 1. Get the data of predictors and response for training Xtrain_segment_df, Y_df = get_XY_segmentation_data(train_cell_types, response_ct, num_chromHMM_state, train_segment_fn, train_mode) print "Done getting one hot data" print Xtrain_segment_df.head() print print Y_df.head() # 2. Get the regression machine regression_machine = train_model(Xtrain_segment_df, Y_df, num_chromHMM_state, train_mode) print "Done training" # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct predict_segmentation(all_ct_segment_folder, regression_machine, predict_outDir, train_cell_types, response_ct, num_chromHMM_state, train_mode) print "Done predicting whole genome"
def main(): if len(sys.argv) != 4: usage() cell_type_folder = sys.argv[1] if not os.path.isdir(cell_type_folder): print "cell_type_folder DOES NOT EXIST" usage() ct_fn = sys.argv[2] helper.check_file_exist(ct_fn) ct_list = get_cell_types_of_interest( ct_fn) # list of cell types of interests example: ['E003', 'E004'] output_fn = sys.argv[3] helper.create_folder_for_file(ct_fn) print "Done getting command line arguments" # select regions on the genome that we will sample from genome_sample_df = sample_genome_positions( cell_type_folder, ct_list, output_fn ) # --> a dataframe of 3 columns: "chromosome", "start_bp", 'end_bp'
def main(): num_mandatory_args = 7 if len(sys.argv) < num_mandatory_args: usage() train_segment_fn = sys.argv[1] helper.check_file_exist(train_segment_fn) all_ct_posterior_folder = sys.argv[ 2] # where the segmentation data of all cell types are combined, and stored in files corresponding to different regions in the genome. helper.check_dir_exist(all_ct_posterior_folder) predict_outDir = sys.argv[3] helper.make_dir(predict_outDir) response_ct = sys.argv[4] try: num_chromHMM_state = int(sys.argv[5]) assert num_chromHMM_state > 0, "num_chromHMM_state needs to be positive" num_train_ct = int(sys.argv[6]) assert num_train_ct > 0, "num_train_ct needs to be positive" except: print "num_chromHMM_state or num_train_ct is not valid" usage() if len(sys.argv) != (num_train_ct + num_mandatory_args): print "num_train_ct is different from the number of arguments passed into the program" usage() print "Done getting command line arguments" train_cell_types = sys.argv[ num_mandatory_args:] # the rest of the arguments are the cell types that we use to train the model # 1. Get the data of predictors and response for training Xtrain_segment_df, Y_df = get_XY_segmentation_data( train_cell_types, response_ct, num_chromHMM_state, train_segment_fn ) # Xtrain_segment_df: example colnames: 'E047_S16', 'E047_S17' --> posterior probabilities of each of the state in each cell type that are used to train # Y_df --> example colnames 'E047' --> state numbers 1 --> 18 of each position used to train data for the response cell type print "Done getting one hot data" print Xtrain_segment_df.head() print print Y_df.head() # 2. Get the regression machine regression_machine = train_multinomial_logistic_regression( Xtrain_segment_df, Y_df, num_chromHMM_state) print "Done training" # 3. Based on the machine just created, process training data and then predict the segmentation at each position for the response_ct predict_segmentation(all_ct_posterior_folder, regression_machine, predict_outDir, train_cell_types, response_ct, num_chromHMM_state) print "Done predicting whole genome"
def main(): if len(sys.argv) != 7: usage() cg_dir = sys.argv[1] helper.check_dir_exist(cg_dir) out_dir = sys.argv[2] helper.make_dir(out_dir) state_annotation_fn = sys.argv[3] helper.check_file_exist(state_annotation_fn) state_annot_df = read_state_annot_fn(state_annotation_fn) ct_list_fn = sys.argv[4] helper.check_file_exist(ct_list_fn) ct_list = helper.get_list_from_line_seperated_file(ct_list_fn) num_chromHMM_state = helper.get_command_line_integer(sys.argv[5]) igv_track_name = sys.argv[6] print "Done getting command line arguments" get_average_state_assign_matrix(cg_dir, ct_list, num_chromHMM_state, out_dir) print "Done getting the representative state semgentation for the cellg group" draw_genome_pos_list = ['chr5_15'] # create_igv_format_bed(out_dir, state_annot_df, draw_genome_pos_list, igv_track_name) print "Done!"
def main(): if len(sys.argv) != 7: usage() train_sampled_data_fn = sys.argv[1] helper.check_file_exist(train_sampled_data_fn) outDir = sys.argv[2] helper.make_dir(outDir) all_ct_posterior_folder = sys.argv[3] helper.check_dir_exist(all_ct_posterior_folder) num_chromHMM_state = helper.get_command_line_integer(sys.argv[4]) validate_ct = sys.argv[5] all_ct_list_fn = sys.argv[6] print "Done getting command line arguments" # get all cell types ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct) print ct_list # call all cell types call_cross_validation_functions(validate_ct, ct_list, outDir, train_sampled_data_fn, all_ct_posterior_folder, num_chromHMM_state)
def main(): if len(sys.argv) != 8: usage() train_sampled_data_fn = sys.argv[1] helper.check_file_exist(train_sampled_data_fn) outDir = sys.argv[2] helper.make_dir(outDir) all_ct_segment_folder = sys.argv[3] helper.check_dir_exist(all_ct_segment_folder) num_chromHMM_state = helper.get_command_line_integer(sys.argv[4]) validate_ct = sys.argv[5] train_mode = sys.argv[6] all_ct_list_fn = sys.argv[7] print "Done getting command line arguments" # get the list of all genomic positions used to segment the genome for our model training (we exclude chromosome Y in all analysis) gen_pos_list = get_genomic_positions_list(all_ct_segment_folder) # get all cell types ct_list = get_all_train_ct_list(all_ct_list_fn, validate_ct) # call all cell types call_cross_validation_functions(validate_ct, ct_list, outDir, train_sampled_data_fn, all_ct_segment_folder, num_chromHMM_state, gen_pos_list, train_mode)