def prepare_dataset_try3(): dataset_nm = 'dataset_try3' print 'Preparing %s' % (dataset_nm) featurized_data = init_featurized_data() # Components... dataset = _data.load_dataset('DisLib-mES-controladj', exp_subset='longdup_series', exp_subset_col='Designed Name') for exp in dataset.keys(): if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS: del dataset[exp] prepare_library_dataset(dataset, featurized_data) dataset = _data.load_dataset('Lib1-mES-controladj') # Remove VO spacers from lib 1 for vo_spacer_idx in range(1872, 1961 + 1): vo_spacer_exp = str(vo_spacer_idx) del dataset[vo_spacer_exp] # Remove low rep spacers from lib1 for exp in dataset.keys(): if int(exp) not in _config.d.HIGHREP_LIB1_EXPS: del dataset[exp] print len(dataset) prepare_library_dataset(dataset, featurized_data) pickle_featurized_data(featurized_data, dataset_nm) return
def prepare_statistics(data_nm1, data_nm2): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc. # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) # If Library, subset VO spacers dataset1 = _data.load_dataset(data_nm1) dataset2 = _data.load_dataset(data_nm2) if dataset1 is None or dataset2 is None: return # Find shared exps and iterate through them, passing both shared exps together to calc_statistics shared_exps = set(dataset1.keys()) & set(dataset2.keys()) if len(shared_exps) == 0: print 'ERROR: No shared exps' timer = util.Timer(total = len(shared_exps)) for exp in shared_exps: d1 = dataset1[exp] d2 = dataset2[exp] calc_statistics(d1, d2, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) if 'Lib1' in data_nm or 'VO' in data_nm: dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name') if 'DisLib' in data_nm: dataset = _data.load_dataset(data_nm, exp_subset = 'clin', exp_subset_col = 'Designed Name') # Remove data with iterated editing dlwt = _config.d.DISLIB_WT for idx, row in dlwt.iterrows(): if row['wt_repairable'] == 'iterwt': del dataset[row['name']] if dataset is None: return timer = util.Timer(total = len(dataset)) # for exp in dataset.keys()[:100]: for exp in dataset.keys(): df = dataset[exp] calc_statistics(df, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_dataset_try4(): dataset_nm = 'dataset_try4' print 'Preparing %s' % (dataset_nm) featurized_data = init_featurized_data() # Load dislib, longdups dataset = _data.load_dataset('DisLib-mES-controladj', exp_subset='longdup_series', exp_subset_col='Designed Name') for exp in dataset.keys(): if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS: del dataset[exp] prepare_library_dataset(dataset, featurized_data) # Load dislib, clin data dataset = _data.load_dataset('DisLib-mES-controladj', exp_subset='clin', exp_subset_col='Designed Name') # Remove data with iterated editing dlwt = _config.d.DISLIB_WT for idx, row in dlwt.iterrows(): if row['wt_repairable'] == 'iterwt': del dataset[row['name']] for exp in dataset.keys(): if exp not in _config.d.HIGHREP_DISLIB_EXPS_NMS: del dataset[exp] print len(dataset) prepare_library_dataset(dataset, featurized_data) # Load Lib1 data dataset = _data.load_dataset('Lib1-mES-controladj') # Remove VO spacers from lib 1 for vo_spacer_idx in range(1872, 1961 + 1): vo_spacer_exp = str(vo_spacer_idx) del dataset[vo_spacer_exp] # Remove low rep spacers from lib1 for exp in dataset.keys(): if int(exp) not in _config.d.HIGHREP_LIB1_EXPS: del dataset[exp] print len(dataset) prepare_library_dataset(dataset, featurized_data) pickle_featurized_data(featurized_data, dataset_nm) return
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # In this case: Distribution of frequencies of indels for each position in 20 bp window around cutsite. Can plot mean, median, etc, difference, etc. # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) dataset = _data.load_dataset(data_nm) if dataset is None: return timer = util.Timer(total=len(dataset)) for exp in dataset: df = dataset[exp] calc_statistics(df, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) col_order = [ '_Experiment', 'Editing Rate', '0gt Frequency', 'Ngt Frequency', '-10', '-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10' ] if len(col_order) != len(alldf.columns): print 'ERROR: Will drop columns' alldf = alldf[col_order] return alldf
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) dataset = _data.load_dataset(data_nm, exp_subset = 'vo_spacers', exp_subset_col = 'Designed Name') if dataset is None: return e_dir = '/cluster/mshen/prj/mmej_figures/out/e_ins_modeling/' timer = util.Timer(total = 100) for rs in range(100): # for rs in range(1): prefix = e_dir + 'len_%s_%s' % (data_nm, rs) test_exps = pickle.load(open(prefix + '_testexps.pkl')) rate_model = pickle.load(open(prefix + '_model.pkl')) bp_model = pickle.load(open(prefix + '_bp.pkl')) for exp in test_exps: df = dataset[exp] calc_statistics(df, exp, rate_model, bp_model, alldf_dict, rs, data_nm) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_dataset_try1(): dataset_nm = 'dataset_try1' print 'Preparing %s' % (dataset_nm) featurized_data = init_featurized_data() # Components... dataset = _data.load_dataset('DisLib-mES-controladj', exp_subset='longdup_series', exp_subset_col='Designed Name') prepare_library_dataset(dataset, featurized_data) dataset = _data.load_dataset('Lib1-mES-controladj') # Remove VO spacers from lib 1 for vo_spacer_idx in range(1872, 1961 + 1): vo_spacer_exp = str(vo_spacer_idx) del dataset[vo_spacer_exp] print len(dataset) prepare_library_dataset(dataset, featurized_data) pickle_featurized_data(featurized_data, dataset_nm) return
def prepare_statistics(): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) data_nm = '1207-mESC-Dislib-Cas9-Tol2-Biorep1-r1-controladj' # data_nm = '0105-mESC-Lib1-Cas9-Tol2-BioRep2-r1-controladj' dataset1 = _data.load_dataset(data_nm) data_nm = '1207-mESC-Dislib-Cas9-Tol2-Biorep1-r2-controladj' # data_nm = '0105-mESC-Lib1-Cas9-Tol2-BioRep3-r1-controladj' dataset2 = _data.load_dataset(data_nm) for exp in dataset1.keys(): df1 = dataset1[exp] df2 = dataset2[exp] calc_statistics(df1, df2, exp, alldf_dict) # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def prepare_statistics(data_nm): # Input: Dataset # Output: Uniformly processed dataset, requiring minimal processing for plotting but ideally enabling multiple plots # Calculate statistics associated with each experiment by name alldf_dict = defaultdict(list) dataset = _data.load_dataset(data_nm) if dataset is None: return timer = util.Timer(total=len(dataset)) # for exp in dataset.keys()[:100]: for exp in dataset.keys(): df = dataset[exp] calc_statistics(df, exp, alldf_dict) timer.update() # Return a dataframe where columns are positions and rows are experiment names, values are frequencies alldf = pd.DataFrame(alldf_dict) return alldf
def data_preload(nm): l2_data = _data.load_dataset(nm) return