def predict(seq, aa_cut=0, percent_peptide=0, model=None, model_file=None): assert not (model is None and model_file is None ), "you have to specify either a model or a model_file" if model is None: try: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) except: raise Exception("could not find model stored to file %s" % model_file) else: model, learn_options = model learn_options["V"] = 2 # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=[[seq, 'NA']]) gene_position = pandas.DataFrame( columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=[[percent_peptide, aa_cut]]) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position) inputs, dim, dimsum, feature_names = util.concatenate_feature_sets( feature_sets) # call to scikit-learn, returns a vector of predicted values return model.predict(inputs)[0]
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True): num_proc = shared_setup(learn_options, order, test) assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False # Xdf = Xdf[to_keep] # gene_position = gene_position[to_keep] # Y = Y[to_keep] Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None: seq_start, seq_end, expected_length = learn_options['left_right_guide_ind'] Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end]) feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def predict(seq, aa_cut=0, percent_peptide=0, model=None, model_file=None): assert not (model is None and model_file is None), "you have to specify either a model or a model_file" if model is None: try: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) except: raise Exception("could not find model stored to file %s" % model_file) else: model, learn_options = model learn_options["V"] = 2 # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=[[seq, 'NA']]) gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=[[percent_peptide, aa_cut]]) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position) inputs, dim, dimsum, feature_names = concatenate_feature_sets(feature_sets) # call to scikit-learn, returns a vector of predicted values return model.predict(inputs)[0]
def get_proximal_5mer_feature(data_df): proximal_5mer = data_df['30mer'].apply(get_5mer) proximal_5mer.name = "proximal_5mers" proximal_5mer = pd.DataFrame(proximal_5mer) proximal_5mer_counts = proximal_5mer.groupby(["proximal_5mers"]).size().reset_index() proximal_5mer = proximal_5mer.merge(proximal_5mer_counts, on="proximal_5mers") proximal_5mer = proximal_5mer.rename(columns={0: 'proximal_5mer_counts'}) return proximal_5mer if __name__ == '__main__': feature_df = pd.read_csv("../../../../../results/cleaned_c_elegans_30mers_energies.csv") features = featurize_data(feature_df, learn_options=learn_options, Y=feature_df, gene_position=feature_df) features['proximal_5mer'] = get_proximal_5mer_feature(feature_df) inputs, dim, dimsum, feature_names = concatenate_feature_sets(features) doensch_df = pd.DataFrame(inputs, columns=feature_names) feature_df = feature_df.join(doensch_df) feature_df = feature_df.drop(axis=1, labels=['sgRNA', 'Gene target', '30mer', 'WormsInjected', 'SuccessfulInjections']) feature_df = pd.get_dummies(feature_df).dropna(axis=0) if any(feature_df.columns.duplicated()): feature_df = feature_df.loc[:, ~feature_df.columns.duplicated()] feature_df = feature_df.rename(columns={"SuccessRate": "target"}) print(feature_df.shape)
def setup(test=False, order=1, learn_options=None, data_file=None): if 'num_proc' not in learn_options.keys(): learn_options['num_proc'] = None if 'num_thread_per_proc' not in learn_options.keys(): learn_options['num_thread_per_proc'] = None num_proc = local_multiprocessing.configure(TEST=test, num_proc=learn_options["num_proc"], num_thread_per_proc=learn_options["num_thread_per_proc"]) learn_options["num_proc"] = num_proc learn_options["order"] = order # gets used many places in code, not just here if "cv" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["cv"] = "gene" if "normalize_features" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["normalize_features"] = True if "weighted" not in learn_options.keys(): learn_options['weighted'] = None if "all pairs" not in learn_options.keys(): learn_options["all pairs"] = False if "include_known_pairs" not in learn_options.keys(): learn_options["include_known_pairs"] = False if "include_gene_guide_feature" not in learn_options.keys(): learn_options["include_gene_guide_feature"] = 0 #used as window size, so 0 is none #these should default to true to match experiments before they were options: if "gc_features" not in learn_options.keys(): learn_options["gc_features"] = True if "nuc_features" not in learn_options.keys(): learn_options["nuc_features"] = True if 'train_genes' not in learn_options.keys(): learn_options["train_genes"] = None if 'test_genes' not in learn_options.keys(): learn_options["test_genes"] = None if "num_proc" not in learn_options: learn_options["num_proc"] = None if "num_thread_per_proc" not in learn_options: learn_options["num_thread_per_proc"] = None if 'seed' not in learn_options: learn_options['seed'] = 1 if "flipV1target" not in learn_options: learn_options["flipV1target"] = False if 'num_genes_remove_train' not in learn_options: learn_options['num_genes_remove_train'] = None if "include_microhomology" not in learn_options: learn_options["include_microhomology"] = False assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def setup(test=False, order=1, learn_options=None, data_file=None): if 'num_proc' not in learn_options.keys(): learn_options['num_proc'] = None if 'num_thread_per_proc' not in learn_options.keys(): learn_options['num_thread_per_proc'] = None num_proc = local_multiprocessing.configure( TEST=test, num_proc=learn_options["num_proc"], num_thread_per_proc=learn_options["num_thread_per_proc"]) learn_options["num_proc"] = num_proc learn_options[ "order"] = order # gets used many places in code, not just here if "cv" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["cv"] = "gene" if "normalize_features" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["normalize_features"] = True if "weighted" not in learn_options.keys(): learn_options['weighted'] = None if "all pairs" not in learn_options.keys(): learn_options["all pairs"] = False if "include_known_pairs" not in learn_options.keys(): learn_options["include_known_pairs"] = False if "include_gene_guide_feature" not in learn_options.keys(): learn_options[ "include_gene_guide_feature"] = 0 #used as window size, so 0 is none #these should default to true to match experiments before they were options: if "gc_features" not in learn_options.keys(): learn_options["gc_features"] = True if "nuc_features" not in learn_options.keys(): learn_options["nuc_features"] = True if 'train_genes' not in learn_options.keys(): learn_options["train_genes"] = None if 'test_genes' not in learn_options.keys(): learn_options["test_genes"] = None if "num_proc" not in learn_options: learn_options["num_proc"] = None if "num_thread_per_proc" not in learn_options: learn_options["num_thread_per_proc"] = None if 'seed' not in learn_options: learn_options['seed'] = 1 if "flipV1target" not in learn_options: learn_options["flipV1target"] = False if 'num_genes_remove_train' not in learn_options: learn_options['num_genes_remove_train'] = None if "include_microhomology" not in learn_options: learn_options["include_microhomology"] = False assert "testing_non_binary_target_name" in learn_options.keys( ), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in [ 'ranks', 'raw', 'thrs' ]: raise Exception( 'learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]' ) Xdf, Y, gene_position, target_genes = load_data.from_file( data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None): """ if pam_audit==False, then it will not check for GG in the expected position this is useful if predicting on PAM mismatches, such as with off-target """ print "predict function running" # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array" assert len(seq[0]) > 0, "Make sure that seq is not empty" assert isinstance(seq[0], str), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations" if aa_cut is not None: assert len(aa_cut) > 0, "Make sure that aa_cut is not empty" assert isinstance(aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array" assert np.all(np.isreal(aa_cut)), "amino-acid cut position needs to be a real number" if percent_peptide is not None: assert len(percent_peptide) > 0, "Make sure that percent_peptide is not empty" assert isinstance(percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array" assert np.all(np.isreal(percent_peptide)), "percent_peptide needs to be a real number" if model_file is None: azimuth_saved_model_dir = os.path.join(os.path.dirname(__file__), 'saved_models') if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join(azimuth_saved_model_dir, model_name) if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) print model_file print learn_options else: model, learn_options = model learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))])) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut)) else: gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1)) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(feature_sets) # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) unique_preds = np.unique(preds) ok = False for pr in preds: if pr not in [0,1]: ok = True assert ok, "model returned only 0s and 1s" return preds