def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True): num_proc = shared_setup(learn_options, order, test) assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = azimuth.load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False # Xdf = Xdf[to_keep] # gene_position = gene_position[to_keep] # Y = Y[to_keep] Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None: seq_start, seq_end, expected_length = learn_options['left_right_guide_ind'] Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end]) feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True): num_proc = shared_setup(learn_options, order, test) assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = azimuth.load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False # Xdf = Xdf[to_keep] # gene_position = gene_position[to_keep] # Y = Y[to_keep] Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True): """ if pam_audit==False, then it will not check for GG in the expected position this is useful if predicting on PAM mismatches, such as with off-target """ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" print aa_cut, percent_peptide if model_file is None: azimuth_saved_model_dir = os.path.join( os.path.dirname(azimuth.__file__), 'saved_models') if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join(azimuth_saved_model_dir, model_name) if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) else: model, learn_options = model learn_options["V"] = 2 # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))])) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame( columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut)) else: gene_position = pandas.DataFrame( columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0]) * -1, np.ones(seq.shape[0]) * -1)) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets( feature_sets) # call to scikit-learn, returns a vector of predicted values return model.predict(inputs)
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True): """ if pam_audit==False, then it will not check for GG in the expected position this is useful if predicting on PAM mismatches, such as with off-target """ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" assert isinstance(seq, (str, np.ndarray)), "Please ensure seq is a numpy array" if isinstance(seq, np.ndarray) and len(seq) > 0: assert isinstance(seq[0], str) or isinstance(seq[0], unicode), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations" assert isinstance(aa_cut, (int, long, np.ndarray)), "Please ensure aa_cut is a numpy array" if isinstance(aa_cut, np.ndarray) and len(aa_cut) > 0: assert isinstance(aa_cut[0], (int, long)) assert isinstance(percent_peptide, (int, long, np.ndarray)), "Please ensure percent_peptide is a numpy array" if isinstance(percent_peptide, np.ndarray) and len(percent_peptide) > 0: assert isinstance(percent_peptide[0], (int, long)) print aa_cut, percent_peptide if model_file is None: azimuth_saved_model_dir = os.path.join(os.path.dirname(azimuth.__file__), 'saved_models') if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join(azimuth_saved_model_dir, model_name) if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) else: model, learn_options = model learn_options["V"] = 2 # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))])) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut)) else: gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1)) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets) # call to scikit-learn, returns a vector of predicted values return model.predict(inputs)
def extract_features(Xdf, Y, gene_position, conservation_scores, order=2): learn_options = { 'nuc_features': True, 'num_proc': 1, 'order': order, 'gc_features': True, 'include_pi_nuc_feat': True, "include_gene_position": True, "include_NGGX_interaction": True, "include_Tm": True, 'include_known_pairs': False, 'include_microhomology': False, 'ignore_gene_level_for_inner_loop': True, # <- what? "include_strand": False, "include_gene_feature": False, "include_gene_guide_feature": 0, "include_gene_effect": False, "include_drug": False, "include_sgRNAscore": False, "normalize_features": False } features = featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=True, length_audit=True) conservation_scores.index = features['_nuc_pd_Order1'].index features['conservation_scores'] = conservation_scores y = Y['score_drug_gene_rank'].astype('float32').as_matrix() # we need the genes associated to the features to do cv data selection genes = features['conservation_scores'].index.get_level_values( 1).to_series().reset_index(drop=True) combined_features, dim, dimsum, feature_names = concatenate_feature_sets( features) combined_features = combined_features.astype('float32') return combined_features, y, genes, feature_names
def predict(seq, aa_cut=None, percent_peptide=None, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None): """ Args: seq: numpy array of 30 nt sequences. aa_cut: numpy array of amino acid cut positions (optional). percent_peptide: numpy array of percent peptide (optional). model: model instance to use for prediction (optional). model_file: file name of pickled model to use for prediction (optional). pam_audit: check PAM of each sequence. length_audit: check length of each sequence. learn_options_override: a dictionary indicating which learn_options to override (optional). Returns: a numpy array of predictions. """ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array" assert len(seq[0]) > 0, "Make sure that seq is not empty" assert isinstance( seq[0], str ), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations" if aa_cut is not None: assert len(aa_cut) > 0, "Make sure that aa_cut is not empty" assert isinstance( aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array" assert np.all(np.isreal( aa_cut)), "amino-acid cut position needs to be a real number" if percent_peptide is not None: assert len( percent_peptide) > 0, "Make sure that percent_peptide is not empty" assert isinstance( percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array" assert np.all(np.isreal( percent_peptide)), "percent_peptide needs to be a real number" if model_file is None: if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join('saved_models', model_name) print(model_file) with pkg_resources.resource_stream(__package__, model_file) as f: model = pickle.load(f, encoding='bytes') if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f, encoding='bytes') else: model, learn_options = model learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=['30mer', 'Strand'], data=list(zip(seq, ['NA' for x in range(len(seq))]))) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame( columns=['Percent Peptide', 'Amino Acid Cut position'], data=list(zip(percent_peptide, aa_cut))) else: gene_position = pandas.DataFrame( columns=['Percent Peptide', 'Amino Acid Cut position'], data=list( zip(np.ones(seq.shape[0]) * -1, np.ones(seq.shape[0]) * -1))) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets( feature_sets) # print "CRISPR" # pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv") # import ipdb; ipdb.set_trace() # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) unique_preds = np.unique(preds) ok = False for pr in preds: if pr not in [0, 1]: ok = True assert ok, "model returned only 0s and 1s" return preds
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None): """ if pam_audit==False, then it will not check for GG in the expected position this is useful if predicting on PAM mismatches, such as with off-target """ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array" assert len(seq[0]) > 0, "Make sure that seq is not empty" assert isinstance( seq[0], str ), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations" if aa_cut is not None: assert len(aa_cut) > 0, "Make sure that aa_cut is not empty" assert isinstance( aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array" assert np.all(np.isreal( aa_cut)), "amino-acid cut position needs to be a real number" if percent_peptide is not None: assert len( percent_peptide) > 0, "Make sure that percent_peptide is not empty" assert isinstance( percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array" assert np.all(np.isreal( percent_peptide)), "percent_peptide needs to be a real number" if model_file is None: azimuth_saved_model_dir = os.path.join( os.path.dirname(azimuth.__file__), 'saved_models') if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join(azimuth_saved_model_dir, model_name) if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) else: model, learn_options = model learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))])) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame( columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut)) else: gene_position = pandas.DataFrame( columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0]) * -1, np.ones(seq.shape[0]) * -1)) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets( feature_sets) # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) unique_preds = np.unique(preds) ok = False for pr in preds: if pr not in [0, 1]: ok = True assert ok, "model returned only 0s and 1s" return preds
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None): """ if pam_audit==False, then it will not check for GG in the expected position this is useful if predicting on PAM mismatches, such as with off-target """ # assert not (model is None and model_file is None), "you have to specify either a model or a model_file" assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array" assert len(seq[0]) > 0, "Make sure that seq is not empty" assert isinstance(seq[0], str), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations" if aa_cut is not None: assert len(aa_cut) > 0, "Make sure that aa_cut is not empty" assert isinstance(aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array" assert np.all(np.isreal(aa_cut)), "amino-acid cut position needs to be a real number" if percent_peptide is not None: assert len(percent_peptide) > 0, "Make sure that percent_peptide is not empty" assert isinstance(percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array" assert np.all(np.isreal(percent_peptide)), "percent_peptide needs to be a real number" if model_file is None: azimuth_saved_model_dir = os.path.join(os.path.dirname(azimuth.__file__), 'saved_models') if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None): print("No model file specified, using V3_model_nopos") model_name = 'V3_model_nopos.pickle' else: print("No model file specified, using V3_model_full") model_name = 'V3_model_full.pickle' model_file = os.path.join(azimuth_saved_model_dir, model_name) if model is None: with open(model_file, 'rb') as f: model, learn_options = pickle.load(f) else: model, learn_options = model learn_options["V"] = 2 learn_options = override_learn_options(learn_options_override, learn_options) # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename) # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets) Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))])) if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None): gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut)) else: gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1)) feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit) inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets) # call to scikit-learn, returns a vector of predicted values preds = model.predict(inputs) # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function()) unique_preds = np.unique(preds) ok = False for pr in preds: if pr not in [0,1]: ok = True assert ok, "model returned only 0s and 1s" return preds