def calculateScore(seq, model): if seq[25:27] == 'GG': score = model_comparison.predict(seq, -1, -1, model) #print 'Rule set 2 score: %.4f'% (score) return str(score) else: print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
def main(): g, cp, pp = testdata.get_test_data("data/testdata.xlsx") results = model_comparison.predict(g, cp, pp, model_file=sys.argv[1]) writer = csv.writer(open(sys.argv[2], "w")) #results = model_comparison.predict(g, cp, pp, model_file='saved_models/PAM_nopos.pickle') #writer = csv.writer(open("NGGXX.csv", "w")) writer.writerow(results)
def calcFusiDoench(seqs): """ Input is a 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5' based on source code sent by John Doench {'include_strand': False, 'weighted': None, 'num_thread_per_proc': None, 'extra pairs': False, 'gc_features': True, 'test_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'testing_non_binary_target_name': 'ranks', 'train_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'cv': 'gene', 'adaboost_alpha': 0.5, 'all pairs': False, 'binary target name': 'score_drug_gene_threshold', 'normalize_features': False, 'nuc_features': True, 'include_gene_effect': False, 'num_genes_remove_train': None, 'include_gene_guide_feature': 0, 'include_known_pairs': False, 'include_gene_feature': False, 'training_metric': 'spearmanr', 'num_proc': 8, 'include_drug': False, 'include_microhomology': False, 'V': 3, 'include_Tm': True, 'adaboost_loss': 'ls', 'rank-transformed target name': 'score_drug_gene_rank', 'include_pi_nuc_feat': True, 'include_sgRNAscore': False, 'flipV1target': False, 'include_NGGX_interaction': True, 'seed': 1, 'NDGC_k': 10, 'raw target name': None, 'all_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'order': 2, 'include_gene_position': False} """ #aa_cut = 0 #percent_peptide=0 #learn_options["V"] = 2 #model, learn_options = pickle.load(f) #for seq in seqs: #get_all_order_nuc_features(seq, feature_sets, learn_options, learn_options["order"], max_index_to_use=30) #assert(not learn_options["gc_features"]) #assert(not learn_options["gene_position"]) aa_cut = 0 per_peptide = 0 f = open(join(fusiDir, 'saved_models/V3_model_nopos.pickle')) model = pickle.load( f ) # if this fails, install sklearn like this: pip install scikit-learn==0.16.1 res = [] for seq in seqs: if "N" in seq: res.append(-1) # can't do Ns continue pam = seq[25:27] if pam != "GG": #res.append(-1) #continue seq = list(seq) seq[25] = "G" seq[26] = "G" seq = "".join(seq) if "N" in seq: res.append(-1) continue score = model_comparison.predict(seq, aa_cut, per_peptide, model=model) res.append(int(round(100 * score))) return res
def get_rs2_score(seq, model_file): seq = seq.upper() if len(seq) != 30: print("Please enter a 30mer sequence.") return None try: with open(model_file, 'rb') as f: model = pickle.load(f) except: raise Exception( "could not find model stored to file %s. Perhaps the scikit-learn package is of another version." % model_file) if seq[25:27] == 'GG': score = model_comparison.predict(seq, -1, -1, model=model) return score else: print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.' return None
def calcFusiDoench(seqs): """ Input is a 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5' based on source code sent by John Doench {'include_strand': False, 'weighted': None, 'num_thread_per_proc': None, 'extra pairs': False, 'gc_features': True, 'test_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'testing_non_binary_target_name': 'ranks', 'train_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'cv': 'gene', 'adaboost_alpha': 0.5, 'all pairs': False, 'binary target name': 'score_drug_gene_threshold', 'normalize_features': False, 'nuc_features': True, 'include_gene_effect': False, 'num_genes_remove_train': None, 'include_gene_guide_feature': 0, 'include_known_pairs': False, 'include_gene_feature': False, 'training_metric': 'spearmanr', 'num_proc': 8, 'include_drug': False, 'include_microhomology': False, 'V': 3, 'include_Tm': True, 'adaboost_loss': 'ls', 'rank-transformed target name': 'score_drug_gene_rank', 'include_pi_nuc_feat': True, 'include_sgRNAscore': False, 'flipV1target': False, 'include_NGGX_interaction': True, 'seed': 1, 'NDGC_k': 10, 'raw target name': None, 'all_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13', 'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1', u'CUL3', u'NF1', u'NF2'], dtype=object), 'order': 2, 'include_gene_position': False} """ # aa_cut = 0 # percent_peptide=0 # learn_options["V"] = 2 # model, learn_options = pickle.load(f) # for seq in seqs: # get_all_order_nuc_features(seq, feature_sets, learn_options, learn_options["order"], max_index_to_use=30) # assert(not learn_options["gc_features"]) # assert(not learn_options["gene_position"]) aa_cut = 0 per_peptide = 0 f = open(join(fusiDir, "saved_models/V3_model_nopos.pickle")) model = pickle.load(f) # if this fails, install sklearn like this: pip install scikit-learn==0.16.1 res = [] for seq in seqs: pam = seq[25:27] if pam != "GG": # res.append(-1) # continue seq = list(seq) seq[25] = "G" seq[26] = "G" seq = "".join(seq) if "N" in seq: res.append(-1) continue score = model_comparison.predict(seq, aa_cut, per_peptide, model=model) res.append(int(round(100 * score))) return res
help='Amino acid cut position of sgRNA') parser.add_argument('--per-peptide', type=float, default=None, help='Percentage of protein cut by sgRNA') return parser if __name__ == '__main__': args = get_parser().parse_args() seq = args.seq.upper() if len(seq)!=30: print "Please enter a 30mer sequence." sys.exit(1) aa_cut = args.aa_cut per_peptide = args.per_peptide model_file_1 = '../saved_models/V3_model_nopos.pickle' model_file_2 = '../saved_models/V3_model_full.pickle' if (aa_cut == None) or (per_peptide == None): model_file = model_file_1 else: model_file = model_file_2 try: with open(model_file, 'rb') as f: model= pickle.load(f) except: raise Exception("could not find model stored to file %s" % model_file) if seq[25:27] == 'GG': score = model_comparison.predict(seq, aa_cut, per_peptide, model=model) print 'Rule set 2 score: %.4f'% (score) else: print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
NF1_PREDICT = predict(np.asarray(table['Construct IDs'][3366:4212].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str), np.asarray(table["Amino Acid"][3366:4212].values, dtype=float), np.asarray(table["Pct Pep"][3366:4212].values, dtype=float)) NF1_SCORE = np.asarray(table["PctRank"][3366:4212]) print "NF1" print scipy.stats.spearmanr(NF1_PREDICT, NF1_SCORE) NF2_PREDICT = predict(np.asarray(table['Construct IDs'][4212:4515].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str), np.asarray(table["Amino Acid"][4212:4515].values, dtype=float), np.asarray(table["Pct Pep"][4212:4515].values, dtype=float)) NF2_SCORE = np.asarray(table["PctRank"][4212:4515]) print "NF2" print scipy.stats.spearmanr(NF2_PREDICT, NF2_SCORE) """ for i in xrange(7): NUDT5_PREDICT = predict(np.asarray(table['Construct IDs'].apply(lambda x: x[i:25+i] + 'GG' + x[27+i:30+i]).values, dtype=str), np.asarray(table["Amino Acid"].values, dtype=float), np.asarray(table["Pct Pep"].values, dtype=float)) NUDT5_SCORE = np.asarray(table["LFC"].values, dtype=float) # print "NUDT5" print i print scipy.stats.spearmanr(NUDT5_PREDICT, NUDT5_SCORE) """ PELP1_PREDICT = predict(np.asarray(table['Construct IDs'][4606:6925].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str), np.asarray(table["Amino Acid"][4606:6925].values, dtype=float), np.asarray(table["Pct Pep"][4606:6925].values, dtype=float)) PELP1_SCORE = np.asarray(table["PctRank"][4606:6925]) print "PELP1" print scipy.stats.spearmanr(PELP1_PREDICT, PELP1_SCORE) TFRC_PREDICT = predict(np.asarray(table['Construct IDs'][6925:7465].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str), np.asarray(table["Amino Acid"][6925:7465].values, dtype=float), np.asarray(table["Pct Pep"][6925:7465].values, dtype=float))