def main(): target_file = 'targets.txt' lines = [line[:-1] for line in open(target_file, 'r', encoding='utf-8')] split = [ line.split('\t') for line in lines if not (line.startswith('#') or len(line) == 0) ] files = [line[1] for line in split] spans = [5, 25] def do_prediction(predictor, count): results = [] score_results = [] for s in spans: for f in files: predicts = [] scores = [] for _ in np.arange(count): predict, score = predictor.prediction(f, s) print("%d 日, %s, Prediction: %f, Score: %f" % (s, f, predict, score)) predicts.append(predict) scores.append(score) ave_predict = np.average(predicts) ave_score = np.average(scores) print("Average: %d 日, %s, Prediction: %f, Score: %f" % (s, f, ave_predict, ave_score)) results.append(ave_predict) score_results.append(ave_score) return results, score_results print("Decision Tree") dt_preds, dt_scores = do_prediction( Predictor.Predictor(tree.DecisionTreeClassifier()), 10) print("SVM") svm_preds, svm_scores = do_prediction( Predictor.Predictor(svm.SVC(kernel='rbf')), 5) print("SGD") sgd_preds, sgd_scores = do_prediction( Predictor.Predictor(lm.SGDClassifier()), 10) print("Decision Tree") print('\t'.join([str(r) for r in dt_preds])) print('\t'.join([str(r) for r in dt_scores])) print("SVM") print('\t'.join([str(r) for r in svm_preds])) print('\t'.join([str(r) for r in svm_scores])) print("SGD") print('\t'.join([str(r) for r in sgd_preds])) print('\t'.join([str(r) for r in sgd_scores]))
def clue(): """ Generate a clue """ data = json.loads(request.data) board = data[1:] ids_to_score_path = 'static/numpy/ids_to_score.npy' invalid_guesses = set(data[0]['invalid_guesses']) decay = data[0]['decay'] predictor = Predictor(board, ids_to_score_path, invalid_guesses, decay) clue, scores = predictor.get_best_guess_and_scores() scaled_scores = [round(s) for s in scores] clue_details = jsonify(clue=clue, scores=scaled_scores) return clue_details
class PredictorTest(unittest.TestCase): def setUp(self): self.features = pd.read_csv('test_dataset.csv', sep=',') self.preditor = Predictor(self.features) def test_load_model(self): self.assertIsInstance(self.preditor._load_model('lm.joblib'), BaseEstimator) def test_predict(self): predictions = self.preditor.predict() self.assertIsInstance(predictions, list) # test the prediction results has same instances as input instances self.assertIs(len(predictions), self.features.shape[0]) # test the predictions all greater than zero self.assertGreaterEqual(min(predictions), 0)
def main(): # load configuration file configReinforce, exp_time = load_config(config_file) # Load Generator object generator_model = Sequential() generator_model = Model(configReinforce) generator_model.model.load_weights(configReinforce.model_name_unbiased) # Load the Predictor object of KOR affinity predictor_kor = Predictor(configReinforce, 'kor') # Load the Predictor object of logP property #predictor_logP = Predictor(configReinforce,'kor') # Initialize lists to evaluate the model difs_kor = [ ] # List with the differences between the averages of KOR affinity distributions (G_0 and G_optimized) difs_qed = [] divs = [ ] # List with the internal diversities of the G_optimized generated molecules perc_valid = [] # List with the % of valid SMILES generated by G_optimized uniqs = [] # List with the % of unique SMILES strings # Create Reinforcement Learning (RL) object RL_obj = Reinforcement(generator_model, predictor_kor, configReinforce) # RL_obj.drawMols() # SMILES generation test with the unbiased Generator # smiles_original, prediction_original_kor,prediction_original_qed,valid,unique = RL_obj.test_generator(configReinforce.n_to_generate,0,True) # Step of RL training cumulative_rewards_qed, cumulative_rewards_kor, cumulative_rewards, previous_weights = RL_obj.policy_gradient( ) # SMILES generation test after 60 RL training iterations # smiles_iteration85,prediction_iteration85_kor,prediction_iteration85_qed,valid,unique = RL_obj.test_generator(configReinforce.n_to_generate,85, False) # Plot the changes in the distribution after applying RL # plot_evolution(prediction_original_kor,prediction_iteration85_kor,'kor') # plot_evolution(prediction_original_qed,prediction_iteration85_qed,'qed') # # # Other way of evaluating the differences before and after applying RL. It # # evaluates the internal diversity, validity and uniqueness for k in range(20): print("\nGeneration test:" + str(k)) dif_qed, dif_kor, valid, div, unique = RL_obj.compare_models( configReinforce.n_to_generate, True) difs_kor.append(dif_kor) difs_qed.append(dif_qed) divs.append(div) perc_valid.append(valid) uniqs.append(unique) print("\nMean value difference for KOR: " + str(np.mean(difs_kor))) print("Mean value difference for QED: " + str(np.mean(difs_qed))) print("Mean value diversity: " + str(np.mean(divs))) print("Mean value validity: " + str(np.mean(perc_valid))) print("Mean value uniqueness: " + str(np.mean(uniqs)))
def violin_plot(pred_identifier): """ violin plot for the physicochemical properties comparison. A: molecules generated by pre-trained model v.s. Chembl set. """ config_file = 'configReinforce.json' configReinforce, exp_time = load_config(config_file) if pred_identifier == 'pIC50': # Load the predictor model predictor = Predictor(configReinforce, 'kor') else: predictor = None biased_generator = Sequential() biased_generator = Model(configReinforce) unbiased_generator = Sequential() unbiased_generator = Model(configReinforce) biased_generator.model.load_weights(configReinforce.model_name_biased + ".h5") unbiased_generator.model.load_weights(configReinforce.model_name_unbiased) # generate with unbiased generate2file(predictor, unbiased_generator, configReinforce, 100, True) generate2file(predictor, biased_generator, configReinforce, 100, False) # plt.figure(figsize=(7, 6)) sns.set( rc={ "axes.facecolor": "white", "axes.grid": False, 'axes.labelsize': 20, 'figure.figsize': (20.0, 10.0), 'xtick.labelsize': 15, 'ytick.labelsize': 15 }) # sns.set(style="white", palette="colorblind", color_codes=True) df = properties_violin([ 'Generated/generated_prop_original.smi', 'Generated/generated_prop_biased.smi' ], ['Original generator', 'Fine-tuned generator'], pred_identifier) sns.violinplot(x='Property', y='Value', hue='Sets', data=df, linewidth=1, split=True, bw=1, legend=False) sns.despine(left=True) plt.ylim([-3, 15])
def main(): """ Main routine """ # load configuration file configReinforce,exp_time=load_config(config_file) # Load generator object generator_model = Sequential() generator_model = Model(configReinforce) generator_model.model.load_weights(configReinforce.model_name_unbiased) # Initialize lists to evaluate the model difs = [] # List with the differences between the averages of the desired property distributions (G_0 and G_optimized) divs = [] # List with the internal diversities of the G_optimized generated molecules perc_valid = [] # List with the % of valid SMILES generated by G_optimized # To compute SA score or qed it's not necessary to have a Predictor model if property_identifier != 'sas' and property_identifier != 'qed': # Load the Predictor object predictor = Predictor(configReinforce,property_identifier) else: predictor = None # Create Reinforcement Learning object RL_obj = Reinforcement(generator_model, predictor,configReinforce,property_identifier) # SMILES generation with unbiased Generator # smiles_original, prediction_original,valid,unique,div = RL_obj.test_generator(configReinforce.n_to_generate,0,True) # Training Generator with RL # RL_obj.policy_gradient() # SMILES generation after 85 training iterations smiles_iteration85,prediction_iteration85,valid,unique,div,perc_desirable = RL_obj.test_generator(configReinforce.n_to_generate,85, False) # Plot to evaluate the differences before and after perform the RL training step # plot_evolution(prediction_original,prediction_iteration85,property_identifier) # To directly compare the original and biased models several times, evaluating # prediction differences, diversity, and validity for k in range(20): print("BIASED GENERATION: " + str(k)) dif,div,valid,perc_uniq,perc_desirable = RL_obj.compare_models(configReinforce.n_to_generate,True) difs.append(dif) divs.append(div) perc_valid.append(valid) print("Mean value difference: " + str(np.mean(difs))) print("Mean value diversity: " + str(np.mean(divs))) print("Mean value validity: " + str(np.mean(perc_valid)))
def main(): """ Main routine """ # load configuration file configReinforce, exp_time = load_config(config_file) # Load generator generator_model = Sequential() generator_model = Model(configReinforce) generator_model.model.load_weights(configReinforce.model_name_unbiased) if property_identifier != 'sas': # To compute SA score it's not necessary to have a Predictor model # Load the predictor model predictor = Predictor(configReinforce, property_identifier) else: predictor = None # Create reinforcement learning object RL_obj = Reinforcement(generator_model, predictor, configReinforce, property_identifier) # SMILES generation with unbiased model smiles_original, prediction_original = RL_obj.test_generator( configReinforce.n_to_generate, 0, True) # Training Generator with RL RL_obj.policy_gradient() # SMILES generation after 25 training iterations smiles_epoch25, prediction_epoch25 = RL_obj.test_generator( configReinforce.n_to_generate, 25, False) plot_evolution(prediction_original, prediction_epoch25) # To directly compare the original and biased models several times for k in range(10): print("BIASED GENERATION: " + str(k)) RL_obj.compare_models(configReinforce.n_to_generate, True)
from flask import Flask, request, json from flask_restful import Resource, Api from flask_cors import CORS import csv from prediction import Predictor import base64 app = Flask(__name__) CORS(app) predictor = Predictor() @app.route('/save', methods=["POST"]) def save(): error = '' try: data = json.loads(request.data) print(data["YEAR"]) writeToCSV(data) return json_response("Success") except Exception as e: print(e) return json_response("Failure") @app.route('/predict', methods=["POST"]) def predict():
def main(): """ Main routine: Script that evokes all the necessary routines """ # load model configurations config = load_config(config_file, property_identifier) directories([config.checkpoint_dir]) # Load the table of possible tokens token_table = tokens_table().table # Read and extract smiles and labels from the csv file smiles_raw, labels_raw = reading_csv(config, property_identifier) if model_type != 'dnn' or descriptor == 'ECFP': # Transformation of data from SMILES strings to ECFP data_ecfp = SMILES_2_ECFP(smiles_raw) else: # Padd each SMILES string with spaces until reaching the size of the largest molecule smiles_padded, padd = pad_seq(smiles_raw, token_table, 0) config.paddSize = padd # Compute the dictionary that makes the correspondence between each token and unique integers tokenDict = smilesDict(token_table) # Tokenize - transform the SMILES strings into lists of tokens tokens = tokenize(smiles_padded, token_table) # Transforms each token to the respective integer, according to the previously computed dictionary smiles_int = smiles2idx(tokens, tokenDict) if searchParameters: # Split data into training, validation and testing sets. data = data_division(config, smiles_int, labels_raw, False) # Normalize the label data, data_aux = normalize(data) # Drop Rate drop_rate = [0.1, 0.3, 0.5] # Batch size batch_size = [16, 32] # Learning Rate learning_rate = [0.001, 0.0001, 0.01] # Number of cells number_units = [64, 128, 256] # Activation function activation = ['linear', 'softmax', 'relu'] # Memory cell rnn = ['LSTM', 'GRU'] epochs = [100] counter = 0 for dr in drop_rate: for bs in batch_size: for lr in learning_rate: for nu in number_units: for act in activation: for nn in rnn: for ep in epochs: param_identifier = [ str(dr) + "_" + str(bs) + "_" + str(lr) + "_" + str(nu) + "_" + nn + "_" + act + "_" + str(ep) ] counter += 1 if counter > 264: print("\nTesting this parameters: ") print(param_identifier) config.dropout = dr config.batch_size = bs config.lr = lr config.n_units = nu config.activation_rnn = act config.rnn = nn Model(config, data, searchParameters, descriptor) if model_type == 'dnn' and descriptor == 'SMILES': # Data splitting and Cross-Validation for the SMILES-based neural network data_rnn_smiles = data_division(config, smiles_int, labels_raw, True, model_type, descriptor) x_test = data_rnn_smiles[2] y_test = data_rnn_smiles[3] data_cv = cv_split(data_rnn_smiles, config) elif model_type == 'dnn' and descriptor == 'ECFP': # Data splitting and Cross-Validation for the ECFP-based neural network data_rnn_ecfp = data_division(config, data_ecfp, labels_raw, True, model_type, descriptor) x_test = data_rnn_ecfp[2] y_test = data_rnn_ecfp[3] data_cv = cv_split(data_rnn_ecfp, config) else: # Data splitting, cross-validation and grid-search for the other standard QSAR models data_otherQsar = data_division(config, data_ecfp, labels_raw, True, model_type, descriptor) x_test = data_otherQsar[2] y_test = data_otherQsar[3] data_cv = cv_split(data_otherQsar, config) best_params = grid_search(data_otherQsar, model_type) i = 0 utils = [] metrics = [] for split in data_cv: print('\nCross validation, fold number ' + str(i) + ' in progress...') data_i = [] train, val = split if model_type != 'dnn' or descriptor == 'ECFP': X_train = data_ecfp.iloc[train, :] y_train = np.array(labels_raw)[train] X_val = data_ecfp.iloc[val, :] y_val = np.array(labels_raw)[val] y_train = y_train.reshape(-1, 1) y_val = y_val.reshape(-1, 1) else: X_train = smiles_int[train] y_train = np.array(labels_raw)[train] X_val = smiles_int[val] y_val = np.array(labels_raw)[val] y_train = y_train.reshape(-1, 1) y_val = y_val.reshape(-1, 1) data_i.append(X_train) data_i.append(y_train) data_i.append(x_test) data_i.append(y_test) data_i.append(X_val) data_i.append(y_val) data_i, data_aux = normalize(data_i) utils.append(data_aux) config.model_name = "model" + str(i) if model_type == 'dnn': Model(config, data_i, False, descriptor) else: build_models(data_i, model_type, config, best_params) i += 1 # Model's evaluation with two example SMILES strings predictor = Predictor(config, token_table, model_type, descriptor) list_ss = [ "CC(=O)Nc1cccc(C2(C)CCN(CCc3ccccc3)CC2C)c1", "CN1CCC23CCCCC2C1Cc1ccc(O)cc13" ] #5.96 e 8.64 prediction = predictor.predict(list_ss, utils) print(prediction) # Model's evaluation with the test set metrics = predictor.evaluator(data_i) if model_type == 'dnn': print("\n\nMean_squared_error: ", metrics[0], "\nQ_squared: ", metrics[1], "\nRoot mean squared: ", metrics[2], "\nCCC: ", metrics[3]) else: print("\n\nMean_squared_error: ", metrics[0], "\nQ_squared: ", metrics[1])
def main(): parser = get_parser() args = parser.parse_args() feature_extractor = FeatureExtractor() if args.pipeline_type == "analysis": text_preprocessor = TextPreProcessor( stop_words_file_path=args.stopwords_file_path) analyser = DataAnalyser(input_file=args.input_file_path, text_preprocessor=text_preprocessor) analyser.get_data_distribution(plot_bar=args.plot_bar) analyser.get_word_weights(word_thresh=args.word_thresh) if args.word_cloud: analyser.generate_word_cloud() elif args.pipeline_type == "model_selection": text_preprocessor = TextPreProcessor( stop_words_file_path=args.stopwords_file_path) training_data_df = load_training_data(args.train_file_path) training_data_df["sentence"] = training_data_df["sentence"].map( text_preprocessor.process) features = feature_extractor.get_features_for_training( training_data_df["sentence"], args.vectorizer) labels = training_data_df["class"] apply_cross_validation( features=features, labels=labels, k_folds=args.kfolds, use_svm=args.use_svm, use_naive_bayes=args.use_naive_bayes, use_random_forest=args.use_random_forest, use_logistic_regression=args.use_logistic_regression, use_xgboost=args.use_xgboost, use_gradient_boosting=args.use_gradient_boosting, plot_cv_graph=True, ) elif args.pipeline_type == "training": trainer = Trainer( train_file_path=args.train_file_path, val_file_path=args.val_file_path, stop_words_file_path=args.stopwords_file_path, model_name=args.best_model, feature_extractor=feature_extractor, ) training_data_df = load_training_data(args.train_file_path) trainer.train( training_data_df, split_test_size=args.split_size, vectorizer_name=args.vectorizer, get_classification_report=args.get_classification_report, get_confusion_matrix=args.get_confusion_matrix, ) validation_data_df = load_validation_data(args.val_file_path) trainer.validate(validation_data_df, vectorizer_name=args.vectorizer) if args.model_check_point_path: trainer.save_trained_model(args.model_check_point_path) elif args.pipeline_type == "prediction": if not args.stopwords_file_path: predictor = Predictor() else: predictor = Predictor(stop_words_file=args.stopwords_file_path) if args.input_file_path: predictor.predict_csv(args.input_file_path, args.output_file_path, args.model_path) if args.test_input: model, vectorizer = predictor.unpickle_the_model(args.model_path) predictor.predict(args.test_input, model, vectorizer)
def setUp(self): self.features = pd.read_csv('test_dataset.csv', sep=',') self.preditor = Predictor(self.features)
def index(request): url = 'https://newsapi.org/v2/everything' params = {'q':'Movie' , 'apiKey':'1007b5cbd3c14bedbc1d0308289852e8'} r = requests.get(url = url, params = params) data = r.json() articles = data['articles'] rating_filter = {} movies=[] movie_data_dd=defaultdict(dict) movie_data_ddd = [] if request.user.is_authenticated(): user_id = request.user.id grm = group_rating_matrix() ratings = grm.group_ratings(user_id) pr = Predictor() predictions = pr.predictTop(user_id) rating_filter = [] for key , value in ratings.iteritems(): if value > 3.0: rating_filter.append(key) for x in predictions: movie = MovieData.objects.get(movieid=x) if Movies.objects.filter(pk=x).exists(): movie_data_d = Movies.objects.get(pk=x) m = re.search("'path': u'(.+?)',", movie_data_d.images) if m: path = m.group(1) movie_data_dd[x]['path'] = path movies.append(movie) page = request.GET.get('page', 1) paginator = Paginator(movies, 6) try: movies = paginator.page(page) for movie in movies: if movie.movieid in movie_data_dd.keys(): movie.data = movie_data_dd[movie.movieid] except PageNotAnInteger: movies = paginator.page(1) except EmptyPage: movies = paginator.page(paginator.num_pages) events = Event.objects.all() form2 = CustomAuthForm() return render(request, 'landing/index.html', {'form2':form2, 'events':events, 'ratings':rating_filter, 'movies':movies, 'movie_data': movie_data_ddd, 'articles':articles})
def main(): """ Main routine: Script that evokes all the necessary routines """ # load model configurations config = load_config(config_file, property_identifier) directories([config.checkpoint_dir]) # Load the table of possible tokens token_table = tokens_table().table # Read and extract smiles and labels from the csv file smiles_raw, labels_raw = reading_csv(config, property_identifier) print("BBB+: ", np.sum(labels_raw)) # mols = [Chem.MolFromSmiles(x) for x in smiles_raw] # # morgan_fp = [Chem.GetMorganFingerprintAsBitVect(x, 2, nBits = 2048) for x in mols] # # # # convert the RDKit explicit vectors into numpy arrays # morg_fp_np = [] # for fp in morgan_fp: # arr = np.zeros((1,)) # DataStructs.ConvertToNumpyArray(fp, arr) # morg_fp_np.append(arr) # # # x_morg = morg_fp_np # # x_morg_rsmp, y_morg_rsmp = SMOTE().fit_resample(x_morg, labels_raw) # Padd each SMILES string with spaces until reaching the size of the largest molecule smiles_padded, padd = pad_seq(smiles_raw, token_table, 0) config.paddSize = padd # Compute the dictionary that makes the correspondence between each token and unique integers tokenDict = smilesDict(token_table) # Tokenize - transform the SMILES strings into lists of tokens tokens = tokenize(smiles_padded, token_table) # Transforms each token to the respective integer, according to the previously computed dictionary smiles_int = smiles2idx(tokens, tokenDict) if searchParameters: # Split data into training, validation and testing sets. data = data_division(config, smiles_int, labels_raw, False, model_type, descriptor) # Normalize the label data, data_aux = normalize(data) # Drop Rate drop_rate = [0.1, 0.3, 0.5] # Batch size batch_size = [16, 32] # Learning Rate learning_rate = [0.001, 0.0001, 0.01] # Number of cells number_units = [64, 128, 256] # Activation function activation = ['linear', 'softmax', 'relu'] # Memory cell rnn = ['LSTM', 'GRU'] epochs = [100] counter = 0 for dr in drop_rate: for bs in batch_size: for lr in learning_rate: for nu in number_units: for act in activation: for nn in rnn: for ep in epochs: param_identifier = [ str(dr) + "_" + str(bs) + "_" + str(lr) + "_" + str(nu) + "_" + nn + "_" + act + "_" + str(ep) ] counter += 1 if counter > 304: print("\nTesting this parameters: ") print(param_identifier) config.dropout = dr config.batch_size = bs config.lr = lr config.n_units = nu config.activation_rnn = act config.rnn = nn Model(config, data, searchParameters, descriptor) if model_type == 'dnn' and descriptor == 'SMILES': # Data splitting and Cross-Validation for the SMILES-based neural network data_rnn_smiles = data_division(config, smiles_int, labels_raw, True, model_type, descriptor) x_test = data_rnn_smiles[2] y_test = data_rnn_smiles[3] data_cv = cv_split(data_rnn_smiles, config) elif model_type == 'dnn' and descriptor == 'ECFP': # Data splitting and Cross-Validation for the ECFP-based neural network data_rnn_ecfp = data_division(config, x_morg_rsmp, y_morg_rsmp, True, model_type, descriptor) x_test = data_rnn_ecfp[2] y_test = data_rnn_ecfp[3] data_cv = cv_split(data_rnn_ecfp, config) else: # Data splitting, cross-validation and grid-search for the other standard QSAR models data_otherQsar = data_division(config, data_ecfp, labels_raw, True, model_type, descriptor) x_test = data_otherQsar[2] y_test = data_otherQsar[3] data_cv = cv_split(data_otherQsar, config) best_params = grid_search(data_otherQsar, model_type) i = 0 # utils = [] metrics = [] for split in data_cv: print('\nCross validation, fold number ' + str(i) + ' in progress...') data_i = [] train, val = split if model_type != 'dnn' or descriptor == 'ECFP': X_train = data_rnn_ecfp[0][train] y_train = np.array(data_rnn_ecfp[1])[train] X_val = data_rnn_ecfp[0][val] y_val = np.array(data_rnn_ecfp[1])[val] y_train = y_train.reshape(-1, 1) y_val = y_val.reshape(-1, 1) else: X_train = data_rnn_smiles[0][train] y_train = np.array(data_rnn_smiles[1])[train] X_val = data_rnn_smiles[0][val] y_val = np.array(data_rnn_smiles[1])[val] # X_train = smiles_int[train] # y_train = np.array(labels_raw)[train] # X_val = smiles_int[val] # y_val = np.array(labels_raw)[val] y_train = y_train.reshape(-1, 1) y_val = y_val.reshape(-1, 1) data_i.append(X_train) data_i.append(y_train) data_i.append(x_test) data_i.append(y_test) data_i.append(X_val) data_i.append(y_val) # data_i,data_aux = normalize(data_i) # utils.append(data_aux) config.model_name = "model" + str(i) if model_type == 'dnn': Model(config, data_i, False, descriptor) # else: # build_models(data_i,model_type,config,best_params) i += 1 # Model's evaluation with two example SMILES strings predictor = Predictor(config, token_table, model_type, descriptor) # list_ss = ["NC(=O)c1cccc(OC2CC3CCC(C2)N3C2(c3ccccc3)CC2)c1","CN(C)C(CNC(CN)Cc1ccc(O)cc1)Cc1ccc(O)cc1"] #3.85 e 1.73 # prediction = predictor.predict(list_ss,utils) # print(prediction) # Model's evaluation with the test set metrics = predictor.evaluator(data_i) print("\n\nAccuracy: ", metrics[0], "\nAUC: ", metrics[1], "\nSpecificity: ", metrics[2], "\nSensitivity: ", metrics[3], "\nMCC: ", metrics[4])