예제 #1
0
def main():
    target_file = 'targets.txt'

    lines = [line[:-1] for line in open(target_file, 'r', encoding='utf-8')]
    split = [
        line.split('\t') for line in lines
        if not (line.startswith('#') or len(line) == 0)
    ]
    files = [line[1] for line in split]
    spans = [5, 25]

    def do_prediction(predictor, count):
        results = []
        score_results = []
        for s in spans:
            for f in files:
                predicts = []
                scores = []
                for _ in np.arange(count):
                    predict, score = predictor.prediction(f, s)
                    print("%d 日, %s, Prediction: %f, Score: %f" %
                          (s, f, predict, score))
                    predicts.append(predict)
                    scores.append(score)

                ave_predict = np.average(predicts)
                ave_score = np.average(scores)
                print("Average: %d 日, %s, Prediction: %f, Score: %f" %
                      (s, f, ave_predict, ave_score))
                results.append(ave_predict)
                score_results.append(ave_score)
        return results, score_results

    print("Decision Tree")
    dt_preds, dt_scores = do_prediction(
        Predictor.Predictor(tree.DecisionTreeClassifier()), 10)
    print("SVM")
    svm_preds, svm_scores = do_prediction(
        Predictor.Predictor(svm.SVC(kernel='rbf')), 5)
    print("SGD")
    sgd_preds, sgd_scores = do_prediction(
        Predictor.Predictor(lm.SGDClassifier()), 10)

    print("Decision Tree")
    print('\t'.join([str(r) for r in dt_preds]))
    print('\t'.join([str(r) for r in dt_scores]))

    print("SVM")
    print('\t'.join([str(r) for r in svm_preds]))
    print('\t'.join([str(r) for r in svm_scores]))

    print("SGD")
    print('\t'.join([str(r) for r in sgd_preds]))
    print('\t'.join([str(r) for r in sgd_scores]))
예제 #2
0
def clue():
    """
    Generate a clue
    """
    data = json.loads(request.data)
    board = data[1:]
    ids_to_score_path = 'static/numpy/ids_to_score.npy'
    invalid_guesses = set(data[0]['invalid_guesses'])
    decay = data[0]['decay']

    predictor = Predictor(board, ids_to_score_path, invalid_guesses, decay)
    clue, scores = predictor.get_best_guess_and_scores()
    scaled_scores = [round(s) for s in scores]
    clue_details = jsonify(clue=clue, scores=scaled_scores)

    return clue_details
예제 #3
0
class PredictorTest(unittest.TestCase):
    def setUp(self):
        self.features = pd.read_csv('test_dataset.csv', sep=',')
        self.preditor = Predictor(self.features)

    def test_load_model(self):
        self.assertIsInstance(self.preditor._load_model('lm.joblib'),
                              BaseEstimator)

    def test_predict(self):
        predictions = self.preditor.predict()
        self.assertIsInstance(predictions, list)
        # test the prediction results has same instances as input instances
        self.assertIs(len(predictions), self.features.shape[0])
        # test the predictions all greater than zero
        self.assertGreaterEqual(min(predictions), 0)
예제 #4
0
def main():
    # load configuration file
    configReinforce, exp_time = load_config(config_file)

    # Load Generator object
    generator_model = Sequential()
    generator_model = Model(configReinforce)
    generator_model.model.load_weights(configReinforce.model_name_unbiased)

    # Load the Predictor object of KOR affinity
    predictor_kor = Predictor(configReinforce, 'kor')

    # Load the Predictor object of logP property
    #predictor_logP = Predictor(configReinforce,'kor')

    # Initialize lists to evaluate the model
    difs_kor = [
    ]  # List with the differences between the averages of KOR affinity distributions (G_0 and G_optimized)
    difs_qed = []
    divs = [
    ]  # List with the internal diversities of the G_optimized generated molecules
    perc_valid = []  # List with the % of valid SMILES generated by G_optimized
    uniqs = []  # List with the % of unique SMILES strings

    # Create Reinforcement Learning (RL) object
    RL_obj = Reinforcement(generator_model, predictor_kor, configReinforce)

    #    RL_obj.drawMols()
    # SMILES generation test with the unbiased Generator
    #    smiles_original, prediction_original_kor,prediction_original_qed,valid,unique = RL_obj.test_generator(configReinforce.n_to_generate,0,True)

    # Step of RL training
    cumulative_rewards_qed, cumulative_rewards_kor, cumulative_rewards, previous_weights = RL_obj.policy_gradient(
    )

    # SMILES generation test after 60 RL training iterations
    #    smiles_iteration85,prediction_iteration85_kor,prediction_iteration85_qed,valid,unique = RL_obj.test_generator(configReinforce.n_to_generate,85, False)

    # Plot the changes in the distribution after applying RL
    #    plot_evolution(prediction_original_kor,prediction_iteration85_kor,'kor')
    #    plot_evolution(prediction_original_qed,prediction_iteration85_qed,'qed')
    #
    #    # Other way of evaluating the differences before and after applying RL. It
    #    # evaluates the internal diversity, validity and uniqueness
    for k in range(20):
        print("\nGeneration test:" + str(k))
        dif_qed, dif_kor, valid, div, unique = RL_obj.compare_models(
            configReinforce.n_to_generate, True)
        difs_kor.append(dif_kor)
        difs_qed.append(dif_qed)
        divs.append(div)
        perc_valid.append(valid)
        uniqs.append(unique)

    print("\nMean value difference for KOR: " + str(np.mean(difs_kor)))
    print("Mean value difference for QED: " + str(np.mean(difs_qed)))
    print("Mean value diversity: " + str(np.mean(divs)))
    print("Mean value validity: " + str(np.mean(perc_valid)))
    print("Mean value uniqueness: " + str(np.mean(uniqs)))
예제 #5
0
def violin_plot(pred_identifier):
    """ 
    violin plot for the physicochemical properties comparison.
    A: molecules generated by pre-trained model v.s. Chembl set.
    """
    config_file = 'configReinforce.json'
    configReinforce, exp_time = load_config(config_file)

    if pred_identifier == 'pIC50':
        # Load the predictor model
        predictor = Predictor(configReinforce, 'kor')
    else:
        predictor = None

    biased_generator = Sequential()
    biased_generator = Model(configReinforce)
    unbiased_generator = Sequential()
    unbiased_generator = Model(configReinforce)
    biased_generator.model.load_weights(configReinforce.model_name_biased +
                                        ".h5")
    unbiased_generator.model.load_weights(configReinforce.model_name_unbiased)

    # generate with unbiased
    generate2file(predictor, unbiased_generator, configReinforce, 100, True)
    generate2file(predictor, biased_generator, configReinforce, 100, False)
    #
    plt.figure(figsize=(7, 6))

    sns.set(
        rc={
            "axes.facecolor": "white",
            "axes.grid": False,
            'axes.labelsize': 20,
            'figure.figsize': (20.0, 10.0),
            'xtick.labelsize': 15,
            'ytick.labelsize': 15
        })

    #    sns.set(style="white", palette="colorblind", color_codes=True)
    df = properties_violin([
        'Generated/generated_prop_original.smi',
        'Generated/generated_prop_biased.smi'
    ], ['Original generator', 'Fine-tuned generator'], pred_identifier)
    sns.violinplot(x='Property',
                   y='Value',
                   hue='Sets',
                   data=df,
                   linewidth=1,
                   split=True,
                   bw=1,
                   legend=False)
    sns.despine(left=True)
    plt.ylim([-3, 15])
예제 #6
0
def main():
        
    """
    Main routine
    """
    # load configuration file
    configReinforce,exp_time=load_config(config_file)
    
    # Load generator object
    generator_model = Sequential()
    generator_model = Model(configReinforce)
    generator_model.model.load_weights(configReinforce.model_name_unbiased)

    # Initialize lists to evaluate the model
    difs = [] # List with the differences between the averages of the desired property distributions (G_0 and G_optimized)
    divs = [] # List with the internal diversities of the G_optimized generated molecules 
    perc_valid = [] # List with the % of valid SMILES generated by G_optimized
    
    # To compute SA score or qed it's not necessary to have a Predictor model
    if property_identifier != 'sas' and property_identifier != 'qed':
        # Load the Predictor object
        predictor = Predictor(configReinforce,property_identifier)
    else:
        predictor = None
  
    # Create Reinforcement Learning object
    RL_obj = Reinforcement(generator_model, predictor,configReinforce,property_identifier)
    
    # SMILES generation with unbiased Generator 
#    smiles_original, prediction_original,valid,unique,div = RL_obj.test_generator(configReinforce.n_to_generate,0,True)
    
#      Training Generator with RL    
#    RL_obj.policy_gradient()
    
    # SMILES generation after 85 training iterations 
    smiles_iteration85,prediction_iteration85,valid,unique,div,perc_desirable = RL_obj.test_generator(configReinforce.n_to_generate,85, False)
   
    # Plot to evaluate the differences before and after perform the RL training step
#    plot_evolution(prediction_original,prediction_iteration85,property_identifier)
    
    # To directly compare the original and biased models several times, evaluating
    # prediction differences, diversity, and validity
    for k in range(20):
        print("BIASED GENERATION: " + str(k))
        dif,div,valid,perc_uniq,perc_desirable = RL_obj.compare_models(configReinforce.n_to_generate,True)
        difs.append(dif)
        divs.append(div)
        perc_valid.append(valid)
    print("Mean value difference: " + str(np.mean(difs)))
    print("Mean value diversity: " + str(np.mean(divs)))
    print("Mean value validity: " + str(np.mean(perc_valid)))
예제 #7
0
def main():
    """
    Main routine
    """
    # load configuration file
    configReinforce, exp_time = load_config(config_file)

    # Load generator
    generator_model = Sequential()
    generator_model = Model(configReinforce)
    generator_model.model.load_weights(configReinforce.model_name_unbiased)

    if property_identifier != 'sas':  # To compute SA score it's not necessary to have a Predictor model
        # Load the predictor model
        predictor = Predictor(configReinforce, property_identifier)
    else:
        predictor = None

    # Create reinforcement learning object
    RL_obj = Reinforcement(generator_model, predictor, configReinforce,
                           property_identifier)

    #   SMILES generation with unbiased model
    smiles_original, prediction_original = RL_obj.test_generator(
        configReinforce.n_to_generate, 0, True)

    #  Training Generator with RL
    RL_obj.policy_gradient()

    # SMILES generation after 25 training iterations
    smiles_epoch25, prediction_epoch25 = RL_obj.test_generator(
        configReinforce.n_to_generate, 25, False)

    plot_evolution(prediction_original, prediction_epoch25)

    # To directly compare the original and biased models several times
    for k in range(10):
        print("BIASED GENERATION: " + str(k))
        RL_obj.compare_models(configReinforce.n_to_generate, True)
예제 #8
0
from flask import Flask, request, json
from flask_restful import Resource, Api
from flask_cors import CORS
import csv
from prediction import Predictor
import base64

app = Flask(__name__)
CORS(app)

predictor = Predictor()


@app.route('/save', methods=["POST"])
def save():

    error = ''
    try:

        data = json.loads(request.data)
        print(data["YEAR"])
        writeToCSV(data)
        return json_response("Success")

    except Exception as e:
        print(e)
        return json_response("Failure")


@app.route('/predict', methods=["POST"])
def predict():
예제 #9
0
파일: main.py 프로젝트: yupliu/DiverseDRL
def main():
    """
    Main routine: Script that evokes all the necessary routines 
    """

    # load model configurations
    config = load_config(config_file, property_identifier)
    directories([config.checkpoint_dir])

    # Load the table of possible tokens
    token_table = tokens_table().table

    # Read and extract smiles and labels from the csv file
    smiles_raw, labels_raw = reading_csv(config, property_identifier)

    if model_type != 'dnn' or descriptor == 'ECFP':
        # Transformation of data from SMILES strings to ECFP
        data_ecfp = SMILES_2_ECFP(smiles_raw)
    else:

        # Padd each SMILES string with spaces until reaching the size of the largest molecule
        smiles_padded, padd = pad_seq(smiles_raw, token_table, 0)
        config.paddSize = padd

        # Compute the dictionary that makes the correspondence between each token and unique integers
        tokenDict = smilesDict(token_table)

        # Tokenize - transform the SMILES strings into lists of tokens
        tokens = tokenize(smiles_padded, token_table)

        # Transforms each token to the respective integer, according to the previously computed dictionary
        smiles_int = smiles2idx(tokens, tokenDict)

    if searchParameters:
        # Split data into training, validation and testing sets.
        data = data_division(config, smiles_int, labels_raw, False)

        # Normalize the label
        data, data_aux = normalize(data)

        # Drop Rate
        drop_rate = [0.1, 0.3, 0.5]
        # Batch size
        batch_size = [16, 32]
        # Learning Rate
        learning_rate = [0.001, 0.0001, 0.01]
        # Number of cells
        number_units = [64, 128, 256]
        # Activation function
        activation = ['linear', 'softmax', 'relu']
        # Memory cell
        rnn = ['LSTM', 'GRU']
        epochs = [100]
        counter = 0
        for dr in drop_rate:
            for bs in batch_size:
                for lr in learning_rate:
                    for nu in number_units:
                        for act in activation:
                            for nn in rnn:
                                for ep in epochs:

                                    param_identifier = [
                                        str(dr) + "_" + str(bs) + "_" +
                                        str(lr) + "_" + str(nu) + "_" + nn +
                                        "_" + act + "_" + str(ep)
                                    ]
                                    counter += 1
                                    if counter > 264:
                                        print("\nTesting this parameters: ")
                                        print(param_identifier)
                                        config.dropout = dr
                                        config.batch_size = bs
                                        config.lr = lr
                                        config.n_units = nu
                                        config.activation_rnn = act
                                        config.rnn = nn
                                        Model(config, data, searchParameters,
                                              descriptor)

    if model_type == 'dnn' and descriptor == 'SMILES':
        # Data splitting and Cross-Validation for the SMILES-based neural network
        data_rnn_smiles = data_division(config, smiles_int, labels_raw, True,
                                        model_type, descriptor)
        x_test = data_rnn_smiles[2]
        y_test = data_rnn_smiles[3]
        data_cv = cv_split(data_rnn_smiles, config)

    elif model_type == 'dnn' and descriptor == 'ECFP':
        # Data splitting and Cross-Validation for the ECFP-based neural network
        data_rnn_ecfp = data_division(config, data_ecfp, labels_raw, True,
                                      model_type, descriptor)
        x_test = data_rnn_ecfp[2]
        y_test = data_rnn_ecfp[3]
        data_cv = cv_split(data_rnn_ecfp, config)
    else:
        # Data splitting, cross-validation and grid-search for the other standard QSAR models
        data_otherQsar = data_division(config, data_ecfp, labels_raw, True,
                                       model_type, descriptor)
        x_test = data_otherQsar[2]
        y_test = data_otherQsar[3]
        data_cv = cv_split(data_otherQsar, config)
        best_params = grid_search(data_otherQsar, model_type)

    i = 0
    utils = []
    metrics = []
    for split in data_cv:
        print('\nCross validation, fold number ' + str(i) + ' in progress...')
        data_i = []
        train, val = split

        if model_type != 'dnn' or descriptor == 'ECFP':
            X_train = data_ecfp.iloc[train, :]
            y_train = np.array(labels_raw)[train]
            X_val = data_ecfp.iloc[val, :]
            y_val = np.array(labels_raw)[val]
            y_train = y_train.reshape(-1, 1)
            y_val = y_val.reshape(-1, 1)

        else:
            X_train = smiles_int[train]
            y_train = np.array(labels_raw)[train]
            X_val = smiles_int[val]
            y_val = np.array(labels_raw)[val]
            y_train = y_train.reshape(-1, 1)
            y_val = y_val.reshape(-1, 1)

        data_i.append(X_train)
        data_i.append(y_train)
        data_i.append(x_test)
        data_i.append(y_test)
        data_i.append(X_val)
        data_i.append(y_val)

        data_i, data_aux = normalize(data_i)

        utils.append(data_aux)

        config.model_name = "model" + str(i)

        if model_type == 'dnn':
            Model(config, data_i, False, descriptor)
        else:
            build_models(data_i, model_type, config, best_params)

        i += 1

    # Model's evaluation with two example SMILES strings
    predictor = Predictor(config, token_table, model_type, descriptor)
    list_ss = [
        "CC(=O)Nc1cccc(C2(C)CCN(CCc3ccccc3)CC2C)c1",
        "CN1CCC23CCCCC2C1Cc1ccc(O)cc13"
    ]  #5.96 e 8.64
    prediction = predictor.predict(list_ss, utils)
    print(prediction)

    # Model's evaluation with the test set
    metrics = predictor.evaluator(data_i)

    if model_type == 'dnn':
        print("\n\nMean_squared_error: ", metrics[0], "\nQ_squared: ",
              metrics[1], "\nRoot mean squared: ", metrics[2], "\nCCC: ",
              metrics[3])
    else:
        print("\n\nMean_squared_error: ", metrics[0], "\nQ_squared: ",
              metrics[1])
def main():
    parser = get_parser()
    args = parser.parse_args()
    feature_extractor = FeatureExtractor()
    if args.pipeline_type == "analysis":
        text_preprocessor = TextPreProcessor(
            stop_words_file_path=args.stopwords_file_path)
        analyser = DataAnalyser(input_file=args.input_file_path,
                                text_preprocessor=text_preprocessor)
        analyser.get_data_distribution(plot_bar=args.plot_bar)
        analyser.get_word_weights(word_thresh=args.word_thresh)
        if args.word_cloud:
            analyser.generate_word_cloud()
    elif args.pipeline_type == "model_selection":
        text_preprocessor = TextPreProcessor(
            stop_words_file_path=args.stopwords_file_path)
        training_data_df = load_training_data(args.train_file_path)
        training_data_df["sentence"] = training_data_df["sentence"].map(
            text_preprocessor.process)
        features = feature_extractor.get_features_for_training(
            training_data_df["sentence"], args.vectorizer)
        labels = training_data_df["class"]
        apply_cross_validation(
            features=features,
            labels=labels,
            k_folds=args.kfolds,
            use_svm=args.use_svm,
            use_naive_bayes=args.use_naive_bayes,
            use_random_forest=args.use_random_forest,
            use_logistic_regression=args.use_logistic_regression,
            use_xgboost=args.use_xgboost,
            use_gradient_boosting=args.use_gradient_boosting,
            plot_cv_graph=True,
        )
    elif args.pipeline_type == "training":
        trainer = Trainer(
            train_file_path=args.train_file_path,
            val_file_path=args.val_file_path,
            stop_words_file_path=args.stopwords_file_path,
            model_name=args.best_model,
            feature_extractor=feature_extractor,
        )
        training_data_df = load_training_data(args.train_file_path)
        trainer.train(
            training_data_df,
            split_test_size=args.split_size,
            vectorizer_name=args.vectorizer,
            get_classification_report=args.get_classification_report,
            get_confusion_matrix=args.get_confusion_matrix,
        )
        validation_data_df = load_validation_data(args.val_file_path)
        trainer.validate(validation_data_df, vectorizer_name=args.vectorizer)
        if args.model_check_point_path:
            trainer.save_trained_model(args.model_check_point_path)
    elif args.pipeline_type == "prediction":
        if not args.stopwords_file_path:
            predictor = Predictor()
        else:
            predictor = Predictor(stop_words_file=args.stopwords_file_path)
        if args.input_file_path:
            predictor.predict_csv(args.input_file_path, args.output_file_path,
                                  args.model_path)
        if args.test_input:
            model, vectorizer = predictor.unpickle_the_model(args.model_path)
            predictor.predict(args.test_input, model, vectorizer)
예제 #11
0
 def setUp(self):
     self.features = pd.read_csv('test_dataset.csv', sep=',')
     self.preditor = Predictor(self.features)
예제 #12
0
def index(request):
    
    
    url = 'https://newsapi.org/v2/everything'
    
    params = {'q':'Movie' , 'apiKey':'1007b5cbd3c14bedbc1d0308289852e8'}
    
    r = requests.get(url = url, params = params) 
    data = r.json() 
    
    articles = data['articles']
    
    
    rating_filter = {}
    
    movies=[]
    
    movie_data_dd=defaultdict(dict)
    
    movie_data_ddd = []
    
    if request.user.is_authenticated():
        
        
        user_id = request.user.id
        
        grm = group_rating_matrix()
        ratings = grm.group_ratings(user_id)
        
        pr = Predictor()
        predictions = pr.predictTop(user_id)
        
        
        
        rating_filter = []
        
        
        for key , value in ratings.iteritems():
            if value > 3.0:
                rating_filter.append(key)
        
        
            
            
        for x in predictions:
            movie = MovieData.objects.get(movieid=x)
            if Movies.objects.filter(pk=x).exists():
                movie_data_d = Movies.objects.get(pk=x)
                
                
                m = re.search("'path': u'(.+?)',", movie_data_d.images)
                if m:
                    path = m.group(1)
                    movie_data_dd[x]['path'] = path
                
                
                
            
            
            
            movies.append(movie)
            
            
            
       
    page = request.GET.get('page', 1)
     
    paginator = Paginator(movies, 6)
    try:
        movies = paginator.page(page)
        
        
        for movie in movies:
            if movie.movieid in movie_data_dd.keys():
                movie.data = movie_data_dd[movie.movieid]
                
        
        
    except PageNotAnInteger:
        movies = paginator.page(1)
    except EmptyPage:
        movies = paginator.page(paginator.num_pages)
        
    
    
    
    
    
    events = Event.objects.all()
    form2 = CustomAuthForm()
    return render(request, 'landing/index.html', {'form2':form2, 'events':events, 'ratings':rating_filter, 'movies':movies, 'movie_data': movie_data_ddd, 'articles':articles})
예제 #13
0
def main():
    """
    Main routine: Script that evokes all the necessary routines 
    """

    # load model configurations
    config = load_config(config_file, property_identifier)
    directories([config.checkpoint_dir])

    # Load the table of possible tokens
    token_table = tokens_table().table

    # Read and extract smiles and labels from the csv file
    smiles_raw, labels_raw = reading_csv(config, property_identifier)

    print("BBB+: ", np.sum(labels_raw))

    #    mols = [Chem.MolFromSmiles(x) for x in smiles_raw]
    #
    #    morgan_fp = [Chem.GetMorganFingerprintAsBitVect(x, 2, nBits = 2048) for x in mols]
    #
    #
    #    # convert the RDKit explicit vectors into numpy arrays
    #    morg_fp_np = []
    #    for fp in morgan_fp:
    #      arr = np.zeros((1,))
    #      DataStructs.ConvertToNumpyArray(fp, arr)
    #      morg_fp_np.append(arr)
    #
    #
    #    x_morg = morg_fp_np
    #
    #    x_morg_rsmp, y_morg_rsmp = SMOTE().fit_resample(x_morg, labels_raw)

    # Padd each SMILES string with spaces until reaching the size of the largest molecule
    smiles_padded, padd = pad_seq(smiles_raw, token_table, 0)
    config.paddSize = padd

    # Compute the dictionary that makes the correspondence between each token and unique integers
    tokenDict = smilesDict(token_table)

    # Tokenize - transform the SMILES strings into lists of tokens
    tokens = tokenize(smiles_padded, token_table)

    # Transforms each token to the respective integer, according to the previously computed dictionary
    smiles_int = smiles2idx(tokens, tokenDict)

    if searchParameters:
        # Split data into training, validation and testing sets.
        data = data_division(config, smiles_int, labels_raw, False, model_type,
                             descriptor)

        # Normalize the label
        data, data_aux = normalize(data)

        # Drop Rate
        drop_rate = [0.1, 0.3, 0.5]
        # Batch size
        batch_size = [16, 32]
        # Learning Rate
        learning_rate = [0.001, 0.0001, 0.01]
        # Number of cells
        number_units = [64, 128, 256]
        # Activation function
        activation = ['linear', 'softmax', 'relu']
        # Memory cell
        rnn = ['LSTM', 'GRU']
        epochs = [100]
        counter = 0
        for dr in drop_rate:
            for bs in batch_size:
                for lr in learning_rate:
                    for nu in number_units:
                        for act in activation:
                            for nn in rnn:
                                for ep in epochs:

                                    param_identifier = [
                                        str(dr) + "_" + str(bs) + "_" +
                                        str(lr) + "_" + str(nu) + "_" + nn +
                                        "_" + act + "_" + str(ep)
                                    ]
                                    counter += 1
                                    if counter > 304:
                                        print("\nTesting this parameters: ")
                                        print(param_identifier)
                                        config.dropout = dr
                                        config.batch_size = bs
                                        config.lr = lr
                                        config.n_units = nu
                                        config.activation_rnn = act
                                        config.rnn = nn
                                        Model(config, data, searchParameters,
                                              descriptor)

    if model_type == 'dnn' and descriptor == 'SMILES':
        # Data splitting and Cross-Validation for the SMILES-based neural network
        data_rnn_smiles = data_division(config, smiles_int, labels_raw, True,
                                        model_type, descriptor)
        x_test = data_rnn_smiles[2]
        y_test = data_rnn_smiles[3]
        data_cv = cv_split(data_rnn_smiles, config)

    elif model_type == 'dnn' and descriptor == 'ECFP':
        # Data splitting and Cross-Validation for the ECFP-based neural network
        data_rnn_ecfp = data_division(config, x_morg_rsmp, y_morg_rsmp, True,
                                      model_type, descriptor)
        x_test = data_rnn_ecfp[2]
        y_test = data_rnn_ecfp[3]
        data_cv = cv_split(data_rnn_ecfp, config)
    else:
        # Data splitting, cross-validation and grid-search for the other standard QSAR models
        data_otherQsar = data_division(config, data_ecfp, labels_raw, True,
                                       model_type, descriptor)
        x_test = data_otherQsar[2]
        y_test = data_otherQsar[3]
        data_cv = cv_split(data_otherQsar, config)
        best_params = grid_search(data_otherQsar, model_type)

    i = 0
    #    utils = []
    metrics = []
    for split in data_cv:
        print('\nCross validation, fold number ' + str(i) + ' in progress...')
        data_i = []
        train, val = split

        if model_type != 'dnn' or descriptor == 'ECFP':
            X_train = data_rnn_ecfp[0][train]
            y_train = np.array(data_rnn_ecfp[1])[train]
            X_val = data_rnn_ecfp[0][val]
            y_val = np.array(data_rnn_ecfp[1])[val]
            y_train = y_train.reshape(-1, 1)
            y_val = y_val.reshape(-1, 1)

        else:
            X_train = data_rnn_smiles[0][train]
            y_train = np.array(data_rnn_smiles[1])[train]
            X_val = data_rnn_smiles[0][val]
            y_val = np.array(data_rnn_smiles[1])[val]

            #            X_train = smiles_int[train]
            #            y_train = np.array(labels_raw)[train]
            #            X_val = smiles_int[val]
            #            y_val = np.array(labels_raw)[val]
            y_train = y_train.reshape(-1, 1)
            y_val = y_val.reshape(-1, 1)

        data_i.append(X_train)
        data_i.append(y_train)
        data_i.append(x_test)
        data_i.append(y_test)
        data_i.append(X_val)
        data_i.append(y_val)

        #        data_i,data_aux = normalize(data_i)

        #        utils.append(data_aux)

        config.model_name = "model" + str(i)

        if model_type == 'dnn':
            Model(config, data_i, False, descriptor)


#        else:
#            build_models(data_i,model_type,config,best_params)

        i += 1

    # Model's evaluation with two example SMILES strings
    predictor = Predictor(config, token_table, model_type, descriptor)
    #    list_ss = ["NC(=O)c1cccc(OC2CC3CCC(C2)N3C2(c3ccccc3)CC2)c1","CN(C)C(CNC(CN)Cc1ccc(O)cc1)Cc1ccc(O)cc1"] #3.85 e 1.73
    #    prediction = predictor.predict(list_ss,utils)
    #    print(prediction)

    # Model's evaluation with the test set
    metrics = predictor.evaluator(data_i)

    print("\n\nAccuracy: ", metrics[0], "\nAUC: ", metrics[1],
          "\nSpecificity: ", metrics[2], "\nSensitivity: ", metrics[3],
          "\nMCC: ", metrics[4])