示例#1
0
     model = tf.keras.models.load_model(path)
     
     l_start = time.time()
     log_every = 200
     for n in range(n_data):
         print('current data ', n)
         if n%log_every==0 and n!=0: 
             l_end = time.time()
             l_time = l_end - l_start
             print(f'current data: {n}')
             print(f'{log_every} data done in: {l_time:.05} seconds')
             l_start = time.time()
             
         s = s_data[n]
         s = one_hot_encode(list(s), s_n_chars)
         s = np.expand_dims(s, axis=0)
         
         logit = model.predict([p,s])
         pred = tf.nn.sigmoid(logit)
         fpred = float(pred.numpy())
         
         if all_smi[n] in all_y_preds:
             all_y_preds[all_smi[n]].append(fpred)
         else:
             all_y_preds[all_smi[n]] = [fpred]
 
     hp.save_pkl(f'{savepath}chunk_{chunk_id}_model_{i}_all_y_preds.pkl', all_y_preds)
     
 end = time.time()
 print(f'PREDICTIONS DONE in {end - start:.05} seconds')
 ####################################
示例#2
0
        warnings.warn(
            'It seems that you are using a very low amount of your data for traning'
        )
    ####################################

    ####################################
    # start processing
    data = hp.read_with_pd(filename)

    #random split with 95% of the data in the training set
    all_idx = np.arange(len(data))
    np.random.shuffle(all_idx)
    cut = int(len(all_idx) * t_split)

    idx_train = all_idx[:cut]
    idx_val = all_idx[cut:]
    if verbose:
        print(f'N data used for training: {len(idx_train)}')
        print(f'N data used for validation: {len(idx_val)}')

    # create the partitions for the data generator
    partition = {}
    partition['train'] = idx_train
    partition['val'] = idx_val

    hp.save_pkl(f'{savepath}partitions.pkl', partition)

    end = time.time()
    print(f'Train-val split for Swiss-Prot DONE in {end - start:.04} seconds')
    ####################################
示例#3
0
         temp_all_clean[pair].append(score)
 
 
 print('Starting the second pass')
 all_clean = []
 n_removed = 0
 n_same_duplicate = 0
 for k,v in temp_all_clean.items():
     if len(v)==1:
         datapoint = (k[0], k[1], v[0])
         all_clean.append(datapoint)
     # case with duplicates,
     # but same binary activity
     elif len(set(v))==1:
         datapoint = (k[0], k[1], v[0])
         all_clean.append(datapoint)
         n_same_duplicate+=1
     else:
         n_removed+=1
         
 print(f'N remaining data points: {len(all_clean)}')
 print(f'Duplicates removed because of inconsistent binary activity: {n_removed}')
 print(f'Entries with multiple same binary activity, so taken once : {n_same_duplicate}')
       
 os.makedirs(savepath, exist_ok=True)
 hp.save_pkl(f'{savepath}all_clean.pkl', all_clean)
 ####################################
 
 
 end = time.time()
 print(f'BindingDB extraction DONE in {end - start:.04} seconds')
示例#4
0
    if ngpu > 1:
        with strategy.scope():
            seqmodel = SeqModel(vocab_size, max_len_model, layers, dropouts,
                                trainables, lr, batchnorm)
    else:
        seqmodel = SeqModel(vocab_size, max_len_model, layers, dropouts,
                            trainables, lr, batchnorm)

    if config.getboolean('RESTART', 'restart'):
        # Load the pretrained model
        path_model = config['RESTART']['path_model']
        if path_model is None:
            raise ValueError(
                'You did not provide a path to a model to be loaded for the restart'
            )
        seqmodel.model = tf.keras.models.load_model(path_model)

    history = seqmodel.model.fit_generator(
        generator=tr_generator,
        validation_data=val_generator,
        use_multiprocessing=True,
        epochs=epochs,
        callbacks=[checkpointer, lr_reduction, early_stopper],
        workers=num_workers,
        verbose=2)

    hp.save_pkl(f'{save_path}history', history.history)
    end = time.time()
    print(f'TRAINING DONE in {end - start:.05} seconds')
    ####################################
示例#5
0
        
        # init protein pretrained model
        model = ProteinBertModel.from_pretrained('bert-base')
        tokenizer = TAPETokenizer(vocab='iupac') 
        
        results = Parallel(n_jobs=nworkers)(delayed(get_PROTrepr)(i,x,model,tokenizer,f_savepath) for i,x in enumerate(unique_protein))
        
        all_indices = {}
        for x in results:
            i = x[0]
            prot = x[1]
            _all_idx = unique_prot_to_idx[prot]
            for _idx in _all_idx:
                all_indices[_idx] = i
        
        hp.save_pkl(f'{savepath}all_indices_{rpr}.pkl', all_indices)
        
        z_norma = True
        if z_norma:
            rep_dim = 768
            mean, std = do_z_norma(f'{f_savepath}', rep_dim)
            hp.save_pkl(f'{savepath}z_norma_param_{rpr}.pkl', {'mean':mean, 'std':std})
        
    elif rpr=='clm':
        from src import helper_clm as hp_clm
        from keras.models import load_model

        max_len_model = 100 + 2
        pad_char = 'A'
        start_char ='G'
        end_char = 'E'
            tr_all_id.extend(all_clusters_id[str(id_)])

        val_all_id = []
        for id_ in val_cluster_idx:
            val_all_id.extend(all_clusters_id[str(id_)])

        assert len(tr_all_id) + len(val_all_id) == len(all_ids)

        # get all the data id from the protein index
        for id_ in tr_all_id:
            seq = data_id_to_seq[id_]
            indices = [idx for idx, x in enumerate(all_protein) if x == seq]
            data_id_tr.extend(indices)

        for id_ in val_all_id:
            seq = data_id_to_seq[id_]
            indices = [idx for idx, x in enumerate(all_protein) if x == seq]
            data_id_val.extend(indices)

        assert len(data_id_tr) + len(data_id_val) == len(all_protein)

        partition = {}
        partition['train'] = data_id_tr
        partition['validation'] = data_id_val

        hp.save_pkl(f'{savepath}/CV_partition_{i}.pkl', partition)
        i += 1
    ####################################

    end = time.time()
    print(f'CV folds DONE in {end - start:.04} seconds')
    start = time.time()

    ####################################
    # get back parameters
    args = vars(parser.parse_args())

    datapath = args['datapath']
    savepath = args['savepath']
    verbose = args['verbose']
    ####################################

    ####################################
    # start processing
    with open(f'{datapath}/clusterRes_cluster.tsv') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        for i, line in enumerate(tsvreader):
            cluster_repr = line[0]
            in_cluster = line[1]

            if cluster_repr in all_clusters_id:
                all_clusters_id[cluster_repr].append(in_cluster)
            else:
                all_clusters_id[cluster_repr] = [in_cluster]

    hp.save_pkl(f'{savepath}/data_id_to_seq.pkl', data_id_to_seq)
    hp.save_pkl(f'{savepath}/all_clusters_id.pkl', all_clusters_id)

    end = time.time()
    print(f'Data process from MMseqs2 DONE in {end - start:.04} seconds')
    ####################################