#nn parameters max_len = 1500 sample_n = 10 embed_size = 256 batch_size = 100 epochs = 100 cnn_fun_path = 'scripts/python/tmr/' seq_type = 'aa' num_letters = 26 seq_resize = True #%% #generate datasets for fitting if new_model == True: seq_df = cf.load_seq_dataframe(data_path) uniq_anno = seq_df.annotation.unique() num_classes = len(uniq_anno) annotation_ydata_df = pd.DataFrame({ 'ydata': range(num_classes), 'annotation': uniq_anno }) seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation') seq_cluster = seq_df.loc[seq_df['Cluster'] > -1] seq_cluster_noise = seq_df.loc[seq_df['Cluster'] == -1] seq_cluster_a = seq_cluster #%% #generate training data for annotation/cluster datasets ##annotations train_a = seq_cluster_a.groupby(['annotation']).sample(n=sample_n)
# all_save_path='data/density_sample/all_data' tmp_save_path = 'data/density_sample/tmp/' final_save_path = 'data/density_sample/KDE' # n_thres=26 #Sampling Parameters rep = 3 val_sample = range(1, 11, 1) #start on 18 test_n = 5 #test per cluster # n_thres=26 # sample_rate=0.98 #%% #generate datasets for fitting seq_df = cf.load_seq_dataframe(data_path) uniq_anno = seq_df.annotation.unique() num_classes = len(uniq_anno) annotation_ydata_df = pd.DataFrame({ 'ydata': range(num_classes), 'annotation': uniq_anno }) seq_df = pd.merge(seq_df, annotation_ydata_df, on='annotation') # seq_df=seq_df.groupby(['annotation','Cluster']).filter(lambda x: x['id'].count()>n_thres) # n_sample=round(min(seq_df.groupby(['annotation'])['id'].count())*max_sample_rate)*2 # seq_df=seq_df.groupby(['annotation']).sample(n_sample) seq_df = seq_df.reset_index(drop=True) seq_df['o_index'] = seq_df.index for s in val_sample: train_sample = round((s - 0.2 * s) / 0.2)