def Datset_Generate(self): token_dict = yaml.load(open(self.hp.Token_Path), Loader=yaml.Loader) train_dataset = Dataset( token_dict=token_dict, pattern_path=self.hp.Train.Train_Pattern.Path, metadata_file=self.hp.Train.Train_Pattern.Metadata_File) dev_dataset = Dataset( token_dict=token_dict, pattern_path=self.hp.Train.Eval_Pattern.Path, metadata_file=self.hp.Train.Eval_Pattern.Metadata_File) inference_dataset = Inference_Dataset( token_dict=token_dict, pattern_paths=self.hp.Train.Inference_Pattern_in_Train) if self.gpu_id == 0: logging.info('The number of train patterns = {}.'.format( len(train_dataset))) logging.info('The number of development patterns = {}.'.format( len(dev_dataset))) logging.info('The number of inference patterns = {}.'.format( len(inference_dataset))) collater = Collater(token_dict=token_dict) inference_collater = Inference_Collater(token_dict=token_dict) self.dataloader_dict = {} self.dataloader_dict['Train'] = torch.utils.data.DataLoader( dataset= train_dataset, sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \ if self.hp.Use_Multi_GPU else \ torch.utils.data.RandomSampler(train_dataset), collate_fn= collater, batch_size= self.hp.Train.Batch_Size, num_workers= self.hp.Train.Num_Workers, pin_memory= True ) self.dataloader_dict['Dev'] = torch.utils.data.DataLoader( dataset= dev_dataset, sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \ if self.num_gpus > 1 else \ torch.utils.data.RandomSampler(dev_dataset), collate_fn= collater, batch_size= self.hp.Train.Batch_Size, num_workers= self.hp.Train.Num_Workers, pin_memory= True ) self.dataloader_dict['Inference'] = torch.utils.data.DataLoader( dataset=inference_dataset, sampler=torch.utils.data.SequentialSampler(inference_dataset), collate_fn=inference_collater, batch_size=self.hp.Inference_Batch_Size or self.hp.Train.Batch_Size, num_workers=self.hp.Train.Num_Workers, pin_memory=True)
def Dataset_Generate(self): train_dataset = Dataset( pattern_path=self.hp.Train.Train_Pattern.Path, metadata_file=self.hp.Train.Train_Pattern.Metadata_File, pattern_per_speaker=self.hp.Train.Batch.Train.Pattern_per_Speaker) dev_dataset = Dataset( pattern_path=self.hp.Train.Eval_Pattern.Path, metadata_file=self.hp.Train.Eval_Pattern.Metadata_File, pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker) inference_dataset = Dataset( pattern_path=self.hp.Train.Eval_Pattern.Path, metadata_file=self.hp.Train.Eval_Pattern.Metadata_File, pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker, num_speakers=50, #Maximum number by tensorboard. ) logging.info('The number of train speakers = {}.'.format( len(train_dataset))) logging.info('The number of development speakers = {}.'.format( len(dev_dataset))) collater = Collater(min_frame_length=self.hp.Train.Frame_Length.Min, max_frame_length=self.hp.Train.Frame_Length.Max) inference_collater = Inference_Collater( samples=self.hp.Train.Inference.Samples, frame_length=self.hp.Train.Inference.Frame_Length, overlap_length=self.hp.Train.Inference.Overlap_Length) self.dataloader_dict = {} self.dataloader_dict['Train'] = torch.utils.data.DataLoader( dataset= train_dataset, sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \ if self.hp.Use_Multi_GPU else \ torch.utils.data.RandomSampler(train_dataset), collate_fn= collater, batch_size= self.hp.Train.Batch.Train.Speaker, num_workers= self.hp.Train.Num_Workers, pin_memory= True ) self.dataloader_dict['Dev'] = torch.utils.data.DataLoader( dataset= dev_dataset, sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \ if self.num_gpus > 1 else \ torch.utils.data.RandomSampler(dev_dataset), collate_fn= collater, batch_size= self.hp.Train.Batch.Eval.Speaker, num_workers= self.hp.Train.Num_Workers, pin_memory= True ) self.dataloader_dict['Inference'] = torch.utils.data.DataLoader( dataset=inference_dataset, shuffle=True, collate_fn=inference_collater, batch_size=self.hp.Train.Batch.Eval.Speaker, num_workers=self.hp.Train.Num_Workers, pin_memory=True)
def distribution_user_view(): """ view the distribution of distance """ dataset = Dataset() data = dataset.data label = dataset.label data = data.reshape((len(data), -1)) data_nagitive = [] data_positive = [] for i in range(len(label)): item = data[i] if label[i] == 0.0: for j in range(len(item)): if item[j] != -1.0: data_nagitive.append(float(item[j])) else: for j in range(len(item)): if item[j] != -1.0: data_positive.append(float(item[j])) print(type(data_positive[0])) sns.kdeplot( data_positive, color='r', ) sns.kdeplot( data_nagitive, color='b', ) plt.show()
def load_ds(query_params): if query_params.query_name in os.listdir('../outputs/pickles/'): print 'loading saved ds' data = pickle.load(open('../outputs/pickles/' + query_params.query_name, 'rb')) else: print 'generating new ds' data = Dataset(execute_query(query_params.queries)) pickle.dump(data, open('../outputs/pickles/' + query_params.query_name, 'wb')) return data
def data_init(self): print("\nData init") #self.dataset = TCGA_Dataset(self.config) self.dataset = Dataset(self.config) generator = Generator(self.config, self.dataset) self.train_generator = generator.generate() self.X_val, self.y_val = self.dataset.convert_to_arrays( self.dataset._partition[0]['val'], self.dataset._partition[1]['val'], phase='val', size=self.config.sampling_size_val) self.X_test, self.y_test = self.dataset.convert_to_arrays( self.dataset._partition[0]['test'], self.dataset._partition[1]['test'], phase='test', size=self.config.sampling_size_test) self.y_test = self.patch_to_image(self.y_test, proba=False)
def Datset_Generate(self): train_Dataset = Dataset( pattern_path=hp.Train.Train_Pattern.Path, metadata_file=hp.Train.Train_Pattern.Metadata_File, accumulated_dataset_epoch=hp.Train.Train_Pattern. Accumulated_Dataset_Epoch, mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min, mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max, text_length_min=hp.Train.Train_Pattern.Text_Length.Min, text_length_max=hp.Train.Train_Pattern.Text_Length.Max, use_cache=hp.Train.Use_Pattern_Cache) dev_Dataset = Dataset( pattern_path=hp.Train.Eval_Pattern.Path, metadata_file=hp.Train.Eval_Pattern.Metadata_File, mel_length_min=hp.Train.Eval_Pattern.Mel_Length.Min, mel_length_max=hp.Train.Eval_Pattern.Mel_Length.Max, text_length_min=hp.Train.Eval_Pattern.Text_Length.Min, text_length_max=hp.Train.Eval_Pattern.Text_Length.Max, use_cache=hp.Train.Use_Pattern_Cache) inference_Dataset = Inference_Dataset( pattern_path=hp.Train.Inference_Pattern_File_in_Train) logging.info('The number of train patterns = {}.'.format( len(train_Dataset) // hp.Train.Train_Pattern.Accumulated_Dataset_Epoch)) logging.info('The number of development patterns = {}.'.format( len(dev_Dataset))) logging.info('The number of inference patterns = {}.'.format( len(inference_Dataset))) collater = Collater() inference_Collater = Inference_Collater() self.dataLoader_Dict = {} self.dataLoader_Dict['Train'] = torch.utils.data.DataLoader( dataset=train_Dataset, shuffle=True, collate_fn=collater, batch_size=hp.Train.Batch_Size, num_workers=hp.Train.Num_Workers, pin_memory=True) self.dataLoader_Dict['Dev'] = torch.utils.data.DataLoader( dataset=dev_Dataset, shuffle=True, collate_fn=collater, batch_size=hp.Train.Batch_Size, num_workers=hp.Train.Num_Workers, pin_memory=True) self.dataLoader_Dict['Inference'] = torch.utils.data.DataLoader( dataset=inference_Dataset, shuffle=False, collate_fn=inference_Collater, batch_size=hp.Inference_Batch_Size or hp.Train.Batch_Size, num_workers=hp.Train.Num_Workers, pin_memory=True) if hp.Mode in ['PE', 'GR']: self.dataLoader_Dict[ 'Prosody_Check'] = torch.utils.data.DataLoader( dataset=Prosody_Check_Dataset( pattern_path=hp.Train.Train_Pattern.Path, metadata_file=hp.Train.Train_Pattern.Metadata_File, mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min, mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max, use_cache=hp.Train.Use_Pattern_Cache), shuffle=False, collate_fn=Prosody_Check_Collater(), batch_size=hp.Train.Batch_Size, num_workers=hp.Train.Num_Workers, pin_memory=True)
print(device) kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {} print('Loading the dataset...') path = '/home/john/Desktop/Dissertation/data/labels_PCA' with open(path, 'rb') as f: labels_dict = pickle.load(f) labels_ID = list(labels_dict.keys()) path = '/home/john/Desktop/Dissertation/data/Dataset_1.npy' df_train = np.load(path) labels = np.array(list(labels_dict.values())) print('Creating DataLoader...') train_dataset = Dataset(data=df_train, labels=labels) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, **kwargs) train_loss = [] model = RICA(df_train.shape[1], n_clusters=256, penalty=1.2).to(device) optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5) for epoch in range(1, args.epochs + 1): print('Executing Epoch...', epoch) train_loss.append(train(epoch, model, optimizer, scheduler)) path = '/home/john/Desktop/Dissertation/TrainingError/RICA_ADAM_loss' with open(path, 'wb') as f:
import pandas as pd import numpy as np from Datasets import Dataset from vectorization import Vectorizer from sklearn.feature_extraction.text import TfidfVectorizer from cotraining import CotClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import classification_report #Data paths train_dir = "train/" test_dir = "test/" directory ="/home/goksenin/Desktop/GRADUATION PROJECT/Programming/" dataset = Dataset(directory) n_train = 3375 n_test = 1125 X_train, Y_train = dataset.get_set(train_dir) X_test, Y_test = dataset.get_set(test_dir) # -1: unlabeled 0:non-relative 1:relative y_train = np.asarray(Y_train) y_train[n_train//4: ] = -1 #######FEATURE EXTRACTION #getting related documents for feature_extraction relative_index = [i for i, y_i in enumerate(Y_train) if y_i == 1] related_data = [] for index in relative_index: related_data.append(X_train[index])
def data_init(self): print("\nData init") self.dataset = Dataset(self.config) self.generator = Generator(self.config, self.dataset)