Пример #1
0
    def Datset_Generate(self):
        token_dict = yaml.load(open(self.hp.Token_Path), Loader=yaml.Loader)

        train_dataset = Dataset(
            token_dict=token_dict,
            pattern_path=self.hp.Train.Train_Pattern.Path,
            metadata_file=self.hp.Train.Train_Pattern.Metadata_File)
        dev_dataset = Dataset(
            token_dict=token_dict,
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File)
        inference_dataset = Inference_Dataset(
            token_dict=token_dict,
            pattern_paths=self.hp.Train.Inference_Pattern_in_Train)

        if self.gpu_id == 0:
            logging.info('The number of train patterns = {}.'.format(
                len(train_dataset)))
            logging.info('The number of development patterns = {}.'.format(
                len(dev_dataset)))
            logging.info('The number of inference patterns = {}.'.format(
                len(inference_dataset)))

        collater = Collater(token_dict=token_dict)
        inference_collater = Inference_Collater(token_dict=token_dict)

        self.dataloader_dict = {}
        self.dataloader_dict['Train'] = torch.utils.data.DataLoader(
            dataset= train_dataset,
            sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \
                     if self.hp.Use_Multi_GPU else \
                     torch.utils.data.RandomSampler(train_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch_Size,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Dev'] = torch.utils.data.DataLoader(
            dataset= dev_dataset,
            sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \
                     if self.num_gpus > 1 else \
                     torch.utils.data.RandomSampler(dev_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch_Size,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_dataset,
            sampler=torch.utils.data.SequentialSampler(inference_dataset),
            collate_fn=inference_collater,
            batch_size=self.hp.Inference_Batch_Size
            or self.hp.Train.Batch_Size,
            num_workers=self.hp.Train.Num_Workers,
            pin_memory=True)
Пример #2
0
    def Dataset_Generate(self):
        train_dataset = Dataset(
            pattern_path=self.hp.Train.Train_Pattern.Path,
            metadata_file=self.hp.Train.Train_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Train.Pattern_per_Speaker)
        dev_dataset = Dataset(
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker)
        inference_dataset = Dataset(
            pattern_path=self.hp.Train.Eval_Pattern.Path,
            metadata_file=self.hp.Train.Eval_Pattern.Metadata_File,
            pattern_per_speaker=self.hp.Train.Batch.Eval.Pattern_per_Speaker,
            num_speakers=50,  #Maximum number by tensorboard.
        )
        logging.info('The number of train speakers = {}.'.format(
            len(train_dataset)))
        logging.info('The number of development speakers = {}.'.format(
            len(dev_dataset)))

        collater = Collater(min_frame_length=self.hp.Train.Frame_Length.Min,
                            max_frame_length=self.hp.Train.Frame_Length.Max)
        inference_collater = Inference_Collater(
            samples=self.hp.Train.Inference.Samples,
            frame_length=self.hp.Train.Inference.Frame_Length,
            overlap_length=self.hp.Train.Inference.Overlap_Length)

        self.dataloader_dict = {}
        self.dataloader_dict['Train'] = torch.utils.data.DataLoader(
            dataset= train_dataset,
            sampler= torch.utils.data.DistributedSampler(train_dataset, shuffle= True) \
                     if self.hp.Use_Multi_GPU else \
                     torch.utils.data.RandomSampler(train_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch.Train.Speaker,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Dev'] = torch.utils.data.DataLoader(
            dataset= dev_dataset,
            sampler= torch.utils.data.DistributedSampler(dev_dataset, shuffle= True) \
                     if self.num_gpus > 1 else \
                     torch.utils.data.RandomSampler(dev_dataset),
            collate_fn= collater,
            batch_size= self.hp.Train.Batch.Eval.Speaker,
            num_workers= self.hp.Train.Num_Workers,
            pin_memory= True
            )
        self.dataloader_dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_dataset,
            shuffle=True,
            collate_fn=inference_collater,
            batch_size=self.hp.Train.Batch.Eval.Speaker,
            num_workers=self.hp.Train.Num_Workers,
            pin_memory=True)
Пример #3
0
def distribution_user_view():
    """
    view the distribution of distance
    """

    dataset = Dataset()
    data = dataset.data
    label = dataset.label
    data = data.reshape((len(data), -1))

    data_nagitive = []
    data_positive = []
    for i in range(len(label)):
        item = data[i]
        if label[i] == 0.0:
            for j in range(len(item)):
                if item[j] != -1.0:
                    data_nagitive.append(float(item[j]))
        else:
            for j in range(len(item)):
                if item[j] != -1.0:
                    data_positive.append(float(item[j]))
    print(type(data_positive[0]))

    sns.kdeplot(
        data_positive,
        color='r',
    )
    sns.kdeplot(
        data_nagitive,
        color='b',
    )
    plt.show()
Пример #4
0
def load_ds(query_params):
    if query_params.query_name in os.listdir('../outputs/pickles/'):
        print 'loading saved ds'
        data = pickle.load(open('../outputs/pickles/' + query_params.query_name, 'rb'))
    else:
        print 'generating new ds'
        data = Dataset(execute_query(query_params.queries))
        pickle.dump(data, open('../outputs/pickles/' + query_params.query_name, 'wb'))
    return data
Пример #5
0
    def data_init(self):

        print("\nData init")
        #self.dataset = TCGA_Dataset(self.config)
        self.dataset = Dataset(self.config)

        generator = Generator(self.config, self.dataset)
        self.train_generator = generator.generate()

        self.X_val, self.y_val = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['val'],
            self.dataset._partition[1]['val'],
            phase='val',
            size=self.config.sampling_size_val)

        self.X_test, self.y_test = self.dataset.convert_to_arrays(
            self.dataset._partition[0]['test'],
            self.dataset._partition[1]['test'],
            phase='test',
            size=self.config.sampling_size_test)

        self.y_test = self.patch_to_image(self.y_test, proba=False)
Пример #6
0
    def Datset_Generate(self):
        train_Dataset = Dataset(
            pattern_path=hp.Train.Train_Pattern.Path,
            metadata_file=hp.Train.Train_Pattern.Metadata_File,
            accumulated_dataset_epoch=hp.Train.Train_Pattern.
            Accumulated_Dataset_Epoch,
            mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min,
            mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max,
            text_length_min=hp.Train.Train_Pattern.Text_Length.Min,
            text_length_max=hp.Train.Train_Pattern.Text_Length.Max,
            use_cache=hp.Train.Use_Pattern_Cache)
        dev_Dataset = Dataset(
            pattern_path=hp.Train.Eval_Pattern.Path,
            metadata_file=hp.Train.Eval_Pattern.Metadata_File,
            mel_length_min=hp.Train.Eval_Pattern.Mel_Length.Min,
            mel_length_max=hp.Train.Eval_Pattern.Mel_Length.Max,
            text_length_min=hp.Train.Eval_Pattern.Text_Length.Min,
            text_length_max=hp.Train.Eval_Pattern.Text_Length.Max,
            use_cache=hp.Train.Use_Pattern_Cache)
        inference_Dataset = Inference_Dataset(
            pattern_path=hp.Train.Inference_Pattern_File_in_Train)
        logging.info('The number of train patterns = {}.'.format(
            len(train_Dataset) //
            hp.Train.Train_Pattern.Accumulated_Dataset_Epoch))
        logging.info('The number of development patterns = {}.'.format(
            len(dev_Dataset)))
        logging.info('The number of inference patterns = {}.'.format(
            len(inference_Dataset)))

        collater = Collater()
        inference_Collater = Inference_Collater()

        self.dataLoader_Dict = {}
        self.dataLoader_Dict['Train'] = torch.utils.data.DataLoader(
            dataset=train_Dataset,
            shuffle=True,
            collate_fn=collater,
            batch_size=hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)
        self.dataLoader_Dict['Dev'] = torch.utils.data.DataLoader(
            dataset=dev_Dataset,
            shuffle=True,
            collate_fn=collater,
            batch_size=hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)
        self.dataLoader_Dict['Inference'] = torch.utils.data.DataLoader(
            dataset=inference_Dataset,
            shuffle=False,
            collate_fn=inference_Collater,
            batch_size=hp.Inference_Batch_Size or hp.Train.Batch_Size,
            num_workers=hp.Train.Num_Workers,
            pin_memory=True)

        if hp.Mode in ['PE', 'GR']:
            self.dataLoader_Dict[
                'Prosody_Check'] = torch.utils.data.DataLoader(
                    dataset=Prosody_Check_Dataset(
                        pattern_path=hp.Train.Train_Pattern.Path,
                        metadata_file=hp.Train.Train_Pattern.Metadata_File,
                        mel_length_min=hp.Train.Train_Pattern.Mel_Length.Min,
                        mel_length_max=hp.Train.Train_Pattern.Mel_Length.Max,
                        use_cache=hp.Train.Use_Pattern_Cache),
                    shuffle=False,
                    collate_fn=Prosody_Check_Collater(),
                    batch_size=hp.Train.Batch_Size,
                    num_workers=hp.Train.Num_Workers,
                    pin_memory=True)
    print(device)

    kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}

    print('Loading the dataset...')
    path = '/home/john/Desktop/Dissertation/data/labels_PCA'
    with open(path, 'rb') as f:
        labels_dict = pickle.load(f)
    labels_ID = list(labels_dict.keys())

    path = '/home/john/Desktop/Dissertation/data/Dataset_1.npy'
    df_train = np.load(path)
    labels = np.array(list(labels_dict.values()))

    print('Creating DataLoader...')
    train_dataset = Dataset(data=df_train, labels=labels)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               **kwargs)

    train_loss = []
    model = RICA(df_train.shape[1], n_clusters=256, penalty=1.2).to(device)
    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, momentum=0.9)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.5)
    for epoch in range(1, args.epochs + 1):
        print('Executing Epoch...', epoch)
        train_loss.append(train(epoch, model, optimizer, scheduler))

    path = '/home/john/Desktop/Dissertation/TrainingError/RICA_ADAM_loss'
    with open(path, 'wb') as f:
Пример #8
0
import pandas as pd
import numpy as np
from Datasets import Dataset
from vectorization import Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from cotraining import CotClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report



#Data paths
train_dir = "train/"
test_dir = "test/"
directory ="/home/goksenin/Desktop/GRADUATION PROJECT/Programming/"
dataset = Dataset(directory)
n_train = 3375
n_test = 1125
X_train, Y_train = dataset.get_set(train_dir)
X_test, Y_test = dataset.get_set(test_dir)

# -1: unlabeled 0:non-relative 1:relative 
y_train = np.asarray(Y_train)
y_train[n_train//4: ] = -1

#######FEATURE EXTRACTION
#getting related documents for feature_extraction
relative_index = [i for i, y_i in  enumerate(Y_train) if y_i == 1]
related_data = []
for index in relative_index:
    related_data.append(X_train[index])
Пример #9
0
 def data_init(self):
     print("\nData init")
     self.dataset = Dataset(self.config)
     self.generator = Generator(self.config, self.dataset)