예제 #1
0
    def _set_dataset(self):
        if self._dataset is not None:
            return

        self._seed()

        self._dataset = Dataset(
            filename=self.args.datafile,
            folder=self.args.dataroot,
            transformer=self.importer["transformer"],
            normalize=self.args.normalize,
        )

        if self.args.verbose:
            print("dataset loaded, {} classes in total".format(
                self._dataset.num_classes))
            print("train_shape = {}, test_shape = {}".format(
                self._dataset.train.X.shape, self._dataset.test.X.shape))

        self._dataset.filter(labels=self.args.labels)
        if self.args.balance:
            self._dataset.balance()
        self._dataset.sample(train_size=self.args.size,
                             test_size=self.args.size)

        if self.args.verbose:
            print("dataset downsampled, {} classes in total".format(
                self._dataset.num_classes))
            print("train_shape = {}, test_shape = {}".format(
                self._dataset.train.X.shape, self._dataset.test.X.shape))
 def __init__(self):
     self.val_inc_set = parameters.full_val_inc_set
     self.net_income_dict = None
     self.count_dict = None
     self.val_inc_count = None
     self.dataset = Dataset()
     self.initial_price = None
     self.return_ratio_dict = None
     self.full_inc_set = dict()
     self.full_count_dict = dict()
예제 #3
0
def main():
    BATCH_SIZE = 32
    NUM_EPOCH = 12
    LR = 0.001
    CLIP = 1
    STEP_SIZE = 4
    GAMMA = 0.1
    ENC_EMB_DIM = 256
    DEC_EMB_DIM = 256
    ENC_HID_DIM = 512
    DEC_HID_DIM = 512
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5

    device = torch.device('cuda')

    dataset = Dataset()
    train_data, valid_data, test_data = dataset.build_dataset()
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device)

    INPUT_DIM = len(dataset.SRC.vocab)
    OUTPUT_DIM = len(dataset.TRG.vocab)
    SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token]
    TRG_PAD_IDX = dataset.TRG.vocab.stoi[dataset.TRG.pad_token]

    encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM,
                      ENC_DROPOUT)
    attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
    decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM,
                      DEC_DROPOUT, attention)
    model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device)
    model.apply(init_weight)
    model.to(device)
    optimizer = Adam(model.parameters(), lr=LR)
    criterion = CrossEntropyLoss(ignore_index=TRG_PAD_IDX).to(device)
    scheduler = StepLR(optimizer, STEP_SIZE, GAMMA)

    min_valid_loss = 1e10

    for e in range(NUM_EPOCH):
        print("Epoch: {}".format(e + 1))
        train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
        print("Train loss: {}".format(train_loss))
        valid_loss = evaluate(model, valid_iterator, criterion)
        print("Valid loss: {}".format(valid_loss))

        if valid_loss < min_valid_loss:
            torch.save(model.state_dict(), "best_model.pt")
            min_valid_loss = valid_loss
예제 #4
0
def main(fpath):
    ENC_EMB_DIM = 256
    DEC_EMB_DIM = 256
    ENC_HID_DIM = 512
    DEC_HID_DIM = 512
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5

    device = torch.device('cuda')
    dataset = Dataset()
    INPUT_DIM = len(dataset.SRC.vocab)
    OUTPUT_DIM = len(dataset.TRG.vocab)
    SRC_PAD_IDX = dataset.SRC.vocab.stoi[dataset.SRC.pad_token]

    encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM,
                      ENC_DROPOUT)
    attention = Attention(ENC_HID_DIM, DEC_HID_DIM)
    decoder = Decoder(DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, OUTPUT_DIM,
                      DEC_DROPOUT, attention)
    model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, device)
    model.load_state_dict(torch.load("best_model.pt"))
    model.to(device)
    with open(fpath, "r") as f:
        sentences = f.readlines()

    translate_sentence(model, sentences, dataset.SRC, dataset.TRG, device)
예제 #5
0
class Worktable:
    def __init__(self):
        self.dataset = Dataset()
        self.dataset.get_data()

    def plot_price(self):
        time_axis = [2014.0 + 1.0 / 24 + i * 1.0 / 12 for i in range(72)]
        for inc_id in list(self.dataset.val_inc_set):
            price_list = self.dataset.price_dict[inc_id]
            plt.plot(time_axis, price_list)
            plt.xlabel('years')
            plt.ylabel('price per share')
            plt.xlim((2014, 2020))
            plt.title(f'{inc_id}')
            plt.savefig(f'figure/price/{inc_id}.png')
            plt.clf()
 def test_build_search_session(self):
     importer = ReflexiveImporter("neural_net_adam")
     dataset = Dataset(folder="../dataset")
     self.session = SearchSession(importer.model,
                                  importer.param_dist,
                                  dataset,
                                  n_iter=1,
                                  cv=3)
예제 #7
0
class Worktable:
    def __init__(self):
        self.dataset = Dataset()
        self.dataset.get_data()
        self.volatility = None
    
    def cal_volatility(self):
        # we use e as the base when calculating the log return
        self.volatility = {}
        for inc_id in list(self.dataset.val_inc_set):
            log_return_list = []
            price_list = self.dataset.price_dict[inc_id]
            assert len(price_list)==72
            for t in range(len(price_list)-1):
                log_return = math.log(price_list[t+1]) - math.log(price_list[t])
                log_return_list.append(log_return)
            assert len(log_return_list)==71
            return_mean = sum(log_return_list) / len(log_return_list)
            vol = 0
            for r in log_return_list:
                vol += ( r - return_mean )**2
            vol = (vol/(len(log_return_list)-1))**0.5
            self.volatility[inc_id] = vol
def main():
    """Load data, train network, visualize results."""
    data_dir = 'data/'
    trainset = loadmat(data_dir + 'train_32x32.mat')
    testset = loadmat(data_dir + 'test_32x32.mat')
    dataset = Dataset(trainset, testset)

    tf.reset_default_graph()
    dcgan = DCGAN(dataset)

    losses, samples = dcgan.train()

    # samples, losses = dcgan.load_pickle_data()

    dcgan.view_samples(-1, samples)
    dcgan.visualize_loss(losses)
예제 #9
0
    def build(self, datadir, test_only=False):
        self.logger.info("Building trainer class %s" % self.__class__.__name__)
        self.logger.info("Loading data from [%s]..." % (datadir))
        self.dataset = Dataset.load_ds(datadir, test_only)
        self.logger.info(str(self.dataset))

        # build model, loss, optimizer
        self.logger.info("Constructing model with hparams:\n%s" %
                         (json.dumps(self.config['Model'], indent=4)))

        self._build_models()

        self.logger.info('Constructing optimizer: %s' %
                         self.config['Trainer']['optimizer'])
        optimizer = getattr(torch.optim, self.config['Trainer']['optimizer'])
        self._opt = optimizer(self._model.parameters(),
                              self.config['Trainer']['lr'])
        params = [(name, p.shape)
                  for name, p in self._model.named_parameters()]
        self.logger.debug('Optimizing parameters: %s' % str(params))
예제 #10
0
def train_and_evaluate(model, epochs, batches, gpus=[], dual=False, plot_history=False, plot_model=False):
    import keras, tensorflow as tf
    from keras import utils

    if len(gpus) > 0:
        os.environ["CUDA_VISIBLE_DEVICES"]=','.join(gpus)

        config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
        sess = tf.Session(config=config)
        keras.backend.set_session(sess)
        keras.backend.get_session().run(tf.global_variables_initializer())

    if plot_model:
        if dual:
            utils.plot_model(model, to_file='dual_model.png', show_shapes=True)
        else:
            utils.plot_model(model, to_file='single_model.png', show_shapes=True)

    fetcher = DataFetcher()
    current_epochs = 0
    history = None

    if dual:
        data_type = 'split'
    else:
        data_type = 'stack'

    for samples in fetcher.fetch_inf(type=data_type):
        if current_epochs >= epochs:
            break

        if dual:
            (x_train1, x_train2, y_train), (x_test1, x_test2, y_test) = samples

            history = model.fit(
                [x_train1, x_train2], y_train,
                batch_size=batches,
                epochs=EPOCHS_BATCH + current_epochs,
                initial_epoch=current_epochs,
                verbose=1,
                validation_data=([x_test1, x_test2], y_test),
            )
            model.save(DUAL_MODEL_NAME)
        else:
            (x_train, y_train), (x_test, y_test) = samples

            history = model.fit(
                x_train, y_train,
                batch_size=batches,
                epochs=EPOCHS_BATCH + current_epochs,
                initial_epoch=current_epochs,
                verbose=1,
                validation_data=(x_test, y_test),
            )
            model.save(SINGLE_MODEL_NAME)

        current_epochs += EPOCHS_BATCH

    if plot_history:
        import matplotlib.pyplot as plt

        # Plot training & validation accuracy values
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('Model accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Test'], loc='upper left')
        plt.show()

        # Plot training & validation loss values
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('Model loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Test'], loc='upper left')
        plt.show()

    dataset = Dataset()
    dataset.load(number=0)

    if dual:
        (x_train1, x_train2, y_train), (x_test1, x_test2, y_test) = dataset.data(type='split')
        score = model.evaluate([x_test1, x_test2], y_test, verbose=0)
        model.save(DUAL_MODEL_NAME)
    else:
        (x_train, y_train), (x_test, y_test) = dataset.data(type='stack')
        score = model.evaluate(x_test, y_test, verbose=0)
        model.save(SINGLE_MODEL_NAME)

    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
예제 #11
0
    BATCH = 16
    START_LR = 1e-3
    STOP_LR = 1e-4
    DECAY_OVER = 400000


args.parse_args()

with open(args.CONFIG, "r") as config:
    config = yaml.safe_load(config)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = DDSP(**config["model"]).to(device)

dataset = Dataset(config["preprocess"]["out_dir"])

dataloader = torch.utils.data.DataLoader(
    dataset,
    args.BATCH,
    True,
    drop_last=True,
)

mean_loudness, std_loudness = mean_std_loudness(dataloader)
config["data"]["mean_loudness"] = mean_loudness
config["data"]["std_loudness"] = std_loudness

writer = SummaryWriter(path.join(args.ROOT, args.NAME), flush_secs=20)

with open(path.join(args.ROOT, args.NAME, "config.yaml"), "w") as out_config:
예제 #12
0
def main(args):
    # set up logs and device
    args.save_dir = get_save_dir(args.save_dir, args.name)
    log = get_logger(args.save_dir, args.name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')

    # set random seed
    log.info(f'Using random seed {args.seed}...')
    set_seeds(args.seed)

    # create dataset using torchtext
    log.info(f'Build data fields and {args.bert_variant} tokenizer...')
    dataset = Dataset(args.bert_variant)
    TEXT, LABEL = dataset.get_fields()

    # train:valid:test = 17500:7500:25000
    log.info('Build IMDb dataset using torchtext.datasets...')
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    train_data, valid_data = train_data.split(
        random_state=random.seed(args.seed))

    # iterators
    train_iterator, valid_iterator, test_iterator = dataset.get_iterators(
        train_data, valid_data, test_data, args.batch_size, device)

    # build LABEL vocabulary
    LABEL.build_vocab(train_data)

    # define model
    log.info('Building model...')
    model = BERTSentiment(args.bert_variant, args.hidden_dim, args.output_dim,
                          args.n_layers, args.bidirectional, args.dropout)

    # optimizer
    optimizer = optim.Adam(model.parameters())

    # criterion
    criterion = nn.BCEWithLogitsLoss()

    # place model and criterion on device
    model = model.to(device)
    criterion = criterion.to(device)

    # train set and validation set
    best_valid_loss = float('inf')
    for epoch in range(args.num_epochs):

        start_time = time.time()

        log.info(f'Training, epoch = {epoch}...')
        train_loss, train_acc = train(model, train_iterator, optimizer,
                                      criterion)

        log.info(f'Evaluating, epoch = {epoch}...')
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            log.info(f'Saving best model...')
            best_valid_loss = valid_loss
            torch.save(model.state_dict(),
                       f'{args.save_dir}/{args.model_name}')

    log.info('Model trained and evaluated...')

    # test set
    log.info('Testing...')
    model.load_state_dict(torch.load(f'{args.save_dir}/{args.model_name}'))
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
예제 #13
0
 def __init__(self):
     self.dataset = Dataset()
     self.dataset.get_data()
     self.volatility = None
예제 #14
0
import os
from preprocess import Dataset
from global_utils import dump, JsonMetricQueueWriter
from .search_session import SearchSession
from .sklearn_args import SklearnSessionParser, SklearnSessionArgs
from reflexive_import import ReflexiveImporter

if __name__ == '__main__':
    parser = SklearnSessionParser()
    args = SklearnSessionArgs(parser)

    dataset = Dataset(args.datafile, args.dataroot)
    dataset.filter(args.labels)
    if args.balance:
        dataset.balance()
    dataset.sample(args.size)

    importer = ReflexiveImporter(module_name=args.model,
                                 var_list=["model", "parameter_distribution"],
                                 alias_list=["model", "param"])
    session = SearchSession(importer["model"], importer["param"], dataset,
                            args.n_iter, args.cv)
    session.report_args()

    # tune (search for) hyper-parameters
    session.fit()
    session.report_best()
    session.report_result()
    dump(session.search_results, os.path.join(args.output,
                                              "search-results.pkl"))
예제 #15
0
class BaseSessionBuilder:
    def __init__(self, args: TorchSessionArgs):
        self.args = args
        if self.args.verbose:
            print(self.args)

        self.importer = ReflexiveImporter(
            module_name=self.args.model,
            var_list=[
                "builder_class", "model_args", "model_kwargs", "transformer"
            ],
            package_name="pytorch_models",
        )

        self._dataset = None
        self._model = None
        self._device = None
        self._writer = None
        self._session = None
        self._set_device()

        self.static_model_kwargs = dict(
            pretrained_path=self.args.pretrained,
            device=self._device,
        )

    def _seed(self):
        if self.args.seed is not None:
            np.random.seed(self.args.seed)
            if self.args.verbose:
                print("setting numpy random seed to {}".format(self.args.seed))
        elif self.args.verbose:
            print("no random seed specified for numpy")

    def _set_dataset(self):
        if self._dataset is not None:
            return

        self._seed()

        self._dataset = Dataset(
            filename=self.args.datafile,
            folder=self.args.dataroot,
            transformer=self.importer["transformer"],
            normalize=self.args.normalize,
        )

        if self.args.verbose:
            print("dataset loaded, {} classes in total".format(
                self._dataset.num_classes))
            print("train_shape = {}, test_shape = {}".format(
                self._dataset.train.X.shape, self._dataset.test.X.shape))

        self._dataset.filter(labels=self.args.labels)
        if self.args.balance:
            self._dataset.balance()
        self._dataset.sample(train_size=self.args.size,
                             test_size=self.args.size)

        if self.args.verbose:
            print("dataset downsampled, {} classes in total".format(
                self._dataset.num_classes))
            print("train_shape = {}, test_shape = {}".format(
                self._dataset.train.X.shape, self._dataset.test.X.shape))

    def _set_model(self):
        if self._model is not None:
            return

        self._set_dataset()

        builder_class = self.importer["builder_class"]  # type: callable
        model_args = self.importer["model_args"]  # type: tuple
        model_kwargs = self.importer["model_kwargs"]  # type: dict
        model_kwargs.update(self.static_model_kwargs)
        model_kwargs.update(dict(num_classes=self._dataset.num_classes))

        model_builder = builder_class(*model_args, **model_kwargs)
        self._model = model_builder()

        if self.args.verbose:
            print("using model", self._model)

    def _set_device(self):
        if self._device is not None:
            return

        self._device = torch.device(
            "cuda" if self.args.cuda or torch.cuda.is_available() else "cpu")
        if self.args.verbose:
            print("using device: {}".format(self._device))

    def _set_writer(self):
        if self._writer is not None:
            return

        self._writer = SummaryWriter(log_dir=self.args.logdir)
        if self.args.verbose:
            print("logging summaries at", self._writer.log_dir)

    def _set_session(self):
        if self._session is not None:
            return

        self._set_dataset()
        self._set_model()
        self._set_device()
        self._set_writer()

    @property
    def dataset(self):
        self._set_dataset()
        return self._dataset

    @property
    def model(self):
        self._set_model()
        return self._model

    @property
    def device(self):
        self._set_device()
        return self._device

    @property
    def writer(self):
        self._set_writer()
        return self._writer

    @property
    def session(self):
        self._set_session()
        return self._session

    def __call__(self, *args, **kwargs):
        return self.session
예제 #16
0
 def __init__(self):
     self.dataset = Dataset()
     self.dataset.get_data()
class RDataset:
    def __init__(self):
        self.val_inc_set = parameters.full_val_inc_set
        self.net_income_dict = None
        self.count_dict = None
        self.val_inc_count = None
        self.dataset = Dataset()
        self.initial_price = None
        self.return_ratio_dict = None
        self.full_inc_set = dict()
        self.full_count_dict = dict()

    def get_data(self):
        self.dataset.get_data()
        self.initial_price = self.dataset.initial_price
        self.net_income_dict = {}
        self.count_dict = {}
        for y in range(2014, 2020):
            for q in [2, 4]:
                #with open(f"financial_report/U_{y}Q{q}.csv", encoding='big5-hkscs') as f:
                with open(f"financial_report/U_{y}Q{q}.csv") as f:
                    lines = f.readlines()
                    err = 0
                    #for i in range(38,len(lines)):
                    for i in range(len(lines)):
                        line = lines[i].strip()
                        char_list = line.split(',')
                        if char_list[1] and char_list[1][0] == '(':
                            char_list[1] = char_list[1][1:-1]
                        try:
                            inc_id = int(char_list[0])
                            net_income = float(char_list[1])
                            #using full dict to check if company has full data
                            self.full_inc_set[inc_id] = self.full_inc_set.get(
                                inc_id, [])
                            self.full_inc_set[inc_id].append(net_income)
                            self.full_count_dict[
                                inc_id] = self.full_count_dict.get(inc_id,
                                                                   0) + 1
                            if inc_id in self.val_inc_set:
                                self.net_income_dict[
                                    inc_id] = self.net_income_dict.get(
                                        inc_id, [])
                                self.net_income_dict[inc_id].append(net_income)
                                self.count_dict[inc_id] = self.count_dict.get(
                                    inc_id, 0) + 1
                        except:
                            err += 1
        '''        
        #print the number
        print('full_count_dict:', self.full_count_dict)
        count_list = [0 for i in range(13)]
        for v in self.full_count_dict.values():
            count_list[v] +=1
        print('count_list:',count_list)
        new_parameters = set()
        for inc,v in self.full_count_dict.items():
            if v >= 10 and inc>=1000:
                print(v)
                new_parameters.add(inc)
        print('new_parameters:', new_parameters)
        print(done)
        '''

        self.val_inc_count = 0
        for inc, c in self.count_dict.items():
            if c == 12:
                self.val_inc_count += 1

        #compute income_sum_dict
        self.income_sum_dict = {}
        for inc_id in list(self.val_inc_set):
            print(len(self.net_income_dict[inc_id]))
            assert len(self.net_income_dict[inc_id]) >= 10
            self.income_sum_dict[inc_id] = sum(self.net_income_dict[inc_id])

        print('net_income_dict:', self.net_income_dict)
        print('num of inc:', len(self.net_income_dict.keys()))
        print('count_dict:', self.count_dict)
        print('val_inc_count:', self.val_inc_count)
        print('val_inc_set:', self.val_inc_set)
        print('initial_price:', self.initial_price)
        print('income_sum_dict:', self.income_sum_dict)

    def cal_return_ratio(self):
        self.return_ratio_dict = {}
        for inc_id in list(self.val_inc_set):
            assert len(self.net_income_dict[inc_id]) >= 10
            income_sum = sum(self.net_income_dict[inc_id]) / len(
                self.net_income_dict[inc_id])
            initial_price = self.initial_price[inc_id]
            return_ratio = income_sum / initial_price
            self.return_ratio_dict[inc_id] = return_ratio

        print('return_ratio_dict:', self.return_ratio_dict)
        print('max_return', max(self.return_ratio_dict.values()))

    def cal_volatility(self):
        self.dataset.cal_volatility()
        print('volatility:', self.dataset.volatility)

    def plot_scatter(self):
        x = []
        y = []
        for inc in list(self.val_inc_set):
            volatility = self.dataset.volatility[inc]
            return_ratio = self.return_ratio_dict[inc]
            x.append(volatility)
            y.append(return_ratio)
        plt.scatter(x, y)
        plt.xlabel('volatility')
        plt.ylabel('mean P2E ratio')  #中文??
        plt.title('mean P2E ratio vs. volatility')
        plt.savefig('figure/scatter/new_scatter.png')
        plt.clf()

    def get_corrcoef(self):
        x = []
        y = []
        for inc in list(self.val_inc_set):
            volatility = self.dataset.volatility[inc]
            return_ratio = self.return_ratio_dict[inc]
            if 0.05 <= volatility <= 0.15 and 0 <= return_ratio <= 2:  #remove outliers
                x.append(volatility)
                y.append(return_ratio)
        x = np.array(x)
        y = np.array(y)
        self.corrcoef = np.corrcoef(x, y)[0][1]

        print('correlation coefficient:', self.corrcoef)

    def get_index_data(self):
        self.debt_ratio_dict = dict()
        self.d2n_ratio_dict = dict()
        self.report_score_dict = dict()
        self.cash_ratio_dict = dict()
        self.quick_ratio_dict = dict()
        self.current_ratio_dict = dict()
        self.ipm_dict = dict()
        self.cash_flow_ratio_dict = dict()
        self.dict_list = [
            None, self.debt_ratio_dict, self.d2n_ratio_dict,
            self.report_score_dict, self.cash_ratio_dict,
            self.quick_ratio_dict, self.current_ratio_dict, self.ipm_dict,
            self.cash_flow_ratio_dict, None
        ]
        with open('index/new_mean.csv', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines[1:]:
                char_list = line.strip().split(',')
                inc_id = int(char_list[0])
                for index in range(1, 10):
                    try:
                        self.dict_list[index][inc_id] = float(char_list[index])
                    except:
                        continue

    def plot_index_scatter_and_get_corrcoef(self):
        self.xlabel_list = [
            None, 'debt ratio', 'debt-to-net worth ratio',
            'financial report score', 'cash ratio', 'quick ratio',
            'current ratio', 'ipm', 'cash_flow_ratio', 'stability'
        ]
        for index in range(1, 10):
            x_list = []
            y_list = []
            x_dict = self.dict_list[index]
            if x_dict == None:
                continue
            y_dict = self.return_ratio_dict
            for inc in list(self.val_inc_set):
                if inc not in x_dict or inc not in y_dict:
                    print(f'incomplete data :{inc}')
                else:
                    if -1000 <= x_dict[inc] <= 1000:
                        x_list.append(x_dict[inc])
                        y_list.append(y_dict[inc])
            xlabel = self.xlabel_list[index]
            plt.scatter(x_list, y_list)
            plt.xlabel(xlabel)
            plt.ylabel('mean P2E ratio')
            plt.title(f'mean P2E ratio vs. {xlabel}')
            plt.savefig(f'figure/scatter/{xlabel}.png')
            plt.clf()
            #get corrcoef
            x_array = np.array(x_list)
            y_array = np.array(y_list)
            corrcoef = np.corrcoef(x_array, y_array)[0][1]
            with open(f'figure/corrcoef/{xlabel}.txt', 'w+') as f:
                f.write(f'corrcoef: {corrcoef}\n')
 def setUp(self):
     self.dataset = Dataset(folder="../dataset")
     self.n_train = len(self.dataset.train)
     self.n_test = len(self.dataset.test)
class TestDataset(unittest.TestCase):
    def setUp(self):
        self.dataset = Dataset(folder="../dataset")
        self.n_train = len(self.dataset.train)
        self.n_test = len(self.dataset.test)

    def test_sample_size(self):
        self.assertEqual(self.dataset.train.X.shape[0], self.dataset.train.y.shape[0])
        self.assertEqual(self.dataset.test.X.shape[0], self.dataset.test.y.shape[0])

    def test_dimension_size(self):
        self.assertEqual(self.dataset.train.X.shape[1], self.dataset.test.X.shape[1])
        self.assertEqual(len(self.dataset.train.y.shape), 1)
        self.assertEqual(len(self.dataset.test.y.shape), 1)

    def test_type(self):
        self.assertIsInstance(self.dataset.mapping, dict)
        self.assertIsInstance(self.dataset.train.X, np.ndarray)
        self.assertIsInstance(self.dataset.train.y, np.ndarray)
        self.assertIsInstance(self.dataset.test.X, np.ndarray)
        self.assertIsInstance(self.dataset.test.y, np.ndarray)

    def test_sample_train(self):
        self.dataset.sample_train(0.5)
        self.assertAlmostEqual(len(self.dataset.train), self.n_train * 0.5, delta=1)
        self.dataset.reset_train()

        self.dataset.sample_test(0.3)
        self.assertAlmostEqual(len(self.dataset.test), self.n_test * 0.3, delta=1)
        self.dataset.reset_test()

        self.dataset.sample_train(0.2).sample_train(0.5)
        self.assertAlmostEqual(len(self.dataset.train), self.n_train * 0.2 * 0.5, delta=1)
        self.dataset.reset_train()

        self.dataset.sample_test(0.9).sample_test(0.9)
        self.assertAlmostEqual(len(self.dataset.test), self.n_test * 0.9 * 0.9, delta=1)
        self.dataset.reset_test()

        self.dataset.sample_test(3.0)
        self.assertAlmostEqual(len(self.dataset.test), self.n_test, delta=1)
        self.dataset.reset_test()

        self.dataset.sample_train(1000)
        self.assertAlmostEqual(len(self.dataset.train), 1000, delta=1)
        self.dataset.reset_train()

        self.dataset.sample_train(10000000000)
        self.assertAlmostEqual(len(self.dataset.train), self.n_train, delta=1)
        self.dataset.reset_train()

        self.dataset.sample_test(3534)
        self.assertAlmostEqual(len(self.dataset.test), 3534, delta=1)
        self.dataset.reset_test()