示例#1
0
def main():
    """
    """
    # data = credit.dataset_31_credit_g()
    data = wine.wine_quality_red_csv()
    print(data.columns)
    # column = data['volatile_acidity'].values.reshape(-1, 1)
    # column = data[].values.reshape(-1, 1)
    X, y = data['volatile_acidity'].values.reshape(-1, 1), data['class']
    X_train, X_test, y_train, y_test = split(X,
                                             y,
                                             test_size=0.2,
                                             random_state=0)
    sets = split(X_test, y_test, test_size=.5, random_state=0)
    X_first_half, X_second_half, y_first_half, y_second_half = sets
    # print(X_first_half.shape, X_second_half.shape)
    # X_train, X_test, y_train, y_test = split(X, y,
    #                                          test_size=0.2,
    #                                          random_state=0)

    pipeline = WineQualityPipeline()
    classifier = RandomForest()
    model = pipeline.with_estimator(classifier).fit(X_train, y_train)
    # prediction = model.predict(X_test)
    # pipeline = CreditGPipeline()
    shift_detector = SklearnDataShiftDetector(model, n_bins=30)
    shift_detector.iteration(X_first_half)
    new_second_half = deepcopy(X_second_half)
    mask = np.logical_and(X_second_half > .4, X_second_half < 1.)
    new_second_half[mask] *= 3.
    plt.plot(range(X_first_half.shape[0]), X_first_half, 'go')
    plt.plot(range(new_second_half.shape[0]), new_second_half, 'r^')
    plt.show()
    shift_detector.iteration(new_second_half)
    print(shift_detector.data_is_shifted())
示例#2
0
def load_data(task, seed, val_size, test_size):
    """Load dataset
    Task: Graph Classification - benchmark datasets. Consisting of various graphs we want to classify.

    Returns:
    - train_dataset: PyG dataset
    - val_dataset: PyG dataset
    - test_dataset: PyG dataset
    - test_idx: indices of the original dataset used for testing - to recover original data.
    """
    path = "../data/TUDataset"

    if task == 'mutag':
        dataset = 'Mutagenicity'
        dataset = TUDataset(path,
                            dataset,
                            transform=T.NormalizeFeatures(),
                            cleaned=True)
    elif task == 'enzymes':
        dataset = 'ENZYMES'
        dataset = TUDataset(path,
                            dataset,
                            transform=T.NormalizeFeatures(),
                            cleaned=True)
    elif task == 'proteins':
        dataset = 'PROTEINS'
        dataset = TUDataset(path,
                            dataset,
                            transform=T.NormalizeFeatures(),
                            cleaned=True)
    else:
        NameError(f"task {args.task} not allowed")

    print(f'Dataset: {dataset}:')
    print('====================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {dataset.num_classes}')

    indices = [i for i in range(len(dataset))]
    train_idx, test_idx = split(indices,
                                random_state=seed,
                                test_size=test_size)
    train_dataset = dataset[train_idx]
    test_dataset = dataset[test_idx]

    indices = [i for i in range(len(train_dataset))]
    train_idx, val_idx = split(indices,
                               random_state=seed,
                               test_size=val_size / (1 - test_size))
    val_dataset = train_dataset[val_idx]
    train_dataset = train_dataset[train_idx]
    print(f'Number of training graphs: {len(train_dataset)}')
    print(f'Number of val graphs: {len(val_dataset)}')
    print(f'Number of test graphs: {len(test_dataset)}')

    return train_dataset, val_dataset, test_dataset, test_idx
示例#3
0
def main():
    parser = argparse.ArgumentParser(
        description='Preprocess a directory of XML files')
    parser.add_argument('--indir',
                        type=str,
                        default='../eltec/ELTeC-eng',
                        help='location of the original Eltec XML-files')
    parser.add_argument('--outdir',
                        type=str,
                        default='data/raw',
                        help='location of the train, dev, and test spit')
    parser.add_argument('--seed', type=int, default=12345, help='random seed')
    parser.add_argument('--train_size',
                        type=float,
                        default=.9,
                        help='Proportion of training data')

    args = parser.parse_args()
    print(args)

    filenames = glob.glob(os.sep.join((f'{args.indir}/level1', '**', '*.xml')),
                          recursive=True)

    train, rest = split(filenames,
                        train_size=args.train_size,
                        shuffle=True,
                        random_state=args.seed)
    dev, test = split(rest,
                      train_size=0.5,
                      shuffle=True,
                      random_state=args.seed)

    print(f'# train files: {len(train)}')
    print(f'# dev files: {len(dev)}')
    print(f'# test files: {len(test)}')

    if not os.path.exists(args.outdir):
        os.mkdir(args.outdir)

    out_folder = f'{args.outdir}/{os.path.basename(args.indir)}'
    try:
        shutil.rmtree(out_folder)
    except FileNotFoundError:
        pass
    os.mkdir(out_folder)

    with open(f'{out_folder}/train.txt', 'w') as f:
        for fn in train:
            f.write(plain_text(fn) + '\n')

    with open(f'{out_folder}/dev.txt', 'w') as f:
        for fn in dev:
            f.write(plain_text(fn) + '\n')

    with open(f'{out_folder}/test.txt', 'w') as f:
        for fn in test:
            f.write(plain_text(fn) + '\n')
示例#4
0
def partition(characteristics, categories, output, batch, training_percentage = 100, validation_percentage = 0, flag = False):

	train_percent = training_percentage / 100
	validate_percent = validation_percentage / 100
	test_percent = round(1 - (train_percent + validate_percent), 1)
	if ((output == 1) & (isinstance(categories[0], np.ndarray) == False) & (flag == False)): categories = categories.reshape((len(categories), -1))
	training_x, training_y = characteristics, categories
	train_x, train_y = torch.FloatTensor(training_x), training_y
	if (output > 1): train_y = torch.LongTensor(training_y)
	elif (output == 1): train_y = torch.FloatTensor(training_y)
	train = group(train_x, train_y)
	trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2)
	tester, validater = None, None

	if (test_percent > 0):

		training_x, testing_x, training_y, testing_y = split(characteristics, categories, test_size = test_percent, random_state = np.random.randint(1, 100), shuffle = True, stratify = categories)
		train_x = torch.FloatTensor(training_x)
		if (output > 1): train_y = torch.LongTensor(training_y)
		elif (output == 1): train_y = torch.FloatTensor(training_y)
		else: train_y = training_y.float()
		train = group(train_x, train_y)
		trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2)
		test_x = torch.FloatTensor(testing_x)
		if (output > 1): test_y = torch.LongTensor(testing_y)
		elif (output == 1): test_y = torch.FloatTensor(testing_y)
		else: test_y = testing_y.float()
		test = group(test_x, test_y)
		tester = loader(dataset = test, batch_size = batch, shuffle = True, num_workers = 2)
		duplicate = validate_percent
		validate_percent = (validation_percentage*len(characteristics) / len(train_x)) / 100

	if (validate_percent > 0):

		training_x, validation_x, training_y, validation_y = split(training_x, training_y, test_size = validate_percent, random_state = np.random.randint(1, 100), shuffle = True, stratify = training_y)
		train_x = torch.FloatTensor(training_x)
		if (output > 1): train_y = torch.LongTensor(training_y)
		elif (output == 1): train_y = torch.FloatTensor(training_y)
		else: train_y = training_y.float()
		train = group(train_x, train_y)
		trainer = loader(dataset = train, batch_size = batch, shuffle = True, num_workers = 2)
		validate_x = torch.FloatTensor(validation_x)
		if (output > 1): validate_y = torch.LongTensor(validation_y)
		elif (output == 1): validate_y = torch.FloatTensor(validation_y)
		else: validate_y = validation_y.float()
		validate = group(validate_x, validate_y)
		validater = loader(dataset = validate, batch_size = batch, shuffle = True, num_workers = 2)

	return trainer, tester, validater
示例#5
0
def kfold_validate(model, k, kwargs):
  """
  This functin does something similar to k fold validation. We train and test 
  our model k times, by randomly splitting our entire data set into three parts
  (train, dev and test) and return the average of the K runs.
  Args:
      model (str): What kind of model to use. It can be either lstm or cnn
      k (int): Number of iterations over which to average
      kwargs (dict): The parameters that define the model
  
  Returns:
      dict: A dictionary of results, contating the keys precision, recall and 
        fscore.
  """
  p_1 = 0.0
  r_1 = 0.0
  f_1 = 0.0
  train_data = ATEDataProcessor(kwargs["train_file"], **kwargs)
  test_data = ATEDataProcessor(kwargs["test_file"],
                               pos_id=get_count(train_data.annotated_sentences),
                               **kwargs)
  sentences = train_data.annotated_sentences + test_data.annotated_sentences
  for i in range(k):
    print("Run number: {}".format(i))
    train_set, test_set = split(sentences, test_size=0.2, random_state=42)
    train_set, dev_set = split(train_set, test_size=kwargs["test_size"], 
                               random_state=42)
    train = DataIterator(train_set, **kwargs)
    dev = DataIterator(dev_set, **kwargs)
    test = DataIterator(test_set, **kwargs)
    if model == "lstm":
      model = LSTMNetwork(**kwargs)
    elif model == "cnn":
      model = CNNNetwork(max_sentence_length=train_data.max_sentence_len,
                         **kwargs)
    model.build()
    model.train(train, dev)
    results = model.evaluate(test)
    p_1 += float(results["p_1"])
    r_1 += float(results["r_1"])
    f_1 += float(results["f_1"])
    model.close_session()
  print("p_1: {}\nr_1: {}\nf_1: {}".format(p_1/k, r_1/k, f_1/k))
  return {
    "precision": p_1/k,
    "recall": r_1/k,
    "fscore": f_1/k
  }
示例#6
0
def train_save(epochs=30, jsonfile="model.json", weightfile="model.h5"):
    X, y, classes = load_cifar10()
    nclasses = len(classes)
    # слои моей модели
    layers = [("Conv2D", {"filters":32, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "input_shape":X.shape[1:], "padding":"same"}),
              ("BatchNormalization", {}),
              ("Conv2D", {"filters":32, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}),
              ("MaxPooling2D", {"pool_size":(2,2), "padding":"same"}),
              ("BatchNormalization", {}),
              ("Dropout", {"rate":0.5}),
              ("Conv2D", {"filters":64, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}),
              ("BatchNormalization", {}),
              ("Conv2D", {"filters":64, "kernel_size":(3,3), "strides":(1,1), "activation":"relu", "padding":"same"}),
              ("BatchNormalization", {}),
              ("MaxPooling2D", {"pool_size":(2,2), "padding":"same"}),
              ("Dropout", {"rate":0.5}),
              ("Flatten", {}),
              ("Dense", {"units":64, "activation":"relu"}),
              ("Dropout", {"rate":0.5}),
              ("Dense", {"units":nclasses, "activation":"softmax"})]
    # тренируем
    model = create_model(layers)
    Xtrain, Xtest, ytrain, ytest = split(X, y, test_size=0.2)
    model.fit(Xtrain, ytrain, batch_size=256, epochs=epochs, validation_data = (Xtest, ytest))
    # сохранить модель
    model_json = model.to_json()
    with open(jsonfile, "w") as json_file:
        json_file.write(model_json)
    # сохранить веса (БОЛЬШОЙ ФАЙЛ)
    model.save_weights(weightfile)
    return model
示例#7
0
 def __init__(self,
              data,
              test_size=0.3,
              weight=False,
              negative=NEGATIVE,
              positive=POSITIVE,
              oversample=False,
              undersample=False):
     data_train, data_test = split(data,
                                   test_size=test_size,
                                   random_state=0)
     if oversample: data_train = Dataset.oversample(data_train, 'Income')
     if undersample: data_train = Dataset.undersample(data_train, 'Income')
     if weight:
         positive_size = (data_train['Income'] == positive).sum()
         negative_size = data_train.shape[0] - positive_size
         ratio = negative_size / positive_size
         data_train = Dataset.weight(data_train, 'Income', {
             negative: 1.0,
             positive: ratio
         })
         data_test = Dataset.weight(data_test, 'Income', {
             negative: 1.0,
             positive: 1.0
         })
     self.y_train = data_train.pop('Income')
     self.x_train = data_train
     self.y_test = data_test.pop('Income')
     self.x_test = data_test
示例#8
0
文件: ml.py 项目: nadiiaaii/mystic
def traintest(x, y, test_size=None, random_state=None):
    """get train-test split of data from archive

    Inputs:
        x: 2D input data array of shape (pts, nx)
        y: 1D output data array of shape (pts,)
        test_size: float, % data to use for test [0,1] (default: copy [1,1])
        random_state: int, seed for splitting data

    Returns:
        xtrain,xtest,ytrain,ytest: arrays of training and test data

    For example:
      >>> x = [[5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5.0, 3.6, 1.4, 0.2]]
      >>> y = [0, 1, 0, 0, 2]
      >>> 
      >>> xx,xt,yy,yt = traintest(x, y, test_size=.4)
      >>> len(xt)/len(x) == len(yt)/len(y) == .4
      True
      >>> len(xx)/len(x) == len(yy)/len(y) == 1-.4
      True
      >>> 
      >>> xx,xt,yy,yt = traintest(x, y)
      >>> len(yy) == len(yt) == len(y)
      True
      >>> len(xx) == len(xt) == len(x)
      True
    """
    # build train/test data
    xx = np.array(x)
    yy = np.array(y)
    if test_size is None:
        return xx, xx, yy, yy
    from sklearn.model_selection import train_test_split as split
    return split(xx, yy, test_size=test_size, random_state=random_state)
def train_evaluate(ga_individual_solution):

    # Decode the Genetic Algorithm solution to get the window size and number of bits
    window_size_bits = BitArray(ga_individual_solution[0:6])
    num_units_bits = BitArray(ga_individual_solution[6:])
    window_size = window_size_bits.uint
    num_of_units = num_units_bits.uint
    print('\nWindow Size: ', window_size, ', Num of Units: ', num_of_units)

    # Return fitness score of 100 if window_size or num_unit is zero
    if window_size == 0 or num_of_units == 0:
        return 100,

    # Segment the train_data based on new window_size;
    # Split the dataset into train set(80) and validation set(20)
    X_data, Y_data = prepare_dataset(train_data, window_size)
    X_train, X_val, y_train, y_val = split(X_data,
                                           Y_data,
                                           test_size=0.20,
                                           random_state=1120)

    # Design an LSTM model to train on training data and predict on validation data
    input_ph = Input(shape=(window_size, 1))
    x = LSTM(num_of_units, input_shape=(window_size, 1))(input_ph)
    predicted_values = Dense(1, activation='tanh')(x)
    model = Model(inputs=input_ph, outputs=predicted_values)
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=5, batch_size=20, shuffle=True)
    y_pred = model.predict(X_val)

    # Calculate the RMSE score as fitness score for GA
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print('Validation RMSE: ', rmse, '\n')

    return rmse,
示例#10
0
文件: train.py 项目: gpilania/roost
def main():

    dataset = CompositionData(data_path=args.data_path, fea_path=args.fea_path)
    orig_atom_fea_len = dataset.atom_fea_dim

    if args.test_path:
        print("using independent test set: {}".format(args.test_path))
        train_set = dataset
        test_set = CompositionData(data_path=args.test_path,
                                   fea_path=args.fea_path)
    else:
        print("using {} of training set as test set".format(args.test_size))
        indices = list(range(len(dataset)))
        train_idx, test_idx = split(indices,
                                    random_state=args.seed,
                                    test_size=args.test_size)

        train_set = torch.utils.data.Subset(dataset, train_idx[0::args.sample])
        test_set = torch.utils.data.Subset(dataset, test_idx)

    if not os.path.isdir("models/"):
        os.makedirs("models/")

    if not os.path.isdir("runs/"):
        os.makedirs("runs/")

    if not os.path.isdir("results/"):
        os.makedirs("results/")

    ensemble(args.data_id, train_set, test_set, args.ensemble,
             orig_atom_fea_len)
    def load_data(self, val_set_size=0.20):

        (X, Y), (x_test, y_test) = mnist.load_data()

        # Splitting Data into train and validation sets
        # x_train, x_val, y_train, y_val = split(X, Y, test_size=val_set_size, shuffle=True)
        x_train, x_val, y_train, y_val = split(X,
                                               Y,
                                               test_size=val_set_size,
                                               random_state=self.seed,
                                               shuffle=True)

        self.y_train = y_train
        self.y_test = y_test
        self.y_val = y_val

        # Flattening Data
        x_train = tf.reshape(tf.Variable(x_train, dtype=tf.float32),
                             [len(x_train), 784])
        x_val = tf.reshape(tf.Variable(x_val, dtype=tf.float32),
                           [len(x_val), 784])
        x_test = tf.reshape(tf.Variable(x_test, dtype=tf.float32),
                            [len(x_test), 784])

        # Normalizing Data
        self.x_train = x_train / 255
        self.x_test = x_test / 255
        self.x_val = x_val / 255
示例#12
0
def main():
    #setup util functions
    DEBUG = False
    PRINT = True
    util = Util(DEBUG, PRINT)
    #easy access to print funcs
    uprt = util.outPrt
    uerr = util.errPrt

    #read in data from csv
    data = pd.read_csv("./btc.csv")

    #print columns
    uprt("Columns", data.columns.values.tolist())

    #pull out target column
    y = data["PriceUSD"]
    X = data.loc[:, data.columns != "PriceUSD"]

    #split dataset
    xtrain, xtest, ytrain, ytest = split(X, y)
    #print split data sets
    uprt("\nxtrain:", xtrain)
    uprt("\nytrain:", ytrain)
    uprt("\nxtest:", xtest)
    uprt("\nytest:", ytest)
示例#13
0
    def train_evaluate(self, ga_individual_solution):
        # Decode GA solution to integer for window_size and num_units
        window_size_bits = BitArray(ga_individual_solution[0:6])
        num_units_bits = BitArray(ga_individual_solution[6:])
        window_size = window_size_bits.uint
        num_units = num_units_bits.uint
        print('\nWindow Size: ', window_size, ', Num of Units: ', num_units)

        # Return fitness score of 100 if window_size or num_unit is zero
        if window_size*num_units == 0:
            return 100,

            # Segment the train_data based on new window_size; split into train and validation (80/20)
        X, Y = prepare_dataset(self.train_data, window_size)
        X_train, X_val, y_train, y_val = split(X, Y, test_size=0.20, random_state=1120)

        # Train LSTM model and predict on validation set
        inputs = Input(shape=(window_size, 1))
        x = LSTM(num_units, input_shape=(window_size, 1))(inputs)
        predictions = Dense(1, activation='linear')(x)
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer='adam', loss='mean_squared_error')
        model.fit(X_train, y_train, epochs=5, batch_size=10, shuffle=True)
        y_pred = model.predict(X_val)

        # Calculate the RMSE score as fitness score for GA
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        self.generations_rmse.append(rmse)
        print('Validation RMSE: ', rmse, '\n')
        return rmse,
示例#14
0
def extract_data(data_dir, train_csv, dev_csv, class_txt):
    class_dict = {}
    class_idx = 0
    audio_train, label_train, audio_dev, label_dev = [], [], [], []
    classes = os.listdir(data_dir)

    for idx in classes:
        class_dict[idx] = class_idx
        class_path = os.path.join(data_dir, idx)
        files = os.listdir(class_path)
        path = [os.path.abspath(os.path.join(class_path, fn)) for fn in files]
        X_train, X_dev, y_train, y_dev = split(path,
                                               [class_idx] * (len(files)),
                                               test_size=0.2)
        audio_train += X_train
        label_train += y_train
        audio_dev += X_dev
        label_dev += y_dev
        class_idx += 1

    extract_csv(audio_train, label_train, train_csv)
    extract_csv(audio_dev, label_dev, dev_csv)

    f = open(class_txt, 'w')
    for idx in class_dict:
        f.write(f'{idx}\t{class_dict[idx]}\n')
    f.close()
示例#15
0
def main():
    """
    """
    # data = credit.dataset_31_credit_g()
    data = wine.wine_quality_red_csv()
    print(data.shape)
    print(data.columns)

    target = "class"
    X, y = data[[col for col in data.columns if col != target]], data[target]
    X_train, X_test, y_train, y_test = split(X,
                                             y,
                                             test_size=0.2,
                                             random_state=0)

    # pipeline = CreditGPipeline()
    pipeline = WineQualityPipeline()
    classifier = RandomForest(size=40)
    model = pipeline.with_estimator(classifier).fit(X_train, y_train)

    prediction = model.predict(X_test)
    print(accuracy_score(y_test, prediction))

    suite = TestSuite()
    automated_suite = AutomatedTestSuite()
    data_profile = DataFrameProfiler().on(X_train)
    pipeline_profile = SklearnPipelineProfiler().on(model)

    suite.add(Test().is_complete(
        data_profile.for_column('volatile_acidity')).is_in_range(
            data_profile.for_column('alcohol')))

    warnings = suite.on(X_test)

    print("*** TEST_SUITE, X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)

    error_generator = ExplicitMissingValues()
    corrupted_X_test = error_generator.run(X_test, ['volatile_acidity'])

    warnings = suite.on(corrupted_X_test)

    print("*** TEST_SUITE, CORRUPTED_X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)

    tests, warnings = (automated_suite.with_profiles(
        data_profile, pipeline_profile).run(corrupted_X_test))

    print("*** AUTOMATED_TEST_SUITE, CORRUPTED_X_TEST")
    if warnings and (len(warnings) != 0):
        print("======= WARNINGS =======")
        for warn in warnings:
            print(warn)
def transform_data(df: pd.DataFrame):
    train, test = split(df, test_size=0.35, random_state=45)
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train['Review'])
    X_test = vectorizer.transform(test['Review'])
    Y_train = train['Class']
    Y_test = test['Class']
    return X_train, Y_train, X_test, Y_test
def train_evaluate(ga_individual_solution):
    # Decode GA solution to integer for window_size and num_units
    window_size = BitArray(ga_individual_solution[0:4]).uint
    num_units = BitArray(ga_individual_solution[4:8]).uint
    dropout_key = BitArray(ga_individual_solution[8:]).uint
    dropout_rate = dropout_dict[dropout_key]
    print('\nWindow Size: ', window_size, ', Num of Units: ', num_units,
          ', Dropout Rate: ', dropout_rate)
    print(ga_individual_solution[0:4], ga_individual_solution[4:8],
          ga_individual_solution[8:])
    print(BitArray(ga_individual_solution[0:4]))
    #print(list('{BitArray(ga_individual_solution[0:4])}'.format(6)))

    # Set minumum sizes for window and units to 1
    if window_size == 0:
        ga_individual_solution[0:4] = [0, 0, 0, 1]
        window_size = 1
    if num_units == 0:
        ga_individual_solution[4:8] = [0, 0, 0, 1]
        num_units = 1
    print('\nWindow Size: ', window_size, ', Num of Units: ', num_units,
          ', Dropout Rate: ', dropout_rate)
    print(ga_individual_solution[0:4], ga_individual_solution[4:8],
          ga_individual_solution[8:])

    # Segment the train_data based on new window_size; split into train and validation (80/20)
    X, Y = prepare_dataset(train_data, window_size)
    X_train, X_val, y_train, y_val = split(X,
                                           Y,
                                           test_size=0.20,
                                           random_state=1120)

    # Train LSTM model and predict on validation set
    model = Sequential()
    model.add(LSTM(num_units, input_shape=(window_size, 1)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    history = model.fit(X_train,
                        y_train,
                        epochs=4,
                        batch_size=10,
                        shuffle=True)
    y_pred = model.predict(X_val)

    plt.plot(history.history['loss'], label='train')

    # Calculate the RMSE score as fitness score for GA
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    adjusted_rmse = .1 * ((num_units / 16) +
                          ((np.square(num_units) * window_size) /
                           (np.power(16, 3)))) + rmse
    print('Validation RMSE: ', rmse, '\nValidation MSE: ', mse, 'adjusted: ',
          adjusted_rmse)
    print
    return adjusted_rmse,
示例#18
0
def impData(sPath, sCat, fP):
    reviews = []

    for i, filename in enumerate(os.listdir(sPath)):
        reviews.insert(i, [open(sPath + filename).read(), sCat])

    #Create training & test sets
    lTrain, lTest = split(reviews, train_size=fP)

    return lTrain, lTest
示例#19
0
def modelar_dados (teste):
        ### FUNCAO PARA MODELAR OS DADOS E DIVIDI-LOS EM UMA BASE DE TREINO E DE TESTES DE ACORDO COM O PRAMETRO TAM_TESTES INFORMADO NAS CONSTANTES ###

	dataset = pd.read_csv('leaf.csv', header = None)
	
	classes = dataset[dataset.columns[0]].values

	dataset.drop([0], axis = 1, inplace = True)
	
	ind_t, ind_te, cl_t, cl_te = split(dataset, classes, test_size = teste, shuffle = True)

	return ind_t, ind_te, cl_t, cl_te
示例#20
0
 def __init__(self, text, Y, train_size=.85):
     self.model_builders = {'dtc': dtc, 'rfc': rfc}
     steps = ['tfidf', 'feature_engineering', 'lda', 'model']
     self.pipeline_dic = {step: None for step in steps}
     self.text_train, self.text_test, self.Y_train, self.Y_test = split(
         text, Y, train_size=train_size, stratify=Y)
     self.keep_tfidf = lambda tfidf_dic: (tfidf_dic == self.pipeline_dic[
         'tfidf'])
     self.keep_features = lambda features_dic: (features_dic == self.
                                                pipeline_dic['features'])
     self.prob_info = lambda prob: -prob * np.log(prob)
     self.pipeline_dic = {step: "Default" for step in steps}
     self.train_size = train_size
def split_dataset(data, target_feature, hp):
    train, val = hp['train_ratio'], hp['val_ratio']
    test, target = hp['test_ratio'], hp['target_ratio']
    denominator = np.sum([train, val, test, target])
    train /= denominator
    val /= denominator
    test /= denominator
    target /= denominator
    random_state = hp['random_state']
    X = data[[col for col in data.columns if col != target_feature]]
    y = data[target_feature]
    X_rest, X_train, y_rest, y_train = split(X,
                                             y,
                                             test_size=train,
                                             random_state=random_state)
    X_rest, X_target, y_rest, y_target = split(X_rest,
                                               y_rest,
                                               test_size=target / (1. - train),
                                               random_state=random_state)
    X_val, X_test, y_val, y_test = split(X_rest,
                                         y_rest,
                                         test_size=test / (val + test),
                                         random_state=random_state)
    return X_train, y_train, X_val, y_val, X_test, y_test, X_target, y_target
示例#22
0
 def setUp(self):
     self.resource_folder = get_resource_path()
     self.pipeline = CreditGPipeline()
     # data = credit.dataset_31_credit_g()
     data = pd.read_csv(os.path.join(self.resource_folder, 'data',
                                     'credit-g/dataset_31_credit-g.csv'))
     target = 'class'
     # I guess it will work only if the target value is the last one.
     self.features = [col for col in data.columns if col != target]
     X = data[self.features]
     y = data[target]
     sets = split(X, y, test_size=0.2, random_state=0)
     self.X_train, self.X_test, self.y_train, self.y_test = sets
     self.data_profile = DataFrameProfiler().on(self.X_train)
     self.automated_suite = AutomatedTestSuite()
def train_roost(args,
                model_name,
                csv_train,
                csv_val=None,
                val_frac=0.0,
                resume=False,
                transfer=None,
                fine_tune=None):

    args.data_path = f'data/datasets/{csv_train}'
    args.val_size = val_frac

    dataset = CompositionData(data_path=args.data_path, fea_path=args.fea_path)
    orig_atom_fea_len = dataset.atom_fea_dim
    args.fea_len = orig_atom_fea_len

    if resume:
        args.resume = resume
    else:
        if transfer is not None:
            args.transfer = transfer
        elif fine_tune is not None:
            args.fine_tune = fine_tune

    if csv_val is None:
        indices = list(range(len(dataset)))
        train_idx, val_idx = split(indices,
                                   random_state=args.seed,
                                   test_size=args.val_size)
        train_set = torch.utils.data.Subset(dataset, train_idx[0::args.sample])
        val_set = torch.utils.data.Subset(dataset, val_idx)
    else:
        train_set = dataset
        val_set = CompositionData(data_path=f'data/datasets/{csv_val}',
                                  fea_path=args.fea_path)

    if not os.path.isdir("models/"):
        os.makedirs("models/")

    if not os.path.isdir("runs/"):
        os.makedirs("runs/")

    if not os.path.isdir("results/"):
        os.makedirs("results/")

    ensemble(model_name, args.fold_id, train_set, val_set, args.ensemble,
             orig_atom_fea_len, args)
示例#24
0
def train_model(best_window_size, best_num_units):
    # Decode GA solution to integer for window_size and num_units

    train_data = DATA
    split_point = SPLIT_POINT

    window_size = best_window_size
    num_units = best_num_units

    #     window_size =  200
    #     num_units = 150

    print('\nWindow Size: ', window_size, ', Num of Units: ', num_units)

    # Return fitness score of 100 if window_size or num_unit is zero
    if window_size == 0 or num_units == 0:
        return 0,

    print("DEBUG: The data size is : ", len(train_data[0]))
    mp_prep = MultiProcessPreprocessor(train_data, window_size)
    X,Y = mp_prep.mp_preprocessor()
    del mp_prep
    # X,Y = prepare_dataset(train_data,window_size)
    print("DEBUG: The X size is : ", len(X))

    X_train, X_val, y_train, y_val = split(X, Y, test_size = 0.20, random_state = 1120)

    inputs = Input(shape=(window_size, 3))
    x = LSTM(num_units, input_shape=(window_size, 3))(inputs)
    ##
    x = Dense(200, activation='relu')(x)
    x = Dense(100, activation='relu')(x)
    x = Dense(50, activation='relu')(x)
    ##
    predictions = Dense(6, activation='softmax')(x)
    opt = optimizers.SGD(lr=0.01, momentum=0.9)
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=20, shuffle=True)

    _, train_acc = model.evaluate(X_train, y_train, verbose=0)
    _, test_acc = model.evaluate(X_val, y_val, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

    return model, test_acc
def train_test_split(data, test_ratio=0.5, n_splits=10, best_split=True):
    """ 
    This function splits the data into two using stratified sampling in ratios as determined by test_ratio.
    The strata is constructed out of creating quartile splits on the target variable, i.e., sales_volume.
    If the best_split is True, The split which yields the minimum differen in the means of target is 
    returned. Else, the last split is returned as it is. 
    Note the number of splits are determined by n_splits"""

    # Object for stratified sampling
    split_obj = split(n_splits=n_splits,
                      test_size=test_ratio,
                      random_state=180)
    # Discretizing the target volume to guide stratified sampling
    data['categories'] = pd.qcut(data['sales_volume'],
                                 4,
                                 labels=["low", "low mid", 'high mid', "high"])

    # best split is one that yields least difference in mean sales_volume of both folds
    least_diff_in_means = None
    best_split = None, None
    # Looping over each split
    for idx_train, idx_test in split_obj.split(data, data['categories']):
        train = data.iloc[idx_train]
        test = data.iloc[idx_test]

        diff_in_means = abs(train.sales_volume.mean() -
                            test.sales_volume.mean())
        """ Update the best split if best_split=True and 
        either the current split is the first split or the best split.
        """
        if best_split and ((least_diff_in_means is None) or
                           (least_diff_in_means > diff_in_means)):
            least_diff_in_means = diff_in_means
            best_split = idx_train, idx_test

    if best_split[0] is None:
        best_split = idx_train, idx_test

    del data['categories']
    idx_train, idx_test = best_split

    train = data.iloc[idx_train]
    test = data.iloc[idx_test]

    return train, test
示例#26
0
def data_split(x, y, i):
    # split x and y1 data, test size = ratio(default = 0.25), train_size = the remaining data
    # random default state 123
    j = 0
    x_train = []
    x_test = []
    y_train = []
    y_test = []

    while j < 3:
        x_tr, x_te, y_tr, y_te = split(x, y[j], test_size=i, random_state=123)
        j += 1

        x_train.append(x_tr)
        x_test.append(x_te)
        y_train.append(y_tr)
        y_test.append(y_te)

    return x_train, x_test, y_train, y_test
    def pred_score(X, y):
        # split the data into train and test, and train iem
        X_train, X_test, y_train, y_test = split(X, y, test_size=0.5)
        rgs.fit(X_train, y_train)

        # predict the response in test cases:
        pred_iem = rgs.predict(X_test)
        err_iem = [
            circ_dist(pred_iem[i_], y_test[i_]) for i_ in range(len(pred_iem))
        ]

        # score the test cases:
        u = np.sum([err_iem[i_]**2 for i_ in range(len(err_iem))])
        v = 90**2
        score_rgs = 1 - u / v

        # compute reconstruction:
        recon = rgs._predict_direction_responses(X_test)

        # score classifier and get average reconstructions per category:
        cat_predict = []
        for i_tr in range(0, len(pred_iem)):
            #             targ_dist = np.abs(pred_iem[i_tr] - targ_directions)
            targ_dist = [
                np.abs(circ_dist(pred_iem[i_tr], targ_directions[i_targ]))
                for i_targ in range(len(targ_directions))
            ]
            targ_match = np.argmin(targ_dist)
            cat_predict.append(targ_directions[targ_match])

        cls_hits = [cat_predict[i_] == y_test[i_] for i_ in range(len(y_test))]
        score_cls = np.sum(cls_hits) / len(cls_hits)

        recon_cat = np.empty((3, 180))
        for i_cat in range(0, len(targ_directions)):
            d_cat = [
                recon[:, x] for x in range(0, len(cat_predict))
                if cat_predict[x] == targ_directions[i_cat]
            ]
            recon_cat[i_cat, :] = np.mean(d_cat, 0)

        return (score_rgs, score_cls, recon_cat)
def train():
    columns = [f'Unnamed: {i}' for i in range(1,6)]
    for index,column in enumerate(columns):
        ''' Get Dataset '''
        features, labels = ImagePreprocessor.createDataset(column, grayscale = False, normalize = True)
        features = np.reshape(features, (len(features), 128, 128, 3))
        ''' onehot for multiclass '''
        if index == 1: labels = onehot(labels)
        else: labels = [0 if label == '-1' else int(label) for label in labels]
        ''' split train and test sets '''
        features, testX, labels, testY = split(features, labels, test_size = 0.2)
        model = createModel(multiclass = (column == 'Unnamed: 1'))
        ''' Stop after 2 epochs if val loss doent decrease, reduce LR when val loss stops decreasing, save model on epoch end '''
        callbacks = [EarlyStopping(monitor = 'val_loss', restore_best_weights= True, patience = 2), ReduceLROnPlateau(monitor = 'val_loss', factor = 0.2, patience = 1)]
        callbacks.append(ModelCheckpoint(f'models/Task{index+1}.h5'))
        ''' loss function different for multi and binary '''
        loss = 'categorical_crossentropy' if index == 1 else 'binary_crossentropy'
        ''' adam optimizer validate on split class weighting for undereprestented classes '''
        model.compile(optimizer = 'adam', loss = loss, metrics = ['accuracy'])
        history = model.fit(features, labels, batch_size = 32, epochs = 20, validation_data = (testX, testY), callbacks = callbacks, class_weight = 'auto')
    def __prepare_train_data(self, X, y, sample_weight):
        self.__text_field = Field(lower=True)
        self.__label_field = Field(sequential=False)
        self.__text_field.tokenize = self.__tokenize
        sample_weight = None if sample_weight is None else list(sample_weight)
        sw = [1 for yi in y] if sample_weight is None else sample_weight
        s = y if Counter(y).most_common()[-1][1] > 1 else None
        X_t, X_d, y_t, y_d, w_t, _ = split(X,
                                           y,
                                           sw,
                                           shuffle=True,
                                           stratify=s,
                                           random_state=self.random_state,
                                           train_size=self.split_ratio)
        fields = [("text", self.__text_field), ("label", self.__label_field)]
        examples = [[X_t[i], y_t[i]] for i in range(len(X_t))]
        examples = [Example.fromlist(example, fields) for example in examples]
        weights = compute_sample_weight(self.class_weight, y_t)
        weights = [weights[i] * w_t[i] for i in range(len(y_t))]
        min_weight = min(weights)
        weights = [int(round(weight / min_weight)) for weight in weights]

        for i in range(len(X_t)):
            Xi = [X_t[i] for j in range(weights[i] - 1)]
            examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]

        train_data = Dataset(examples, fields)
        dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))]
        dev_data = [Example.fromlist(example, fields) for example in dev_data]
        dev_data = Dataset(dev_data, fields)

        self.__text_field.build_vocab(train_data,
                                      dev_data,
                                      vectors=self.vectors)
        self.__label_field.build_vocab(train_data, dev_data)

        batch_sizes = (self.batch_size, len(dev_data))
        return Iterator.splits((train_data, dev_data),
                               batch_sizes=batch_sizes,
                               sort_key=lambda ex: len(ex.text),
                               repeat=False)
示例#30
0
def decision_tree_main(cost_mode, max_depth):
    iris = load_iris()
    data = iris.get("data")
    target = iris.get("target")
    train_d, test_d, train_t, test_t = split(data,
                                             target,
                                             test_size=0.3,
                                             random_state=0)

    d_tree = DecisionTree(max_depth=int(max_depth),
                          target_num=len(set(target)))
    d_tree.fit(train_d, train_t, cost_mode)
    pred_list = d_tree.predict(test_d)
    #精度計算
    accuracy = 0
    for pred, answer in zip(pred_list, test_t):
        if pred == answer:
            accuracy += 1
    accuracy = float(accuracy / len(pred_list))
    print("max depth : ", max_depth, "accuracy : ", accuracy)
    return accuracy