示例#1
0
def eval_input_fn(params):

    data = Data(dataset='WN18', reverse=True)

    validation_data = data.get_inputs_and_targets()

    ds = validation_data

    return ds
示例#2
0
class kddNShot:
    def __init__(self, path, batch_size):
        self.data_generator = Data(path, batch_size)
        self.norm_adj = self.data_generator.get_adj_mat()

    def next(self):
        return self.data_generator.sample()

    def get_test(self):
        return self.data_generator.sample('test')
示例#3
0
def predict_input_fn(params):

    batch_size = params["batch_size"]

    data = Data(dataset='WN18', reverse=True)

    test_data = data.get_inputs_and_targets()

    # Take out top 10 samples from test data to make the predictions.
    ds = test_data.take(10).batch(batch_size)

    return ds
示例#4
0
def train_input_fn(params):
    """train_input_fn defines the input pipeline used for training."""

    # Retrieves the batch size for the current shard. The # of shards is
    # computed according to the input pipeline deployment. See
    # `tf.contrib.tpu.RunConfig` for details.
    data = Data(dataset='WN18', reverse=True)

    train_data = data.get_inputs_and_targets(training=True)

    ds = train_data.shuffle(buffer_size=1000).repeat()

    return ds
示例#5
0
def make_data(dataset_name):
    if dataset_name == 'sst':
        dataset = Data(dataset_name)
        texts, labels = load_sst_train()

        x_train = dataset.lt_to_int([t.lower() for t in texts])
        y_train = labels
        x_train_raw = texts

        x_test = dataset.x_val
        y_test = dataset.y_val
        x_test_raw = dataset.x_val_raw

    return x_train, y_train, x_train_raw, x_test, y_test, x_test_raw
示例#6
0
    def init(self):
        # data
        dataset = self.comboBox.currentText()
        data_process = self.comboBox_2.currentText()
        test_size = float(self.doubleSpinBox.value())
        data, label = Data().load(dataset, data_process)
        if data is None:
            self.textBrowser.setText("[Error]: input error" + str(label))
            return
        self.train_data, self.test_data, self.train_label, self.test_label = train_test_split(
            data, label, test_size=test_size)
        n_inputs = len(data[0])
        self.label = list(set(i for i in label))
        self.n_outputs = len(self.label)
        n_hidden_layer = int(self.spinBox.value())
        hidden_layer = self.lineEdit.text().split()
        if len(hidden_layer) == 0:
            self.textBrowser.setText("[Error]: input error")
            return
        try:
            hidden_layer = list(map(int, hidden_layer[:n_hidden_layer]))
        except Exception as e:
            self.textBrowser.setText("[Error]: input error, " + str(e))
            return

        self.MLP = MLP_Model(n_inputs, hidden_layer, self.label)
        self.textBrowser.setText("[Success]: init sucesss")
        self.mode = 'init'
示例#7
0
 def evaluate(self, data, xs, selected):
     Xnews_value = self.predict(xs, selected)
     dataset = Data(data, False)
     ori_model = TextModel(data)
     preds_ori = ori_model.predict(Xnews_value)
     acc_ori = np.mean(
         np.argmax(preds_ori, axis=-1) == np.argmax(dataset.pred_val,
                                                    axis=-1))
     return acc_ori
示例#8
0
def L2X(args):
    from build_gumbel_selector import Gumbel_Selection, Gumbel_Selection_Char

    if args.data == 'agccnn':
        gumbel_selector = Gumbel_Selection_Char(args.num_feats, args.data,
                                                args.train, args.original,
                                                args.mask)

    else:
        gumbel_selector = Gumbel_Selection(
            args.num_feats,
            args.data,
            args.train,
            args.original,
            args.mask,
        )
    if args.train:
        return None, None
    else:
        if args.train_score:
            dataset = Data(args.data, True)
            scores_val = gumbel_selector.predict(dataset.x_val)
            np.save(
                '{}/results/scores-val-{}-{}-original{}-mask{}.npy'.format(
                    args.data, args.method, args.num_feats, args.original,
                    args.mask), scores_val)
            scores_train = gumbel_selector.predict(dataset.x_train)
            np.save(
                '{}/results/scores-train-{}-{}-original{}-mask{}.npy'.format(
                    args.data, args.method, args.num_feats, args.original,
                    args.mask), scores_train)

        dataset = Data(args.data, False)
        st = time.time()
        scores = gumbel_selector.predict(dataset.x_val)
        print('Time spent is {}'.format(time.time() - st))
        return scores, [time.time() - st]
    def prepare_data(self, data_fields, wv_size=600):
        test_data = Data(self.file_name, self.file_path)
        test_df = test_data.csv_df(data_fields)
        # make a copy of the original tweets for later use
        original_df = test_df.copy()

        # pre-process data(same as how we trained)
        test_data.pre_process(test_df)

        # then convert using word2vec
        model = test_data.build_wordvec(size=wv_size, verbose=False)
        # take a look of the max_len of testing. although we still have to use max_len from train
        max_len_test = test_data.max_len(test_df)
        data = test_data.convert2vec(test_df,
                                     self.max_len_train,
                                     model,
                                     name='test_' + self.file_name)
        test_data.save_vec(data, name='test_' + self.file_name)

        self.data = data
        self.test_data = test_data
        self.test_df = test_df
        self.original_df = original_df
        print ">>>Done preparing data.<<<\n"
def gumbel(args):
    from build_gumbel_transformer import Gumbel_Transform, Gumbel_Transform_Char
    if args.data == 'agccnn':
        gumbel_transform = Gumbel_Transform_Char(args.data, args.num_feats,
                                                 args.method, args.train,
                                                 args.original, args.mask)
    else:
        gumbel_transform = Gumbel_Transform(args.data, args.num_feats,
                                            args.max_words, args.method,
                                            args.train, args.original,
                                            args.mask)
    if not args.train:
        dataset = Data(args.data)
        if args.method == 'L2X':

            scores = np.load(
                '{}/results/scores-{}-{}-original{}-mask{}.npy'.format(
                    args.data, args.method, args.num_feats, args.original,
                    args.mask))

        elif args.method == 'leave_one_out':
            scores = np.load('{}/results/scores-{}.npy'.format(
                args.data, args.method))

        changed_xs = []
        st = time.time()

        for k in xrange(1, args.num_feats + 1):
            selected_index = np.argsort(
                scores, axis=-1)[:, -k:]  # indices of largest k score.
            selected = np.zeros(scores.shape)
            selected[np.expand_dims(np.arange(len(scores)), axis=-1),
                     selected_index] = 1.0
            changed_x = gumbel_transform.predict(dataset.x_val, selected)

            changed_xs.append(changed_x)
        changed_xs = np.array(changed_xs)
        changed_xs = np.swapaxes(changed_xs, 0, 1)
        return changed_xs, [time.time() - st]
    return None, None
示例#11
0
文件: bow.py 项目: kae-mihara/LS-Tree
def train_bow(dataset_name):
    # Train the BoW model.
    data_model = dataset_name + 'bow'

    dataset = Data(dataset_name)
    x_train, y_train = dataset.x_train_raw, np.argmax(dataset.y_train, axis=1)
    x_test, y_test = dataset.x_val_raw, np.argmax(dataset.y_val, axis=1)

    print('Fitting transform...')
    vectorizer = CountVectorizer(max_features=20000)
    x_train_bow = vectorizer.fit_transform(x_train)

    x_test_bow = vectorizer.transform(x_test)

    print('Fitting logistic regression...')
    clf = LogisticRegression(random_state=0,
                             solver='lbfgs',
                             multi_class='multinomial')
    clf.fit(x_train_bow, y_train)

    print('Making prediction...')
    pred_test = clf.predict_proba(x_test_bow)
    acc_train = clf.score(x_train_bow, y_train)
    acc_test = clf.score(x_test_bow, y_test)
    print('The training accuracy is {}; the test accuracy is {}.'.format(
        acc_train, acc_test))

    # print('The size of bow transformer is {} MB.'.format(sys.getsizeof(vectorizer) * 1e-6))

    print('Save model to pickle...')

    if data_model not in os.listdir('.'):
        os.mkdir(data_model)

    with open('{}/vectorizer.pkl'.format(data_model), 'wb') as f:
        pkl.dump(vectorizer, f)

    with open('{}/clf.pkl'.format(data_model), 'wb') as f:
        pkl.dump(clf, f)
示例#12
0
def create_original_predictions(args):
    # save original validation prediction probabilities.
    dataset = Data(args.data, True)
    model = TextModel(args.data, False)
    pred_val = model.predict(dataset.x_val, verbose=True)
    pred_train = model.predict(dataset.x_train, verbose=True)

    if 'data' not in os.listdir(args.data):
        os.mkdir('{}/data'.format(args.data))

    np.save('{}/data/pred_val.npy'.format(args.data), pred_val)
    np.save('{}/data/pred_train.npy'.format(args.data), pred_train)

    acc_val = np.mean(
        np.argmax(pred_val, axis=1) == np.argmax(dataset.y_val, axis=1))
    acc_train = np.mean(
        np.argmax(pred_train, axis=1) == np.argmax(dataset.y_train, axis=1))
    print('The validation accuracy is {}.'.format(acc_val))
    print('The training accuracy is {}.'.format(acc_train))

    if args.data != 'agccnn':
        np.save('{}/data/embedding_matrix.npy'.format(args.data),
                model.emb_weights)
示例#13
0
        yhat = model.predict(X_test, batch_size=minibatch_size, verbose=1)
        yhat = [1 if x > 0.5 else -1 for x in yhat]
        print('Test accuracy: ' + str(accuracy_score(Y_test, yhat)))


#  MAIN
# .
# .
# .
# .
# .
# .
# .
# .

Data.load_data(momentum_window=30, X_window_average=30, newsTimeToMarket=0)

(X_train, Y_train), (X_test, Y_test) = Data.get_train_test_set()

test_x = tf.convert_to_tensor(X_test, dtype=tf.float32)
train_x = tf.convert_to_tensor(X_train, dtype=tf.float32)

train_y = tf.convert_to_tensor(Y_train, dtype=tf.float32)
test_y = tf.convert_to_tensor(Y_test, dtype=tf.float32)

print('.........................')
print("number of training examples = " + str(train_x.shape[0]))
print("number of test examples = " + str(test_x.shape[0]))
print("X_train shape: " + str(train_x.shape))
print("Y_train shape: " + str(train_y.shape))
print("X_test shape: " + str(test_x.shape))
示例#14
0
from load_data import Data
from model import Model
from train import Train


if __name__ == "__main__":

    data = Data()
    data.load()
    data.data_augment()
    data.data_splitting()
    data.print()
    dataset = data.get_dataset()
    testset = data.get_testset()
    
    models = Model(dataset[0].shape[1:], 50)
    m = models.ResNet()
    m.summary()

    # train = Train(m, dataset, testset, 50, 32, 'adam', 'sparse_categorical_crossentropy')
    train = Train(m, dataset, testset, 100, 200, 'adam', 'categorical_crossentropy')
    train.training()
    train.evaluate()

    


# tensorboard --logdir logs/scalars --port=7000
示例#15
0
    config.pdata = args.pdata
    config.data_opt = args.data_opt
    print(args.bk)

    dataset = args.dataset
    data_dir = "data/%s/" % dataset
    torch.backends.cudnn.deterministic = True

    # For reproducibility
    seed = 20
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed_all(seed)
    d = Data(data_dir=data_dir,
             reverse=True,
             subset_percentage=args.pdata,
             data_opt=args.data_opt)
    experiment = Experiment(num_iterations=args.num_iterations,
                            batch_size=args.batch_size,
                            learning_rate=args.lr,
                            decay_rate=args.dr,
                            ent_vec_dim=args.edim,
                            rel_vec_dim=args.rdim,
                            cuda=args.cuda,
                            input_dropout=args.input_dropout,
                            hidden_dropout1=args.hidden_dropout1,
                            hidden_dropout2=args.hidden_dropout2,
                            label_smoothing=args.label_smoothing,
                            bk=args.bk)
    path = 'model_state.pts'
    if args.bk: path = 'model_state_sym.pts'
示例#16
0
def main(config: argparse.Namespace) -> None:
    # TODO docstring
    with open(config.config_file, 'r') as cfg:
        experiments: dict = yaml.load(cfg)

    print('loading data')
    data = Data(config.yelp_file, config.geneea_file)
    data.print(f'Processing file {config.config_file}')

    print('generating samples')
    datasize: int = data.generate_sample(experiments['config']['chunks'],
                                         LikeTypeEnum.USEFUL)

    stats: DataGraph = DataGraph('', 'number of instances', 'percentage')

    # texts_tokenized = (self._tokenize(row.text) for index, row
    #                    in self.data.iterrows())
    # words_freqs = nltk.FreqDist(w.lower() for tokens in texts_tokenized
    #                             for w in tokens)
    #
    # # TODO statistics
    # # for x in all_words:
    # # print(all_words[x])
    #
    # # self.print('total number of words:', sum(all_words.values()))
    # # self.print('unique words:', len(all_words))
    # # self.print('words present only once:',
    # # sum(c for c in all_words.values() if c == 1))
    # # all_words.plot(30)
    #
    # # only the right frequencies
    # self.gram_words = words_freqs.copy()
    # for w, count in words_freqs.items():
    #     if count > 200 or count == 20:
    #         # TODO Measure
    #         del self.gram_words[w]
    #
    # self.gram_words = frozenset(self.gram_words.keys())

    # calculate mutual information of all features if wanted
    # and dump it into text files
    if experiments['config']['mi']:
        for x in FeatureSetEnum:
            if x == FeatureSetEnum.BIGRAMS or \
                    x == FeatureSetEnum.TRIGRAMS or \
                    x == FeatureSetEnum.FOURGRAMS:
                continue
            if x == FeatureSetEnum.UNIGRAMS:  # TODO REMOVE
                continue
            # get data
            data.set_statfile(f'mi_{x}')
            data.print(f'Mutual Information of {x}.')
            train = data.get_feature_dict(SampleTypeEnum.TRAIN, {x})
            test = data.get_feature_dict(SampleTypeEnum.TEST, {x})
            instances = train + test

            # get matrix
            matrix_convertor = featurematrixconversion.Preprocessor({})
            vector_instances = matrix_convertor.process(
                instances, SampleTypeEnum.TRAIN)

            # calculate mutual info
            matrix_gen, labels_gen = zip(*vector_instances)
            matrix = sparse.vstack(matrix_gen)
            labels = list(labels_gen)
            mi = mutual_info_classif(matrix, labels)

            # dump data
            for f_name, f_mi in zip(matrix_convertor.all_fs, mi):
                data.print(f'{f_name}	{f_mi}')

        data.set_statfile(f'statistics')

    first_run: bool = True

    while True:
        train_size: int \
            = int(datasize - datasize / experiments['config']['chunks'])
        train_size_log: int = int(ceil(log2(train_size)) + 1)

        data.max_tfidf = experiments['config']['max_tfidf']
        data.max_ngrams = experiments['config']['max_ngrams']

        for ex in experiments['tasks']:
            # convert features to set:
            features: Set[FeatureSetEnum] \
                = {FeatureSetEnum[f] for f in ex['features']}
            train_set = data.get_feature_dict(SampleTypeEnum.TRAIN, features,
                                              ex['extra_data'])
            test_set = data.get_feature_dict(SampleTypeEnum.TEST, features,
                                             ex['extra_data'])

            if first_run:
                unique_features: set = set()
                for inst in train_set:
                    unique_features = unique_features.union(set(
                        inst[0].keys()))
                data.print(
                    f'Number of unique features for {ex["name"]}: {len(unique_features)}'
                )
                unique_features = set()

            l_curves = experiments['config']['l_curves']
            start_size: int = 1 if l_curves \
                else train_size_log-1

            for t_size in map(lambda x: min(2**x, train_size),
                              range(start_size, train_size_log)):
                if l_curves:
                    train_set_copy = train_set[:t_size]
                    test_set_copy = test_set[:]
                else:
                    train_set_copy = train_set
                    test_set_copy = test_set

                # preprocess data
                for pp in ex['preprocessing']:
                    prep: PreprocessorBase \
                        = getattr(preprocessors, pp).Preprocessor(ex['config'])
                    train_set_copy = prep.process(train_set_copy,
                                                  SampleTypeEnum.TRAIN)
                    test_set_copy = prep.process(test_set_copy,
                                                 SampleTypeEnum.TEST)

                if first_run and hasattr(train_set[0][0], 'keys'):
                    unique_features: set = set()
                    for inst in train_set:
                        unique_features = unique_features.union(
                            set(inst[0].keys()))
                    data.print(
                        f'Number of unique features after preprocessing for {ex["name"]}: {len(unique_features)}'
                    )
                    unique_features = set()

                cls: ClassifierBase \
                    = getattr(classifiers, ex['classificator']).Classifier(ex['config'])
                cls.train(train_set_copy)

                evaluation: dict \
                    = compute_evaluation_scores(cls, test_set_copy, LikeTypeEnum.USEFUL)

                stats.add_points(len(train_set_copy), ex['name'], evaluation)

                if l_curves:
                    evaluation: dict \
                        = compute_evaluation_scores(cls, train_set_copy, LikeTypeEnum.USEFUL)

                    stats.add_points(len(train_set_copy),
                                     ex['name'] + '-train', evaluation)

                first_run = False

        if not data.prepare_next_dataset():
            break

    # aggregate results here
    for g in experiments['graphs']:
        stats.name = g['name']
        stats.set_view(g['data'])
        data.plot(stats)
示例#17
0
                ranks.append(rank + 1)

                for hits_level in range(10):

                    if rank <= hits_level:
                        hits[hits_level].append(1.0)
                    else:
                        hits[hits_level].append(0.0)

        logger.info('Hits @10: {0}'.format(np.mean(hits[9])))
        logger.info('Hits @3: {0}'.format(np.mean(hits[2])))
        logger.info('Hits @1: {0}'.format(np.mean(hits[0])))
        logger.info('Mean rank: {0}'.format(np.mean(ranks)))
        logger.info('Mean reciprocal rank: {0}'.format(
            np.mean(1. / np.array(ranks))))


if __name__ == '__main__':

    # Load data
    data = Data(dataset='WN18', reverse=True)

    # Intialise model
    hypER = HyperER(len(data.entities), len(data.relations))

    # intialise build
    trainer = Train(hypER, data, num_epoch=100)
    trainer.train_and_eval()
    # trainer.evaluate()
    # trainer.test()
示例#18
0
 def __init__(self, path, batch_size):
     self.data_generator = Data(path, batch_size)
     self.norm_adj = self.data_generator.get_adj_mat()
	def __init__(self, data, train = False):
		self.data = data
		if data in ['imdbcnn']:

			filters = 250 
			hidden_dims = 250
			self.embedding_dims = 50
			self.maxlen = 400
			self.num_classes = 2
			self.num_words = 20002
			self.type = 'word'
			if not train:
				K.set_learning_phase(0)

			X_ph = Input(shape=(self.maxlen,), dtype='int32')
			emb_layer = Embedding(self.num_words, self.embedding_dims,
				input_length=self.maxlen, name = 'embedding_1')
			emb_out = emb_layer(X_ph) 

			if train:
				preds = construct_original_network(emb_out, data)	

			else: 
				emb_ph = Input(shape=(self.maxlen,self.embedding_dims), dtype='float32')   

				preds = construct_original_network(emb_ph, data) 


			if not train:
				model1 = Model(X_ph, emb_out)
				model2 = Model(emb_ph, preds) 
				pred_out = model2(model1(X_ph))  
				pred_model = Model(X_ph, pred_out) 
				pred_model.compile(loss='categorical_crossentropy',
							  optimizer='adam',
							  metrics=['accuracy']) 
				self.pred_model = pred_model 
				grads = []
				for c in range(self.num_classes):
					grads.append(tf.gradients(preds[:,c], emb_ph))

				grads = tf.concat(grads, axis = 0)  
				# [num_classes, batchsize, maxlen, embedding_dims]

				approxs = grads * tf.expand_dims(emb_ph, 0) 
				# [num_classes, batchsize, maxlen, embedding_dims]
				self.sess = K.get_session()  
				self.grads = grads 
				self.approxs = approxs
				self.input_ph = X_ph
				self.emb_out = emb_out
				self.emb_ph = emb_ph
				weights_name = 'original.h5'#[i for i in os.listdir('imdblstm/models/') if i.startswith('original')][0]
				model1.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)
				model2.load_weights('{}/models/{}'.format(data, weights_name), 
					by_name=True)  
				print('Model constructed.')
				# For validating the data. 
				emb_weights = emb_layer.get_weights() 
				emb_weights[0][0] = np.zeros(50)
				emb_layer.set_weights(emb_weights)
			else:
				pred_model = Model(X_ph, preds)
				
				pred_model.compile(loss='categorical_crossentropy',
							  optimizer='adam',
							  metrics=['accuracy']) 
				self.pred_model = pred_model
				from load_data import Data
				dataset = Data(self.data)
				self.train(dataset) 
				print('Training is done.') 
示例#20
0
def main(args):

    a4a = Data("a4a", A4A_FEATURES)
    a4a_testing = Data("a4a.t", A4A_FEATURES)
    iris = Data("iris.scale", IRIS_FEATURES)
    iris_testing = Data("iris.t", IRIS_FEATURES)

    if not args.algorithm:
        raise AssertionError("Please specify which ML Algorithm you would like to use with -a. Exiting...")

    if args.algorithm == 'perceptron' or args.algorithm == 'Perceptron':

        # Can specify max_lrate and max_epochs with -l and -e
        if not args.lrate:
            max_lrate = 0.1
        else:
            max_lrate = args.lrate

        if not args.epochs:
            epochs = 1000
        else:
            epochs = args.epochs

        if args.verbose:
            print("Beginning perceptron categorization... \n")
            
        for lrate in np.arange(0, max_lrate, 0.001):

            # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- #
            init_w_iris = [1 for _ in range(IRIS_FEATURES)]
            
            if args.verbose:
                print("\nIRIS:\n")
            """
                Perceptron makes two passes for multi-classification.
                First pass sets datapoints with labels 2 or 3 as -1, and classifies a point as either 1, or 2/3.
                Second pass distinguishes between 2 and 3 by comparing them alone.
                This is kind of messy tbh.
            """
            iris_y_second_pass = []
            iris_x_second_pass = []
            for i in range(len(iris.y)):
                if iris.y[i] == 1:
                    continue        
                elif iris.y[i] == 2:
                    iris.y[i] = -1
                    iris_x_second_pass.append(iris.x[i])
                    iris_y_second_pass.append(-1)
                elif iris.y[i] == 3:
                    iris.y[i] = -1
                    iris_x_second_pass.append(iris.x[i])
                    iris_y_second_pass.append(1)

            p2 = Perceptron(iris_testing.x[0], init_w_iris, bias=1)
            p2.train_weights(iris.x, iris.y, lrate=lrate, epochs=epochs, verbose=args.verbose)

            p3 = Perceptron(iris_testing.x[0], init_w_iris, bias=1)
            p3.train_weights(iris_x_second_pass, iris_y_second_pass, lrate=lrate, epochs=epochs, verbose=args.verbose)

            iris_error = 0

            iris_start_time = time.time()

            for j in range(len(iris_testing.x)):
                p2.set_x(iris_testing.x[j])
                prediction = p2.predict()
                if args.verbose:
                    print(f"Prediction for {iris_testing.x[j]}: {prediction}. Recorded classification is {iris_testing.y[j]}")

                if prediction == 1 and iris_testing.y[j] == 1:
                    iris_error += 1
                elif prediction == -1 and (iris_testing.y[j] == 2 or iris_testing.y[j] == 3):
                    iris_error += 1 

            for k in range(len(iris_testing.x)):
                if iris_testing.y[k] != 1:
                    p3.set_x(iris_testing.x[k])
                    prediction = p3.predict()
                    if args.verbose:
                        print(f"Prediction for {iris_testing.x[k]}: {prediction}. Recorded classification is {iris_testing.y[k]}")

                    if iris_testing.y[k] == 2 and prediction == -1:
                        iris_error += 1
                    elif iris_testing.y[k] == 3 and prediction == 1:
                        iris_error += 1

            iris_error = iris_error / ( len(iris_testing.y) + 10 )


            iris_total_time = time.time() - iris_start_time

            # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ #
            init_w_a4a = [1 for _ in range(A4A_FEATURES)]

            if args.verbose:
                print("\nA4A:\n")

            p = Perceptron(a4a_testing.x[0], init_w_a4a, bias=1)
            p.train_weights(a4a.x, a4a.y, lrate=lrate, epochs=epochs, verbose=args.verbose)    
            
            a4a_error = 0

            a4a_start_time = time.time()

            for i in range(len(a4a_testing.x)):
                p.set_x(a4a_testing.x[i])
                prediction = p.predict()
                #if args.verbose:
                    #print(f"Prediction for {a4a_testing.x[i]}: {prediction}. Recorded classification is {a4a_testing.y[i]}")
                
                if prediction == a4a_testing.y[i]:
                    a4a_error += 1

            a4a_error = a4a_error / len(a4a_testing.y)
            a4a_total_time = time.time() - a4a_start_time

            if args.verbose:
                print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n")
                print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}")

    elif args.algorithm == 'kNN' or args.algorithm == 'knn':

        if args.verbose:
            print("Beginning k-Nearest Neighbors categorization... \n")

        # can specify k and distance with -k and -d
        if not args.k:
            max_k=25
        else:
            max_k = args.k

        if not args.distance:
            distance_metric = 'euclidean'
        else:
            distance_metric = args.distance

        for k in range(1, max_k):

            # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- #
            iris_knn = kNN(iris.x, iris.y)
            iris_error = 0

            iris_start_time = time.time()

            for i in range(len(iris_testing.x)):
                y = iris_knn.classify(new_x=iris_testing.x[i], k=k, distance_metric=distance_metric, verbose=args.verbose)
                if args.verbose:
                    print(f"Prediction for {iris_testing.x[i]}: {y}. Recorded classification is {iris_testing.y[i]}")

                if y == iris_testing.y[i]:
                    iris_error += 1

            iris_error = iris_error / len(iris_testing.y)

            iris_total_time = time.time() - iris_start_time

            # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ #
            
            a4a_knn = kNN(a4a.x, a4a.y)
            a4a_error = 0

            a4a_start_time = time.time()

            for j in range(len(a4a_testing.x)):
                y = a4a_knn.classify(new_x=a4a_testing.x[j], k=k, distance_metric=distance_metric, verbose=args.verbose)
                if args.verbose:
                    print(f"Prediction for {a4a_testing.x[j]}: {y}. Recorded classification is {a4a_testing.y[j]}")

                if y == a4a_testing.y[j]:
                    a4a_error += 1

            a4a_error = a4a_error / len(a4a_testing.y)

            a4a_total_time = time.time() - a4a_start_time

            if args.verbose:
                print(f"Iris misclassification error: {iris_error}\na4a misclassification error: {a4a_error}\n")
                print(f"Iris classification time: {iris_total_time}\na4a classification time: {a4a_total_time}")

    elif args.algorithm == 'decision' or args.algorithm == 'tree' or args.algorithm == 'decision_tree':
        
        # --------------------------------------------------------------------- IRIS ----------------------------------------------------------------- #
        iris_dt = DecisionTree(iris.x, iris.y)

        #left_x, left_y, right_x, right_y = iris_dt.split(0, 0)
    

        # --------------------------------------------------------------------- A4A ------------------------------------------------------------------ #
        a4a_dt = DecisionTree(a4a.x, a4a.y)
        

        print("This part made optional, therefore not implemented for the sake of time. ")

    return
示例#21
0
文件: main.py 项目: zhifei1993/RCN
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default="ComConV",
                        help='ComConV')
    parser.add_argument(
        '--dataset',
        type=str,
        default="countries/countries_S3",
        help=
        'FB15k, FB15k-237, WN18, WN18RR, YAGO3-10, countries/countries_S1, ...'
    )
    parser.add_argument('--cuda',
                        type=bool,
                        default=False,
                        help='use cuda or not')
    parser.add_argument('--get_best_results',
                        type=bool,
                        default=True,
                        help='get best results or not')
    parser.add_argument('--get_complex_results',
                        type=bool,
                        default=False,
                        help='get complex results or not')
    parser.add_argument('--num_to_eval',
                        type=int,
                        default=5,
                        help='number to evaluate')

    # learning parameters
    parser.add_argument('--learning_rate',
                        type=float,
                        default=1e-1,
                        help='learning rate')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='batch size')
    parser.add_argument('--num_iterations',
                        type=int,
                        default=1500,
                        help='iterations number')
    parser.add_argument('--optimizer_method',
                        type=str,
                        default="RAdam",
                        help='optimizer method')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=1.0,
                        help='decay rate')
    parser.add_argument('--label_smoothing',
                        type=float,
                        default=0.1,
                        help='label smoothing')

    # convolution parameters
    parser.add_argument('--ent_vec_dim',
                        type=int,
                        default=200,
                        help='entity vector dimension')
    parser.add_argument('--rel_vec_dim',
                        type=int,
                        default=200,
                        help='relation vector dimension')
    parser.add_argument('--input_dropout',
                        type=float,
                        default=0.2,
                        help='input dropout')
    parser.add_argument('--feature_map_dropout',
                        type=float,
                        default=0.2,
                        help='feature map dropout')
    parser.add_argument('--hidden_dropout',
                        type=float,
                        default=0.3,
                        help='hidden dropout')
    parser.add_argument('--filt_h', type=int, default=2, help='filter height')
    parser.add_argument('--filt_w', type=int, default=5, help='filter width')
    parser.add_argument('--in_channels',
                        type=int,
                        default=1,
                        help='in channels')
    parser.add_argument('--out_channels',
                        type=int,
                        default=36,
                        help='out channels')

    args = parser.parse_args()
    dataset = args.dataset
    data_dir = "data/%s/" % dataset
    print(args)

    # 通过设置随机数种子,固定每一次的训练结果
    seed = 777
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    data = Data(data_dir=data_dir, reverse=True)
    run = RunModel(data,
                   modelname=args.model_name,
                   optimizer_method=args.optimizer_method,
                   num_iterations=args.num_iterations,
                   batch_size=args.batch_size,
                   learning_rate=args.learning_rate,
                   decay_rate=args.decay_rate,
                   ent_vec_dim=args.ent_vec_dim,
                   rel_vec_dim=args.rel_vec_dim,
                   cuda=args.cuda,
                   input_dropout=args.input_dropout,
                   hidden_dropout=args.hidden_dropout,
                   feature_map_dropout=args.feature_map_dropout,
                   in_channels=args.in_channels,
                   out_channels=args.out_channels,
                   filt_h=args.filt_h,
                   filt_w=args.filt_w,
                   label_smoothing=args.label_smoothing,
                   num_to_eval=args.num_to_eval,
                   get_best_results=args.get_best_results,
                   get_complex_results=args.get_complex_results,
                   regular_method="",
                   regular_rate=1e-4)
    run.train_and_eval()
示例#22
0
    # pkl_file = './datasets/dblp10000/VertexClustering.pkl'
    # attr1_file = './datasets/dblp10000/gender.txt'
    # # attr2_file = './datasets/dblp10000/prolific.txt'
    # attr3_file = './datasets/dblp10000/topic.txt'

    # edges_file = './datasets/dblp84170/edgelist_py.txt'
    # pkl_file = './datasets/dblp84170/VertexClustering.pkl'
    # attr1_file = './datasets/dblp84170/prolific.txt'
    # attr2_file = './datasets/dblp84170/topic.txt'

    # edges_file = './datasets/AmazonLarge/edgelist_py.txt'
    # pkl_file = './datasets/AmazonLarge/VertexClustering.pkl'
    # attr1_file = './datasets/AmazonLarge/avg_rating.txt'
    # attr2_file = './datasets/AmazonLarge/sales_rank.txt'

    data = Data()
    data.read_graph(edges_file)

    data.load_clusters(pkl_file)
    # data.detect_clusters(stru_method='lpa')
    # data.detect_clusters(stru_method='infomap')

    data.read_attr(attr1_file)
    data.read_attr(attr2_file)
    data.read_attr(attr3_file)

    coho = Cohomo(data)
    del data
    coho.init_attr_weight()
    coho.update_attr_weight()
data_file = file_path + 'data/sports-600.npy'
label_file = file_path + 'data/labels.npy'

data = np.load(data_file)
label = np.load(label_file)

# load original tweets
# ---------------------------------------------------------------------------------
sports_dic = {
    'basketball': 1,
    'hockey': 2,
    'baseball': 3,
    'tennis': 4,
    'volleyball': 5
}
sp_data = Data(sports_dic, file_path)
sp_df = sp_data.csv_df(['text'])  # load data
rm_hashtags = ['#' + s for s in sports_dic.keys()]
sp_data.pre_process(sp_df, rm_list=rm_hashtags)  # pre-process data
sp_df.drop(['tokenized'], axis=1, inplace=True)
# ---------------------------------------------------------------------------------

# set up lstm structure
n_classes = 5
hm_epochs = 20
batch_size = 50
chunk_size = data.shape[2]
n_chunks = data.shape[1]
rnn_size = 300

# height x width
示例#24
0
# Algo config
num_folds = 7

# File config
VERSION = 4
MODEL_NAME = 'lgbm'
OUTPUT_FOLDER = 'model_outputs/{}_{}/'.format(MODEL_NAME, VERSION)
OUTPUT_FILENAME = OUTPUT_FOLDER + MODEL_NAME

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

if __name__ == "__main__":

    print('Loading Data...')
    df = Data().read_data()
    df_x = df.drop(['target'], axis=1)
    df_y = df[['target']]
    features = list(df_x.columns.values)

    X_train, X_test, Y_train, Y_test = train_test_split(df_x,
                                                        df_y,
                                                        shuffle=False,
                                                        train_size=0.8)

    del df
    gc.collect()

    folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=47)
    oof = np.zeros(X_train.shape[0])
    getVal = np.zeros(X_train.shape[0])
示例#25
0
                    type=float,
                    default=0.5,
                    help='Dropout rate (1 - keep probability).')
args = parser.parse_args()
for arg in vars(args):
    print('{0} = {1}'.format(arg, getattr(args, arg)))
torch.manual_seed(args.seed)
# training on the first GPU if not on CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Training on device = {}'.format(device))
"""
===========================================================================
Loading data
===========================================================================
"""
data = Data(path=args.data_path, dataset=args.dataset, split=args.split)
print('Loaded {0} dataset with {1} nodes and {2} edges'.format(
    args.dataset, data.n_node, data.n_edge))
feature = data.feature.to(device)
label = data.label.to(device)
train = Dataset(data.idx_train)
val = Dataset(data.idx_val)
test = Dataset(data.idx_test)
train_loader = DataLoader(dataset=train, batch_size=args.batch_size)
val_loader = DataLoader(dataset=val, batch_size=args.batch_size)
test_loader = DataLoader(dataset=test, batch_size=args.batch_size)
sampler = Sampler(data.adj, args.aggregator)
"""
===========================================================================
Training
===========================================================================
示例#26
0
                bestNumLSTMUnits = numLSTMUnits

        print('BEST: ( num_lstm_units: ' + str(bestNumLSTMUnits) + ')')


#  MAIN
# .
# .
# .
# .
# .
# .
# .
# .

Data.load_data(news_per_hour=10, momentum_window=30, newsTimeToMarket=20)

(X_train, Y_train), (X_test, Y_test) = Data.get_train_test_set()

test_x = tf.convert_to_tensor(np.asarray(X_test), dtype=tf.float32)
train_x = tf.convert_to_tensor(np.asarray(X_train), dtype=tf.float32)

train_y = tf.convert_to_tensor(np.asarray(Y_train), dtype=tf.float32)
test_y = tf.convert_to_tensor(np.asarray(Y_test), dtype=tf.float32)

print('.........................')
print("number of training examples = " + str(train_x.shape[0]))
print("number of test examples = " + str(test_x.shape[0]))
print("X_train shape: " + str(train_x.shape))
print("Y_train shape: " + str(train_y.shape))
print("X_test shape: " + str(test_x.shape))
示例#27
0
     '--dataset',
     type=str,
     default="FB15k-237",
     nargs="?",
     help='Which dataset to use: FB15k, FB15k-237, WN18 or WN18RR')
 args = parser.parse_args()
 model_name = args.algorithm
 dataset = args.dataset
 data_dir = "data/%s/" % dataset
 torch.backends.cudnn.deterministic = True
 seed = 42
 np.random.seed(seed)
 torch.manual_seed(seed)
 if torch.cuda.is_available:
     torch.cuda.manual_seed_all(seed)
 d = Data(data_dir=data_dir, reverse=True)
 experiment = Experiment(model_name,
                         num_iterations=800,
                         batch_size=128,
                         learning_rate=0.001,
                         decay_rate=0.99,
                         ent_vec_dim=200,
                         rel_vec_dim=200,
                         cuda=True,
                         input_dropout=0.2,
                         hidden_dropout=0.3,
                         feature_map_dropout=0.2,
                         in_channels=1,
                         out_channels=32,
                         filt_h=1,
                         filt_w=9,
示例#28
0
    parser.add_argument('--model', type=str, default="p2v-l", nargs="?",
                    help='Which model to use: p2v-l or p2v-p')
    parser.add_argument('--num_iters', type=int, default=100, nargs="?",
                    help='Number of iterations')
    parser.add_argument('--lr', type=float, default=0.1, nargs="?",
                    help='Initial learning rate')
    parser.add_argument('--dr', type=float, default=0.98, nargs="?",
                    help='Decay rate')
    parser.add_argument('--batch_size', type=int, default=10000, nargs="?",
                    help='Batch size')
    parser.add_argument('--num_neg', type=int, default=5, nargs="?",
                    help='Number of negative samples per each positive sample')
    parser.add_argument('--dim', type=int, default=200, nargs="?",
                    help='Embeddings dimensionality')
    parser.add_argument('--w_reg', type=float, default=0.5, nargs="?",
                    help='Regularization coefficient for W')
    parser.add_argument('--c_reg', type=float, default=0.5, nargs="?",
                    help='Regularization coefficient for C')
    parser.add_argument('--cuda', type=bool, default=True, nargs="?",
                    help='Whether to use cuda (GPU) or not (CPU)')

    args = parser.parse_args()
    d = Data(data_dir="data/", fname=args.dataset, min_occurrences=args.min_occurrences, 
             window_size=args.window_size, subsample=args.subsample, t=args.threshold,
             cutoff=args.cutoff)
    
    experiment = Experiment(args.model, num_iterations=args.num_iters, learning_rate=args.lr, 
                    batch_size=args.batch_size, corrupt_size=args.num_neg, decay_rate=args.dr, 
                    embeddings_dim=args.dim, w_reg=args.w_reg, c_reg=args.c_reg, cuda=args.cuda)
    experiment.train_and_eval()
    
示例#29
0
                        help="Entity embedding dimensionality.")
    parser.add_argument("--rdim", type=int, default=200, nargs="?",
                        help="Relation embedding dimensionality.")
    parser.add_argument("--cuda", type=bool, default=True, nargs="?",
                        help="Whether to use cuda (GPU) or not (CPU).")
    parser.add_argument("--input_dropout", type=float, default=0.3, nargs="?",
                        help="Input layer dropout.")
    parser.add_argument("--hidden_dropout1", type=float, default=0.4, nargs="?",
                        help="Dropout after the first hidden layer.")
    parser.add_argument("--hidden_dropout2", type=float, default=0.5, nargs="?",
                        help="Dropout after the second hidden layer.")
    parser.add_argument("--label_smoothing", type=float, default=0.1, nargs="?",
                        help="Amount of label smoothing.")

    args = parser.parse_args()
    dataset = args.dataset
    data_dir = "data/%s/" % dataset
    torch.backends.cudnn.deterministic = True
    seed = 20
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed_all(seed)
    d = Data(data_dir=data_dir, reverse=False)
    experiment = Experiment(num_iterations=args.num_iterations, batch_size=args.batch_size, learning_rate=args.lr,
                            decay_rate=args.dr, ent_vec_dim=args.edim, rel_vec_dim=args.rdim, cuda=args.cuda,
                            input_dropout=args.input_dropout, hidden_dropout1=args.hidden_dropout1,
                            hidden_dropout2=args.hidden_dropout2, label_smoothing=args.label_smoothing)
    experiment.train_and_eval()

示例#30
0
	parser.add_argument('--data', type = str, 
		choices = ['imdbcnn'], default = 'imdbcnn') 
	parser.add_argument('--num_neighbors', type = int, default = 4) 
	parser.add_argument('--train', action='store_true')
	parser.add_argument('--original', action='store_true')
	parser.add_argument('--max_order', type = int, default = 16)

	args = parser.parse_args()
	dict_a = vars(args)   
	if args.method == 'train':
		model = TextModel(args.data, train = True)


	else:
		print('Loading dataset...') 
		dataset = Data(args.data)

		print('Creating model...')
		model = TextModel(args.data) 

		dict_a.update({'dataset': dataset, 'model': model})

	if args.data not in os.listdir('./'):	
		os.mkdir(args.data)
	if 'results' not in os.listdir('./{}'.format(args.data)):
		os.mkdir('{}/results'.format(args.data))

	if args.method in ['localshapley','connectedshapley']:
		dict_a.update({'regression': False})
		scores = lcshapley(args)