def evaluate(inp, client, l, DBlocation): inp = inp.split(' ') status = client.status() if len(inp) > 1 and not str(inp[1]): inp.pop() if inp[0] == 'p' or 'play' == (inp[0]): try: if not status['state'] == 'stop': if len(inp) == 1: util.pause(client) else: util.play(client, int(inp[1])) else: if len(inp) == 1: util.play(client, 0) else: util.play(client, int(inp[1])) except: print('mpd error: bad song index') elif inp[0] == 'pause': util.pause(client) elif inp[0] == 'next' or inp[0] == 'n': util.next(client) elif inp[0] == 'previous' or inp[0] == 'ps': util.previous(client) elif inp[0] == 'stop': util.stop(client) elif inp[0] == 'pl' or inp[0] == 'playlist': util.print_playlist(client) elif inp[0] == 'update' or inp[0] == 'u': util.update(client) elif inp[0] == 'clear': util.clear(client) elif inp[0] == 'random': util.mpdrandom(client, inp[1]) elif inp[0] == 'shuffle': util.shuffle(client) elif inp[0] == 'consume': util.consume(client, inp[1]) elif inp[0] == 'swap': util.swap(client, int(inp[1]) - 1, int(inp[2]) - 1) elif inp[0] == 'single': util.single(client, inp[1]) elif inp[0] == 'search' or inp[0] == 's': if '-f' in inp or '--filter' in inp: l = util.mpdsearch(inp[1], inp, DBlocation, True) else: l = util.mpdsearch(inp[1], inp, DBlocation, False) elif inp[0] == 'a' or inp[0] == 'add': if l: for line in l: client.add(line) else: print('You have to search first!') elif inp[0] == 'q' or inp[0] == 'quit': quit() return l
def __init__(self, classified_data_list, kernel, svm_constructor=LibSvmClassifier, ensemble_size=3): self._classifiers = [] for _ in range(ensemble_size): data = classified_data_list.copy() util.shuffle(data) util.take_n(data, 10) self._classifiers.append(svm_constructor(data, kernel))
def add(self, rect): if len(self.pieces) < 7 and self.selected is None: self.pieces.insert(randint(0, len(self.pieces)), rect) elif len(self.pieces) < 7 and self.selected is not None: self.pieces.append(rect) else: self.extras.append(rect) if self.selected is None and self.shuffle >= 1: shuffle(self.pieces) self.shuffle -= 1 self.realign()
def load_data(): with open(os.path.join(data_dir, dataset, 'train.txt')) as f: train_data = np.load(f) with open(os.path.join(data_dir, dataset, 'val.txt')) as f: val_data = np.load(f) with open(os.path.join(data_dir, dataset, 'test.txt')) as f: test_data = np.load(f) train_x, train_y = train_data[:,:-1], train_data[:,-1] val_x, val_y = val_data[:,:-1], val_data[:,-1] test_x, test_y = test_data[:,:-1], test_data[:,-1] train_x, train_y = util.shuffle(train_x, train_y) val_x, val_y = util.shuffle(val_x, val_y) return train_x, train_y, val_x, val_y, test_x, test_y
def classify(sess, pos_data, neg_data, pos_label, neg_label, pos_data_test, neg_data_test, pos_label_test, neg_label_test, **kwargs): net = multi_classifier(sess, **kwargs) for i in range(kwargs['epoch']): train_neg_data, train_neg_label = util.shuffle(neg_data, neg_label) trainData = np.concatenate((pos_data, train_neg_data)) trainLabel = np.concatenate((pos_label, train_neg_label)) trainData, trainLabel = util.shuffle(trainData, trainLabel) net.train_epoch(sess, trainData, trainLabel, **kwargs) # print(kwargs['trainNum'], "train", net.train_epoch(sess, trainData, trainLabel, **kwargs)) # print(kwargs['trainNum'], "test", net.test(sess, testData, testLabel)) # print(net.inference(sess, testData)) print(kwargs['trainNum'], "test_pos", net.test(sess, pos_data_test, pos_label_test)) print(kwargs['trainNum'], "test_neg", net.test(sess, neg_data_test, neg_label_test)) net.save_model(sess, **kwargs)
def knn(trainData, trainLabel, testData, testLabel, **kwargs): print(kwargs) trainData = util.normalization(trainData) testData = util.normalization(testData) acc_list = [] ret = [] acc_max = 0 for i in range(10): trainData_shuffle, trainLabel_shuffle = util.shuffle( trainData, trainLabel) neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'], weights=kwargs['weights'], p=kwargs['p']) if kwargs['PCA']: pca = PCA(n_components=kwargs['n_components']) trainData_shuffle = pca.fit_transform(trainData_shuffle) neigh.fit(trainData_shuffle, trainLabel_shuffle) testData_PCA = pca.transform(testData) acc_i = neigh.score(testData_PCA, testLabel) if acc_i > acc_max: acc_max = acc_i ret = neigh.predict(testData_PCA) else: neigh.fit(trainData_shuffle, trainLabel_shuffle) acc_i = neigh.score(testData, testLabel) print("%d acc: " % i, acc_i) acc_list.append(acc_i) acc = np.mean(np.array(acc_list)) print("KNN accuracy: ", acc) return ret, acc_max
def calc_cross_validated_beta(x_full, y_full, lam, step_size, iterations, weight_step, k, use_nll, plot_nll): #shuffle x_full and y_full so we can crossvalidate feature_count = len(x_full[0]) x_full, y_full = util.shuffle(x_full, y_full, to_numpy_array = True) beta_all = np.zeros(shape=(k, feature_count)) validation_error_rates = np.empty(shape=k) nll = [None]*k for i in xrange(k): x_train, x_test = extract_fold(x_full, i, k) y_train, y_test = extract_fold(y_full, i, k) #This alters beta_all and possibly nll nll[i], beta_all[i] = run_batch_gradient_descent(x_train, y_train, lam, step_size, iterations, weight_step, use_nll = use_nll) test_labels_calc = calc_labels(x_test, beta_all[i]) validation_error_rates[i] = calc_error_rate(test_labels_calc, y_test) print 'cross-validation error rate', validation_error_rates[i] training_labels = calc_labels(x_train, beta_all[i]) print 'training error rate', calc_error_rate(training_labels, y_train), full_labels = calc_labels(x_full, beta_all[i]) print 'full', calc_error_rate(full_labels, y_full) if plot_nll and use_nll: plot_nll_data(nll[i], 'derp') #Take the average beta among all betas calculated during cross-validation #beta = np.sum(beta_all, axis=0)/float(len(beta_all)) if use_nll: for i in xrange(len(validation_error_rates)): print i, nll[i][-1], validation_error_rates[i] print 'avg error rate', np.mean(validation_error_rates) return beta_all
def like(job, session=None): count = 0 try: insta = Insta() insta.login(username=job.i_user.username, password=job.i_user.get_password()) time.sleep(1) # get users tags and shuffles them tag_names = [str(tag) for tag in job.i_user.tags] tags = shuffle(tag_names) for tag in tags: insta.search(tag) count += insta.like_tag(tag) time.sleep(5) except Exception as e: job.error = '{}: {}'.format(type(e), e) job.count = count job.finish() # new run for jobs new_job = schedule_next_job(job, rando_hour()) session.add(new_job) session.commit() insta.driver.quit() return job
def calc_cross_validated_beta(x_full, y_full, lam, step_size, iterations, weight_step, k, use_nll, plot_nll): #shuffle x_full and y_full so we can crossvalidate feature_count = len(x_full[0]) x_full, y_full = util.shuffle(x_full, y_full, to_numpy_array=True) beta_all = np.zeros(shape=(k, feature_count)) validation_error_rates = np.empty(shape=k) nll = [None] * k for i in xrange(k): x_train, x_test = extract_fold(x_full, i, k) y_train, y_test = extract_fold(y_full, i, k) #This alters beta_all and possibly nll nll[i], beta_all[i] = run_batch_gradient_descent(x_train, y_train, lam, step_size, iterations, weight_step, use_nll=use_nll) test_labels_calc = calc_labels(x_test, beta_all[i]) validation_error_rates[i] = calc_error_rate(test_labels_calc, y_test) print 'cross-validation error rate', validation_error_rates[i] training_labels = calc_labels(x_train, beta_all[i]) print 'training error rate', calc_error_rate(training_labels, y_train), full_labels = calc_labels(x_full, beta_all[i]) print 'full', calc_error_rate(full_labels, y_full) if plot_nll and use_nll: plot_nll_data(nll[i], 'derp') #Take the average beta among all betas calculated during cross-validation #beta = np.sum(beta_all, axis=0)/float(len(beta_all)) if use_nll: for i in xrange(len(validation_error_rates)): print i, nll[i][-1], validation_error_rates[i] print 'avg error rate', np.mean(validation_error_rates) return beta_all
def bayes(trainData, trainLabel, testData, testLabel, **kwargs): print(kwargs) trainData = util.normalization(trainData) testData = util.normalization(testData) acc_list = [] acc_max = 0 ret = [] for i in range(10): trainData_shuffle, trainLabel_shuffle = util.shuffle( trainData, trainLabel) clf = GaussianNB() if kwargs['PCA']: pca = PCA(n_components=kwargs['n_components']) trainData_shuffle = pca.fit_transform(trainData_shuffle) clf.fit(trainData_shuffle, trainLabel_shuffle) testData_PCA = pca.transform(testData) acc_i = clf.score(testData_PCA, testLabel) if acc_i > acc_max: acc_max = acc_i ret = clf.predict(testData_PCA) else: clf.fit(trainData_shuffle, trainLabel_shuffle) acc_i = clf.score(testData, testLabel) print("%d acc: " % i, acc_i) acc_list.append(acc_i) acc = np.mean(np.array(acc_list)) print("Naive Bayes accuracy: ", acc) return ret, acc_max
def train(train_imgs, model, sess): n_train = len(train_imgs) for epoch in range(EPOCH_NUM): imgs = util.shuffle(train_imgs) loss = 0. step = 0 for start in range(0, n_train, BATCH_SIZE): end = min(start + BATCH_SIZE, n_train) batch = imgs[start:end] o, l, _ = sess.run([model.outs, model.losses, model.optim], feed_dict={ model.inputs: util.get_input_imgs(batch), model.labels: batch }) loss += l*(end-start) if epoch%2==0 and step==0: # save outputs of 1st batch out_imgs = util.arrays2imgs(o[:5]) for i, img in enumerate(out_imgs): cv2.imwrite(OUT_IMG_DIR_INT + 'e%d_%d.jpg' % (epoch, i), img) if step%100==0: print('.. Step %5d, loss: %.5f' % (step, l)) step += 1 print('Epoch %3d >> avg_loss: %.5f' % (epoch, loss/n_train)) return model
def testRBM(opts): """show how to use RBM to do classification""" # read data data = np.load(opts.feature) label = np.load(opts.label) # set the nodes of hidden layers nHid = 1000 # shuffle data and label [data, label] = util.shuffle(data, label) # decide how many samples to be used as training set percent = float(opts.trainPercent) nCase = data.shape[0] nTrain = int(nCase * percent) nTest = nCase - nTrain # split data and label into train dataset and test dataset trainData = data[0:nTrain, :] trainLabel = label[0:nTrain, :] example = data[nTrain:, :] testLabel = label[nTrain:, :] p = {"maxEpoch": opts.maxEpoch} m = rbmFit.rbmFit(trainData, nHid, trainLabel, isSaveModel=True, name=opts.model, **p) [trainR, F1] = rbmPredict.rbmPredict(m, trainData) [testR, F2] = rbmPredict.rbmPredict(m, example) trainK = 0 for x in range(nTrain): if trainLabel[x] != trainR[x]: trainK = trainK + 1 testK = 0 for x in range(nTest): if testLabel[x] != testR[x]: testK = testK + 1 print "---------------------------------------" print "train classification rate : %f " % (1 - trainK * 1.0 / nTrain) print "test classification rate : %f " % (1 - testK * 1.0 / nTest) print "---------------------------------------" if options.isSaveResult: result = shelve.open(options.resultName) result["nHid"] = nHid result["maxEpoch"] = options.maxEpoch result["trainAcc"] = 1 - trainK * 1.0 / nTrain result["testAcc"] = 1 - testK * 1.0 / nTest result.close()
def __init__(self): self.gamemode = 'matchmake' # players -> {name: {'hand':list of id of item_card in hand, 'scored_accident': list of id of got accident_card}} self.players = {} self.accident_map = daily_accidents self.item_map = daily_items self.accidents = deque(shuffle(self.accident_map)) self.items = deque(shuffle(self.item_map)) self.onfield_accident = None self.onfield_item = [] self.now_use_item_num = None self.item_pool = [] self.accident_pool = []
def run_decision_trees(x_train, y_train, x_test, y_test, args): #Allow validation without external testing data if x_test is None or np.array_equal(x_test, x_train): if args['shuffle']: x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array=True) print 'no test data found' if args['validate'] > 0: validation_size = args['validate'] crashes = x_train[validation_size:] labels = y_train[validation_size:] crashes_validate = x_train[:validation_size] labels_validate = y_train[:validation_size] elif args['validate'] == 0: crashes = x_train labels = y_train crashes_validate = crashes labels_validate = labels else: #Check the training set error rate - useful for debugging validation_size = args['validate'] crashes = x_train[-validation_size:] labels = y_train[-validation_size:] crashes_validate = crashes labels_validate = labels x_test = crashes_validate else: if args['shuffle']: x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array=True) crashes = x_train labels = y_train crashes_validate = x_test labels_validate = y_test if args['tree_size'] == 0: args['tree_size'] = len(crashes) decision_trees.do_stuff(crashes, labels, crashes_validate, labels_validate, x_test, args)
def run(self): for _ in range(10): util.shuffle(self.classified_data_list) for i in range(0, self.folds): self.train_test_pairs[i] = [ _split_train_test(d, i, self.folds) for d in self.classified_data_list ] for i in range(0, self.folds): classifier = self._get_classifier( [train_set for (train_set, _) in self.train_test_pairs[i]]) test_set = [(self.train_test_pairs[i][j][1], j) for j in range(0, self.num_class)] for (rows, gt) in test_set: for d in rows: class_of_data = classifier.classify(d) self.confusions[class_of_data][gt] += 1
def run_logistic_regression(x_train, y_train, x_test, y_test, args): print 'data loaded' if args['method'] == 'logistic-plot': logistic_regression.plot_batch_gradient_descent( x_train, y_train, lam=args['lambda'], step_size=args['step_size'], iterations=args['iterations'], weight_step=False) elif args['method'] == 'logistic': x_train = logistic_regression.standardize_data(x_train) x_test = logistic_regression.standardize_data(x_test) if args['beta_file'] is None: beta = logistic_regression.calc_cross_validated_beta( x_train, y_train, lam=args['lambda'], step_size=args['step_size'], iterations=args['iterations'], weight_step=False, k=args['k'], use_nll=args['use_nll'], plot_nll=args['plot_nll']) else: inputfile = open(args['beta_file'], 'rb') beta = cPickle.load(inputfile) #Save beta if args['store_beta']: beta_dumpfile = open( 'beta{0}{1}.pkl'.format(datetime.now().hour, datetime.now().minute), 'wb') cPickle.dump(beta, beta_dumpfile) beta = np.sum(beta, axis=0) / float(len(beta)) training_labels = logistic_regression.calc_labels(x_train, beta) training_error = logistic_regression.calc_error_rate( training_labels, y_train) print 'training error rate', training_error testing_labels = logistic_regression.calc_labels(x_test, beta) testing_error = logistic_regression.calc_error_rate( testing_labels, y_test) print 'testing error rate', testing_error logistic_regression.write_labels(x_test, beta) elif args['method'] == 'logistic-sklearn': x_train = logistic_regression.standardize_data(x_train) x_test = logistic_regression.standardize_data(x_test) x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array=True) logistic = LogisticRegression() logistic.fit(x_train, y_train) print logistic.score(x_test, y_test)
def get(self,id): questionnaire = db.get_questionnaire(id) self.req_parser = reqparse.RequestParser() self.req_parser.add_argument('shuffle', default=0, type=int, required=False) self.req_parser.add_argument('size', default=len(questionnaire['questions']), type=int, required=False) self.args = self.req_parser.parse_args() if not self.args['shuffle'] == 0: ordered_questions = questionnaire['questions']; quizz_questions = util.shuffle(ordered_questions, self.args['size'] ) questionnaire['questions'] = quizz_questions return questionnaire
def testRBM(opts) : """show how to use RBM to do classification""" # read data data = np.load(opts.feature) label = np.load(opts.label) # set the nodes of hidden layers nHid = 1000 # shuffle data and label [data, label] = util.shuffle(data, label) # decide how many samples to be used as training set percent = float(opts.trainPercent) nCase = data.shape[0] nTrain = int(nCase * percent) nTest = nCase - nTrain # split data and label into train dataset and test dataset trainData = data[0:nTrain, :] trainLabel = label[0:nTrain, :] example = data[nTrain:, :] testLabel = label[nTrain:, :] p = {"maxEpoch" : opts.maxEpoch} m = rbmFit.rbmFit(trainData, nHid, trainLabel, isSaveModel=True, name=opts.model, **p) [trainR, F1] = rbmPredict.rbmPredict(m, trainData) [testR, F2] = rbmPredict.rbmPredict(m, example) trainK = 0 for x in range(nTrain) : if trainLabel[x] != trainR[x] : trainK = trainK + 1 testK = 0 for x in range(nTest) : if testLabel[x] != testR[x] : testK = testK+1 print "---------------------------------------" print "train classification rate : %f " % (1-trainK*1.0/nTrain) print "test classification rate : %f " % (1-testK*1.0/nTest) print "---------------------------------------" if options.isSaveResult : result = shelve.open(options.resultName) result["nHid"] = nHid result["maxEpoch"] = options.maxEpoch result["trainAcc"] = 1-trainK*1.0/nTrain result["testAcc"] = 1-testK*1.0/nTest result.close()
def run_decision_trees(x_train, y_train, x_test, y_test, args): #Allow validation without external testing data if x_test is None or np.array_equal(x_test,x_train): if args['shuffle']: x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True) print 'no test data found' if args['validate']>0: validation_size = args['validate'] crashes = x_train[validation_size:] labels = y_train[validation_size:] crashes_validate = x_train[:validation_size] labels_validate = y_train[:validation_size] elif args['validate']==0: crashes = x_train labels = y_train crashes_validate = crashes labels_validate = labels else: #Check the training set error rate - useful for debugging validation_size = args['validate'] crashes = x_train[-validation_size:] labels = y_train[-validation_size:] crashes_validate = crashes labels_validate = labels x_test = crashes_validate else: if args['shuffle']: x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True) crashes = x_train labels = y_train crashes_validate = x_test labels_validate = y_test if args['tree_size']==0: args['tree_size'] = len(crashes) decision_trees.do_stuff(crashes, labels, crashes_validate, labels_validate, x_test, args)
def train_novel_classifier(sess, trainData, trainLabel, testData, testLabel, **kwargs): net = multi_classifier(sess, test=False, **kwargs) for i in range(kwargs['epoch']): trainData, trainLabel = util.shuffle(trainData, trainLabel) # net.train_epoch(sess, trainData, trainLabel, **kwargs) print(i, "train", net.train_epoch(sess, trainData, trainLabel, **kwargs)) print(i, "test", net.test(sess, testData, testLabel)) # print(net.inference(sess, testData)) # print(kwargs['trainNum'], "test_pos", net.test(sess, pos_data_test, pos_label_test)) # print(kwargs['trainNum'], "test_neg", net.test(sess, neg_data_test, neg_label_test)) net.save_model(sess, **kwargs)
def run(self): while True: # listen for packets packet, addr = self.incoming.recvfrom(UDP_MTU) # if the src addr of the last packet is the same as the addr of the # next hop, then this packet is a response, otherwise a mix fragment if addr == self.next_addr: self.handle_response(packet) else: if self.mix_addr is None: self.mix_addr = addr self.handle_mix_fragment(packet) # send out requests if len(ChannelMid.requests) >= STORE_LIMIT: # mix packets before sending shuffle(ChannelMid.requests) # send STORE_LIMIT packets for _ in range(STORE_LIMIT): # use bound socket to send packets packet = ChannelMid.requests.pop() enc_packet = self.request_link_encryptor.encrypt(packet) print(self, "Data/Init", "->", len(enc_packet)) self.incoming.sendto(enc_packet, self.next_addr) # send out responses if len(ChannelMid.responses) >= STORE_LIMIT: # mix packets before sending shuffle(ChannelMid.responses) # send STORE_LIMIT packets for _ in range(STORE_LIMIT): packet = ChannelMid.responses.pop() enc_packet = self.response_link_encryptor.encrypt(packet) print(self, "Data/Init", "<-", len(enc_packet)) self.incoming.sendto(enc_packet, self.mix_addr)
def test_shuffle(self): print util.shuffle('ae') print util.shuffle('ate') f = util.shuffle('aest') print len(f), f f = util.shuffle('etaelehoyr') print len(f)
def prepare_data(df): df = u.shuffle(df, 999) X, Y = u.xy(df) scaler = preprocessing.MinMaxScaler() scaler.fit(X) df_tr, df_te = u.split(df, 0.75) X_tr, Y_tr = u.xy(df_tr) X_te, Y_te = u.xy(df_te) X_te_norm = scaler.transform(X_te) X_tr_norm = scaler.transform(X_tr) return X_tr_norm, Y_tr, X_te_norm, Y_te
def __init__(self): alldata, alltargets = [], [] with open('./data/review_data') as f: alldata = pickle.load(f) with open('./data/review_targets') as f: alltargets = pickle.load(f) p = range(len(alldata)) random.seed(0) random.shuffle(p) shuffle = lambda l: [l[p[i]] for i in range(len(p))] alldata = shuffle(alldata) alltargets = shuffle(alltargets) train_size = 0.6 self.train_data, self.train_target = \ subset(alldata, alltargets, 0, 0.6) self.alltest_data, self.alltest_target = \ subset(alldata, alltargets, 0.6, 1) self.num_classes = 3 DataGatherer.__init__(self)
def train_base_classifier(sess, trainData, trainLabel, trainIndex, **kwargs): for i, indexlist in enumerate(trainIndex): pos_data = trainData[indexlist] pos_label = [1] * len(indexlist) templist = list(np.arange(0, len(trainData))) for l in indexlist: templist.remove(l) neg_data = trainData[templist] neg_label = [0] * len(templist) data = np.concatenate((pos_data, neg_data)) label = np.concatenate((pos_label, neg_label)) data, label = util.shuffle(data, label) classify(sess, data, label, None, None, trainNum=i, test=False, **kwargs) tf.get_variable_scope().reuse_variables()
def svm(trainData, trainLabel, testData, testLabel, **kwargs): print(kwargs) #neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'], weights=kwargs['weights'], p=kwargs['p']) #trainData = util.normalization(trainData.reshape((trainData.shape[0], trainData.shape[1] * trainData.shape[2] * trainData.shape[3]))) #testData = util.normalization(testData.reshape((testData.shape[0], testData.shape[1] * testData.shape[2] * testData.shape[3]))) linearSVC_clf = LinearSVC() #this acc_list = [] acc_max = 0 ret = [] shuffleTimes = 1 for i in range(shuffleTimes): print(i+1, '/', shuffleTimes) trainData, trainLabel = util.shuffle(trainData, trainLabel) linearSVC_clf.fit(trainData, trainLabel) acc_i = linearSVC_clf.score(testData, testLabel) acc_list.append(acc_i) print("LinearSVC accuracy: ", np.mean(np.array(acc_list))) if acc_i > acc_max: acc_max = acc_i ret = linearSVC_clf.predict(testData) acc = np.mean(np.array(acc_list)) print("LinearSVC accuracy: ", acc) return ret, acc_max SVC_clf = SVC() acc_list = [] shuffleTimes = 50 for i in range(shuffleTimes): print(i+1, '/', shuffleTimes) trainData, trainLabel = util.shuffle(trainData, trainLabel) SVC_clf.fit(trainData, trainLabel) acc_list.append(SVC_clf.score(testData, testLabel)) print("SVC accuracy: ", np.mean(np.array(acc_list))) acc = np.mean(np.array(acc_list)) print("SVC accuracy: ", acc)
def next_mixed(self): sample = next(self) B = self.arg.batch_size // self.N for i in range(self.N): select = torch.randperm(self.arg.batch_size)[:B] self.mixed[i*B:(i+1)*B].copy_(sample[i][select]) self.label[i*B:(i+1)*B].fill_(i) shuffle = util.shuffle(self.arg.batch_size) self.mixed = self.mixed[shuffle] self.label = self.label[shuffle] return self.mixed, self.label
def __close_register(self): self.gamemode = 'ongoing' self.player_order = deque(shuffle(self.players), maxlen=len(self.players)) self.turn_player = self.player_order[0] msg = '登録を締め切ったわ,順番はこんな感じね\n' for idx, name in enumerate(self.player_order): ordermsg = str(idx + 1) + ': ' + name + '\n' msg += ordermsg msg += '準備できたら手番のプレイヤーは`/turn`と入力するのよ\n' deal_item_msg = '\n'.join( [self.__deal_items(player) for player in self.player_order]) msg += deal_item_msg return msg
def fineTune(sess, trainData, trainLabel, testData, testLabel, **kwargs): net = network.Network(sess, model="default", **kwargs) maxAcc = 0 retInf = [] for i in range(20): trainData, trainLabel = util.shuffle(trainData, trainLabel) print(network.train_epoch(net, sess, trainData, trainLabel, **kwargs)) acc = net.test(sess, testData, testLabel)[1] print(acc) if acc > maxAcc: maxAcc = acc retInf = net.inference(sess, testData)[0] ##for i in range(0, 100): ## print(net.train(sess, trainData[0:10], trainLabel[0:10], args['keep_prob'])[0:2]) print(maxAcc) return retInf, maxAcc
def prepare_data(df): df = u.shuffle(df, 999) df_train, df_test = u.split(df, 0.75) X_train, Y_train = u.xy(df_train) X_test, Y_test = u.xy(df_test) X_train = preprocessing.maxabs_scale(X_train) X_test = preprocessing.maxabs_scale(X_test) ones = np.ones((X_train.shape[0], 1)) X_train = np.hstack((X_train, ones)) ones = np.ones((X_test.shape[0], 1)) X_test = np.hstack((X_test, ones)) return X_train, Y_train, X_test, Y_test
def run_logistic_regression(x_train, y_train, x_test, y_test, args): print 'data loaded' if args['method'] == 'logistic-plot': logistic_regression.plot_batch_gradient_descent(x_train, y_train, lam=args['lambda'], step_size = args['step_size'], iterations = args['iterations'], weight_step = False) elif args['method'] == 'logistic': x_train = logistic_regression.standardize_data(x_train) x_test = logistic_regression.standardize_data(x_test) if args['beta_file'] is None: beta = logistic_regression.calc_cross_validated_beta(x_train, y_train, lam=args['lambda'], step_size = args['step_size'], iterations=args['iterations'], weight_step = False, k=args['k'], use_nll = args['use_nll'], plot_nll = args['plot_nll']) else: inputfile = open(args['beta_file'], 'rb') beta = cPickle.load(inputfile) #Save beta if args['store_beta']: beta_dumpfile = open('beta{0}{1}.pkl'.format( datetime.now().hour, datetime.now().minute), 'wb') cPickle.dump(beta, beta_dumpfile) beta = np.sum(beta, axis=0)/float(len(beta)) training_labels = logistic_regression.calc_labels(x_train, beta) training_error = logistic_regression.calc_error_rate(training_labels, y_train) print 'training error rate', training_error testing_labels = logistic_regression.calc_labels(x_test, beta) testing_error = logistic_regression.calc_error_rate(testing_labels, y_test) print 'testing error rate', testing_error logistic_regression.write_labels(x_test, beta) elif args['method'] == 'logistic-sklearn': x_train = logistic_regression.standardize_data(x_train) x_test = logistic_regression.standardize_data(x_test) x_train, y_train = util.shuffle(x_train, y_train, to_numpy_array = True) logistic = LogisticRegression() logistic.fit(x_train, y_train) print logistic.score(x_test, y_test)
def linearReg(trainData, trainLabel, testData, testLabel, **kwargs): print(kwargs) # neigh = KNeighborsClassifier(n_neighbors=kwargs['n_neighbors'], weights=kwargs['weights'], p=kwargs['p']) clf = linear_model.LinearRegression() trainData = util.normalization(trainData) testData = util.normalization(testData) acc_list = [] # Shuffle 10 times, seems useless for linear regression for i in range(10): trainData_shuffle, trainLabel_shuffle = util.shuffle(trainData, trainLabel) clf.fit(trainData_shuffle, trainLabel_shuffle) acc_i = clf.score(testData, testLabel) print("%d acc: " % i, acc_i) acc_list.append(acc_i) acc = np.mean(np.array(acc_list)) print("Linear Regression accuracy: ", acc) return acc
def decisionTree(trainData, trainLabel, testData, testLabel, **kwargs): print(kwargs) trainData = util.normalization(trainData) testData = util.normalization(testData) shuffle_times = 10 acc_list = [] for i in range(shuffle_times): trainData_shuffle, trainLabel_shuffle = util.shuffle( trainData, trainLabel) cdt = DecisionTreeClassifier() cdt.fit(trainData_shuffle, trainLabel_shuffle) now_acc = cdt.score(testData, testLabel) acc_list.append(now_acc) print("%d acc, %.3f" % (i, now_acc)) acc = np.mean(np.array(acc_list)) print("Decision Tree accuracy: ", acc) return acc
def follow(job, session=None): new_follows = [] count = 0 try: insta = Insta() insta.login(username=job.i_user.username, password=job.i_user.get_password()) time.sleep(1) # get users tags and shuffles them tag_names = [str(tag) for tag in job.i_user.tags] tags = shuffle(tag_names) for tag in tags: insta.search(tag) users, finished = insta.follow(tag) count += len(users) new_follows += users if finished is True: break time.sleep(5) except Exception as e: job.error = '{}: {}'.format(type(e), e) if len(new_follows) > 0: for user in new_follows: f = Following() f.timestamp = time.time() f.i_user = job.i_user f.other_user = user session.add(f) session.commit() job.count = count job.finish() # new run for jobs new_job = schedule_next_job(job, 1.5 * rando_hour()) session.add(new_job) session.commit() insta.driver.quit() return job
def train(self): log.infov("Training Starts!") output_save_step = 1000 self.session.run(self.global_step.assign(0)) # reset global step from data_loader import load_kdd99 x_train, x_test, y_train, y_test = load_kdd99('kdd_cup.npz', self.config.seed) n_updates = 0 with open(self.res_dir + "/step.txt", 'w') as f: for e in range(1, 1 + self.config.n_epochs): x_train, y_train = shuffle(x_train, y_train) n_train = len(x_train) max_batches = n_train // self.config.batch_size #if n_train % self.config.batch_size != 0: max_batches+=1 for x_batch, y_batch in tqdm(iter_data( x_train, y_train, size=self.config.batch_size), total=max_batches): step, summary, loss, step_time = self.run_single_step( x_batch) self.summary_writer.add_summary(summary, global_step=n_updates) n_updates += 1 #if n_updates % 100 == 0: # eng, eng_chk = self.session.run([self.model.energy, self.model.energy_check], feed_dict=self.model.get_feed_dict(x_batch)) # print(np.mean(eng), np.mean(eng_chk)) if e % 10 == 0: accuracy, precision, recall, f_score = self.evaluate( x_train, y_train, x_test, y_test) f.write(self.filepath + ',' + repr(e) + ',' + repr(accuracy) + ',' + repr(precision) + ',' + repr(recall) + ',' + repr(f_score) + '\n') f.flush() # save model at the end self.saver.save(self.session, os.path.join(self.res_dir, 'model'), global_step=step)
def get_shuffled_seeds(num_participants): """Get randomized seedings for a tournament with num_participants. This is not fully randomized, but instead uses a bucket approach, where the final projected placements of the participants are unaffected. This is nice for varying who gets matched up in a tourney while still preserving the overall benefits of seeding. Args: num_participants: The number of participants in the tournament. Returns: A list of seeds to use for the tournament. For a given seed X, the value at index X - 1 is their randomized seed to use for the tournament. """ shuffled_buckets = [util.shuffle(x) for x in _get_buckets(num_participants)] # Buckets are ordered from last place to first place, so we need to reverse # them to get the seeds ordered from first to last. return util.flatten(reversed(shuffled_buckets))
def testDBN(opts) : """show how to use DBN to do classification""" # read data data = np.load(opts.feature) label = np.load(opts.label) # set the nodes of hidden layers nHid = [5000, 2000] # shuffle data and label [data, label] = util.shuffle(data, label) # decide how many samples to be used as training set percent = float(opts.trainPercent) nCase = data.shape[0] nTrain = int(nCase * percent) nTest = nCase - nTrain # split data and label into train dataset and test dataset trainData = data[0:nTrain, :] trainLabel = label[0:nTrain, :] example = data[nTrain:, :] testLabel = label[nTrain:, :] # set parameters # layer1 p1 = {"maxEpoch" : opts.maxEpoch, "modelType" : "BB"} # layer2 p2 = {"maxEpoch" : opts.maxEpoch} p = {"layer1" : p1, "layer2" : p2} # train the DBN model model = DBNFit.DBNFit(trainData, trainLabel, nHid, name=opts.model, isSingleDBN=True, **p) # do prediction for training set and testing set [trainR, F1] = DBNPredict.DBNPredict(model, trainData, isSingleDBN=True) [testR, F2] = DBNPredict.DBNPredict(model, example, isSingleDBN=True) # calculate classification accuracy trainK = 0 for x in range(nTrain) : if trainLabel[x] != trainR[x] : trainK = trainK+1 testK = 0 for x in range(nTest) : if testLabel[x] != testR[x] : testK = testK+1 print "---------------------------------------" print "train classification rate : %f " % (1 - trainK*1.0/nTrain) print "test classification rate : %f " % (1 - testK*1.0/nTest) print "---------------------------------------" if opts.isSaveResult : result = shelve.open(opts.resultName) result["nHid"] = nHid result["maxEpoch"] = opts.maxEpoch result["trainPercent"] = opts.trainPercent result["trainAcc"] = 1-trainK*1.0/nTrain result["testAcc"] = 1-testK*1.0/nTest result["trainLabel"] = trainLabel result["trainR"] = trainR result["testLabel"] = testLabel result["testR"] = testR result.close()
def main(param=None): if not param: param = {'lr': 0.0970806646812754, 'verbose': 1, 'decay': True, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'nhidden': 200, # number of hidden units 'seed': 345, 'emb_dimension': 50, # dimension of word embedding 'nepochs': 100, # 60 is recommended 'savemodel': False} print param folder = "RelationExtraction" if not os.path.exists(folder): os.mkdir(folder) #load dataset pickle_file = 'semeval.pkl' with open(pickle_file, 'rb') as f: save = pickle.load(f) train_dataset = save['train_dataset'] train_labels = save['train_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] dic=save['dicts'] del save # hint to help gc free up memory print('Training set', train_dataset.shape, train_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) # In[5]: train_dataset=[np.array(x,dtype=np.int32) for x in train_dataset] train_labels=[np.array(x,dtype=np.int32) for x in train_labels] x_test=[np.array(x,dtype=np.int32) for x in test_dataset] y_test=[np.array(x,dtype=np.int32) for x in test_labels] x_train=train_dataset[0:7200] y_train=train_labels[0:7200] x_valid=train_dataset[7201:8000] y_valid=train_labels[7201:8000] # In[6]: #Raw input encoding -''' visualize a few sentences ''' w2idx,labels2idx = dic['words2idx'], dic['labels2idx'] idx2w = dict((v,k) for k,v in w2idx.iteritems()) idx2la = dict((v,k) for k,v in labels2idx.iteritems()) # In[10]: vocsize = len(idx2w) nclasses = len(idx2la) nsentences = len(x_train) groundtruth_valid = [map(lambda x: idx2la[x], y) for y in y_valid] words_valid = [map(lambda x: idx2w[x], w) for w in x_valid] groundtruth_test = [map(lambda x: idx2la[x], y) for y in y_test] words_test = [map(lambda x: idx2w[x], w) for w in x_test] # instanciate the model np.random.seed(param['seed']) random.seed(param['seed']) rnn = GRUTheano(word_dim=param['emb_dimension'], window_context_size=param['win'], vocab_size=vocsize, num_labels=nclasses, hidden_dim=param['nhidden']) #rnn = RNNSLU_LSTM(hidden_dim=param['nhidden'], num_labels=nclasses, vocab_size=vocsize, word_dim=param['emb_dimension'], window_context_size=param['win']) # train with early stopping on validation set best_f1 = -np.inf param['clr'] = param['lr'] for e in xrange(param['nepochs']): # shuffle shuffle([x_train, y_train], param['seed']) param['ce'] = e tic = timeit.default_timer() for i, (x, y) in enumerate(zip(x_train, y_train)): rnn.train(x, y, param['win'], param['clr']) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / nsentences), print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic), sys.stdout.flush() # evaluation // back into the real world : idx -> words predictions_test = [map(lambda x: idx2la[x], rnn.classify(np.asarray( contextwin(x, param['win'])).astype('int32'))) for x in x_test] predictions_valid = [map(lambda x: idx2la[x], rnn.classify(np.asarray( contextwin(x, param['win'])).astype('int32'))) for x in x_valid] # evaluation // compute the accuracy using conlleval.pl res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt', folder) res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt', folder) if res_valid['f1'] > best_f1: if param['savemodel']: rnn.save(folder) best_rnn = copy.deepcopy(rnn) best_f1 = res_valid['f1'] if param['verbose']: print('NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1']) param['vf1'], param['tf1'] = res_valid['f1'], res_test['f1'] param['vp'], param['tp'] = res_valid['p'], res_test['p'] param['vr'], param['tr'] = res_valid['r'], res_test['r'] param['be'] = e subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) else: if param['verbose']: print '' # learning rate decay if no improvement in 10 epochs if param['decay'] and abs(param['be']-param['ce']) >= 10: param['clr'] *= 0.5 rnn = best_rnn if param['clr'] < 1e-5: break print('BEST RESULT: epoch', param['be'], 'valid F1', param['vf1'], 'best test F1', param['tf1'], 'with the model', folder)
def main(data_dir, active_participant_counter, bag_size, held_out_bag_size, test_bag_size, M, N, K, \ clf_name, eta, kernel, cv_method, cv, n_iter, n_jobs, n_trials, verbose, save, description): """ @param data_dir : The directory in which the data is located. The directory should contain a load_data.py script with a load_data() method, which returns the feature representation of the dataset. @param active_participant_counter : Index of the held-out test participant. @param bag_size : The size of the training bags. Use -1 for sessions. @param held_out_bag_size : The size of the training bags from the held-out participant. Use -1 for sessions. @param test_bag_size : The size of the test bags. Use -1 for sessions. @param M : The number of labeled training instances. @param N : The number of labeled training bags. @param K : The number of labeled training instances from the held-out participant. @param clf_name : Classifier; one of 'SVM', 'LinearSVC', 'RF', 'SIL', 'LinearSIL', 'sMIL', 'sbMIL', 'MIForest' or 'misvm'. @param eta_ : If the classifier used is sbMIL, eta is the expected density of positive instances in positive bags, between 0.0 and 1.0. @param kernel : If a non-linear SVM-based classifier is used, the kernel can be specified, i.e. 'rbf', 'linear_av', etc. @param cv_method : The search method for cross-validation; either 'grid' or 'randomized'. @param cv : Number of cross-validation folds. @param n_iter : If the cross-validation search method is 'randomized', then n_iter is the number of randomly sampled parameter tuples. @param n_jobs : The number of jobs, -1 for full parallelization. @param n_trials : Number of trials, in case randomness is introduced in each trial. @param verbose : Indicates the level of detail to be displayed during run-time. @param save : The path of the file where results are stored. @param description : Description of the evaluation to be saved with the results. """ sys.path.insert(0, data_dir) from load_data import load_data dataset = load_data(data_dir) X = dataset['data']['X'] Y = dataset['data']['Y'] session_start = dataset['data']['sessions']['start'] session_labels = dataset['data']['sessions']['labels'] print data_dir print dataset['description'] if clf_name == 'RF': clf = RandomForestClassifier(n_estimators=185, verbose=(verbose>1)) elif clf_name == 'SVM': clf = SVC(kernel=kernel, verbose=(verbose>1)) elif clf_name == 'SIL': clf = misvm.SIL(kernel=kernel, C=1.0, verbose=(verbose>1)) elif clf_name == 'MIForest': clf = MIForest(n_estimators=50, directory="miforest", prefix="eating") elif clf_name == 'sMIL': clf = misvm.sMIL(kernel=kernel, C=1.0, verbose=(verbose>1)) elif clf_name == 'sbMIL': clf = misvm.sbMIL(kernel=kernel, eta=eta, C=1.0, verbose=(verbose>1)) elif clf_name == 'misvm': clf = misvm.MISVM(kernel=kernel, C=1.0, verbose=(verbose>1)) elif clf_name == 'LinearSIL': clf = misvm.LinearSIL(C=1.0) elif clf_name == 'LinearSVC': clf = LinearSVC(C=1.0) #class weights are determined by a Farey sequence to make sure that redundant pairs, #i.e. (1,1) = (2,2), (2,3) = (4,6), etc. are not included. class_weights = [{1 : i, -1 : j} for (i,j) in farey(25)[1:]] #ignore first value where i=0 class_weights.extend([{1 : j, -1 : i} for (i,j) in farey(25)[1:]]) #swap i and j, ignore first value C_array = np.logspace(-5, 15, 21, base=2).tolist() gamma_array = np.logspace(-15, 3, 19, base=2).tolist() eta_array = np.linspace(0,1,9).tolist() n_estimators_array = [25,50,75,100,125,150] param_grid = {} if clf_name in {'RF', 'MIForest'}: param_grid.update({'n_estimators' : n_estimators_array}) if clf_name in {'SIL', 'sMIL', 'sbMIL', 'RF', 'SVM', 'LinearSVC'}: param_grid.update({'class_weight' : class_weights}) if clf_name in {'SIL', 'sMIL', 'sbMIL', 'misvm', 'SVM', 'LinearSIL', 'LinearSVC'}: param_grid.update({'C' : C_array}) if clf_name in {'SIL', 'sMIL', 'sbMIL', 'misvm', 'SVM'} and kernel == 'rbf': param_grid.update({'gamma' : gamma_array}) if clf_name == 'sbMIL': param_grid.update({'eta' : eta_array}) data_params = {"Number of Training Bags": N, "Number of Single-Instance Bags" : M, "Test Participant": active_participant_counter} cv_params = {"K-Fold": cv, "Method": cv_method, "Parameter Grid" : param_grid, "Number of Iterations": n_iter} params = { "Bag Size": bag_size, \ "Data": data_params, \ "Classifier": str(clf), \ "Number of Trials": n_trials, \ "CV": cv_params \ } results = { "Confusion Matrix" : {"Training" : np.zeros((2,2)), "Test" : np.zeros((2,2))}, \ "Precision": {"Training" : 0.0, "Test" : 0.0}, \ "Recall": {"Training" : 0.0, "Test" : 0.0}, \ "F1 Score": {"Training" : 0.0, "Test" : 0.0, "Validation" : 0.0} \ } participant_indices = range(len(X)) n_si_participants = 5 n_bag_participants = len(X) - n_si_participants - 1 if verbose: pprint_header("Train Model for Participant: " + str(active_participant_counter + (active_participant_counter>=13) + 1)) for T in xrange(1,n_trials+1): #allow multiple trials to account for randomness pprint_header("Trial: " + str(T)) #indices for participants in training data; skip active participant counter: train_indices = participant_indices[:active_participant_counter] + participant_indices[active_participant_counter+1:] si_participant_indices = train_indices[:n_si_participants] bag_participant_indices = train_indices[n_si_participants+1:n_si_participants+n_bag_participants+1] #single-instance training data: X_SI = np.vstack([X[k] for k in si_participant_indices]) Y_SI = np.hstack([Y[k] for k in si_participant_indices]) #bag-level training data: X_B = np.vstack([X[k] for k in bag_participant_indices]) Y_B = np.hstack([Y[k] for k in bag_participant_indices]) #test data X_test = X[active_participant_counter] Y_test = Y[active_participant_counter] #convert to bags: if clf_name in MIL: X_SI = [X_SI[k:k+1, :] for k in xrange(len(X_SI))] Y_SI = [max(Y_SI[k:k+1]) for k in xrange(len(Y_SI))] if bag_size == -1: X_B, Y_B, _ = single_instances_to_sessions(X, Y, session_labels, session_start, bag_participant_indices) else: X_B = [X_B[k:k+bag_size, :] for k in xrange(0, len(X_B), bag_size)] Y_B = [max(Y_B[k:k+bag_size]) for k in xrange(0, len(Y_B), bag_size)] if held_out_bag_size == -1: X_T, Y_T, Y_si = single_instances_to_sessions(X, Y, session_labels, session_start, [active_participant_counter]) else: X_T = [X_test[k:k+bag_size, :] for k in xrange(0,len(X_test), held_out_bag_size)] Y_si = [Y_test[k:k+bag_size] for k in xrange(0,len(Y_test), held_out_bag_size)] Y_T = [max(y_t) for y_t in Y_si] X_T, Y_T, Y_si = shuffle(X_T, Y_T, Y_si) # convert remaining bags back to test instances X_test = [] Y_test = [] for i, (x_t, y_si) in enumerate(zip(X_T, Y_si)[K:]): for (x,y) in zip(x_t, y_si): X_test.append(x) Y_test.append(y) X_test = [np.asarray(X_test)[k:k+test_bag_size, :] for k in xrange(0, len(X_test), test_bag_size)] Y_test = [max(Y_test[k:k+test_bag_size]) for k in xrange(0, len(Y_test), test_bag_size)] else: # standard supervised learning case X_T = X_test[:K] X_T = X_test[:K] X_test = X_test[K:] Y_test = Y_test[K:] if N < 0: N=len(X_B) if M < 0: M=len(X_SI) X_SI, Y_SI = shuffle(X_SI, Y_SI) X_B, Y_B = shuffle(X_B, Y_B) X_test, Y_test = shuffle(X_test, Y_test) #combine into single training data set with mixed bags and single-instances X_train = [] Y_train = [] if M > 0: X_train += X_SI[:M] Y_train += Y_SI[:M] if K > 0: X_train += X_T[:K] Y_train += Y_T[:K] if N > 0: X_train += X_B[:N] Y_train += Y_B[:N] if test_bag_size > 1: cv_iterator = mil_train_test_split(X_SI[:M], X_T[:K] + X_B[:N], cv) else: cv_iterator = mil_train_test_split(X_SI[:M] + X_T[:K], X_B[:N], cv) if clf_name in MIL: print ("Total number of bags : %d" %len(X_train)) print ("Feature Dimensionality: %d " %X_train[0].shape[1]) else: print ("Total number of instances : %d" %len(X_train)) print("Feature Dimensionality %d " %len(X_train[0])) sys.stdout.flush() if cv_method == 'grid': gs = GridSearchCV(clf, param_grid, scoring=score, cv=cv_iterator, verbose=verbose, n_jobs = n_jobs) elif cv_method == 'randomized': #scoring='f1_weighted' gs = RandomizedSearchCV(clf, param_distributions=param_grid, scoring=score, cv=cv, n_jobs = n_jobs, n_iter=n_iter, verbose=verbose) t0 = time() gs = gs.fit(X_train, Y_train) tf = time() print("Time elapsed: %0.2f seconds." %(tf-t0)) print("Best params: ") print(gs.best_params_) print("Best F1-score on training data: %0.2f%%" %(100*gs.best_score_)) results['F1 Score']['Validation'] += gs.best_score_ if clf_name == 'MIForest': #for MIForest, we need to pass in Y as well #check training accuracy to start: y_pred = 2*np.greater(gs.best_estimator_.predict(X_train, Y_train),0)-1 else: #for MIForest, we need to pass in Y as well #check training accuracy to start: y_pred = 2*np.greater(gs.best_estimator_.predict(X_train),0)-1 conf = confusion_matrix(Y_train, y_pred, [-1,+1]) print("Confusion matrix on the training data:") print(conf) results['Confusion Matrix']['Training'] += conf if clf_name == 'MIForest': y_pred = 2*np.greater(gs.best_estimator_.predict(X_test, Y_test),0)-1 else: y_pred = 2*np.greater(gs.best_estimator_.predict(X_test),0)-1 conf = confusion_matrix(Y_test, y_pred, [-1,+1]) print("Confusion matrix on the test data:") print(conf) results['Confusion Matrix']['Test'] += conf pprint_header("Results") conf = results['Confusion Matrix']['Training'] avg_precision, avg_recall, avg_fscore = accuracy_precision_recall_fscore(conf)[1][1] results['F1 Score']['Training'] = avg_fscore results['Precision']['Training'] = avg_precision results['Recall']['Training'] = avg_recall print("Average Precision on the training data: %0.2f%%" %(100*avg_precision)) print("Average Recall on the training data: %0.2f%%" %(100*avg_recall)) print("Average F1 Score on the training data: %0.2f%%\n" %(100*avg_fscore)) conf = results['Confusion Matrix']['Test'] avg_precision, avg_recall, avg_fscore = accuracy_precision_recall_fscore(conf)[1][1] results['F1 Score']['Test'] = avg_fscore results['Precision']['Test'] = avg_precision results['Recall']['Test'] = avg_recall print("Average Precision on the test data: %0.2f%%" %(100*avg_precision)) print("Average Recall on the test data: %0.2f%%" %(100*avg_recall)) print("Average F1 Score on the test data: %0.2f%%\n" %(100*avg_fscore)) if save != 'none': print("Saving results to %s ..." %save) evaluation = {"Parameters" : params, "Results" : results} with open(save, 'wb') as f: pickle.dump(evaluation, f)
def rbmFit(X, numHid, y, isSaveModel=False, name=None, **kwargs) : """ X ... data. should be binary, or in [0,1] interpreted as ... probabilities numhid ... number of hidden units y ... List of discrete labels nClass number of classes method CD or SML eta learning rate momentum momentum for smoothness amd to prevent overfitting NOTE: momentum is not recommended with SML maxepoch # of epochs: each is a full pass through train data avglast how many epochs before maxepoch to start averaging before. Procedure suggested for faster convergence by Kevin Swersky in his MSc thesis batchsize The number of training instances per batch verbose For printing progress model.weight The weights of the connections model.biasH The biases of the hidden layer model.biasV The biases of the visible layer model.weightlabel ... The weights on labels layer model.biasLabel ... The biases on labels layer errors The errors in reconstruction at each epoch """ arg = util.processOptions(kwargs, \ nClass = np.unique(y).size, \ method = "CD", \ eta = 0.1, \ momentum = 0.5,\ maxEpoch = 500, \ avgLast = 0, \ penalty = 0, \ batchSize = 100, \ verbose = True) [nClass, method, eta, momentum, maxEpoch, avgLast, penalty, batchSize, verbose] = [\ arg["nClass"],\ arg["method"],\ arg["eta"],\ arg["momentum"],\ arg["maxEpoch"],\ arg["avgLast"],\ arg["penalty"],\ arg["batchSize"],\ arg["verbose"] ] if verbose : print "Processing data ..." # from which step, we start to compute the average # avgStart = maxEpoch - avgLast # for weight decay use # oldPenalty = penalty # numCases : number of example # numDims : the length of each example # each row is an example [numCases, numDims] = list(X.shape) numVis = numDims uniqueLabel = np.unique(y) numBatch = util.ceil(numCases, batchSize) y = util.matrixLabel(y) # shuffle data and label data = copy.deepcopy(X) [data, label] = util.shuffle(data, y) # init CUDA cm.cublas_init() cm.CUDAMatrix.init_random(100) deviceData = cm.CUDAMatrix(cm.reformat(data)) deviceLabel = cm.CUDAMatrix(cm.reformat(label)) # init weights weight = cm.CUDAMatrix(0.1*np.random.randn(numVis,numHid)) biasV = cm.CUDAMatrix(np.zeros((1, numVis))) biasH = cm.CUDAMatrix(np.zeros((1, numHid))) weightLabel = cm.CUDAMatrix(0.1*np.random.randn(nClass, numHid)) biasLabel = cm.CUDAMatrix(np.zeros((1,nClass))) # init weight update weightInc = cm.CUDAMatrix(np.zeros((numVis,numHid))) biasVInc = cm.CUDAMatrix(np.zeros((1,numVis))) biasHInc = cm.CUDAMatrix(np.zeros((1,numHid))) weightLabelInc = cm.CUDAMatrix(np.zeros((nClass, numHid))) biasLabelInc = cm.CUDAMatrix(np.zeros((1,nClass))) #init temporary storage visActP = cm.empty((batchSize, numVis)) hidActP = cm.empty((batchSize, numHid)) hidState = cm.empty((batchSize, numHid)) for epoch in range(maxEpoch) : error = [] for batch in range(numBatch) : # train each data batch if batchSize*(batch+1) > numCases : visTrue = deviceData.get_row_slice(batchSize*batch, numCases) labelTrue = deviceLabel.get_row_slice(batchSize*batch, numCases) batchSize = visTrue.shape[0] visActP = cm.empty((batchSize, numVis)) hidActP = cm.empty((batchSize, numHid)) hidState = cm.empty((batchSize, numHid)) else : visTrue = deviceData.get_row_slice(batchSize*batch, batchSize*(batch+1)) labelTrue = deviceLabel.get_row_slice(batchSize*batch, batchSize*(batch+1)) batchSize = visTrue.shape[0] visActP.assign(visTrue) #apply momentum weightInc.mult(momentum) biasVInc.mult(momentum) biasHInc.mult(momentum) weightLabel.mult(momentum) biasLabel.mult(momentum) # positive phase cm.dot(visActP, weight, target = hidActP) hidActP.add_dot(labelTrue, weightLabel) hidActP.add_row_vec(biasH) hidActP.apply_sigmoid() weightInc.add_dot(visActP.T, hidActP) biasVInc.add_sums(visActP, axis=0) biasHInc.add_sums(hidActP, axis=0) weightLabelInc.add_dot(labelTrue.T, hidActP) biasLabelInc.add_sums(labelTrue, axis=0) hidState.fill_with_rand() hidState.less_than(hidActP, target=hidActP) if cmp(method, "SML") == 0 : if np.logical_and(np.equal(epoch,1), np.equal(batch,1)) : pass # here does not need in practical use elif cmp(method, "CD") == 0 : pass # negative phase cm.dot(hidActP, weight.T, target = visActP) visActP.add_row_vec(biasV) visActP.apply_sigmoid() cm.dot(hidActP, weightLabel.T, target = labelTrue) labelTrue.add_row_vec(biasLabel) labelTrue = util.softmax(labelTrue) # another positive phase cm.dot(visActP, weight, target = hidActP) hidActP.add_dot(labelTrue, weightLabel) hidActP.add_row_vec(biasH) hidActP.apply_sigmoid() weightInc.subtract_dot(visActP.T, hidActP) biasVInc.add_sums(visActP, axis=0, mult=-1) biasHInc.add_sums(hidActP, axis=0, mult=-1) weightLabelInc.subtract_dot(labelTrue.T, hidActP) biasLabelInc.add_sums(labelTrue, axis=0, mult=-1) # update weights and bias weight.add_mult(weightInc, eta/batchSize) biasV.add_mult(biasVInc, eta/batchSize) biasH.add_mult(biasHInc, eta/batchSize) weightLabel.add_mult(weightLabelInc, eta/batchSize) biasLabel.add_mult(biasLabelInc, eta/batchSize) # calculate reconstruction error visTrue.subtract(visActP) error.append(visTrue.euclid_norm()**2) # free memory visTrue.free_device_memory() labelTrue.free_device_memory() if verbose : print "Epoch %d/%d, reconstruction error is %f " % (epoch+1, maxEpoch, sum(error)) # save rbm model weight.copy_to_host() biasV.copy_to_host() biasH.copy_to_host() weightLabel.copy_to_host() biasLabel.copy_to_host() model_ = m.rbmModel(weight.numpy_array, biasV.numpy_array, biasH.numpy_array, \ weightLabel = weightLabel.numpy_array,\ biasLabel = biasLabel.numpy_array, labels = uniqueLabel) # free device memory deviceData.free_device_memory() deviceLabel.free_device_memory() weight.free_device_memory() biasV.free_device_memory() biasH.free_device_memory() weightLabel.free_device_memory() biasLabel.free_device_memory() weightInc.free_device_memory() biasVInc.free_device_memory() biasHInc.free_device_memory() weightLabelInc.free_device_memory() biasLabelInc.free_device_memory() hidActP.free_device_memory() visActP.free_device_memory() hidState.free_device_memory() cm.shutdown() if isSaveModel : modelList = [] modelList.append(model_) model = np.array(modelList) np.save(name,model) return model_
if __name__ == '__main__': print "Opening data files..." X, y = [], [] with open('./data/dmoz_data') as f: X = pickle.load(f) with open('./data/dmoz_targets') as f: y = pickle.load(f) print y[:20] print "Shuffling..." p = range(len(y)) random.seed(0) random.shuffle(p) shuffle = lambda l: [l[p[i]] for i in range(len(p))] y = shuffle(y) X = X[p] print "Loading data..." labeled_data, labeled_target = \ subset_matrix(X, y, 0, 0.2) unlabeled_data, unlabeled_target = \ subset_matrix(X, y, 0.2, 0.6) validate_data, validate_target = \ subset_matrix(X, y, 0.6, 0.8) test_data, test_target = \ subset_matrix(X, y, 0.8, 1) X_labeled = X_unlabeled = X_validate = X_test = None def dump (data, target, fname):
def main(data_dir, data_file, bag_size, active_participant_counter, M, N, seed=None, shuffle_bags = False, shuffle_si = False, K=0, K_max=0, held_out_b=1, shuffle_heldout = True): #data_dir = '../data/eating_detection_inertial_ubicomp2015/' #data_dir = '../data/smoking-data/' #data_file = "data_p0.pickle" sys.path.insert(0, data_dir) from load_data import load_data dataset = load_data(data_dir) X = dataset['data']['X'] Y = dataset['data']['Y'] session_start = dataset['data']['sessions']['start'] session_labels = dataset['data']['sessions']['labels'] participant_indices = range(len(X)) n_si_participants = 5 n_bag_participants = len(X) - n_si_participants - 1 #indices for participants in training data; skip active participant counter: train_indices = participant_indices[:active_participant_counter] + participant_indices[active_participant_counter+1:] si_participant_indices = train_indices[:n_si_participants] bag_participant_indices = train_indices[n_si_participants:n_si_participants+n_bag_participants+1] #single-instance training data: X_SI = [] Y_SI = [] for p in si_participant_indices: x = X[p] y = Y[p] if shuffle_si: x, y = shuffle(seed, x, y) X_SI.append(x[:M]) Y_SI.append(y[:M]) # X_SI.append(X[active_participant_counter][:K]) # Y_SI.append(Y[active_participant_counter][:K]) #bag-level training data: X_B = [] Y_B = [] for p in bag_participant_indices: if bag_size == -1: x, y, _ = single_instances_to_sessions(X[p], Y[p], session_labels[p], session_start[p]) else: x = [X[p][k:k+bag_size, :] for k in xrange(0, len(X[p]), bag_size)] y = [max(Y[p][k:k+bag_size]) for k in xrange(0, len(Y[p]), bag_size)] if shuffle_bags: x, y = shuffle(seed, x,y) X_B.append(x[:N]) Y_B.append(y[:N]) if K_max > 0: if held_out_b == -1: x, y, si_labels = single_instances_to_sessions(X[active_participant_counter], Y[active_participant_counter], session_labels[active_participant_counter], session_start[active_participant_counter]) else: x = [X[active_participant_counter][k:k+held_out_b, :] for k in xrange(0, len(X[active_participant_counter]), held_out_b)] #y = [max(Y[active_participant_counter][k:k+held_out_b]) for k in xrange(0, min(K*held_out_b,len(Y[active_participant_counter])), held_out_b)] si_labels = [Y[active_participant_counter][k:k+held_out_b] for k in xrange(0, len(Y[active_participant_counter]), held_out_b)] y = [max(y_i) for y_i in si_labels] if shuffle_heldout: x, y, si_labels = shuffle(seed, x, y, si_labels) print("len(x): %d" %len(x)) X_B.append(x[:K]) Y_B.append(y[:K]) X_test = [] Y_test = [] if held_out_b == -1: starts = np.cumsum([len(x[l]) for l in range(len(x))]) K_start = np.argmax(starts >= K_max) print("start[K_start]: %d" %starts[K_start]) else: K_start = int(np.ceil(K_max / held_out_b)) for k in range(K_start, len(x)): X_test.extend([x[k][j] for j in range(x[k].shape[0])]) Y_test.extend([si_labels[k][j] for j in range(si_labels[k].shape[0])]) if held_out_b==-1 and K_max > starts[K_start-1]: X_test = X_test[K_max - starts[K_start-1]:] Y_test = Y_test[K_max - starts[K_start-1]:] else: #test data: X_test = X[active_participant_counter] Y_test = Y[active_participant_counter] #X_test, Y_test = shuffle(X_test, Y_test) ##convert to bags: #if clf_name in MIL: # X_SI = [X_SI[k:k+1, :] for k in xrange(len(X_SI))] # Y_SI = [max(Y_SI[k:k+1]) for k in xrange(len(Y_SI))] # # if bag_size == -1: # X_B, Y_B, _ = single_instances_to_sessions(X, Y, session_labels, session_start, bag_participant_indices) # else: # X_B = [X_B[k:k+bag_size, :] for k in xrange(0, len(X_B), bag_size)] # Y_B = [max(Y_B[k:k+bag_size]) for k in xrange(0, len(Y_B), bag_size)] # # if held_out_bag_size == -1: # X_T, Y_T, Y_si = single_instances_to_sessions(X, Y, session_labels, session_start, [active_participant_counter]) # else: # X_T = [X_test[k:k+bag_size, :] for k in xrange(0,len(X_test), held_out_bag_size)] # Y_si = [Y_test[k:k+bag_size] for k in xrange(0,len(Y_test), held_out_bag_size)] # Y_T = [max(y_t) for y_t in Y_si] # # X_T, Y_T, Y_si = shuffle(X_T, Y_T, Y_si) # # # convert remaining bags back to test instances # X_test = [] # Y_test = [] # for i, (x_t, y_si) in enumerate(zip(X_T, Y_si)[K:]): # for (x,y) in zip(x_t, y_si): # X_test.append(x) # Y_test.append(y) # # X_test = [np.asarray(X_test)[k:k+test_bag_size, :] for k in xrange(0, len(X_test), test_bag_size)] # Y_test = [max(Y_test[k:k+test_bag_size]) for k in xrange(0, len(Y_test), test_bag_size)] # #else: # standard supervised learning case # X_T = X_test[:K] # X_T = X_test[:K] # X_test = X_test[K:] # Y_test = Y_test[K:] # #if N < 0: # N=len(X_B) # #if M < 0: # M=len(X_SI) # #X_SI, Y_SI = shuffle(X_SI, Y_SI) #X_B, Y_B = shuffle(X_B, Y_B) #X_test, Y_test = shuffle(X_test, Y_test) data = {} data['training'] = {'instance' : {'X' : X_SI, 'Y' : Y_SI}, 'bag' : {'X' : X_B, 'Y' : Y_B}} data['test'] = {'X' : X_test, 'Y' : Y_test} with open(data_file, 'wb') as f: pickle.dump(data, f) return data
def main(data_file, clf_str, cv_method, n_iter, n_jobs, verbose, save, description): """ TODO: Doc string """ with open(data_file, 'rb') as f: data = pickle.load(f) X_SI = data['training']['instance']['X'] Y_SI = data['training']['instance']['Y'] X_B = data['training']['bag']['X'] Y_B = data['training']['bag']['Y'] X_train = [] Y_train = [] X_val = [] Y_val = [] X_SI_val = [] Y_SI_val = [] X_B_val = [] Y_B_val = [] for p in range(len(X_SI)): X_train.extend(X_SI[p]) Y_train.extend(Y_SI[p]) #l = min(100,int(np.ceil(0.5*len(X_SI[p])))) x,y = shuffle(None, X_SI[p], Y_SI[p]) X_val.extend(x) Y_val.extend(y) X_SI_val.append(x) Y_SI_val.append(y) n_single_instances = len(X_train) #for class weights: # N1 = np.sum(np.greater(Y_train, 0)) # N0 = np.sum(np.less(Y_train, 0)) for p in range(len(X_B)): X_train.extend(X_B[p]) Y_train.extend(Y_B[p]) l = int(np.ceil(0.25*len(X_B[p]))) x,y = shuffle(None, X_B[p], Y_B[p]) X_val.extend(x[:l]) Y_val.extend(y[:l]) X_B_val.append(x[:l]) Y_B_val.append(y[:l]) n_bags = len(X_train) - n_single_instances X_test = data['test']['X'] Y_test = data['test']['Y'] clf_name, clf_params = parse_clf(clf_str) # if N0 + N1 == 0: # clf_params['class_weight'] = {1 : 0.9, -1 : 0.1} # else: # clf_params['class_weight'] = {1 : N0/(N0 + N1), -1 : N1/(N0 + N1)} # print clf_params['class_weight'] clf = get_clf_by_name(clf_name, **clf_params) param_grid = get_param_grid_by_clf(clf_name, clf_params.get("kernel", "linear")) results = { "Confusion Matrix" : {"Training" : np.zeros((2,2)), "Test" : np.zeros((2,2))}, \ "Precision": {"Training" : 0.0, "Test" : 0.0}, \ "Recall": {"Training" : 0.0, "Test" : 0.0}, \ "F1 Score": {"Training" : 0.0, "Test" : 0.0, "Validation" : 0.0} \ } cv_iterator = mil_train_test_split(X_SI_val, X_B_val, Y_SI_val, Y_B_val) pprint_header("Number of bags : %d Number of single instances: %d Number of test instances: %d" %(n_bags, n_single_instances, len(Y_test))) if cv_method == 'grid': gs = GridSearchCV(clf, param_grid, scoring=score, cv=cv_iterator, verbose=verbose, n_jobs = n_jobs, refit=False) elif cv_method == 'randomized': gs = RandomizedSearchCV(clf, param_distributions=param_grid, scoring=score, cv=cv_iterator, n_jobs = n_jobs, n_iter=n_iter, verbose=verbose, refit=False) t0 = time() gs = gs.fit(X_val, Y_val) tf = time() print("Best parameters set found on development set:\n") print(gs.best_params_) print("\nGrid scores on development set:\n") for params, mean_score, scores in gs.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) clf.set_params(**gs.best_params_) clf.fit(X_train, Y_train) print("\nDetailed classification report:\n") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.\n") y_true, y_pred = Y_test, 2*np.greater(clf.predict(X_test),0)-1 print(classification_report(y_true, y_pred)) print("\nTime elapsed: %0.2f seconds." %(tf-t0)) # if clf_name == 'MIForest': #for MIForest, we need to pass in Y as well # #check training accuracy to start: # y_pred = 2*np.greater(gs.best_estimator_.predict(X_train, Y_train),0)-1 # else: #for MIForest, we need to pass in Y as well # #check training accuracy to start: # y_pred = 2*np.greater(gs.best_estimator_.predict(X_train),0)-1 # # conf = confusion_matrix(Y_train, y_pred, [-1,+1]) # print("Confusion matrix on the training data:") # print(conf) # results['Confusion Matrix']['Training'] = conf # # precision, recall, fscore = accuracy_precision_recall_fscore(conf)[1][1] # results['F1 Score']['Training'] = fscore # results['Precision']['Training'] = precision # results['Recall']['Training'] = recall # if clf_name == 'MIForest': # y_pred = 2*np.greater(gs.best_estimator_.predict(X_test, Y_test),0)-1 # else: # y_pred = 2*np.greater(gs.best_estimator_.predict(X_test),0)-1 # conf = confusion_matrix(y_true, y_pred, [-1,+1]) print("Confusion matrix on the test data:") print(conf) results['Confusion Matrix']['Test'] = conf precision, recall, fscore = accuracy_precision_recall_fscore(conf)[1][1] results['F1 Score']['Test'] = fscore results['Precision']['Test'] = precision results['Recall']['Test'] = recall print("Precision on the test data: %0.2f%%" %(100*precision)) print("Recall on the test data: %0.2f%%" %(100*recall)) print("F1 Score on the test data: %0.2f%%\n" %(100*fscore)) evaluation = {"Description": description, "Results" : results} if save != 'none': print("Saving results to %s ..." %save) with open(save, 'wb') as f: pickle.dump(evaluation, f) return evaluation