def load_data(config, cache_byte_size, files_config_key, **kwargs): """ :param Config config: :param int cache_byte_size: :param str files_config_key: such as "train" or "dev" :param kwargs: passed on to init_dataset() or init_dataset_via_str() :rtype: (Dataset,int) :returns the dataset, and the cache byte size left over if we cache the whole dataset. """ if not config.bool_or_other(files_config_key, None): return None, 0 kwargs = kwargs.copy() kwargs.setdefault("name", files_config_key) if config.is_typed(files_config_key) and isinstance(config.typed_value(files_config_key), dict): config_opts = config.typed_value(files_config_key) assert isinstance(config_opts, dict) kwargs.update(config_opts) if 'cache_byte_size' not in config_opts: if kwargs.get('class', None) == 'HDFDataset': kwargs["cache_byte_size"] = cache_byte_size Dataset.kwargs_update_from_config(config, kwargs) data = init_dataset(kwargs) else: config_str = config.value(files_config_key, "") data = init_dataset_via_str(config_str, config=config, cache_byte_size=cache_byte_size, **kwargs) cache_leftover = 0 if isinstance(data, HDFDataset): cache_leftover = data.definite_cache_leftover return data, cache_leftover
def load_data(config, cache_byte_size, files_config_key, **kwargs): """ :type config: Config :type cache_byte_size: int :type chunking: str :type seq_ordering: str :rtype: (Dataset,int) :returns the dataset, and the cache byte size left over if we cache the whole dataset. """ if not config.has(files_config_key): return None, 0 if config.is_typed(files_config_key) and isinstance(config.typed_value(files_config_key), dict): new_kwargs = config.typed_value(files_config_key) assert isinstance(new_kwargs, dict) kwargs.update(new_kwargs) if 'cache_byte_size' not in new_kwargs: if kwargs.get('class', None) == 'HDFDataset': kwargs["cache_byte_size"] = cache_byte_size Dataset.kwargs_update_from_config(config, kwargs) data = init_dataset(kwargs) else: config_str = config.value(files_config_key, "") data = init_dataset_via_str(config_str, config=config, cache_byte_size=cache_byte_size, **kwargs) cache_leftover = 0 if isinstance(data, HDFDataset): cache_leftover = data.definite_cache_leftover return data, cache_leftover
def test_read_data_points(self): set = Dataset() set.read_data_points("flueaeg.txt") data = set.get_data self.assertEqual(data[100.0], 16.6) # here we should see an error printet set.read_data_points("findes-ikke.txt")
def changeURLs(dataLines): changedUrlsLines = [] dataset = Dataset(dataLines) for line in dataset.dataLines: columns = dataset.getColumns(line) columns['url'] = Brand.brandsUrls[ columns['brand'] ] changedUrlsLines.append( dataset.getLine(columns) ) return changedUrlsLines
def fixSizeTypes(dataLines): fixedDataLines = [] dataset = Dataset(dataLines) for line in dataset.dataLines: columns = dataset.getColumns(line) columns['size_type'] = columns['size_type'].replace(" ", "_") fixedDataLines.append( dataset.getLine(columns) ) return fixedDataLines
def __init__(self, id=None, drawing=None, posX=0, posY=0, x1=0, y1=0, x2=0, y2=0, pen=None, brush=None): Dataset.__init__(self, id) self.drawing = drawing self.posX = posX self.posY = posY self.x1 = x1 self.y1 = y1 self.x2 = x2 self.y2 = y2 self.pen = pen self.brush = brush
def train_set_loss_vars_for_cur_batches(self): """ Called via Engine.SeqTrainParallelControl. """ assert self.train_have_loss_for_cur_batches() # See EngineUtil.assign_dev_data for reference. from Dataset import Dataset n_time, n_batch = Dataset.index_shape_for_batches(self.train_batches) n_output_dim = self.output_layer.attrs['n_out'] output_loss = numpy.zeros((n_batch,), "float32") output_hat_y = numpy.zeros((n_time, n_batch, n_output_dim), "float32") offset_slice = 0 for batch in self.train_batches: for seq in batch.seqs: o = seq.batch_frame_offset q = seq.batch_slice + offset_slice l = seq.frame_length # input-data, input-index will also be set in this loop. That is data-key "data". for k in [self.output_target]: if l[k] == 0: continue loss, hat_y = self.get_loss_and_hat_y(seq.seq_idx) assert seq.seq_start_frame[k] < hat_y.shape[0] assert seq.seq_end_frame[k] <= hat_y.shape[0] output_loss[q] += loss * float(l[k]) / hat_y.shape[0] output_hat_y[o[k]:o[k] + l[k], q] = hat_y[seq.seq_start_frame[k]:seq.seq_end_frame[k]] self.output_var_loss.set_value(output_loss) self.output_var_hat_y.set_value(output_hat_y)
def mergeSynonymousSizeTypes(dataLines): mergedDataLines = [] dataset = Dataset(dataLines) for line in dataset.dataLines: columns = dataset.getColumns(line) if columns['size_type'] in SizeType.mergedSizeTypes: columns['size_type'] = SizeType.mergedSizeTypes[ columns['size_type'] ] mergedDataLines.append( dataset.getLine(columns) ) return mergedDataLines
def clone(): print 'Enter name for new dataset:' dsname = raw_input() os.system('mkdir %s' % dsname) ds = Dataset() db = getDB() cur = db.cursor(MySQLdb.cursors.DictCursor) cur.execute("SELECT * FROM Answers WHERE isRetrieved=1") for row in cur.fetchall(): ds.X.append(row['answer']) ds.Y.append(row['author']) ds.ts = max([ds.ts, row['updated_at']]) with open('%s/data' % dsname, 'w') as f: cPickle.dump(ds, f) print 'Dataset cloned'
def getXAtMaxIm(dataset: Dataset): data = dataset.getPlane() curMin = data[0][0] bestCycle = 0 for cycle in range(len(data)): if max(data[cycle]) > curMin: curMin = max(data[cycle]) bestCycle = cycle return bestCycle
def doUpperCase(dataLines): upperCaseLines = [] dataset = Dataset(dataLines) for line in dataset.dataLines: columns = dataset.getColumns(line) columns['size_type'] = columns['size_type'].upper() columns['label'] = columns['label'].upper() columns['brand'] = columns['brand'].upper() columns['clothe_category'] = columns['clothe_category'].upper() columns['size_category'] = columns['size_category'].upper() columns['gender'] = columns['gender'].upper() upperCaseLines.append( dataset.getLine(columns) ) return upperCaseLines
def get_all_dataset(self, idmodel): con = lite.connect(self.name) with con: con.row_factory = lite.Row cur = con.cursor() cur.execute("SELECT * FROM dataset where idmodel=:idmodel", {'idmodel': idmodel}) rows = cur.fetchall() dataset = Dataset.from_db_rows(rows, self.setup) return dataset
def main(): global X global Y ds = Dataset.open('quora') X,Y = ds.X,ds.Y # Z = [re.findall(r"[\w']+", x) for x in X] # Z = [filter(None, x.split('.')) for x in X] # Z = ["".join(s) for s in Z] # Z = [z.split(' ') for z in Z] # Z = [[len(s) for s in z] for z in Z] # feature = [] # for a in Z: # wordLenDist = [0]*100 # for ln in a: # wordLenDist[ln]+=1 # feature.append(wordLenDist) feature = [] tokenizer = RegexpTokenizer(r'\w+') for x in X: All = len(nltk.word_tokenize(x)) numPunctuation = All - len(tokenizer.tokenize(x)) numWords = All - numPunctuation ff = [numPunctuation, numWords] feature.append(ff) X = feature Z = zip(X, Y) shuffle(Z) (X, Y) = zip(*Z) si=0 acc = 0.0 cnt = 0 while si<len(X): Xe = X[si:si+50] Ye = Y[si:si+50] X1 = X[:si] + X[si+50:] Y1 = Y[:si] + Y[si+50:] acc += train_chunk(X1, Y1, Xe, Ye) cnt += 1 si += 50 print 'Accuracy: %f' % (acc/cnt)
def benchmark(lstm_unit, use_gpu): """ :param str lstm_unit: e.g. "LSTMBlock", one of LstmCellTypes :param bool use_gpu: :return: runtime in seconds of the training itself, excluding initialization :rtype: float """ device = {True: "GPU", False: "CPU"}[use_gpu] key = "%s:%s" % (device, lstm_unit) print(">>> Start benchmark for %s." % key) config = Config() config.update(make_config_dict(lstm_unit=lstm_unit, use_gpu=use_gpu)) dataset_kwargs = config.typed_value("train") Dataset.kwargs_update_from_config(config, dataset_kwargs) dataset = init_dataset(dataset_kwargs) engine = Engine(config=config) engine.init_train_from_config(config=config, train_data=dataset) print(">>> Start training now for %s." % key) start_time = time.time() engine.train() runtime = time.time() - start_time print(">>> Runtime of %s: %s" % (key, hms_fraction(runtime))) engine.finalize() return runtime
def parse_csv_file(filename, optimized_indexes=[]): """ Parse a csv file and return the corresponding dataset """ data = [] with open(filename, 'r') as csvFile: fields = csvFile.readline().rstrip().split(SEPARATOR) if len(optimized_indexes) != 0: fields = [ fields[i] for i in range(len(fields)) if i in optimized_indexes ] for row in csvFile: data.append(parse_csv_line(row, optimized_indexes)) return Dataset(fields, data)
def train_network(path_to_db, model, frame_size, rescale, validation_set_percentage, batch_size, epochs, optimizer, loss='categorical_crossentropy', metrics=['acc']): data_set = Dataset(path_to_db) train, validation, test = data_set.get_train_val_test_sets( size=frame_size, rescale=rescale, validaion_set_percentage=validation_set_percentage, batch_size=batch_size) # configure the model for training: model.compile(loss=loss, optimizer=optimizer, metrics=metrics) history = model.fit_generator(train, steps_per_epoch=100, epochs=epochs, validation_data=validation, validation_steps=50) return history
def load_HP_dataset(noise=False): fin = open( '/media/heyue/8d1c3fac-68d3-4428-af91-bc478fbdd541/ClusterResearch/clusterQNet/data/HP_model_5_feature.txt', 'r') lines = fin.read().splitlines() data_num = len(lines) / 5 dataset = Dataset() for i in xrange(data_num): dataset.imageNameList.append(['data/HP/' + lines[i * 5]]) dataset.rect.append([0, 0, 100, 100]) dataset.imgID.append(int(lines[i * 5 + 4])) dataset.feature.append(map(float, lines[i * 5 + 2].split())) dataset.size += 1 if dataset.imgID[-1] == 1 and noise == False: dataset.imageNameList.pop() dataset.rect.pop() dataset.imgID.pop() dataset.feature.pop() dataset.size -= 1 dataset.computeAffinity() #dataset.computeQuality() dataset.Quality = [1.0 for i in xrange(dataset.size)] return dataset
def run(hidden, inNodes, output, dataset, iterations, percent): d = Dataset(dataset) # print d.trainTestSplit(percent) trainX, testX, trainy, testy = d.trainTestSplit(percent) # print trainy inNodes = trainX.shape[1] output = trainy.shape[1] network = Network(hidden, inNodes, output) network.train(trainX, trainy, iterations) print network predictionsTrain = network.predict(trainX) predictionsTest = network.predict(testX) print "Total Training Error:", network.getError(trainX, trainy) print "Total Testing Error:", network.getError(testX, testy)
def mainAdaline(): arquivoDTest = open("Dataset2/dtest2.txt", "r") arquivoXTest = open("Dataset2/xtest2.txt", "r") datasetTest = Dataset(arquivoDTest, arquivoXTest) arquivoDTrain = open("Dataset2/dtrain2.txt", "r") arquivoXTrain = open("Dataset2/xtrain2.txt", "r") datasetTrain = Dataset(arquivoDTrain, arquivoXTrain) redeAdalineTrain = Adaline(datasetTrain.definindoEntradas(), datasetTrain.definindoValoresDesejados(), taxa_aprendizado, precisao) redeAdalineAnd = Adaline([[1, 1], [0, 0], [1, 0], [0, 1]], [1, -1, -1, -1], 0.25, 0.0001) redeAdalineAnd.treinamento_online() # redeAdalineTrain.treinamento_online() redeAdalineAnd.conferirRespostas(redeAdalineAnd.x, redeAdalineAnd.w, redeAdalineAnd.w0, redeAdalineAnd.d, len(redeAdalineAnd.x))
def __init__(self, filename='./text8.zip', word_num=200, batch_size=8, skip_window=2, num_skips=2, embed_dim=10, epoch=100, lr=0.025, neg_cnt=5, outfile='./skip_gram', dictfile='./word_dict'): """Init this word2vec model""" # params about dataset self.batch_size = batch_size self.skip_window = skip_window self.num_skips = num_skips # params about skip gram self.embed_num = word_num self.embed_dim = embed_dim # params about learning self.epoch = epoch self.lr = lr self.neg_cnt = neg_cnt # dataset self.dataset = Dataset(filename, word_num) if (not os.path.exists(dictfile)): pickle.dump(self.dataset.word_dict, open(dictfile, 'wb')) # skip gram self.outfile = outfile if (os.path.exists(outfile)): self.skip_gram = pickle.load(open(self.outfile, 'rb')) else: self.skip_gram = SkipGram(word_num, embed_dim) # optimizer self.optimizer = optim.SGD(self.skip_gram.parameters(), lr=self.lr)
def train_one_batch(self, sess, x, y_, accuracy, train_step, train_feed_dict, test_feed_dict): tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) dataset = Dataset(input_file_path=self.data_path, max_sample_records=self.max_sample_records) # Not sure what these two lines do run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_opts_metadata = tf.RunMetadata() train_batches = dataset.get_batches(train=True) batch = next(train_batches) images, labels = process_data(batch) train_feed_dict[x] = images train_feed_dict[y_] = labels for epoch in range(self.epochs): train_step.run(feed_dict=train_feed_dict) train_summary, train_accuracy = sess.run([merged, accuracy], feed_dict=train_feed_dict, options=run_opts, run_metadata=run_opts_metadata) test_summary, test_accuracy = sess.run([merged, accuracy], feed_dict=train_feed_dict, options=run_opts, run_metadata=run_opts_metadata) message = "epoch: {0}, training accuracy: {1}, validation accuracy: {2}" print(message.format(epoch, train_accuracy, test_accuracy))
def getDataset(data, path, evalMode): # if data in ["ml-1m", "yelp", "pinterest-20"]: if data in ["brightkite", "fsq11", "yelp"]: columns = ['uid', 'iid', 'rating', 'hour', 'day', 'month', 'timestamp'] train = pd.read_csv(path + "data/%sTrain" % data, names=columns, sep="\t") test = pd.read_csv(path + "data/%sTest" % data, names=columns, sep="\t") df = train.append(test) df.sort_values(["uid", "timestamp"], inplace=True) dataset = Dataset(df, evalMode) elif data in ["ml-1m", "yelp-he"]: names = ["uid", "iid", "rating", "timestamp"] data = "yelp" if data == "yelp-he" else data train = pd.read_csv(path + "data/%s.train.rating" % data, sep="\t", names=names) test = pd.read_csv(path + "data/%s.test.rating" % data, sep="\t", names=names) df = train.append(test) dataset = Dataset(df, evalMode) elif data in ["beauty", "steam", "video", "ml-sas"]: names = ["uid", "iid"] if data == "beauty": df = pd.read_csv(path + "data/Beauty.txt", sep=" ", names=names) elif data == "steam": df = pd.read_csv(path + "data/Steam.txt", sep=" ", names=names) elif data == "video": df = pd.read_csv(path + "data/Video.txt", sep=" ", names=names) else: df = pd.read_csv(path + "data/ml-1m.txt", sep=" ", names=names) dataset = Dataset(df, evalMode) elif data == "test": columns = ["uid", "timestamp", "lat", "lng", "iid"] df = pd.read_csv(path + "data/brightkite.txt", names=columns, sep="\t", nrows=10000) dataset = Dataset(df, evalMode) return dataset
def __init__(self, data_path, train_val_test=(0.8, 0.1, 0.1)): """ Initialises a 'YOLO_Dataset' object by calling the superclass initialiser. The difference between a YOLO_Dataset object and a Dataset object is the annotation. The YOLO_Dataset object will therefore override the self.annotations_path and self.annotation_list attributes such that the building labels are in XML format. """ assert (train_val_test[0] + train_val_test[1] + train_val_test[2] ) == 1, 'Train, val and test percentages should add to 1' assert train_val_test[0] > 0 and train_val_test[ 1] > 0 and train_val_test[ 2] > 0, 'Train, val and test percentages should be non-negative' Dataset.__init__(self, data_path) self.train_val_test = train_val_test self.train_path = self.data_path + '/yolo/train' self.val_path = self.data_path + '/yolo/val' self.test_path = self.data_path + '/yolo/test' if not os.path.isdir(self.data_path + '/yolo'): print(f"Creating directory to store YOLO formatted dataset.") os.mkdir(self.data_path + '/yolo') # Create train, validation, test directories, each with an images and annotations # sub-directory for directory in [self.train_path, self.val_path, self.test_path]: if not os.path.isdir(directory): os.mkdir(directory) if not os.path.isdir(directory + '/images'): os.mkdir(directory + '/images') if not os.path.isdir(directory + '/annotations'): os.mkdir(directory + '/annotations')
def __init__(self, input_shape, num_classes, learning_rate, clients_num): self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) # Call the create function to build the computational graph of AlexNet net = AlexNet(input_shape, num_classes, learning_rate, self.graph) self.model = FedModel(*net) # initialize with self.graph.as_default(): self.sess.run(tf.global_variables_initializer()) # Load Cifar-10 dataset # NOTE: len(self.dataset.train) == clients_num self.dataset = Dataset(tf.keras.datasets.cifar10.load_data, split=clients_num)
def __init__(self, input_shape, num_classes, learning_rate, clients_num): self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) # 创建alxnet网络 net = AlexNet(input_shape, num_classes, learning_rate, self.graph) #net = vgg_net(input_shape, num_classes, learning_rate, self.graph) self.model = FedModel(*net) # 初始化 with self.graph.as_default(): self.sess.run(tf.global_variables_initializer()) # 装载数据 # 根据训练客户端数量划分数据集 self.dataset = Dataset(tf.keras.datasets.cifar10.load_data,split=clients_num)
def onRequestedCall(self): # List of storage we will work with mainStorage = self.storageController.getStorageSpace(14) qBox1Storage = self.storageController.getStorageSpace(13) qBox2Storage = self.storageController.getStorageSpace(12) # Creating and storing a dataset as initialization dataset1 = Dataset(1, ONE_GIGA_BYTE) dataset2 = Dataset(2, ONE_GIGA_BYTE*10) dataset3 = Dataset(3, ONE_GIGA_BYTE*30) dataset4 = Dataset(4, ONE_GIGA_BYTE*30) dataset5 = Dataset(5, ONE_GIGA_BYTE*30) dataset6 = Dataset(6, ONE_GIGA_BYTE*30) dataset7 = Dataset(7, ONE_GIGA_BYTE*30) dataset8 = Dataset(8, ONE_GIGA_BYTE*30) mainStorage.storeDataset(dataset1) mainStorage.storeDataset(dataset2) mainStorage.storeDataset(dataset3) mainStorage.storeDataset(dataset4) mainStorage.storeDataset(dataset5) mainStorage.storeDataset(dataset6) mainStorage.storeDataset(dataset7) mainStorage.storeDataset(dataset8) # Moving the dataset and notify on finish self.storageController.doDataTransfer(dataset1, mainStorage, [qBox1Storage]) self.storageController.doDataTransfer(dataset2, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset3, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset4, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset5, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset6, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset7, mainStorage, [qBox2Storage]) self.storageController.doDataTransfer(dataset6, mainStorage, [qBox1Storage]) self.storageController.doDataTransfer(dataset7, mainStorage, [qBox1Storage]) self.bs.notify_submission_finished()
def train(args, train_data_path): print("use_gpu:{}, NeuMF:{}, epochs:{}, batch_size:{}, num_factors:{}, num_neg:{}, lr:{}, model_dir:{}, layers:{}".format( args.use_gpu, args.NeuMF, args.epochs, args.batch_size, args.num_factors, args.num_neg, args.lr, args.model_dir, args.layers)) dataset = Dataset(args.path + args.dataset) testRatings, testNegatives = dataset.testRatings, dataset.testNegatives train_data_generator = utils.Dataset() train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size) inputs = utils.input_data(True) if args.GMF: model = GMF() loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors) elif args.MLP: model = MLP() loss, pred = model.net(inputs, args.num_users, args.num_items, args.layers) elif args.NeuMF: model = NeuMF() loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors, args.layers) optimizer = fluid.optimizer.AdamOptimizer(args.lr) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) loader = fluid.io.DataLoader.from_generator( feed_list=inputs, capacity=args.batch_size, iterable=True) loader.set_sample_list_generator(train_reader, places=place) for epoch in range(args.epochs): for batch_id, data in enumerate(loader()): begin = time.time() loss_val = exe.run(program=fluid.default_main_program(), feed=data, fetch_list=[loss.name], return_numpy=True) end = time.time() logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}".format(epoch, batch_id, end - begin, np.array(loss_val)[0][0])) save_dir = "%s/epoch_%d" % (args.model_dir, epoch) feed_var_names = ["user_input", "item_input"] fetch_vars = [pred] fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
def Main(): parser = argparse.ArgumentParser(description='Dataset computing procedure. v1.0') parser.add_argument('-d',dest='dataset',type=str,required=True,\ help='input dataset') args = parser.parse_args() dataset = Dataset.load(args.dataset) df = res_count(dataset)/(sum(res_count(dataset)['O']))*100 df.columns = ['Helix','Strand','Coil','Overall'] df = df.plot(kind='bar', rot=0, colormap='coolwarm_r', edgecolor='black') df.set(xlabel="Residues", ylabel="Residue Frequency (%)") df.yaxis.set_major_formatter(mtick.PercentFormatter()) #df.plot(kind='bar',figsize=(6,6),alpha=0.90,rot=0) plt.show()
def hanging_query_count(QY_dataset, start_time, end_time, params={}): """ Count the number of hanging queries which started in the given time window. Args: QY_dataset: (Dataset) the query dataset start_time: (pd.Timestamp) the start time of the window end_time: (pd.Timestamp) the ends time of the window params: (dict) optional parameters Returns: a Dataset of counts """ if QY_dataset.raw: logging.error("Data must be preprocessed before counts") raise ValueError time_range = Time_Range(start_time, end_time) if time_range.start_column != "start_date": logging.error("Query data can only be binned by date") raise ValueError left_bool = QY_dataset.dataset["OpenDate"] <= time_range.start_time right_bool = QY_dataset.dataset["CloseDate"] > time_range.end_time in_range = QY_dataset.dataset[left_bool & right_bool] site_counts = in_range.groupby("Site")["STUDYID"].count() if "site_list" in params: site_counts = site_counts.reindex(params["site_list"], fill_value=0) site_count_frame = series_to_frame(site_counts, time_range, "hanging_query_count") out = Dataset( dataset=site_count_frame, params={ "count": True, "raw": False, "time_format": time_range.type }, ) return out
def main(): # Command line arguments parser = argparse.ArgumentParser() parser.add_argument('--data', type=str, default='Data/ml-1m', help='Path to the dataset') parser.add_argument('--epochs', type=int, default=128, help='Number of training epochs') parser.add_argument('--embedding_dim', type=int, default=8, help='Embedding dimensions, the first dimension will be ' 'used for the bias.') parser.add_argument('--regularization', type=float, default=0.0, help='L2 regularization for user and item embeddings.') parser.add_argument('--negatives', type=int, default=8, help='Number of random negatives per positive examples.') parser.add_argument('--learning_rate', type=float, default=0.001, help='SGD step size.') parser.add_argument('--stddev', type=float, default=0.1, help='Standard deviation for initialization.') args = parser.parse_args() # Load the dataset dataset = Dataset(args.data) train_pos_pairs = np.column_stack(dataset.trainMatrix.nonzero()) test_ratings, test_negatives = (dataset.testRatings, dataset.testNegatives) print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' % ( dataset.num_users, dataset.num_items, train_pos_pairs.shape[0], len(test_ratings))) # Initialize the model model = MFModel(dataset.num_users, dataset.num_items, args.embedding_dim-1, args.regularization, args.stddev) # Train and evaluate model hr, ndcg = evaluate(model, test_ratings, test_negatives, K=10) print('Epoch %4d:\t HR=%.4f, NDCG=%.4f\t' % (0, hr, ndcg)) for epoch in range(args.epochs): # Training _ = model.fit(train_pos_pairs, learning_rate=args.learning_rate, num_negatives=args.negatives) # Evaluation hr, ndcg = evaluate(model, test_ratings, test_negatives, K=10) print('Epoch %4d:\t HR=%.4f, NDCG=%.4f\t' % (epoch+1, hr, ndcg))
def generateTimeExchangeData(self): self.is_generating_time_exchange_data = True i = 1 for source_storage_key, source_storage in self.storage_dict.items(): self.time_exchange_data_job_dict[source_storage_key] = TimeExchangeData(source_storage_key, source_storage.id, source_storage.id) self.time_exchange_data_job_dict[source_storage_key].time = 0 dataset_id = "time_exchange_data" + str(i) ds = Dataset(dataset_id, 1000000) i += 1 source_storage.addDataset(ds) for target_storage_key, target_storage in self.storage_dict.items(): if target_storage_key != source_storage_key: self.moveDataset(dataset_id, source_storage_key, target_storage_key) self.is_generating_time_exchange_data = False
def Main(): parser = argparse.ArgumentParser( description='Vincenzo 12/11/2019: SOV computation. v1.1') parser.add_argument('-d',dest='dataset',type=str,required=True,\ help='dataset object to be loaded for the training') parser.add_argument('-t','--type',dest='genre',type=str,required=True,\ help='specify the type of prediction to be tested: SVM or GOR') args = parser.parse_args() sse_list = ['H', 'E', '-'] dataset = Dataset.load(args.dataset) genre = args.genre per_structure_dict = sov_multi(dataset, genre)[1] for key in per_structure_dict.keys(): print('%s : %s' % (key, per_structure_dict[key])) return print('Mean SOV : %s' % (sov_multi(dataset, genre)[0]))
def Main(): parser = argparse.ArgumentParser( description='Vincenzo 21/11/2019: SVM output decoding. v1.1') parser.add_argument('-d',dest='dataset',type=str,required=True,\ help='dataset object to be loaded for the encoding') parser.add_argument('-i','--in',dest='infile',type=str,required=True,\ help='specify an input file containing the predictions.') parser.add_argument('-o','--out',dest='out',type=str,required=True,\ help='specify an ouput file.') args = parser.parse_args() data = Dataset.load(args.dataset) infile = open(args.infile).readlines() prediction = testing(data, infile) prediction.dump(args.out + '.dat') return print('SVM-decoding succesfully saved at %r' % (args.out + '.dat'))
class Tests(unittest.TestCase): db = Dataset.loadFrom('medium20000_10_shuffled_0.3obstacles.pkl') def setUp(self): pass def test_1(self): user = User() user.createNetwork(model='custom_mlp:2,800,0,0.3') user.loadNetworkState( "model_customMlp2_800_0_0.3__10by10___30percentsAccuracy.npz") saveDir = "games" # for i in [0, 1, 2]: #range(Tests.db.X_test): for i in range(Tests.db.X_test.shape[0]): sample = Tests.db.X_test[i] world, start, goal = Dataset.greyScaleSampleToGridWorld(sample) found, blocked, records = user.play(world, start, goal) gameName = "" if found: gameName += "success" elif blocked: gameName += "blocked" else: gameName += "timeout" gameName += "_" + str(i).zfill(6) finalDir = saveDir + "/" + gameName + "/" # os.makedirs(finalDir) user.saveRecordsToFiles(world, start, goal, found, blocked, records, finalDir + gameName) # for i in range(Tests.db.X_train.shape[0]): # user.predictSample(Tests.db.X_train[i]) # world, start, goal = Dataset.greyScaleSampleToGridWorld(Tests.db.X_train[10]) # user.predictSample(user.makeSample(world, start, goal)) # user.predictSample(Tests.db.X_train[0]) # user.predictSample(Tests.db.X_train[1]) # user.predictSample(Tests.db.X_train[2]) # user.predictSample(Tests.db.X_train[3]) # user.predictSample(Tests.db.X_train[:2]) # found, blocked, records = user.play(world, start, goal) # user.saveRecordsToFiles(world, start, goal, found, blocked, records, "games/test1/") pass
def dev(): generator = Generator(10) print("Generating...") samples, labels = generator.generate(20000) print("Done generating. Shuffling...") Generator.shuffle_in_unison_scary(samples, labels) print("Done shuffling. Splitting...") db = Dataset() db.init(samples, labels, 17000, 1500) print("Done splitting. Saving...") fileName = 'medium20000_10_shuffled_0.3obstacles.pkl' db.saveTo(fileName) print("Done saving to", fileName)
def __init__(self, device): super(RawEmbedding, self).__init__(device=device) self.indexer = Indexer(special_tokens={ '<s>': 0, '<unk>': 1, '<pad>': 2, '<\s>': 3, '<mask>': 4 }, with_del_stopwords=self.with_del_stopwords) datasets = Dataset().get_instance() sentences = [pairs[0] for pairs in datasets['train']] self.indexer.count_word_in_text(sentences) self.indexer.add_sentences(sentences) self.embedding_dim = 100 self.embedding = nn.Embedding(num_embeddings=len(self.indexer), embedding_dim=self.embedding_dim, padding_idx=self.indexer.padding_index) self.embedding.to(device)
def test_network(FLAGS): x_test = Dataset.get_user("dataset/npy/", FLAGS.user) data = x_test.flatten("C") client = pyhe_client.HESealClient( FLAGS.hostname, FLAGS.port, FLAGS.batch_size, {FLAGS.tensor_name: (FLAGS.encrypt_data_str, data)}, ) results = np.round(client.get_results(), 2) y_pred_reshape = np.array(results).reshape(FLAGS.batch_size, 9) with np.printoptions(precision=3, suppress=True): print(y_pred_reshape) y_pred = y_pred_reshape.argmax(axis=1) print("y_pred", y_pred)
def fit(self, sess, datapath, batch_size=50, epoch=5, max_len=500): dataset = Dataset(sess, datapath, batch_size, '\t', max_len=max_len, epoch=epoch) sess.run(tf.initialize_all_variables()) for steps, (c, ws, lens) in enumerate(dataset): feed = { self._x: ws, self._y: c, self._lens: lens, self._vocab: self._wv, self._dropout: self._dr } loss, acc, _ = sess.run([self._loss, self._acc, self._train_op], feed_dict=feed) yield steps, loss, acc
def play(self, world, start, goal, verbose=False): found = False blocked = False stepsCount = 0 records = [] while not found and not blocked and stepsCount < 30: if start.col == goal.col and start.line == goal.line: # if the goal has been reached found = True if verbose: print("Step", stepsCount, ": Reached the goal in", start) else: stepsCount += 1 sample = Dataset.gridWorldToGreyScaleSample(world, start, goal) actions = self.predictSample(sample) chosenAction = None for direction in actions: nextPosition = self.getNextPosition(world, start, direction) if nextPosition is not None: chosenAction = direction break if (nextPosition is None) or ( start.col == nextPosition.col and start.line == nextPosition.line ): # if there is not any valid next position, or if next position is the same as current position blocked = True if verbose: print("Step", stepsCount, ": Blocked in", start) if verbose: print( "Step", stepsCount, ": From", start, "move", chosenAction.name, "to", nextPosition, "(goal is in ", goal, ")", ) records.append((start, chosenAction, nextPosition)) start = nextPosition return found, blocked, records
def main(): global X global Y global auth_to_id ds = Dataset.open('quora') # (X, Y) = ([x.split('.') for x in ds.X], ds.Y) #X = ([sum([len( filter(None, y.split(' ')) ) for y in x])/len(x) for x in X]) #X = zip(X, [len( filter(None, x.split('\n')) ) for x in ds.X]) (X, Y) = (ds.X, ds.Y) mx = 0 for auth in Y: if auth not in auth_to_id: auth_to_id[auth] = mx mx+=1 getTagsforAll(X) # print [x for x in ds.X if len( filter(None, x.split('\n')) ) > 1] # print [(x, y) for (x, y) in X if y > 1] #X = [[x, y] for (x, y) in X] Z = zip(X, Y) Z = pred_shuffle(Z) (X, Y) = zip(*Z) si=0 acc = 0.0 cnt = 0 while si<len(X): print "doing iteration ", cnt Xe = X[si:si+50] Ye = Y[si:si+50] X1 = X[:si] + X[si+50:] Y1 = Y[:si] + Y[si+50:] train, pred = gen_feature_vector(X1, Y1, Xe) acc += train_chunk(train, Y1, pred, Ye) cnt += 1 si += 50 print 'Accuracy: %f' % (acc/cnt)
def evaluateMAP(trace_link_candidates, k, dataset: Dataset, reverse_compare=False): """ trace_link_candidates: List of TraceLink-objects """ if not trace_link_candidates: text = "No Trace Link candidates!" log.info(text) return 0, 0 req_dict = _build_req_dict_for_map(trace_link_candidates, dataset, reverse_compare) map = Util.calculate_mean_average_precision(req_dict, k, dataset.num_reqs(), reverse_compare) return map, len(trace_link_candidates)
def Main(): parser = argparse.ArgumentParser( description='Dataset computing procedure. v1.0') parser.add_argument('-d',dest='dataset',type=str,required=True,\ help='input dataset') args = parser.parse_args() dataset = Dataset.load(args.dataset) df = res_count(dataset).T colors = plt.get_cmap('coolwarm_r') labels = ['Helix', 'Strand', 'Coil'] temp = np.array((df['H'], df['E'], df['-'])) sizes = [i for i in (temp / np.sum(temp)) * 100] #print sizes # create a circle for the center of the plot my_circle = plt.Circle((0, 0), 0.5, fc='white', edgecolor='black') # compute the pieplot give color names fig1, ax1 = plt.subplots() ax1.set_prop_cycle( "color", [colors(1. * i / len(sizes)) for i in range(len(sizes))]) plt.pie(df, wedgeprops={ "edgecolor": "k", 'linewidth': 1, 'linestyle': 'solid', 'antialiased': True }) plt.legend(loc='lower right', labels=['%s = %1.1f%%' % (l, s) for l, s in zip(labels, sizes)], prop={'size': 9}, bbox_to_anchor=(1, 0), bbox_transform=fig1.transFigure) p = plt.gcf() ax1.axis('equal') #plt.title('Composition in Secondary Structure', fontname="Times New Roman", fontweight="bold") # add the white cyrcle in the middle p.gca().add_artist(my_circle) plt.show()
def runITHSExperiment(k): print('run experiments for ' + Config.NETWORK) allFoldK = pickle.load(open('allFold_2019.pickle', "rb")) allTestData = Util.readData('sentences_all_2019.pickle') allTestData = allTestData.iloc[0:4000000, :] allSentences = pickle.load(open('sentences_2019.pickle', "rb")) [training_df, classes] = getFullTrainingData(allFoldK[k]) test_df = allTestData sentences = allSentences[allSentences['id'].isin( training_df['SentenceId'])] training_df = training_df.rename(columns={'SentenceId': 'id'}) training_df = pd.merge(training_df, sentences, on='id') training_df, validation_df = train_test_split( training_df, test_size=Config.TRAIN_TEST_RATIO, stratify=training_df[classes].values, random_state=42) dataset = Dataset(training_df, validation_df, test_df, classes, None, k, preTrain=False) model = Network() if (os.path.isfile("model_iths.h5")): model.loadModel("model_iths.h5") else: model.build(dataset) model.run(dataset) model.saveModel("model_iths.h5") ids, preds = model.predict(dataset) weights = model.getActivationWeights(dataset) pickle.dump((ids, preds, weights), open('preds_all.pickle', "wb"))
def main(): num_cases = 150 num_learn = 1000 bn = BayesianNet() print('Adjacency matrix: \n' + str(bn.dag)) dag = np.zeros((5, 5)) for i in range(num_learn): data = Dataset(bn, num_cases) dag += sl.k2(data.dataset, data.ordered_array, 2) for i in range(len(dag)): for j in range(len(dag)): if dag[i][j] < (num_learn/2): dag[i][j] = 0 else: dag[i][j] = 1 print('Learned structure: \n' + str(dag))
def expandDataset(buf, label=-1, limit=''): list = [] parts = buf.split(',') for part in parts: if part.endswith('.bents'): list.append(Dataset(part)) else: for name in Dataset.names(): if datasetWildcardMatch(part, name): ds = Dataset(name) ds.restrictTo(label) ds.limitTo(limit) list.append(ds) return list
def main(): global X global Y ds = Dataset.open('quora') X,Y = ds.X,ds.Y #Z = [re.findall(r"[\w']+", x) for x in X] Z = [filter(None, x.split('.')) for x in X] Z = ["".join(s) for s in Z] Z = [z.split(' ') for z in Z] Z = [[len(s) for s in z] for z in Z] feature = [] for a in Z: wordLenDist = [0]*100 for ln in a: wordLenDist[ln]+=1 feature.append(wordLenDist) X = feature Z = zip(X, Y) shuffle(Z) (X, Y) = zip(*Z) # X = [i for i in range(len(X))] si=0 acc = 0.0 cnt = 0 while si<len(X): Xe = X[si:si+50] Ye = Y[si:si+50] X1 = X[:si] + X[si+50:] Y1 = Y[:si] + Y[si+50:] acc += train_chunk(X1, Y1, Xe, Ye) cnt += 1 si += 50 print 'Accuracy: %f' % (acc/cnt)
def load_MV_dataset(num): fin=open('data/Foreign_Movie_model_5_feature.txt','r') lines=fin.read().splitlines() data_num=len(lines)/2 dataset=Dataset() data_num=num for i in xrange(data_num): dataset.imageNameList.append(['data/Foreign_Movie_Face/'+lines[2*i]]) dataset.rect.append([0,0,178,218]) dataset.imgID.append(1) dataset.feature.append(map(float,lines[2*i+1].split())) dataset.size+=1 dataset.computeAffinity() dataset.computeQuality() return dataset
def get_benchmarking_data(self, idmodel, it=-1): """ Returns a Dataset instance with the data used for learning up to iteration it """ con = lite.connect(self.name) cues = [] obs = [] nobs = defaultdict(int) with con: con.row_factory = lite.Row cur = con.cursor() if it < 0: cur.execute("SELECT * FROM benchmark_data WHERE idmodel=:idmodel", {'idmodel': idmodel}) else: cur.execute("SELECT * FROM benchmark_data WHERE idmodel=:idmodel AND it<=:it", {'idmodel': idmodel, 'it': it}) rows = cur.fetchall() dataset = Dataset.from_db_rows(rows, self.setup) return dataset
def main(): global X global Y ds = Dataset.open('quora') (X, Y) = (get_tagged_text(ds.X), ds.Y) XX=[] YY=[] for (auth, ans) in zip(Y, X): if len(nltk.word_tokenize(ans)) > 200: XX.append(ans) YY.append(auth) (X, Y) = (XX, YY) Z = zip(X, Y) shuffle(Z) (X, Y) = zip(*Z) Xe = X[-50:] Ye = Y[-50:] X = X[:-50] Y = Y[:-50] count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0) X_train_counts = count_vect.fit_transform(X) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) clf = MultinomialNB().fit(X_train_tf, Y) clf2 = KNeighborsClassifier(n_neighbours=5).fit(X_train_tf, Y) X_new_counts = count_vect.transform(Xe) X_new_tfidf = tf_transformer.transform(X_new_counts) Yd = clf.predict(X_new_tfidf) istats(Y) print '' stats(Ye, Yd)
def test_1(self): user = User() user.createNetwork(model="custom_mlp:2,800,0,0.3") user.loadNetworkState("model_customMlp2_800_0_0.3__10by10___30percentsAccuracy.npz") saveDir = "games" # for i in [0, 1, 2]: #range(Tests.db.X_test): for i in range(Tests.db.X_test.shape[0]): sample = Tests.db.X_test[i] world, start, goal = Dataset.greyScaleSampleToGridWorld(sample) found, blocked, records = user.play(world, start, goal) gameName = "" if found: gameName += "success" elif blocked: gameName += "blocked" else: gameName += "timeout" gameName += "_" + str(i).zfill(6) finalDir = saveDir + "/" + gameName + "/" # os.makedirs(finalDir) user.saveRecordsToFiles(world, start, goal, found, blocked, records, finalDir + gameName) # for i in range(Tests.db.X_train.shape[0]): # user.predictSample(Tests.db.X_train[i]) # world, start, goal = Dataset.greyScaleSampleToGridWorld(Tests.db.X_train[10]) # user.predictSample(user.makeSample(world, start, goal)) # user.predictSample(Tests.db.X_train[0]) # user.predictSample(Tests.db.X_train[1]) # user.predictSample(Tests.db.X_train[2]) # user.predictSample(Tests.db.X_train[3]) # user.predictSample(Tests.db.X_train[:2]) # found, blocked, records = user.play(world, start, goal) # user.saveRecordsToFiles(world, start, goal, found, blocked, records, "games/test1/") pass
def runner( PATH_DATA, RATIO_TEST_DATA, RATIO_SPECIFICITY, RATIO_CONFIDENCE, EXPERIMENTS, fe, setting_name ): results = [] errors = Counter() qtypes = QuestionTypes() for e in range(1, EXPERIMENTS + 1): start = time.time() dataset = Dataset(PATH_DATA) dataset.load() invprob = InverseProbabilities(dataset) index = Index(invprob) train = [ # (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob) + bow(fe, text, prob_filter=invprob), label, mark) (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark) for text, label, mark in dataset.train() ] train = train * 4 test = [ (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob), label, mark) # (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark) for text, label, mark in dataset.test() if mark ][:int(len(train) * RATIO_TEST_DATA)] test += [ (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob), label, mark) # (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark) for text, label, mark in dataset.test() if not mark ][:len(test)] for tbow, label, mark in train: index.update(tbow) index.add(label) tp, tn, fp, fn, prec, rec, f, duration = 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0 marked = sum([1 for _, _, mark in test if mark]) for tbow, label, mark in test: qtypes.increment(label) expectation = sum([ invprob[w] for w in set(bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob)) ]) matches = index(tbow) if not matches and not mark: tn += 1 continue elif not matches and mark: fn += 1 errors[('fn', '', label)] += 1 qtypes.update('fn', None, label) continue best_match = matches[0] guess = best_match[2] sim = best_match[0] ratio = sim / (expectation + 0.1) if ratio <= RATIO_CONFIDENCE: if not mark: tn += 1 continue else: fn += 1 errors[('fn', '', label)] += 1 qtypes.update('fn', None, label) continue else: if mark and guess == label: tp += 1 else: fp += 1 _qtype = '_'.join(guess.lower().split()[:2]) errors[('fp', guess, label)] += 1 qtypes.update('fp', guess, label) duration = time.time() - start if tp: prec = tp / float(tp + fp) rec = tp / float(tp + fn) f = f1(prec, rec) else: prec, rec, f = 0.0, 0.0, 0.0 vector = (e, _r(tp), _r(tn), _r(fp), _r(fn), _r(prec), _r(rec), _r(f), _r(duration)) results.append(vector) print '%d, tp: %d, tn: %d, fp: %d, fn: %d, all: %d, prec: %.2f, rec: %.2f, f1: %.2f, time=%.2f' % (e, tp, tn, fp, fn, sum([tp, tn, fp, fn]), prec, rec, f, duration) precs, recs, fs = zip(*results)[-4:-1] print e, avg(precs), avg(recs), avg(fs) print '---' if not results: return None cols = columns(results) columns_int = [avg(col) for col in cols[:4]] columns_float = [_r(avg(col)) for col in cols[4:]] summary_row = [ tuple(['all'] + columns_int + columns_float) ] create_folder(RESULTS_FOLDER) to_csv( RESULTS_KEYS + results + summary_row, '%ssecond_task.%s.results.csv' % (RESULTS_FOLDER, setting_name) ) to_csv( [tuple([f] + list(key)) for key, f in errors.most_common()], '%ssecond_task.%s.errors.csv' % (RESULTS_FOLDER, setting_name) ) to_csv( qtypes.dump(), '%ssecond_task.error.%s.question_types.csv' % (RESULTS_FOLDER, setting_name) ) return summary_row[0]
folds=['2-fold', '5-fold', 'N-fold'] for ds in alcohol_datasets: train_data_all = ds[0].data test_data = ds[1].data # Accuracy for get 20%, 50%, 80% and 100% of the data. # Each subset will have train_accuracy = [[np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)], [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)], [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)], [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)]] best_k_and_ds = [[0,0,0],[0,0,0],[0,0,0],[0,0,0]] for it in range(5): train_data_20, t = Dataset.getRandomPercent(train_data_all, 0.2) train_data_50, t = Dataset.getRandomPercent(train_data_all, 0.5) train_data_80, t = Dataset.getRandomPercent(train_data_all, 0.8) all_training_data = [train_data_20, train_data_50, train_data_80, train_data_all] # Only run on train_data_all once. if it > 0: all_training_data = all_training_data[:-1] for val in range(len(all_training_data)): for k in k_values: print str(it) + ": Training on: " + labels[val] + "for k value: " + str(k) + " for " + ds[0].name # Do 2-5-N Fold Cross Validation. cv_2 = Dataset.getkPartitions(all_training_data[val], 2) cv_5 = Dataset.getkPartitions(all_training_data[val], 5)
from Dataset import Dataset from Method import Method from Evaluation import Evaluation from Result import Result from Setting import Setting import numpy as np #--------------------------- run the exp ---------------------------- if 1: k = 5 fold = 10 dataset = Dataset('', '') dataset.file_folder_path = '../data/input/' method = Method('', '') method.k = k evaluation = Evaluation('') result = Result('', '') result.k = k setting = Setting('', '', dataset, method, result, evaluation) setting.fold = fold setting.load_classify_save() if 1: fold = 10 k = 5 result = Result('', '')
log = '%s %s,' % (log, close_word) print (log) end = time.time() print ('Training took %.2f sec\n' % (end - start)) final_embeddings = normalized_embeddings.eval() return final_embeddings if __name__ == '__main__': print ('Loading the dataset... ') start = time.time() data = Dataset('Text8', reformatted=True, verbose=True) data, count, dictionary, reverse_dictionary = data.load() end = time.time() print ('Loading the dataset took %.2f sec.\n' % (end - start)) print ('Most common words (+UNK)', count[:5]) print ('Sample data', data[:10]) print('data:', [reverse_dictionary[di] for di in data[:8]]) embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution.
#!/usr/bin/python # CIS 521 Homework 7: Learning Machine Learning # Cory Rivera (rcor) and Sam Panzer (panzers) from numpy import * from Dataset import Dataset d = Dataset("comp.sys.ibm.pc.hardware.txt", "rec.sport.baseball.txt", cutoff=10) #d = Dataset("comp.sys.mac.hardware.txt", "comp.sys.ibm.pc.hardware.txt", cutoff=2000) (Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=1) wordlist = d.getWordList() def trainNaiveBayes(X, Y): # First, count frequencies given the category # Each row is a post, and each column is a word # To count the number of words from every post, sum up the values from each # column for a given category # Flattens Y so that it is easier to iterate over yFlat = Y.flatten() yPos = yFlat == 1 yNeg = yFlat == -1 # X.shape[1] returns number of columns for a given matrix numColumns = X.shape[1] # Indexing with a boolean array like yOne only checks indices that are True
def get_random_dataset(self, idmodel, nexp, max_stimuli=0, max_inhibitors=0): rows = self.get_population_rows(idmodel, max_stimuli, max_inhibitors) exps = random.sample(rows, nexp) dataset = Dataset.from_db_rows(exps, self.setup) return dataset
def load_dataset(): db = Dataset.loadFrom('medium20000_10_shuffled_0.3obstacles.pkl') # db = Dataset.loadFrom('medium20000_10_shuffled.pkl') # db = Dataset.loadFrom('large200000_10_shuffled.pkl') return db.X_train, db.y_train, db.X_val, db.y_val, db.X_test, db.y_test
# Test code for detect heart region import detectHeartRegion as dhr from matplotlib import pyplot from matplotlib import cm from Dataset import Dataset d = Dataset("C:\\Kaggle\\train\\27", "27"); d.load(); (num_slices, num_times, width,height) = d.images.shape rois,circles = dhr.detect_heart_region(d.images); #plot roi in each slice at time 0 numSlicesToDisplay = 10; pyplot.figure(1); pyplot.subplots_adjust(left=0.1,hspace=0.1,wspace=0); numslicesPerRow = 2; numRows = numSlicesToDisplay/numslicesPerRow; index = 1; for slice in range(numSlicesToDisplay): pyplot.subplot(numRows,2 * numslicesPerRow, index ); pyplot.imshow(d.images[slice][0],cmap=cm.Greys_r); index = index + 1; pyplot.subplot(numRows,2 * numslicesPerRow, index) ; pyplot.imshow(rois[slice],cmap=cm.Greys_r); index = index + 1;