Exemplo n.º 1
0
def load_data(config, cache_byte_size, files_config_key, **kwargs):
  """
  :param Config config:
  :param int cache_byte_size:
  :param str files_config_key: such as "train" or "dev"
  :param kwargs: passed on to init_dataset() or init_dataset_via_str()
  :rtype: (Dataset,int)
  :returns the dataset, and the cache byte size left over if we cache the whole dataset.
  """
  if not config.bool_or_other(files_config_key, None):
    return None, 0
  kwargs = kwargs.copy()
  kwargs.setdefault("name", files_config_key)
  if config.is_typed(files_config_key) and isinstance(config.typed_value(files_config_key), dict):
    config_opts = config.typed_value(files_config_key)
    assert isinstance(config_opts, dict)
    kwargs.update(config_opts)
    if 'cache_byte_size' not in config_opts:
      if kwargs.get('class', None) == 'HDFDataset':
        kwargs["cache_byte_size"] = cache_byte_size
    Dataset.kwargs_update_from_config(config, kwargs)
    data = init_dataset(kwargs)
  else:
    config_str = config.value(files_config_key, "")
    data = init_dataset_via_str(config_str, config=config, cache_byte_size=cache_byte_size, **kwargs)
  cache_leftover = 0
  if isinstance(data, HDFDataset):
    cache_leftover = data.definite_cache_leftover
  return data, cache_leftover
Exemplo n.º 2
0
def load_data(config, cache_byte_size, files_config_key, **kwargs):
  """
  :type config: Config
  :type cache_byte_size: int
  :type chunking: str
  :type seq_ordering: str
  :rtype: (Dataset,int)
  :returns the dataset, and the cache byte size left over if we cache the whole dataset.
  """
  if not config.has(files_config_key):
    return None, 0
  if config.is_typed(files_config_key) and isinstance(config.typed_value(files_config_key), dict):
    new_kwargs = config.typed_value(files_config_key)
    assert isinstance(new_kwargs, dict)
    kwargs.update(new_kwargs)
    if 'cache_byte_size' not in new_kwargs:
      if kwargs.get('class', None) == 'HDFDataset':
        kwargs["cache_byte_size"] = cache_byte_size
    Dataset.kwargs_update_from_config(config, kwargs)
    data = init_dataset(kwargs)
  else:
    config_str = config.value(files_config_key, "")
    data = init_dataset_via_str(config_str, config=config, cache_byte_size=cache_byte_size, **kwargs)
  cache_leftover = 0
  if isinstance(data, HDFDataset):
    cache_leftover = data.definite_cache_leftover
  return data, cache_leftover
Exemplo n.º 3
0
    def test_read_data_points(self):
        set = Dataset()
        set.read_data_points("flueaeg.txt")

        data = set.get_data

        self.assertEqual(data[100.0], 16.6)

        # here we should see an error printet
        set.read_data_points("findes-ikke.txt")
def changeURLs(dataLines):

	changedUrlsLines = []

	dataset = Dataset(dataLines)

	for line in dataset.dataLines:

		columns = dataset.getColumns(line)
		columns['url'] = Brand.brandsUrls[ columns['brand'] ]
		changedUrlsLines.append( dataset.getLine(columns) )

	return changedUrlsLines
def fixSizeTypes(dataLines):
		
	fixedDataLines = []

	dataset = Dataset(dataLines)

	for line in dataset.dataLines:

		columns = dataset.getColumns(line)
		columns['size_type'] = columns['size_type'].replace(" ", "_")
		fixedDataLines.append( dataset.getLine(columns) )

	return fixedDataLines
Exemplo n.º 6
0
 def __init__(self, id=None, drawing=None,
              posX=0, posY=0,
              x1=0, y1=0, x2=0, y2=0,
              pen=None, brush=None):
     Dataset.__init__(self, id)
     self.drawing = drawing
     self.posX = posX
     self.posY = posY
     self.x1 = x1
     self.y1 = y1
     self.x2 = x2
     self.y2 = y2
     self.pen = pen
     self.brush = brush
Exemplo n.º 7
0
 def train_set_loss_vars_for_cur_batches(self):
   """
   Called via Engine.SeqTrainParallelControl.
   """
   assert self.train_have_loss_for_cur_batches()
   # See EngineUtil.assign_dev_data for reference.
   from Dataset import Dataset
   n_time, n_batch = Dataset.index_shape_for_batches(self.train_batches)
   n_output_dim = self.output_layer.attrs['n_out']
   output_loss = numpy.zeros((n_batch,), "float32")
   output_hat_y = numpy.zeros((n_time, n_batch, n_output_dim), "float32")
   offset_slice = 0
   for batch in self.train_batches:
     for seq in batch.seqs:
       o = seq.batch_frame_offset
       q = seq.batch_slice + offset_slice
       l = seq.frame_length
       # input-data, input-index will also be set in this loop. That is data-key "data".
       for k in [self.output_target]:
         if l[k] == 0: continue
         loss, hat_y = self.get_loss_and_hat_y(seq.seq_idx)
         assert seq.seq_start_frame[k] < hat_y.shape[0]
         assert seq.seq_end_frame[k] <= hat_y.shape[0]
         output_loss[q] += loss * float(l[k]) / hat_y.shape[0]
         output_hat_y[o[k]:o[k] + l[k], q] = hat_y[seq.seq_start_frame[k]:seq.seq_end_frame[k]]
   self.output_var_loss.set_value(output_loss)
   self.output_var_hat_y.set_value(output_hat_y)
def mergeSynonymousSizeTypes(dataLines):
		
	mergedDataLines = []

	dataset = Dataset(dataLines)

	for line in dataset.dataLines:

		columns = dataset.getColumns(line)

		if columns['size_type'] in SizeType.mergedSizeTypes:
			columns['size_type'] = SizeType.mergedSizeTypes[ columns['size_type'] ]

		mergedDataLines.append( dataset.getLine(columns) )

	return mergedDataLines
Exemplo n.º 9
0
def clone():
	print 'Enter name for new dataset:'
	dsname = raw_input()

	os.system('mkdir %s' % dsname)
	ds = Dataset()
	db = getDB()
	cur = db.cursor(MySQLdb.cursors.DictCursor)

	cur.execute("SELECT * FROM Answers WHERE isRetrieved=1")
	for row in cur.fetchall():
		ds.X.append(row['answer'])
		ds.Y.append(row['author'])
		ds.ts = max([ds.ts, row['updated_at']])

	with open('%s/data' % dsname, 'w') as f:
		cPickle.dump(ds, f)
	print 'Dataset cloned'
Exemplo n.º 10
0
def getXAtMaxIm(dataset: Dataset):
    data = dataset.getPlane()
    curMin = data[0][0]
    bestCycle = 0
    for cycle in range(len(data)):
        if max(data[cycle]) > curMin:
            curMin = max(data[cycle])
            bestCycle = cycle
    return bestCycle
Exemplo n.º 11
0
def doUpperCase(dataLines):
		
	upperCaseLines = []

	dataset = Dataset(dataLines)

	for line in dataset.dataLines:

		columns = dataset.getColumns(line)

		columns['size_type'] = columns['size_type'].upper()
		columns['label'] = columns['label'].upper()
		columns['brand'] = columns['brand'].upper()
		columns['clothe_category'] = columns['clothe_category'].upper()
		columns['size_category'] = columns['size_category'].upper()
		columns['gender'] = columns['gender'].upper()

		upperCaseLines.append( dataset.getLine(columns) )

	return upperCaseLines
Exemplo n.º 12
0
    def get_all_dataset(self, idmodel):
        con = lite.connect(self.name)

        with con:
            con.row_factory = lite.Row
            cur = con.cursor()
            cur.execute("SELECT * FROM dataset where idmodel=:idmodel", {'idmodel': idmodel})
                    
            rows = cur.fetchall()
            dataset = Dataset.from_db_rows(rows, self.setup)
            
        return dataset   
Exemplo n.º 13
0
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	# Z = [re.findall(r"[\w']+", x) for x in X]
	# Z = [filter(None, x.split('.')) for x in X]
	# Z = ["".join(s) for s in Z]
	# Z = [z.split(' ') for z in Z]
	# Z = [[len(s) for s in z] for z in Z]

	# feature = []
	# for a in Z:
	# 	wordLenDist = [0]*100
	# 	for ln in a:
	# 			wordLenDist[ln]+=1
	# 	feature.append(wordLenDist)

	feature = []
	tokenizer = RegexpTokenizer(r'\w+')
	for x in X:
		All = len(nltk.word_tokenize(x))
		numPunctuation = All - len(tokenizer.tokenize(x))
		numWords = All - numPunctuation
		ff = [numPunctuation, numWords]
		feature.append(ff)


	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)


	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
Exemplo n.º 14
0
def benchmark(lstm_unit, use_gpu):
  """
  :param str lstm_unit: e.g. "LSTMBlock", one of LstmCellTypes
  :param bool use_gpu:
  :return: runtime in seconds of the training itself, excluding initialization
  :rtype: float
  """
  device = {True: "GPU", False: "CPU"}[use_gpu]
  key = "%s:%s" % (device, lstm_unit)
  print(">>> Start benchmark for %s." % key)
  config = Config()
  config.update(make_config_dict(lstm_unit=lstm_unit, use_gpu=use_gpu))
  dataset_kwargs = config.typed_value("train")
  Dataset.kwargs_update_from_config(config, dataset_kwargs)
  dataset = init_dataset(dataset_kwargs)
  engine = Engine(config=config)
  engine.init_train_from_config(config=config, train_data=dataset)
  print(">>> Start training now for %s." % key)
  start_time = time.time()
  engine.train()
  runtime = time.time() - start_time
  print(">>> Runtime of %s: %s" % (key, hms_fraction(runtime)))
  engine.finalize()
  return runtime
Exemplo n.º 15
0
def parse_csv_file(filename, optimized_indexes=[]):
    """
    Parse a csv file and return the corresponding dataset
    """
    data = []
    with open(filename, 'r') as csvFile:
        fields = csvFile.readline().rstrip().split(SEPARATOR)
        if len(optimized_indexes) != 0:
            fields = [
                fields[i] for i in range(len(fields)) if i in optimized_indexes
            ]

        for row in csvFile:
            data.append(parse_csv_line(row, optimized_indexes))
        return Dataset(fields, data)
Exemplo n.º 16
0
 def train_network(path_to_db,
                   model,
                   frame_size,
                   rescale,
                   validation_set_percentage,
                   batch_size,
                   epochs,
                   optimizer,
                   loss='categorical_crossentropy',
                   metrics=['acc']):
     data_set = Dataset(path_to_db)
     train, validation, test = data_set.get_train_val_test_sets(
         size=frame_size,
         rescale=rescale,
         validaion_set_percentage=validation_set_percentage,
         batch_size=batch_size)
     # configure the model for training:
     model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
     history = model.fit_generator(train,
                                   steps_per_epoch=100,
                                   epochs=epochs,
                                   validation_data=validation,
                                   validation_steps=50)
     return history
Exemplo n.º 17
0
def load_HP_dataset(noise=False):
    fin = open(
        '/media/heyue/8d1c3fac-68d3-4428-af91-bc478fbdd541/ClusterResearch/clusterQNet/data/HP_model_5_feature.txt',
        'r')
    lines = fin.read().splitlines()
    data_num = len(lines) / 5
    dataset = Dataset()

    for i in xrange(data_num):
        dataset.imageNameList.append(['data/HP/' + lines[i * 5]])
        dataset.rect.append([0, 0, 100, 100])
        dataset.imgID.append(int(lines[i * 5 + 4]))
        dataset.feature.append(map(float, lines[i * 5 + 2].split()))
        dataset.size += 1
        if dataset.imgID[-1] == 1 and noise == False:
            dataset.imageNameList.pop()
            dataset.rect.pop()
            dataset.imgID.pop()
            dataset.feature.pop()
            dataset.size -= 1
    dataset.computeAffinity()
    #dataset.computeQuality()
    dataset.Quality = [1.0 for i in xrange(dataset.size)]
    return dataset
Exemplo n.º 18
0
def run(hidden, inNodes, output, dataset, iterations, percent):

    d = Dataset(dataset)

    # print d.trainTestSplit(percent)

    trainX, testX, trainy, testy = d.trainTestSplit(percent)

    # print trainy

    inNodes = trainX.shape[1]
    output = trainy.shape[1]

    network = Network(hidden, inNodes, output)

    network.train(trainX, trainy, iterations)

    print network

    predictionsTrain = network.predict(trainX)
    predictionsTest = network.predict(testX)

    print "Total Training Error:", network.getError(trainX, trainy)
    print "Total Testing Error:", network.getError(testX, testy)
Exemplo n.º 19
0
def mainAdaline():

    arquivoDTest = open("Dataset2/dtest2.txt", "r")
    arquivoXTest = open("Dataset2/xtest2.txt", "r")
    datasetTest = Dataset(arquivoDTest, arquivoXTest)

    arquivoDTrain = open("Dataset2/dtrain2.txt", "r")
    arquivoXTrain = open("Dataset2/xtrain2.txt", "r")
    datasetTrain = Dataset(arquivoDTrain, arquivoXTrain)

    redeAdalineTrain = Adaline(datasetTrain.definindoEntradas(),
                               datasetTrain.definindoValoresDesejados(),
                               taxa_aprendizado, precisao)

    redeAdalineAnd = Adaline([[1, 1], [0, 0], [1, 0], [0, 1]], [1, -1, -1, -1],
                             0.25, 0.0001)

    redeAdalineAnd.treinamento_online()

    # redeAdalineTrain.treinamento_online()

    redeAdalineAnd.conferirRespostas(redeAdalineAnd.x, redeAdalineAnd.w,
                                     redeAdalineAnd.w0, redeAdalineAnd.d,
                                     len(redeAdalineAnd.x))
Exemplo n.º 20
0
 def __init__(self,
              filename='./text8.zip',
              word_num=200,
              batch_size=8,
              skip_window=2,
              num_skips=2,
              embed_dim=10,
              epoch=100,
              lr=0.025,
              neg_cnt=5,
              outfile='./skip_gram',
              dictfile='./word_dict'):
     """Init this word2vec model"""
     # params about dataset
     self.batch_size = batch_size
     self.skip_window = skip_window
     self.num_skips = num_skips
     # params about skip gram
     self.embed_num = word_num
     self.embed_dim = embed_dim
     # params about learning
     self.epoch = epoch
     self.lr = lr
     self.neg_cnt = neg_cnt
     # dataset
     self.dataset = Dataset(filename, word_num)
     if (not os.path.exists(dictfile)):
         pickle.dump(self.dataset.word_dict, open(dictfile, 'wb'))
     # skip gram
     self.outfile = outfile
     if (os.path.exists(outfile)):
         self.skip_gram = pickle.load(open(self.outfile, 'rb'))
     else:
         self.skip_gram = SkipGram(word_num, embed_dim)
     # optimizer
     self.optimizer = optim.SGD(self.skip_gram.parameters(), lr=self.lr)
Exemplo n.º 21
0
    def train_one_batch(self, sess, x, y_, accuracy, train_step, train_feed_dict, test_feed_dict):

        tf.summary.scalar('accuracy', accuracy)
        merged = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())
        dataset = Dataset(input_file_path=self.data_path, max_sample_records=self.max_sample_records)

        # Not sure what these two lines do
        run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_opts_metadata = tf.RunMetadata()

        train_batches = dataset.get_batches(train=True)
        batch = next(train_batches)
        images, labels = process_data(batch)
        train_feed_dict[x] = images
        train_feed_dict[y_] = labels
        for epoch in range(self.epochs):
            train_step.run(feed_dict=train_feed_dict)
            train_summary, train_accuracy = sess.run([merged, accuracy], feed_dict=train_feed_dict,
                                                     options=run_opts, run_metadata=run_opts_metadata)
            test_summary, test_accuracy = sess.run([merged, accuracy], feed_dict=train_feed_dict,
                                                   options=run_opts, run_metadata=run_opts_metadata)
            message = "epoch: {0}, training accuracy: {1}, validation accuracy: {2}"
            print(message.format(epoch, train_accuracy, test_accuracy))
Exemplo n.º 22
0
def benchmark(lstm_unit, use_gpu):
  """
  :param str lstm_unit: e.g. "LSTMBlock", one of LstmCellTypes
  :param bool use_gpu:
  :return: runtime in seconds of the training itself, excluding initialization
  :rtype: float
  """
  device = {True: "GPU", False: "CPU"}[use_gpu]
  key = "%s:%s" % (device, lstm_unit)
  print(">>> Start benchmark for %s." % key)
  config = Config()
  config.update(make_config_dict(lstm_unit=lstm_unit, use_gpu=use_gpu))
  dataset_kwargs = config.typed_value("train")
  Dataset.kwargs_update_from_config(config, dataset_kwargs)
  dataset = init_dataset(dataset_kwargs)
  engine = Engine(config=config)
  engine.init_train_from_config(config=config, train_data=dataset)
  print(">>> Start training now for %s." % key)
  start_time = time.time()
  engine.train()
  runtime = time.time() - start_time
  print(">>> Runtime of %s: %s" % (key, hms_fraction(runtime)))
  engine.finalize()
  return runtime
Exemplo n.º 23
0
def getDataset(data, path, evalMode):
    # if data in ["ml-1m", "yelp", "pinterest-20"]:
    if data in ["brightkite", "fsq11", "yelp"]:
        columns = ['uid', 'iid', 'rating', 'hour', 'day', 'month', 'timestamp']
        train = pd.read_csv(path + "data/%sTrain" % data, names=columns, sep="\t")
        test = pd.read_csv(path + "data/%sTest" % data, names=columns, sep="\t")
        df = train.append(test)
        df.sort_values(["uid", "timestamp"], inplace=True)
        dataset = Dataset(df, evalMode)

    elif data in ["ml-1m", "yelp-he"]:
        names = ["uid", "iid", "rating", "timestamp"]
        data = "yelp" if data == "yelp-he" else data
        train = pd.read_csv(path + "data/%s.train.rating" % data, sep="\t", names=names)
        test = pd.read_csv(path + "data/%s.test.rating" % data, sep="\t", names=names)
        df = train.append(test)
        dataset = Dataset(df, evalMode)

    elif data in ["beauty", "steam", "video", "ml-sas"]:
        names = ["uid", "iid"]
        if data == "beauty":
            df = pd.read_csv(path + "data/Beauty.txt", sep=" ", names=names)
        elif data == "steam":
            df = pd.read_csv(path + "data/Steam.txt", sep=" ", names=names)
        elif data == "video":
            df = pd.read_csv(path + "data/Video.txt", sep=" ", names=names)
        else:
            df = pd.read_csv(path + "data/ml-1m.txt", sep=" ", names=names)
        dataset = Dataset(df, evalMode)

    elif data == "test":
        columns = ["uid", "timestamp", "lat", "lng", "iid"]
        df = pd.read_csv(path + "data/brightkite.txt", names=columns, sep="\t", nrows=10000)
        dataset = Dataset(df, evalMode)

    return dataset
Exemplo n.º 24
0
    def __init__(self, data_path, train_val_test=(0.8, 0.1, 0.1)):
        """
    Initialises a 'YOLO_Dataset' object by calling the superclass initialiser.

    The difference between a YOLO_Dataset object and a Dataset object is the annotation.
    The YOLO_Dataset object will therefore override the self.annotations_path and
    self.annotation_list attributes such that the building labels are in XML format.
    """
        assert (train_val_test[0] + train_val_test[1] + train_val_test[2]
                ) == 1, 'Train, val and test percentages should add to 1'
        assert train_val_test[0] > 0 and train_val_test[
            1] > 0 and train_val_test[
                2] > 0, 'Train, val and test percentages should be non-negative'

        Dataset.__init__(self, data_path)

        self.train_val_test = train_val_test
        self.train_path = self.data_path + '/yolo/train'
        self.val_path = self.data_path + '/yolo/val'
        self.test_path = self.data_path + '/yolo/test'

        if not os.path.isdir(self.data_path + '/yolo'):
            print(f"Creating directory to store YOLO formatted dataset.")
            os.mkdir(self.data_path + '/yolo')

        # Create train, validation, test directories, each with an images and annotations
        # sub-directory
        for directory in [self.train_path, self.val_path, self.test_path]:
            if not os.path.isdir(directory):
                os.mkdir(directory)

            if not os.path.isdir(directory + '/images'):
                os.mkdir(directory + '/images')

            if not os.path.isdir(directory + '/annotations'):
                os.mkdir(directory + '/annotations')
Exemplo n.º 25
0
    def __init__(self, input_shape, num_classes, learning_rate, clients_num):
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        # Call the create function to build the computational graph of AlexNet
        net = AlexNet(input_shape, num_classes, learning_rate, self.graph)
        self.model = FedModel(*net)

        # initialize
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())

        # Load Cifar-10 dataset
        # NOTE: len(self.dataset.train) == clients_num
        self.dataset = Dataset(tf.keras.datasets.cifar10.load_data,
                               split=clients_num)
Exemplo n.º 26
0
    def __init__(self, input_shape, num_classes, learning_rate, clients_num):
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        # 创建alxnet网络
        net = AlexNet(input_shape, num_classes, learning_rate, self.graph)
        #net = vgg_net(input_shape, num_classes, learning_rate, self.graph)
        self.model = FedModel(*net)

        # 初始化
        with self.graph.as_default():
            self.sess.run(tf.global_variables_initializer())

        # 装载数据
        # 根据训练客户端数量划分数据集
        self.dataset = Dataset(tf.keras.datasets.cifar10.load_data,split=clients_num)
Exemplo n.º 27
0
    def onRequestedCall(self):
        # List of storage we will work with
        mainStorage = self.storageController.getStorageSpace(14)
        qBox1Storage = self.storageController.getStorageSpace(13)
        qBox2Storage = self.storageController.getStorageSpace(12)

        # Creating and storing a dataset as initialization
        dataset1 = Dataset(1, ONE_GIGA_BYTE)
        dataset2 = Dataset(2, ONE_GIGA_BYTE*10)
        dataset3 = Dataset(3, ONE_GIGA_BYTE*30)
        dataset4 = Dataset(4, ONE_GIGA_BYTE*30)
        dataset5 = Dataset(5, ONE_GIGA_BYTE*30)
        dataset6 = Dataset(6, ONE_GIGA_BYTE*30)
        dataset7 = Dataset(7, ONE_GIGA_BYTE*30)
        dataset8 = Dataset(8, ONE_GIGA_BYTE*30)
        mainStorage.storeDataset(dataset1)
        mainStorage.storeDataset(dataset2)
        mainStorage.storeDataset(dataset3)
        mainStorage.storeDataset(dataset4)
        mainStorage.storeDataset(dataset5)
        mainStorage.storeDataset(dataset6)
        mainStorage.storeDataset(dataset7)
        mainStorage.storeDataset(dataset8)

        # Moving the dataset and notify on finish
        self.storageController.doDataTransfer(dataset1, mainStorage, [qBox1Storage])

        self.storageController.doDataTransfer(dataset2, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset3, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset4, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset5, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset6, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset7, mainStorage, [qBox2Storage])

        self.storageController.doDataTransfer(dataset6, mainStorage, [qBox1Storage])

        self.storageController.doDataTransfer(dataset7, mainStorage, [qBox1Storage])

        self.bs.notify_submission_finished()
Exemplo n.º 28
0
def train(args, train_data_path):
    print("use_gpu:{}, NeuMF:{}, epochs:{}, batch_size:{}, num_factors:{}, num_neg:{}, lr:{}, model_dir:{}, layers:{}".format(
        args.use_gpu, args.NeuMF, args.epochs, args.batch_size, args.num_factors, args.num_neg, args.lr, args.model_dir, args.layers))
    dataset = Dataset(args.path + args.dataset)
    testRatings, testNegatives = dataset.testRatings, dataset.testNegatives

    train_data_generator = utils.Dataset()
    train_reader = fluid.io.batch(train_data_generator.train(train_data_path, True), batch_size=args.batch_size)
    
    inputs = utils.input_data(True)
    if args.GMF:
        model = GMF()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors)
    elif args.MLP:
        model = MLP()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.layers)
    elif args.NeuMF:
        model = NeuMF()
        loss, pred = model.net(inputs, args.num_users, args.num_items, args.num_factors, args.layers)

    optimizer = fluid.optimizer.AdamOptimizer(args.lr)
    optimizer.minimize(loss)
    
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    loader = fluid.io.DataLoader.from_generator(
        feed_list=inputs, capacity=args.batch_size, iterable=True)
    loader.set_sample_list_generator(train_reader, places=place)
    
    for epoch in range(args.epochs):

        for batch_id, data in enumerate(loader()):
            begin = time.time()
            loss_val = exe.run(program=fluid.default_main_program(),
                    feed=data,
                    fetch_list=[loss.name],
                    return_numpy=True)
            end = time.time()
            logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}".format(epoch, batch_id, end - begin, np.array(loss_val)[0][0]))

        save_dir = "%s/epoch_%d" % (args.model_dir, epoch)
        feed_var_names = ["user_input", "item_input"]
        fetch_vars = [pred]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
Exemplo n.º 29
0
def Main():
    parser = argparse.ArgumentParser(description='Dataset computing procedure. v1.0')
    parser.add_argument('-d',dest='dataset',type=str,required=True,\
        help='input dataset')

    args = parser.parse_args()

    dataset = Dataset.load(args.dataset)

    df = res_count(dataset)/(sum(res_count(dataset)['O']))*100
    df.columns = ['Helix','Strand','Coil','Overall']
    df = df.plot(kind='bar', rot=0, colormap='coolwarm_r', edgecolor='black')
    df.set(xlabel="Residues", ylabel="Residue Frequency (%)")
    df.yaxis.set_major_formatter(mtick.PercentFormatter())

    #df.plot(kind='bar',figsize=(6,6),alpha=0.90,rot=0)
    plt.show()
Exemplo n.º 30
0
def hanging_query_count(QY_dataset, start_time, end_time, params={}):
    """
    Count the number of hanging queries which started in the given time window.

    Args:
        QY_dataset: (Dataset) the query dataset
        start_time: (pd.Timestamp) the start time of the window
        end_time: (pd.Timestamp) the ends time of the window
        params: (dict) optional parameters

    Returns:
        a Dataset of counts

    """
    if QY_dataset.raw:
        logging.error("Data must be preprocessed before counts")
        raise ValueError

    time_range = Time_Range(start_time, end_time)

    if time_range.start_column != "start_date":
        logging.error("Query data can only be binned by date")
        raise ValueError

    left_bool = QY_dataset.dataset["OpenDate"] <= time_range.start_time
    right_bool = QY_dataset.dataset["CloseDate"] > time_range.end_time

    in_range = QY_dataset.dataset[left_bool & right_bool]

    site_counts = in_range.groupby("Site")["STUDYID"].count()

    if "site_list" in params:
        site_counts = site_counts.reindex(params["site_list"], fill_value=0)

    site_count_frame = series_to_frame(site_counts, time_range,
                                       "hanging_query_count")
    out = Dataset(
        dataset=site_count_frame,
        params={
            "count": True,
            "raw": False,
            "time_format": time_range.type
        },
    )
    return out
Exemplo n.º 31
0
def main():
  # Command line arguments
  parser = argparse.ArgumentParser()
  parser.add_argument('--data', type=str, default='Data/ml-1m',
                      help='Path to the dataset')
  parser.add_argument('--epochs', type=int, default=128,
                      help='Number of training epochs')
  parser.add_argument('--embedding_dim', type=int, default=8,
                      help='Embedding dimensions, the first dimension will be '
                           'used for the bias.')
  parser.add_argument('--regularization', type=float, default=0.0,
                      help='L2 regularization for user and item embeddings.')
  parser.add_argument('--negatives', type=int, default=8,
                      help='Number of random negatives per positive examples.')
  parser.add_argument('--learning_rate', type=float, default=0.001,
                      help='SGD step size.')
  parser.add_argument('--stddev', type=float, default=0.1,
                      help='Standard deviation for initialization.')
  args = parser.parse_args()

  # Load the dataset
  dataset = Dataset(args.data)
  train_pos_pairs = np.column_stack(dataset.trainMatrix.nonzero())
  test_ratings, test_negatives = (dataset.testRatings, dataset.testNegatives)
  print('Dataset: #user=%d, #item=%d, #train_pairs=%d, #test_pairs=%d' % (
      dataset.num_users, dataset.num_items, train_pos_pairs.shape[0],
      len(test_ratings)))

  # Initialize the model
  model = MFModel(dataset.num_users, dataset.num_items,
                  args.embedding_dim-1, args.regularization, args.stddev)

  # Train and evaluate model
  hr, ndcg = evaluate(model, test_ratings, test_negatives, K=10)
  print('Epoch %4d:\t HR=%.4f, NDCG=%.4f\t'
        % (0, hr, ndcg))
  for epoch in range(args.epochs):
    # Training
    _ = model.fit(train_pos_pairs, learning_rate=args.learning_rate,
                  num_negatives=args.negatives)

    # Evaluation
    hr, ndcg = evaluate(model, test_ratings, test_negatives, K=10)
    print('Epoch %4d:\t HR=%.4f, NDCG=%.4f\t'
          % (epoch+1, hr, ndcg))
Exemplo n.º 32
0
    def generateTimeExchangeData(self):
        self.is_generating_time_exchange_data = True
        i = 1

        for source_storage_key, source_storage in self.storage_dict.items():
            self.time_exchange_data_job_dict[source_storage_key] = TimeExchangeData(source_storage_key, source_storage.id, source_storage.id)
            self.time_exchange_data_job_dict[source_storage_key].time = 0

            dataset_id = "time_exchange_data" + str(i)
            ds = Dataset(dataset_id, 1000000)
            i += 1
            source_storage.addDataset(ds)

            for target_storage_key, target_storage in self.storage_dict.items():
                if target_storage_key != source_storage_key:
                    self.moveDataset(dataset_id, source_storage_key, target_storage_key)

        self.is_generating_time_exchange_data = False
Exemplo n.º 33
0
def Main():
    parser = argparse.ArgumentParser(
        description='Vincenzo 12/11/2019: SOV computation. v1.1')
    parser.add_argument('-d',dest='dataset',type=str,required=True,\
        help='dataset object to be loaded for the training')
    parser.add_argument('-t','--type',dest='genre',type=str,required=True,\
        help='specify the type of prediction to be tested: SVM or GOR')

    args = parser.parse_args()

    sse_list = ['H', 'E', '-']
    dataset = Dataset.load(args.dataset)
    genre = args.genre

    per_structure_dict = sov_multi(dataset, genre)[1]
    for key in per_structure_dict.keys():
        print('%s : %s' % (key, per_structure_dict[key]))
    return print('Mean SOV : %s' % (sov_multi(dataset, genre)[0]))
Exemplo n.º 34
0
def Main():
    parser = argparse.ArgumentParser(
        description='Vincenzo 21/11/2019: SVM output decoding. v1.1')
    parser.add_argument('-d',dest='dataset',type=str,required=True,\
        help='dataset object to be loaded for the encoding')
    parser.add_argument('-i','--in',dest='infile',type=str,required=True,\
        help='specify an input file containing the predictions.')
    parser.add_argument('-o','--out',dest='out',type=str,required=True,\
        help='specify an ouput file.')

    args = parser.parse_args()
    data = Dataset.load(args.dataset)
    infile = open(args.infile).readlines()

    prediction = testing(data, infile)

    prediction.dump(args.out + '.dat')
    return print('SVM-decoding succesfully saved at %r' % (args.out + '.dat'))
Exemplo n.º 35
0
class Tests(unittest.TestCase):
    db = Dataset.loadFrom('medium20000_10_shuffled_0.3obstacles.pkl')

    def setUp(self):
        pass

    def test_1(self):
        user = User()
        user.createNetwork(model='custom_mlp:2,800,0,0.3')
        user.loadNetworkState(
            "model_customMlp2_800_0_0.3__10by10___30percentsAccuracy.npz")

        saveDir = "games"
        # for i in [0, 1, 2]: #range(Tests.db.X_test):
        for i in range(Tests.db.X_test.shape[0]):
            sample = Tests.db.X_test[i]
            world, start, goal = Dataset.greyScaleSampleToGridWorld(sample)
            found, blocked, records = user.play(world, start, goal)
            gameName = ""
            if found:
                gameName += "success"
            elif blocked:
                gameName += "blocked"
            else:
                gameName += "timeout"
            gameName += "_" + str(i).zfill(6)
            finalDir = saveDir + "/" + gameName + "/"
            # os.makedirs(finalDir)
            user.saveRecordsToFiles(world, start, goal, found, blocked,
                                    records, finalDir + gameName)

        # for i in range(Tests.db.X_train.shape[0]):
        #     user.predictSample(Tests.db.X_train[i])

        # world, start, goal = Dataset.greyScaleSampleToGridWorld(Tests.db.X_train[10])
        # user.predictSample(user.makeSample(world, start, goal))
        # user.predictSample(Tests.db.X_train[0])
        # user.predictSample(Tests.db.X_train[1])
        # user.predictSample(Tests.db.X_train[2])
        # user.predictSample(Tests.db.X_train[3])
        # user.predictSample(Tests.db.X_train[:2])
        # found, blocked, records = user.play(world, start, goal)
        # user.saveRecordsToFiles(world, start, goal, found, blocked, records, "games/test1/")
        pass
def dev():
    generator = Generator(10)
    print("Generating...")
    samples, labels = generator.generate(20000)
    print("Done generating. Shuffling...")
    Generator.shuffle_in_unison_scary(samples, labels)
    print("Done shuffling. Splitting...")
    db = Dataset()
    db.init(samples, labels, 17000, 1500)
    print("Done splitting. Saving...")
    fileName = 'medium20000_10_shuffled_0.3obstacles.pkl'
    db.saveTo(fileName)
    print("Done saving to", fileName)
Exemplo n.º 37
0
 def __init__(self, device):
     super(RawEmbedding, self).__init__(device=device)
     self.indexer = Indexer(special_tokens={
         '<s>': 0,
         '<unk>': 1,
         '<pad>': 2,
         '<\s>': 3,
         '<mask>': 4
     },
                            with_del_stopwords=self.with_del_stopwords)
     datasets = Dataset().get_instance()
     sentences = [pairs[0] for pairs in datasets['train']]
     self.indexer.count_word_in_text(sentences)
     self.indexer.add_sentences(sentences)
     self.embedding_dim = 100
     self.embedding = nn.Embedding(num_embeddings=len(self.indexer),
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.indexer.padding_index)
     self.embedding.to(device)
def test_network(FLAGS):
    x_test = Dataset.get_user("dataset/npy/", FLAGS.user)
    data = x_test.flatten("C")

    client = pyhe_client.HESealClient(
        FLAGS.hostname,
        FLAGS.port,
        FLAGS.batch_size,
        {FLAGS.tensor_name: (FLAGS.encrypt_data_str, data)},
    )

    results = np.round(client.get_results(), 2)

    y_pred_reshape = np.array(results).reshape(FLAGS.batch_size, 9)
    with np.printoptions(precision=3, suppress=True):
        print(y_pred_reshape)

    y_pred = y_pred_reshape.argmax(axis=1)
    print("y_pred", y_pred)
Exemplo n.º 39
0
 def fit(self, sess, datapath, batch_size=50, epoch=5, max_len=500):
     dataset = Dataset(sess,
                       datapath,
                       batch_size,
                       '\t',
                       max_len=max_len,
                       epoch=epoch)
     sess.run(tf.initialize_all_variables())
     for steps, (c, ws, lens) in enumerate(dataset):
         feed = {
             self._x: ws,
             self._y: c,
             self._lens: lens,
             self._vocab: self._wv,
             self._dropout: self._dr
         }
         loss, acc, _ = sess.run([self._loss, self._acc, self._train_op],
                                 feed_dict=feed)
         yield steps, loss, acc
 def play(self, world, start, goal, verbose=False):
     found = False
     blocked = False
     stepsCount = 0
     records = []
     while not found and not blocked and stepsCount < 30:
         if start.col == goal.col and start.line == goal.line:  # if the goal has been reached
             found = True
             if verbose:
                 print("Step", stepsCount, ": Reached the goal in", start)
         else:
             stepsCount += 1
             sample = Dataset.gridWorldToGreyScaleSample(world, start, goal)
             actions = self.predictSample(sample)
             chosenAction = None
             for direction in actions:
                 nextPosition = self.getNextPosition(world, start, direction)
                 if nextPosition is not None:
                     chosenAction = direction
                     break
             if (nextPosition is None) or (
                 start.col == nextPosition.col and start.line == nextPosition.line
             ):  # if there is not any valid next position, or if next position is the same as current position
                 blocked = True
                 if verbose:
                     print("Step", stepsCount, ": Blocked in", start)
             if verbose:
                 print(
                     "Step",
                     stepsCount,
                     ": From",
                     start,
                     "move",
                     chosenAction.name,
                     "to",
                     nextPosition,
                     "(goal is in ",
                     goal,
                     ")",
                 )
             records.append((start, chosenAction, nextPosition))
             start = nextPosition
     return found, blocked, records
Exemplo n.º 41
0
def main():
	global X
	global Y
	global auth_to_id

	ds = Dataset.open('quora')
	# (X, Y) = ([x.split('.') for x in ds.X], ds.Y)
	#X = ([sum([len( filter(None, y.split(' ')) ) for y in x])/len(x) for x in X])
	#X = zip(X, [len( filter(None, x.split('\n')) ) for x in ds.X])
	(X, Y) = (ds.X, ds.Y)

	mx = 0
	for auth in Y:
		if auth not in auth_to_id:
			auth_to_id[auth] = mx
			mx+=1

	getTagsforAll(X)
	# print [x for x in ds.X if len( filter(None, x.split('\n')) ) > 1]

	# print [(x, y) for (x, y) in X if y > 1]
	#X = [[x, y] for (x, y) in X]



	Z = zip(X, Y)
	Z = pred_shuffle(Z)
	(X, Y) = zip(*Z)
	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		print "doing iteration ", cnt
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		train, pred  = gen_feature_vector(X1, Y1, Xe)
		acc += train_chunk(train, Y1, pred, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
Exemplo n.º 42
0
def evaluateMAP(trace_link_candidates,
                k,
                dataset: Dataset,
                reverse_compare=False):
    """
    trace_link_candidates: List of TraceLink-objects
    """
    if not trace_link_candidates:
        text = "No Trace Link candidates!"
        log.info(text)
        return 0, 0

    req_dict = _build_req_dict_for_map(trace_link_candidates, dataset,
                                       reverse_compare)
    map = Util.calculate_mean_average_precision(req_dict, k,
                                                dataset.num_reqs(),
                                                reverse_compare)

    return map, len(trace_link_candidates)
Exemplo n.º 43
0
def Main():
    parser = argparse.ArgumentParser(
        description='Dataset computing procedure. v1.0')
    parser.add_argument('-d',dest='dataset',type=str,required=True,\
        help='input dataset')

    args = parser.parse_args()

    dataset = Dataset.load(args.dataset)

    df = res_count(dataset).T
    colors = plt.get_cmap('coolwarm_r')
    labels = ['Helix', 'Strand', 'Coil']
    temp = np.array((df['H'], df['E'], df['-']))
    sizes = [i for i in (temp / np.sum(temp)) * 100]
    #print sizes
    # create a circle for the center of the plot
    my_circle = plt.Circle((0, 0), 0.5, fc='white', edgecolor='black')
    # compute the pieplot give color names
    fig1, ax1 = plt.subplots()
    ax1.set_prop_cycle(
        "color", [colors(1. * i / len(sizes)) for i in range(len(sizes))])
    plt.pie(df,
            wedgeprops={
                "edgecolor": "k",
                'linewidth': 1,
                'linestyle': 'solid',
                'antialiased': True
            })
    plt.legend(loc='lower right',
               labels=['%s = %1.1f%%' % (l, s) for l, s in zip(labels, sizes)],
               prop={'size': 9},
               bbox_to_anchor=(1, 0),
               bbox_transform=fig1.transFigure)
    p = plt.gcf()

    ax1.axis('equal')
    #plt.title('Composition in Secondary Structure', fontname="Times New Roman", fontweight="bold")

    # add the white cyrcle in the middle
    p.gca().add_artist(my_circle)
    plt.show()
def runITHSExperiment(k):

    print('run experiments for ' + Config.NETWORK)

    allFoldK = pickle.load(open('allFold_2019.pickle', "rb"))
    allTestData = Util.readData('sentences_all_2019.pickle')
    allTestData = allTestData.iloc[0:4000000, :]
    allSentences = pickle.load(open('sentences_2019.pickle', "rb"))

    [training_df, classes] = getFullTrainingData(allFoldK[k])
    test_df = allTestData
    sentences = allSentences[allSentences['id'].isin(
        training_df['SentenceId'])]
    training_df = training_df.rename(columns={'SentenceId': 'id'})
    training_df = pd.merge(training_df, sentences, on='id')

    training_df, validation_df = train_test_split(
        training_df,
        test_size=Config.TRAIN_TEST_RATIO,
        stratify=training_df[classes].values,
        random_state=42)

    dataset = Dataset(training_df,
                      validation_df,
                      test_df,
                      classes,
                      None,
                      k,
                      preTrain=False)

    model = Network()
    if (os.path.isfile("model_iths.h5")):
        model.loadModel("model_iths.h5")

    else:
        model.build(dataset)
        model.run(dataset)
        model.saveModel("model_iths.h5")

    ids, preds = model.predict(dataset)
    weights = model.getActivationWeights(dataset)
    pickle.dump((ids, preds, weights), open('preds_all.pickle', "wb"))
def main():
    num_cases = 150
    num_learn = 1000

    bn = BayesianNet()
    print('Adjacency matrix: \n' + str(bn.dag))

    dag = np.zeros((5, 5))
    for i in range(num_learn):
        data = Dataset(bn, num_cases)
        dag += sl.k2(data.dataset, data.ordered_array, 2)

    for i in range(len(dag)):
        for j in range(len(dag)):
            if dag[i][j] < (num_learn/2):
                dag[i][j] = 0
            else:
                dag[i][j] = 1

    print('Learned structure: \n' + str(dag))
Exemplo n.º 46
0
def expandDataset(buf, label=-1, limit=''):
    list = []

    parts = buf.split(',')
    for part in parts:
        if part.endswith('.bents'):
            list.append(Dataset(part))
        else:
            for name in Dataset.names():
                if datasetWildcardMatch(part, name):
                    ds = Dataset(name)
                    ds.restrictTo(label)
                    ds.limitTo(limit)
                    list.append(ds)
    return list
Exemplo n.º 47
0
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	#Z = [re.findall(r"[\w']+", x) for x in X]
	Z = [filter(None, x.split('.')) for x in X]
	Z = ["".join(s) for s in Z]
	Z = [z.split(' ') for z in Z]
	Z = [[len(s) for s in z] for z in Z]

	feature = []
	for a in Z:
		wordLenDist = [0]*100
		for ln in a:
				wordLenDist[ln]+=1
		feature.append(wordLenDist)

	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)

	# X = [i for i in range(len(X))]

	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
Exemplo n.º 48
0
def load_MV_dataset(num):
    fin=open('data/Foreign_Movie_model_5_feature.txt','r')
    lines=fin.read().splitlines()
    data_num=len(lines)/2
    dataset=Dataset()
    data_num=num
    for i in xrange(data_num):
        dataset.imageNameList.append(['data/Foreign_Movie_Face/'+lines[2*i]])
        dataset.rect.append([0,0,178,218])
        dataset.imgID.append(1)
        dataset.feature.append(map(float,lines[2*i+1].split()))
        dataset.size+=1
    dataset.computeAffinity()
    dataset.computeQuality()
    return dataset
Exemplo n.º 49
0
 def get_benchmarking_data(self, idmodel, it=-1):
     """
     Returns a Dataset instance with the data used for learning up to iteration it
     """
     con = lite.connect(self.name)
     
     cues = []
     obs = []
     nobs = defaultdict(int)
     
     with con:
         con.row_factory = lite.Row
         cur = con.cursor()
         if it < 0:
             cur.execute("SELECT * FROM benchmark_data WHERE idmodel=:idmodel", {'idmodel': idmodel})
         else:
             cur.execute("SELECT * FROM benchmark_data WHERE idmodel=:idmodel AND it<=:it", {'idmodel': idmodel, 'it': it})
     
         rows = cur.fetchall()
         dataset = Dataset.from_db_rows(rows, self.setup)
 
     return dataset
Exemplo n.º 50
0
def main():
	global X
	global Y
	ds = Dataset.open('quora')
	(X, Y) = (get_tagged_text(ds.X), ds.Y)

	XX=[]
	YY=[]
	for (auth, ans) in zip(Y, X):
	 	if len(nltk.word_tokenize(ans)) > 200:
	 		XX.append(ans)
	 		YY.append(auth)
	(X, Y) = (XX, YY)

	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)
	Xe = X[-50:]
	Ye = Y[-50:]
	X = X[:-50]
	Y = Y[:-50]

	count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0)
	X_train_counts = count_vect.fit_transform(X)
	tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)

	clf = MultinomialNB().fit(X_train_tf, Y)
	clf2 = KNeighborsClassifier(n_neighbours=5).fit(X_train_tf, Y)

	X_new_counts = count_vect.transform(Xe)
	X_new_tfidf = tf_transformer.transform(X_new_counts)
	Yd = clf.predict(X_new_tfidf)

	istats(Y)
	print ''
	stats(Ye, Yd)
    def test_1(self):
        user = User()
        user.createNetwork(model="custom_mlp:2,800,0,0.3")
        user.loadNetworkState("model_customMlp2_800_0_0.3__10by10___30percentsAccuracy.npz")

        saveDir = "games"
        # for i in [0, 1, 2]: #range(Tests.db.X_test):
        for i in range(Tests.db.X_test.shape[0]):
            sample = Tests.db.X_test[i]
            world, start, goal = Dataset.greyScaleSampleToGridWorld(sample)
            found, blocked, records = user.play(world, start, goal)
            gameName = ""
            if found:
                gameName += "success"
            elif blocked:
                gameName += "blocked"
            else:
                gameName += "timeout"
            gameName += "_" + str(i).zfill(6)
            finalDir = saveDir + "/" + gameName + "/"
            # os.makedirs(finalDir)
            user.saveRecordsToFiles(world, start, goal, found, blocked, records, finalDir + gameName)

        # for i in range(Tests.db.X_train.shape[0]):
        #     user.predictSample(Tests.db.X_train[i])

        # world, start, goal = Dataset.greyScaleSampleToGridWorld(Tests.db.X_train[10])
        # user.predictSample(user.makeSample(world, start, goal))
        # user.predictSample(Tests.db.X_train[0])
        # user.predictSample(Tests.db.X_train[1])
        # user.predictSample(Tests.db.X_train[2])
        # user.predictSample(Tests.db.X_train[3])
        # user.predictSample(Tests.db.X_train[:2])
        # found, blocked, records = user.play(world, start, goal)
        # user.saveRecordsToFiles(world, start, goal, found, blocked, records, "games/test1/")
        pass
def runner(
        PATH_DATA,
        RATIO_TEST_DATA,
        RATIO_SPECIFICITY,
        RATIO_CONFIDENCE,
        EXPERIMENTS,
        fe,
        setting_name
    ):

    results = []
    errors = Counter()
    qtypes = QuestionTypes()
    for e in range(1, EXPERIMENTS + 1):

        start = time.time()
        dataset = Dataset(PATH_DATA)
        dataset.load()

        invprob = InverseProbabilities(dataset)
        index = Index(invprob)

        train = [
    #         (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob) + bow(fe, text, prob_filter=invprob), label, mark)
            (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark)
            for text, label, mark in dataset.train()
        ]
        train = train * 4

        test = [
            (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob), label, mark)
#             (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark)
            for text, label, mark in dataset.test()
            if mark
        ][:int(len(train) * RATIO_TEST_DATA)]

        test += [
            (bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob), label, mark)
#             (bow(fe, text, RATIO_SPECIFICITY, prob_filter=invprob), label, mark)
            for text, label, mark in dataset.test()
            if not mark
        ][:len(test)]

        for tbow, label, mark in train:
            index.update(tbow)
            index.add(label)

        tp, tn, fp, fn, prec, rec, f, duration = 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0
        marked = sum([1 for _, _, mark in test if mark])
        for tbow, label, mark in test:
            qtypes.increment(label)
            expectation = sum([
                invprob[w]
                for w in set(bow(fe, label, RATIO_SPECIFICITY, prob_filter=invprob))
            ])
            matches = index(tbow)

            if not matches and not mark:
                tn += 1
                continue
            elif not matches and mark:
                fn += 1
                errors[('fn', '', label)] += 1
                qtypes.update('fn', None, label)
                continue

            best_match = matches[0]
            guess = best_match[2]
            sim = best_match[0]
            ratio = sim / (expectation + 0.1)

            if ratio <= RATIO_CONFIDENCE:
                if not mark:
                    tn += 1
                    continue
                else:
                    fn += 1
                    errors[('fn', '', label)] += 1
                    qtypes.update('fn', None, label)
                    continue
            else:
                if mark and guess == label:
                    tp += 1
                else:
                    fp += 1
                    _qtype = '_'.join(guess.lower().split()[:2])
                    errors[('fp', guess, label)] += 1
                    qtypes.update('fp', guess, label)

            duration = time.time() - start
            if tp:
                prec = tp / float(tp + fp)
                rec = tp / float(tp + fn)
                f = f1(prec, rec)
            else:
                prec, rec, f = 0.0, 0.0, 0.0

        vector = (e, _r(tp), _r(tn), _r(fp), _r(fn),
                  _r(prec), _r(rec), _r(f), _r(duration))
        results.append(vector)

        print '%d, tp: %d, tn: %d, fp: %d, fn: %d, all: %d, prec: %.2f, rec: %.2f, f1: %.2f, time=%.2f' % (e, tp, tn, fp, fn, sum([tp, tn, fp, fn]), prec, rec, f, duration)
        precs, recs, fs = zip(*results)[-4:-1]
        print e, avg(precs), avg(recs), avg(fs)
        print '---'

    if not results:
        return None

    cols = columns(results)
    columns_int = [avg(col) for col in cols[:4]]
    columns_float = [_r(avg(col)) for col in cols[4:]]
    summary_row = [
        tuple(['all'] + columns_int + columns_float)
    ]

    create_folder(RESULTS_FOLDER)
    to_csv(
        RESULTS_KEYS + results + summary_row,
        '%ssecond_task.%s.results.csv' % (RESULTS_FOLDER, setting_name)
    )

    to_csv(
        [tuple([f] + list(key)) for key, f in errors.most_common()],
        '%ssecond_task.%s.errors.csv' % (RESULTS_FOLDER, setting_name)
    )

    to_csv(
        qtypes.dump(),
        '%ssecond_task.error.%s.question_types.csv' % (RESULTS_FOLDER, setting_name)
    )

    return summary_row[0]
Exemplo n.º 53
0
folds=['2-fold', '5-fold', 'N-fold']

for ds in alcohol_datasets:
    train_data_all = ds[0].data
    test_data = ds[1].data

    # Accuracy for get 20%, 50%, 80% and 100% of the data.
    # Each subset will have 
    train_accuracy = [[np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)],
                      [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)],
                      [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)],
                      [np.zeros(num_k_values), np.zeros(num_k_values), np.zeros(num_k_values)]]
    best_k_and_ds = [[0,0,0],[0,0,0],[0,0,0],[0,0,0]]

    for it in range(5):
        train_data_20, t = Dataset.getRandomPercent(train_data_all, 0.2)
        train_data_50, t = Dataset.getRandomPercent(train_data_all, 0.5)
        train_data_80, t = Dataset.getRandomPercent(train_data_all, 0.8)
        all_training_data = [train_data_20,
                             train_data_50,
                             train_data_80,
                             train_data_all]
        # Only run on train_data_all once.
        if it > 0:
            all_training_data = all_training_data[:-1]
        for val in range(len(all_training_data)):
            for k in k_values:
                print str(it) + ": Training on: " + labels[val] + "for k value: " + str(k) + " for " + ds[0].name
                # Do 2-5-N Fold Cross Validation.
                cv_2 = Dataset.getkPartitions(all_training_data[val], 2)
                cv_5 = Dataset.getkPartitions(all_training_data[val], 5)
Exemplo n.º 54
0
from Dataset import Dataset
from Method import Method
from Evaluation import Evaluation
from Result import Result
from Setting import Setting
import numpy as np

#--------------------------- run the exp ----------------------------
if 1:
    k = 5
    fold = 10
    dataset = Dataset('', '')
    dataset.file_folder_path = '../data/input/'
    
    method = Method('', '')
    method.k = k
    
    evaluation = Evaluation('')
    
    result = Result('', '')
    result.k = k
    
    setting = Setting('', '', dataset, method, result, evaluation)
    setting.fold = fold
    setting.load_classify_save()

if 1:
    fold = 10
    k = 5
    
    result = Result('', '')
Exemplo n.º 55
0
                            log = '%s %s,' % (log, close_word)
                        print (log)
        
            end = time.time()   
            print ('Training took %.2f sec\n' % (end - start))
            
            final_embeddings = normalized_embeddings.eval()
    
    return final_embeddings
    


if __name__ == '__main__':
    print ('Loading the dataset... ')
    start = time.time()
    data = Dataset('Text8', reformatted=True, verbose=True)
    data, count, dictionary, reverse_dictionary = data.load()
    end = time.time() 
    print ('Loading the dataset took %.2f sec.\n' % (end - start))
        
    print ('Most common words (+UNK)', count[:5])
    print ('Sample data', data[:10])
    
    print('data:', [reverse_dictionary[di] for di in data[:8]])
        
    
    embedding_size = 128 # Dimension of the embedding vector.
    skip_window = 1 # How many words to consider left and right.
    num_skips = 2 # How many times to reuse an input to generate a label.    
    valid_size = 16 # Random set of words to evaluate similarity on.
    valid_window = 100 # Only pick dev samples in the head of the distribution.
Exemplo n.º 56
0
#!/usr/bin/python

# CIS 521 Homework 7: Learning Machine Learning
# Cory Rivera (rcor) and Sam Panzer (panzers)

from numpy import *
from Dataset import Dataset

d = Dataset("comp.sys.ibm.pc.hardware.txt", 
"rec.sport.baseball.txt", cutoff=10)

#d = Dataset("comp.sys.mac.hardware.txt", "comp.sys.ibm.pc.hardware.txt", cutoff=2000)
(Xtrain, Ytrain, Xtest, Ytest) = d.getTrainAndTestSets(0.8, seed=1)
wordlist = d.getWordList()

def trainNaiveBayes(X, Y):
  # First, count frequencies given the category

  # Each row is a post, and each column is a word
  # To count the number of words from every post, sum up the values from each
  # column for a given category

  # Flattens Y so that it is easier to iterate over
  yFlat = Y.flatten()
  yPos = yFlat == 1
  yNeg = yFlat == -1

  # X.shape[1] returns number of columns for a given matrix
  numColumns = X.shape[1]

  # Indexing with a boolean array like yOne only checks indices that are True
Exemplo n.º 57
0
 def get_random_dataset(self, idmodel, nexp, max_stimuli=0, max_inhibitors=0):
     rows = self.get_population_rows(idmodel, max_stimuli, max_inhibitors)
     exps = random.sample(rows, nexp)
     dataset = Dataset.from_db_rows(exps, self.setup)
         
     return dataset
def load_dataset():
    db = Dataset.loadFrom('medium20000_10_shuffled_0.3obstacles.pkl')
    # db = Dataset.loadFrom('medium20000_10_shuffled.pkl')
    # db = Dataset.loadFrom('large200000_10_shuffled.pkl')
    return db.X_train, db.y_train, db.X_val, db.y_val, db.X_test, db.y_test
# Test code for detect heart region
import detectHeartRegion as dhr

from matplotlib import pyplot
from matplotlib import cm
from Dataset import Dataset


d = Dataset("C:\\Kaggle\\train\\27", "27");
d.load();
(num_slices, num_times, width,height) = d.images.shape

rois,circles = dhr.detect_heart_region(d.images);


#plot roi in each slice at time 0
numSlicesToDisplay = 10;
pyplot.figure(1);
pyplot.subplots_adjust(left=0.1,hspace=0.1,wspace=0);

numslicesPerRow = 2;
numRows = numSlicesToDisplay/numslicesPerRow;
index = 1;
for slice in range(numSlicesToDisplay):
    pyplot.subplot(numRows,2 * numslicesPerRow, index );
    pyplot.imshow(d.images[slice][0],cmap=cm.Greys_r);
    index = index + 1;
    
    pyplot.subplot(numRows,2 * numslicesPerRow, index) ;
    pyplot.imshow(rois[slice],cmap=cm.Greys_r);
    index = index + 1;