예제 #1
0
    def execute_command(self):
        args = self.parser.parse_args()

        d = DataProvider()
        data = d.extract_data()
        x = CalculationLogic()
        if args.task == 't1':
            if args.district and args.year:
                if args.gender:
                    print(x.task1(data, args.year, args.district,args.gender))
                else:
                    print(x.task1(data, args.year, args.district))
        elif args.task == 't2':
            if args.district:
                if args.gender:
                    print(x.task2(data,args.district,args.gender))
                else:
                    print(x.task2(data, args.district))
        elif args.task == 't3':
            if args.year:
                if args.gender:
                    print(x.task3(data,args.year,args.gender))
                else:
                    print(x.task3(data,args.year))
        elif args.task == 't4':
            if args.gender:
                print(x.task4(data,args.gender))
            else:
                print(x.task4(data))
        elif args.task == 't5':
            if args.district and args.district2:
                if args.gender:
                    print(x.task5(data,args.district,args.district2,args.gender))
                else:
                    print(x.task5(data,args.district,args.district2))
예제 #2
0
def train(args):
    if not os.path.isdir('./%s' % args.output):
        os.system('mkdir ./%s' % args.output)

    if args.cont:
        try:
            model = torch.load(args.cont)
            print('load success')
        except:
            model = OwnModel()
    else:
        model = OwnModel()
    dp = DataProvider(args.dataset)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=1e-5)
    crit = torch.nn.CrossEntropyLoss()
    mbsize = 1024
    for epoch in range(args.max_iter):
        odata, olabel = dp.train_iter(mbsize)
        data = Variable(torch.from_numpy(odata))
        label = Variable(torch.from_numpy(olabel))
        lr = get_lr(model.iter)
        pred = model(data)
        pred = pred.contiguous().view(-1, 2)
        loss = crit(pred, label)
        optimizer.zero_grad()
        for group in optimizer.param_groups:
            group['lr'] = lr
        loss.backward()
        optimizer.step()
        print('iter:%s loss:%s' % (epoch, loss.data.numpy()), end='\r')
        if epoch % 10 == 0:
            torch.save(model, './%s/model.pkl' % args.output)
예제 #3
0
def validate(s2s, n_samples):
    bsize = 1  # batch size
    vgen = DataProvider(n_samples, mlen, batch_size=bsize)

    correct = 0
    correct_elements = 0
    total = 0
    total_elements = 0

    for _ in range(n_samples):
        batch, slen, _d_inputs, _d_seqlen, _targets_e, _targets_d = vgen.next()
        inp = []
        for i, b1 in enumerate(batch):
            for j in range(slen[i]):
                inp.append(chr(b1[j].index(1)))
        e_results, results = s2s.do_inference(batch, slen, vgen)
        pred = [chr(np.argmax(e_results))]
        for i, result in enumerate(results):
            for j, res in enumerate(result):  # for each seq in a mini batch
                pred.append(chr(np.argmax(res)))
        pred = pred[:-1]  # ignore the end char
        print "Inp: ", inp
        print "Prd: ", pred
        for c1, c2 in zip(inp, pred):
            if c1 == c2:
                correct_elements += 1
            total_elements += 1
        if inp == pred:
            correct += 1
        total += 1

    print "EXACT match validation accuracy: ", (float(correct) /
                                                total) * 100, "%"
    print "Elementwise match validation accuracy: ", (
        float(correct_elements) / total_elements) * 100, "%"
예제 #4
0
def train(config, network_spec=None):
    data_provider = DataProvider(config.db)
    env = StockEnvironment(data_provider, config, 0)
    agent = overwrite_agent(env, network_spec,
                            config) if config.overwrite_agent else load_agent(
                                config, env, network_spec)

    mlflow.log_param("agent", "tensorforce.agents.DQNAgent")
    for key in config.agent_specs:
        mlflow.log_param(key, config.agent_specs[key])

    runner = Runner(agent=agent, environment=env)
    offset = 20000
    num_episodes = 20
    step = 0
    while data_provider.has_data_key(offset + config.max_step_per_episode):
        runner.run(num_episodes=num_episodes)
        offset = offset + config.max_step_per_episode
        env.offset = offset
        agent.save(config.agent_dir, config.agent_name)
        if step % 10 == 0:
            evaluate(config, data_provider,
                     offset - config.max_step_per_episode, agent)
        step += 1
    return agent, env
예제 #5
0
    def __init__(self, uriList, streamType, moreVars, expandPatterns=True):
        DataProvider.__init__(self, streamType, moreVars)
        self.expandPatterns = expandPatterns
        if "expandPatterns" in moreVars:
            if self.expandPatterns=="False" or self.expandPatterns=="false":
                self.expandPatterns = False
            del moreVars["expandPatterns"]
        
        if self.expandPatterns:
            fileNamePatterns = []
            for f in uriList:
                if f.startswith("file://"):
                    fileNamePatterns.append(f[7:])
                else:
                    fileNamePatterns.append(f)
        else:
            fileNamePatterns = uriList
        if len(fileNamePatterns) == 0:
            raise NoFilesSpecified()
        self.myFileNames = [name for name in expandFiles(fileNamePatterns, shouldOpen=False, checkPattern=self.expandPatterns) ]

        if not self._streamType:
            colonPos = self.myFileNames[0].rfind(":")
            if colonPos==-1 or (colonPos==1 and len(self.myFileNames)>=3 and self.myFileNames[2]=='\\'):
                name = self.myFileNames[0]
            else:
                name = self.myFileNames[0][0:colonPos]
            ext = os.path.splitext(name)[1]
            if ext == ".gz":
                # try to get prev extension before gzip
                ext = os.path.splitext(os.path.splitext(name)[0])[1]
            self._streamType = io_targets.getTypeByExtension(ext)
            if not self._streamType:
                raise UnknownExtensionType(name)
예제 #6
0
 def __init__(self, rss_url, max_news=2):
     """
     .ctor
     :return:
     """
     DataProvider.__init__(self, max_news=max_news)
     self.rss_url = rss_url
     self.max_new_article = max_news
예제 #7
0
def main():
    # read data
    dp = DataProvider()
    dp.read_data("train.csv")

    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
    create_model(dp)
예제 #8
0
def preparation():
    """Prepare databases and start background tasks."""
    # Kill old processes if running
    kill_bg_servers()
    time.sleep(10)

    data_dir = config.DATA_DIR

    log.info("Removing Databases.")
    with contextlib.suppress(FileNotFoundError):
        # Remove Bloom Filter
        os.remove(data_dir + config.BLOOM_FILE)
        # Remove Databases
        os.remove(data_dir + config.KEYSERVER_DB)
        os.remove(data_dir + config.STORAGE_DB)

    # Add User
    log.info("Prepare User DB.")
    db.main(UserType.CLIENT, ['testuser', 'password', '-a'], no_print=True)
    db.main(UserType.OWNER, ['testprovider', 'password', '-a'], no_print=True)
    log.info("Starting Background Servers.")
    subprocess.run([f"{config.WORKING_DIR}src/allStart.sh", "eval"])
    time.sleep(10)
    # Create data provider client
    d = DataProvider('testprovider')
    d.set_password('password')

    # Check that servers are really online
    tries = 0
    done = False
    while not done:
        try:
            if tries >= 1:
                # Try to start servers again.
                kill_bg_servers()
                time.sleep(10)
                subprocess.run(
                    [f"{config.WORKING_DIR}src/allStart.sh", "eval"])
                time.sleep(10)
                tries = 0
            # Check Key Server
            d.get_token(ServerType.KeyServer)
            # Check celery
            r = d.get(d.KEYSERVER.replace('provider', 'celery'))
            if r.content != b"True":
                raise RuntimeError("Celery of keyserver not started.")
            # Check Storage Server
            d.get_token(ServerType.StorageServer)
            # Check celery
            r = d.get(d.STORAGESERVER.replace('provider', 'celery'))
            if r.content != b"True":
                raise RuntimeError("Celery of storage-server not started.")
            # Success
            done = True
        except Exception as e:
            log.error(f"Server not up, yet. Try: {tries}. Error: {str(e)}")
            tries += 1
            time.sleep(5)
예제 #9
0
def rnn():
    data_provider = DataProvider(data_dir, BATCH_SIZE, SEQUENCE_LENGTH)
    model = RNNModel(data_provider.vocabulary_size,
                     batch_size=BATCH_SIZE,
                     sequence_length=SEQUENCE_LENGTH,
                     hidden_layer_size=HIDDEN_LAYER_SIZE,
                     cells_size=CELLS_SIZE)

    with tf.Session() as sess:

        summaries = tf.summary.merge_all()
        writer = tf.summary.FileWriter(tensorboard_dir)
        writer.add_graph(sess.graph)
        sess.run(tf.global_variables_initializer())

        # Forward pass and one backward pass of all the training examples
        epoch = 0
        temp_losses = []
        smooth_losses = []

        while True:
            sess.run(
                tf.assign(model.learning_rate,
                          LEARNING_RATE * (DECAY_RATE**epoch)))
            data_provider.reset_batch_pointer()
            state = sess.run(model.initial_state)
            for batch in range(data_provider.batches_size):
                inputs, targets = data_provider.next_batch()
                feed = {model.input_data: inputs, model.targets: targets}
                for index, (c, h) in enumerate(model.initial_state):
                    feed[c] = state[index].c
                    feed[h] = state[index].h

                # Iteration is the number of times batch data has passed
                # through the neural network - both forward and backwards
                # propagation
                iteration = epoch * data_provider.batches_size + batch
                summary, loss, state, _ = sess.run(
                    [summaries, model.cost, model.final_state, model.train_op],
                    feed)
                writer.add_summary(summary, iteration)
                temp_losses.append(loss)

                if iteration % SAMPLING_FREQUENCY == 0:
                    sample_model(sess, data_provider, iteration)

                if iteration % LOGGING_FREQUENCY == 0:
                    smooth_loss = np.mean(temp_losses)
                    smooth_losses.append(smooth_loss)
                    temp_losses = []
                    plot(smooth_losses, "iterations (thousands)", "loss")
                    print('{{"metric": "iteration", "value": {}}}'.format(
                        iteration))
                    print('{{"metric": "epoch", "value": {}}}'.format(epoch))
                    print('{{"metric": "loss", "value": {}}}'.format(
                        smooth_loss))
            epoch += 1
예제 #10
0
def main():
    # read data
    dp = DataProvider()
    xgb_model = XGBClassifier()
    dp.read_data("train.csv")
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
    # create_xgbmodel(dp,xgb_model,device="gpu")
    opt = Optimizer()
    tune_with_TPE(dp, xgb_model, opt)
예제 #11
0
파일: fc4.py 프로젝트: msuryadeekshith/fc4
def show_patches():
    from data_provider import DataProvider
    dp = DataProvider(True, ['g0'])
    dp.set_batch_size(10)
    while True:
        batch = dp.get_batch()
        for img in batch[0]:
            #img = img / np.mean(img, axis=(0, 1))[None, None, :]
            img = img / img.max()
            cv2.imshow("Input", np.power(img, 1 / 2.2))
            cv2.waitKey(0)
예제 #12
0
    def test_epoch_complete(self):
        # check if every element of the dataset is really seen at the end
        provider = DataProvider('final_data', 8)
        dataset_img = [img.tostring() for img in provider.images]

        while provider.next_batch_available():
            batch_img, _ = provider.get_batch()
            for img in batch_img:
                if img.tostring() in dataset_img:
                    dataset_img.remove(img.tostring())
        self.assertEqual(len(dataset_img), 0)
예제 #13
0
def main(parameters):
    data_provider = DataProvider()
    train_loader, test_loader = data_provider.get_data_loaders(**parameters)

    writer = SummaryWriter()

    trainer_type = parameters["experiment"] + "net"
    trainer = TrainerFactory.create_trainer(trainer_type, train_loader, test_loader, writer, **parameters)
    trainer.run(parameters["epochs"])

    writer.close()
예제 #14
0
    def test_get_casted_dataframe(self, mock_method):
        date_column = pd.date_range(start=datetime.datetime.today(), periods=4)
        mock_method.side_effect = [
            pd.DataFrame([
                '    min    ', 'asdasdasd0', '          ciao', 'ciao          '
            ],
                         dtype='object'),
            pd.DataFrame(['UD', '   O', 'P   ', '   TS    '], dtype='object'),
            pd.DataFrame([0, 1, 1, 1]),
            pd.DataFrame((([np.nan] * 3) + [0.24]), dtype='float64'),
            pd.DataFrame(date_column)
        ]

        # Creo dataframe di test, uguale al self.dp.df solo con stringhe
        data = {
            'col1':
            ['    min    ', 'asdasdasd0', '          ciao', 'ciao          '],
            'col2': ['UD', '   O', 'P   ', '   TS    '],
            'col3': ['0', '1', '1', '1'],
            'col4': ([np.nan] * 3) + ['0.24'],
            'col5':
            date_column.strftime("%Y-%m-%d")  # Cast date to string
        }
        df = pd.DataFrame(data)

        # Creo DataProvider di test con il dataframe solo di stringhe
        test_dp = DataProvider(df=df,
                               column_types={
                                   0: 'object',
                                   1: 'object',
                                   2: 'int',
                                   3: 'float',
                                   4: 'date'
                               },
                               column_constraints={
                                   0: False,
                                   1: False,
                                   2: True,
                                   3: False,
                                   4: False
                               })

        # Effettuo il casting
        casted_df = test_dp.get_casted_dataframe()

        self.assertEqual(casted_df.dtypes.tolist(), [
            np.dtype('O'),
            np.dtype('O'),
            np.dtype('int64'),
            np.dtype('float64'),
            np.dtype('<M8[ns]')
        ])
예제 #15
0
    def __init__(self,
                 part='2',
                 img_width=28,
                 filter_width=28,
                 num_filters=2,
                 num_classes=2,
                 alpha=.01,
                 activation_function='sigmoid',
                 relu_alpha=0,
                 sig_lambdas=(1, 1, 1),
                 subset_size=1,
                 tanh_lambda=1):

        self.part = part
        if self.part == '2':
            self.filter_width = 28
            self.num_filters = 2
            num_classes = 2
            train_dir = '../data/part2/train/*'
            test_dir = '../data/part2/train/*'

        if self.part == '3a' or part == '3b':
            self.filter_width = 7
            self.num_filters = 16
            num_classes = 10
            train_dir = '../data/part3/train/*'
            test_dir = '../data/part3/train/*'

        self.img_width = img_width

        self.output_dim = num_classes
        self.alpha = alpha
        self.activation_function = activation_function

        self.relu_alpha = relu_alpha
        self.sig_lambdas = sig_lambdas
        self.tanh_lambda = tanh_lambda

        #computed properties
        self.conv_mat_H = np.power((img_width - self.filter_width + 1),
                                   2)  #number kernel positions
        self.conv_mat_S = img_width - self.filter_width  #space between kerel and outside of image
        self.conv_output_dim = self.conv_mat_H * self.num_filters

        #create data provider to feed in data
        self.dp = DataProvider(train_dir, test_dir, num_classes, subset_size)

        if part == '2' or part == '3a':
            self.init_weights_3A()
        else:
            self.init_weights_3B()
예제 #16
0
 def __init__(self, cfg):
     DataProvider.__init__(self, cfg)
     # Load training images (path) and labels
     train_path = os.path.join(Paths.data_path, 'cell/labels/train.csv')
     test_path = os.path.join(Paths.data_path, 'cell/labels/test.csv')
     data_type = {'image_name': np.str, 'label': np.int}
     self._train_df = pd.read_csv(train_path, dtype=data_type)
     self._test_df = pd.read_csv(test_path, dtype=data_type)
     self._train_list = list(self._train_df.index)
     # random.shuffle(self._train_list)
     self._test_list = list(self._test_df.index)
     self._test_size = len(self._test_list)
     self._train_index = 0
     self._test_index = 0
예제 #17
0
    def test_get_column_constraints_is_respected_NotImplemented(self):
        # Creo istanza di Data Provider
        data = {
            # duplicando l'ultimo valore della prima colonna
            'col1': ['222365896', '522559845', '333652214', '522559845'],
            'col2': ['UD', '   O', 'P   ', '   TS    ']
        }
        df = pd.DataFrame(data)
        col_types = {0: 'object', 1: 'object'}
        dp = DataProvider(df, col_types, column_constraints=NotImplemented)

        # Test valore corrispondente
        duplicated_values = dp.get_column_constraints_is_respected()
        pd.testing.assert_series_equal(duplicated_values,
                                       pd.Series([], dtype='object'))
def main():
	dp = DataProvider()
	test_data = dp.get_test_data()
	model_name="rando:0%reg_a:0%max_d:0%subsa:1%boost:gbtree%nthre:8%colsa:1%learn:0.025%scale:5.2872645858027125%max_d:3%missi:None%gamma:0%base_:0.5%colsa:1%min_c:2%seed:100%n_job:1%silen:0%n_est:800%reg_l:1%objec:binary:logistic%"
	path="/home/msaffarm/KaggleChallenges/SafeDriverPred/xgbModel/trainedModels/" + model_name
	model = get_model(path)
	test_ids = test_data[["id"]].as_matrix()
	test_data.drop(["id"], axis=1,inplace=True)
	preds = model.get_booster().predict(xgb.DMatrix(test_data))
	final_pred = np.concatenate([test_ids.reshape(-1,1),preds.reshape(-1,1)],axis=1)
	final_pred_df = pd.DataFrame(final_pred,columns=["id","target"])
	final_pred_df["id"] = final_pred_df["id"].astype(int)
	print(final_pred_df)

	final_pred_df.to_csv("predictions.csv",index=False)
예제 #19
0
def main():
    # parse config
    config_file = sys.argv[1]
    config = Config(config_file)

    # setup logger
    setup_logging(config.working_dir)

    # encoding func
    encoding_func = ENCODING_METHOD_MAP[config.encoding_method]
    encoding_func2= ENCODING_METHOD_MAP[config.encoding_method2]
    log_to_file('Encoding method2', config.encoding_method2)

    data_provider=[]
    for p in range(config.base_model_count):
        temp_provider = DataProvider(
             encoding_func,
             encoding_func2,
             config.data_file,
             config.test_file,
             config.batch_size,
             max_len_hla=config.max_len_hla,
             max_len_pep=config.max_len_pep,
             model_count=config.model_count
        )
        data_provider.append(temp_provider)
 
    log_to_file('max_len_hla', data_provider[0].max_len_hla)
    log_to_file('max_len_pep', data_provider[0].max_len_pep)
    
    test(config, data_provider[0])
예제 #20
0
def setup(trello_key, trello_secret, board_id, out, delimiter, card_extractors,
          filters):
    # validate inputs

    if not trello_key or not trello_secret:
        raise click.BadParameter('trello_secret and trello_key are required')

    if not board_id:
        raise click.BadParameter('board_id is required')

    trello_client = TrelloClient(
        api_key=trello_key,
        api_secret=trello_secret,
    )
    data_provider = DataProvider(Board(
        trello_client,
        board_id=board_id,
    ))
    print(data_provider.board.name)  # TODO: add logging
    database = DataBase(delimiter=delimiter)
    runner = Runner(data_provider,
                    database,
                    card_extractors_parameter=[
                        Parameter(x.strip())
                        for x in card_extractors.split(',')
                    ],
                    filters=[Parameter(x.strip())
                             for x in filters.split(',')] if filters else [])
    runner.run()
    database.export(out)
예제 #21
0
def show_patches():
    from data_provider import DataProvider
    # dp = DataProvider(False, ['s0'])
    dp = DataProvider(True, ['s0'])
    dp.set_batch_size(10)
    while True:
        batch = dp.get_batch()
        imgs = batch[0]
        illums = batch[2]
        for i in range(len(imgs)):
            #img = img / np.mean(img, axis=(0, 1))[None, None, :]
            img = imgs[i] / imgs[i].max()
            illum = illums[i]
            print('illum: ', illum)
            cv2.imshow("Input", np.power(img, 1 / 2.2))
            cv2.waitKey(0)
예제 #22
0
def main():
    # parse config
    config_file = sys.argv[1]
    config = Config(config_file)

    # setup logger
    setup_logging(config.working_dir)

    # encoding func
    encoding_func = ENCODING_METHOD_MAP[config.encoding_method]
    encoding_func2 = ENCODING_METHOD_MAP[config.encoding_method2]
    log_to_file('Encoding method2', config.encoding_method2)

    data_provider = []
    for p in range(config.base_model_count):
        temp_provider = DataProvider(encoding_func,
                                     encoding_func2,
                                     config.data_file,
                                     config.test_file,
                                     config.batch_size,
                                     max_len_hla=config.max_len_hla,
                                     max_len_pep=config.max_len_pep,
                                     model_count=config.model_count)
        data_provider.append(temp_provider)

    log_to_file('Traning samples', len(data_provider[0].train_samples[0]))
    log_to_file('Val samples', len(data_provider[0].validation_samples[0]))
    log_to_file('Traning steps', data_provider[0].train_steps())
    log_to_file('Val steps', data_provider[0].val_steps())
    log_to_file('Batch size', data_provider[0].batch_size)
    log_to_file('max_len_hla', data_provider[0].max_len_hla)
    log_to_file('max_len_pep', data_provider[0].max_len_pep)

    for p in range(config.base_model_count):
        train(config, data_provider[p], p)
예제 #23
0
    def show_images() -> None:
        random.seed(10)

        dp = DataProvider.load_from_folder(dataset_folder)

        nn = NeuralNet(sizes=[784, 128, 10], epochs=10)
        nn.train(dp.get_train_x(), dp.get_hot_encoded_train_y(),
                 dp.get_test_x(), dp.get_hot_encoded_test_y())

        properly_classified, misclassified = nn.get_properly_classified_and_misclassified_images(
            dp.get_test_x(), dp.get_hot_encoded_test_y())

        print('properly classified')
        plt.imshow(properly_classified[0].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(properly_classified[1].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(properly_classified[2].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(properly_classified[3].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(properly_classified[4].reshape(28, 28), cmap=cm.binary)
        plt.show()

        print('missclasified')
        plt.imshow(misclassified[0].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(misclassified[1].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(misclassified[2].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(misclassified[3].reshape(28, 28), cmap=cm.binary)
        plt.show()
        plt.imshow(misclassified[4].reshape(28, 28), cmap=cm.binary)
        plt.show()
예제 #24
0
    def test_invalid_formatting(self):

        invalid_values = DataProvider.get_data("generic.invalid_values.json")

        for value in invalid_values["text-fields"]:
            AddAssetPage.complete_form(self.driver,
                                       "asset_form.valid_asset.0.json")
            AddAssetPage.set_text_fields(self.driver, value)

            # Submision should not be successful
            assert "/add" in self.driver.current_url
            # Validation alert should be visible
            assert AddAssetPage.is_validation_message_displayed(self.driver)

        for value in invalid_values["non-http-urls"]:
            AddAssetPage.complete_form(self.driver,
                                       "asset_form.valid_asset.0.json")
            AddAssetPage.set_url_fields(self.driver, value)

            # Submision should not be successful
            assert "/add" in self.driver.current_url
            # Validation alert should be visible
            assert AddAssetPage.is_validation_message_displayed(self.driver)

        for value in invalid_values["lists"]:
            AddAssetPage.complete_form(self.driver,
                                       "asset_form.valid_asset.0.json")
            AddAssetPage.set_list_fields(self.driver, value)

            # Submision should not be successful
            assert "/add" in self.driver.current_url
            # Validation alert should be visible
            assert AddAssetPage.is_validation_message_displayed(self.driver)
예제 #25
0
    def test_get_column_constraints_is_respected_strings(self):
        # Creo istanza di Data Provider
        data = {
            # duplicando l'ultimo valore della prima colonna
            'col1': ['222365896', '522559845', '333652214', '522559845'],
            'col2': ['UD', '   O', 'P   ', '   TS    ']
        }
        df = pd.DataFrame(data)
        col_types = {0: 'object', 1: 'object'}
        col_constraints = {0: True, 1: False}
        dp = DataProvider(df, col_types, col_constraints)

        # Test valore corrispondente
        duplicated_values = dp.get_column_constraints_is_respected()
        pd.testing.assert_series_equal(duplicated_values,
                                       pd.Series([False, False, False, True]))
        self.assertEqual(duplicated_values.sum(), 1)
예제 #26
0
    def test_get_column_constraints_is_respected_multicolumn(self):
        # Creo istanza di Data Provider
        data = {
            'col1': ['222365896', '522559845', '522559845', '522559845'],
            'col2': ['UD', 'GO', 'PN', 'GO'],
            'col3': [1, 2, 3, 4]
        }
        df = pd.DataFrame(data)
        col_types = {0: 'object', 1: 'object', 2: 'int'}
        col_constraints = {0: True, 1: True, 2: False}
        dp = DataProvider(df, col_types, col_constraints)

        # Test valore corrispondente
        duplicated_values = dp.get_column_constraints_is_respected()
        pd.testing.assert_series_equal(duplicated_values,
                                       pd.Series([False, False, False, True]))
        self.assertEqual(duplicated_values.sum(), 1)
예제 #27
0
    def test_get_casted_column_for_type_date(self):
        s = pd.Series(['05/11/2020', '05/12/2020', '05/13/2020', '05/14/2020'],
                      dtype='object')
        casted_s = DataProvider.get_casted_column_for_type(s, 'date')

        date_series = pd.date_range(start='05/11/2020', periods=4)
        pd.testing.assert_frame_equal(casted_s, pd.DataFrame(date_series))
        self.assertEqual(casted_s.dtypes.tolist(), [np.dtype('<M8[ns]')])
예제 #28
0
def main():
    x = tf.placeholder(tf.float32, [batch_size, 512, 512, 3])
    y = tf.placeholder(tf.float32, [None, 3])
    out = M.test_architecture2(x)
    dp = DataProvider(True, ['g0'])
    dp.set_batch_size(batch_size)
    angular_loss = angular_error_fn(out, y)
    nr_step = 100
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, "tf_log/model.ckpt")
        for epoch in range(0, nr_epochs):
            for step in range(0, nr_step):
                batch = dp.get_batch()
                feed_x = batch[0]
                feed_y = batch[2]
                ans, angular_error = sess.run([out,angular_loss], feed_dict = {x: feed_x, y:feed_y})
                print(str(step) + " Angular_error: " + str(angular_error))
                print(ans[0])
                print(feed_y[0])
                img = feed_x[0] / feed_x[0].max()
                #cv2.imshow("Input", np.power(img, 1 / 2.2))
                #cv2.waitKey(0)
                cv2.imwrite("data/inference/" + str(step) + "_img_input.png", 255*np.power(img, 1 / 2.2))
                img_gt = sp.apply_gt(img, feed_y[0])
                cv2.imwrite("data/inference/" + str(step) + "_img_gt.png", 255*np.power(img_gt, 1 / 2.2))
                img_pred = sp.apply_gt(img, ans[0])
                cv2.imwrite("data/inference/" + str(step) + "_img_pred.png", 255*np.power(img_pred, 1 / 2.2))
        dp.stop()
예제 #29
0
    def example4():
        """
        Neural net with 2 hidden layers 100 epochs test
        """
        dp = DataProvider.load_from_folder(dataset_folder)

        nn = NeuralNet2(sizes=[784, 128, 64, 10], epochs=100)
        nn.train(dp.get_train_x(), dp.get_hot_encoded_train_y(),
                 dp.get_test_x(), dp.get_hot_encoded_test_y())
예제 #30
0
    def run_validation(cls, model):
        _, gen_val = DataProvider.get_generators()

        print('Evaluating model...')
        result = model.evaluate_generator(gen_val, use_multiprocessing=True, workers=4)

        print('Results:')
        for idx, metric in enumerate(model.metrics_names):
            print(f'\t{metric}: {result[idx]}')
예제 #31
0
def show_patches():
    from data_provider import DataProvider
    dp = DataProvider(True, ['g0'])
    dp.set_batch_size(1)
    while True:
        batch = dp.get_batch()
        images = batch[0]
        labels = batch[2]
        for i in range(len(images)):
            img = images[i]
            gt = labels[i]
            #img = img / np.mean(img, axis=(0, 1))[None, None, :]
            img = img / img.max()
            cv2.imshow("Input", np.power(img, 1 / 2.2))
            cv2.waitKey(0)
            img = apply_gt(img, gt)
            cv2.imshow("Corrected", np.power(img, 1 / 2.2))
            cv2.waitKey(0)
    def test_add_nlp_tags(self):

        payload = DataProvider.get_payload("POST_asset_add.tag_payload.0.json")

        # Request is made twice to check for tag duplication
        response_one = requests.post(ADD_ENDPOINT, data=payload)
        response_two = requests.post(ADD_ENDPOINT, data=payload)

        # Getting POS annotations
        nlp_response = requests.post(NLP_ENDPOINT,
                                     data=payload["asset_purpose"])

        tags_to_store = [
            "NN", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR",
            "JJS", "RB", "RBR", "RBS"
        ]
        expected_tags = []
        for sentence in nlp_response.json()["sentences"]:
            for token in sentence["tokens"]:
                if token["pos"] in tags_to_store:
                    expected_tags.append(token["lemma"])

        # Checking if tags were added
        found_count = 0
        for doc in self.db.tags.find({'type': 'NLP'}):
            if doc["value"] in expected_tags:
                found_count += 1

        # Higher number means duplicates
        # Lower number means not generated
        self.assertEqual(
            len(expected_tags), found_count,
            "NLP tags not extracted correctly: Found " + str(found_count) +
            "; Expected: " + str(len(expected_tags)))

        # Should find one if assets get tagged
        result = self.db.tags.find({
            'type': 'NLP',
            'tagged': {
                "$in": [response_one.json()["asset_id"]]
            }
        }).count()
        self.assertEqual(
            1, result,
            "Tag has been generated but asset has not been tagged: Found:" +
            str(result) + "; Expected: 1")
        result = self.db.tags.find({
            'type': 'NLP',
            'tagged': {
                "$in": [response_two.json()["asset_id"]]
            }
        }).count()
        self.assertEqual(
            1, result,
            "Tag has been generated but asset has not been tagged: Found:" +
            str(result) + "; Expected: 1")
예제 #33
0
miner_globals.addAggregator("segments", "aggregate.Segments", "segments(start, size) returns aggregate.Segments object")
miner_globals.addAggregator("rate", "aggregate.Rate", "rate(period)(value) gets the rates of the value over defined period")
miner_globals.addAggregator("rateIf", "aggregate.RateIf", "rateIf(period)(cond, exp) gets the rates of the value over defined period filtered by the condition")

miner_globals.addTargetToClassMapping("csv", "io_targets.iCSV", "io_targets.oCSV", "comma separated value text (unicode=True flag preserves unicode indication in output)")
miner_globals.addTargetToClassMapping("pickle", "io_targets.iPickle", "io_targets.oPickle", "python object native serialization format")
miner_globals.addTargetToClassMapping("stdout", None, "io_targets.oStdout", "dumps user friendly formatted output to stdout")
miner_globals.addTargetToClassMapping("less", None, "io_targets.oLess", "dumps user friendly formatted output to less pager")
miner_globals.addTargetToClassMapping("log", "io_targets.iLog", "io_targets.oLog", "Processes text file by splitting it to words. Created record is (line, words, NR).\nFS= may specify alternative regular regular expression for splitting.")
miner_globals.addTargetToClassMapping("raw", "io_targets.iRaw", "io_targets.oRaw", "Processes text file without splitting into words. Record is (line,).")
miner_globals.addTargetToClassMapping("json", "io_targets.iJson", "io_targets.oJson", "Reads json files to 'obj' variable or writes all variables to json list")
miner_globals.addTargetToClassMapping("tsv", "io_targets.iTsv", "io_targets.oTsv", "tab separated value text")

miner_globals.addExtensionToTargetMapping(".csv", "csv")
miner_globals.addExtensionToTargetMapping(".tsv", "tsv")
miner_globals.addExtensionToTargetMapping(".pic", "pickle")
miner_globals.addExtensionToTargetMapping(".txt", "stdout")
miner_globals.addExtensionToTargetMapping(".log", "log")
miner_globals.addExtensionToTargetMapping(".json", "json")
miner_globals.addExtensionToTargetMapping("stdout", "csv")

DataProvider.registerDataProvider("file", FileDataProvider)
DataProvider.registerDataProvider("repository", RepositoryDataProvider)

import m.db
import m.db.sqlite_engine
sqliteEngine = m.db.sqlite_engine.SQLiteEngine()
m.db.registerEngine("file.db", sqliteEngine)
m.db.registerEngine("file.sqlite", sqliteEngine)

예제 #34
0
 def setUp(self):
     self.data_provider = DataProvider(5, genre_dataset_size=40)
예제 #35
0
class TestDataProvider(unittest.TestCase):

    def setUp(self):
        self.data_provider = DataProvider(5, genre_dataset_size=40)

    def test_get_output_shape(self):
        self.assertEqual(self.data_provider.get_output_shape(), (5, ))

    def test_setup(self):
        self.data_provider.setup()

        training_data = self.data_provider.get_all_training_data()
        test_data = self.data_provider.get_test_data()
        # Check proportions of training and test sets
        self.assertEqual(training_data.shape[0], 180)
        self.assertEqual(test_data.shape[0], 20)

        ids = numpy.array([])
        for example in training_data:
            ids = numpy.append(ids, example['id'])
        for example in test_data:
            ids = numpy.append(ids, example['id'])
        # Check the reunion of training and test examples gives the entire dataset
        numpy.testing.assert_array_equal(numpy.sort(ids), numpy.sort(numpy.array(5 * (range(40)))))

    def test_get_next_batch(self):
        self.data_provider.setup()

        genres_count = numpy.zeros((5, ), dtype=int)
        for i in range(18):
            batch = self.data_provider.get_next_batch()
            for example in batch:
                genres_count[numpy.argmax(example['out'])] += 1
        numpy.testing.assert_array_equal(genres_count, numpy.array([36, 36, 36, 36, 36]))

    def test_get_all_training_data(self):
        self.data_provider.setup()

        training_data = self.data_provider.get_all_training_data()
        ids = numpy.array([])
        for example in training_data:
            # Check training examples have ids from the dataset range
            self.assertIn(example['id'], range(40))
            ids = numpy.append(ids, example['id'])

    def test_get_test_data(self):
        self.data_provider.setup()

        test_data = self.data_provider.get_test_data()
        ids = numpy.array([])
        for example in test_data:
            # Check test examples have ids from the dataset range
            self.assertIn(example['id'], range(40))
            ids = numpy.append(ids, example['id'])

    def test_get_test_data_for_genre(self):
        self.data_provider.setup()

        test_data_genre = self.data_provider.get_test_data_for_genre('classical')
        ids = numpy.array([])
        for example in test_data_genre:
            ids = numpy.append(ids, example['id'])
        # Check test set for 1 genre does not contain duplicates
        numpy.testing.assert_array_equal(numpy.unique(ids), ids)
        # Check size of test set per genre
        self.assertEqual(ids.shape[0], 4)

    def test_reset(self):
        self.data_provider.setup()

        for i in range(18):
            batch = self.data_provider.get_next_batch()
            self.assertIsNotNone(batch)

        batch = self.data_provider.get_next_batch()
        self.assertIsNone(batch)

        self.data_provider.reset()

        for i in range(18):
            batch = self.data_provider.get_next_batch()
            self.assertIsNotNone(batch)
예제 #36
0
        print 'username=[%s] password=[%s]' % (username, password)
        status, account = data_provider.get_account(username, password)
        result = {"status": status, "account": account}
        self.write(json.dumps(result, ensure_ascii=False))
 
class LinkerManagerRequestHandler(tornado.web.RequestHandler):
    #@tornado.web.authenticated
    def post(self):
        print 'LinkerManager post %s' % self.request.uri
        act = self.get_argument('act')
        if act == 'get_linkers':
            return self.get_linkers()
    
    def get_linkers(self):
        data = self.request.body
        print 'request body is [%s]' % data
        bee_id = self.get_body_argument('bee', '')
        status, linkers = data_provider.get_linkers(bee_id)
        result = {"status": status, "linkers": linkers}
        self.write(json.dumps(result, ensure_ascii=False))

data_provider = DataProvider()

if __name__ == "__main__":
    cfg = ConfigParser.ConfigParser()
    cfg.read(sys.argv[1])
    data_provider.init(cfg)
    server = tornado.httpserver.HTTPServer(MyApplication())
    server.listen(80)
    tornado.ioloop.IOLoop.instance().start()