Exemplo n.º 1
0
    def test_build_schema(self):
        illegal_col_regex = re.compile(r'\W|[A-Z]')

        for dataset_name in self.TEST_DATASETS:
            dataset = Dataset.create(self.test_dataset_ids[dataset_name])
            Dataset.build_schema(dataset,
                    self.test_data[dataset_name].dtypes)

            # get dataset with new schema
            dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])

            for key in [CREATED_AT, SCHEMA, UPDATED_AT]:
                self.assertTrue(key in dataset.keys())

            df_columns = self.test_data[dataset_name].columns.tolist()
            seen_columns = []

            for column_name, column_attributes in dataset[SCHEMA].items():
                # check column_name is unique
                self.assertFalse(column_name in seen_columns)
                seen_columns.append(column_name)

                # check column name is only legal chars
                self.assertFalse(illegal_col_regex.search(column_name))
                # check has require attributes
                self.assertTrue(SIMPLETYPE in column_attributes)
                self.assertTrue(OLAP_TYPE in column_attributes)
                self.assertTrue(LABEL in column_attributes)

                # check label is an original column
                self.assertTrue(column_attributes[LABEL] in df_columns)
                df_columns.remove(column_attributes[LABEL])

            # ensure all columns in df_columns have store columns
            self.assertTrue(len(df_columns) == 0)
Exemplo n.º 2
0
 def get(self):
     Dataset.deleteAll()
     Dumpfile.deleteAll()
     for dataset in yaml.load(open("predefined_datasets.yaml", 'r')):
         Dataset(name=dataset['name'], voidURI=dataset['voidURI']).put()
         
     return webapp2.redirect('/datasets')
Exemplo n.º 3
0
def import_dataset(_file, dataset):
    """
    For reading a URL and saving the corresponding dataset.
    """
    dframe = read_csv(_file)
    Dataset.build_schema(dataset, dframe.dtypes)
    Observation.save(dframe, dataset)
Exemplo n.º 4
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv'])
     Dataset.build_schema(self.dataset,
             self.test_data['good_eats.csv'].dtypes)
     self.formula = 'rating'
     self.name = 'test'
Exemplo n.º 5
0
 def test_update(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset.create(self.test_dataset_ids[dataset_name])
         self.assertFalse('field' in dataset)
         Dataset.update(dataset, {'field': {'key': 'value'}})
         dataset = Dataset.find_one(self.test_dataset_ids[dataset_name])
         self.assertTrue('field' in dataset)
         self.assertEqual(dataset['field'], {'key': 'value'})
Exemplo n.º 6
0
 def test_find(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset.save(self.test_dataset_ids[dataset_name])
         cursor = Dataset.find(self.test_dataset_ids[dataset_name])
         rows = [x for x in cursor]
         self.assertTrue(isinstance(cursor, Cursor))
         self.assertEqual(record, rows[0])
         self.assertEqual(record, Dataset.find_one(
                     self.test_dataset_ids[dataset_name]))
Exemplo n.º 7
0
 def test_POST_remove_summary(self):
     Datasets().GET(self.dataset_id, mode=MODE_SUMMARY)
     dataset = Dataset.find_one(self.dataset_id)
     self.assertTrue(isinstance(dataset[STATS], dict))
     self.assertTrue(isinstance(dataset[STATS][ALL], dict))
     self._post_formula()
     # [STATS][ALL] should be removed
     dataset = Dataset.find_one(self.dataset_id)
     self.assertEqual(dataset[STATS].get(ALL), None)
Exemplo n.º 8
0
 def test_delete(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset.save(self.test_dataset_ids[dataset_name])
         records = [x for x in \
                 Dataset.find(self.test_dataset_ids[dataset_name])]
         self.assertNotEqual(records, [])
         Dataset.delete(self.test_dataset_ids[dataset_name])
         records = [x for x in
                 Dataset.find(self.test_dataset_ids[dataset_name])]
         self.assertEqual(records, [])
Exemplo n.º 9
0
    def DELETE(self, dataset_id):
        """
        Delete observations (i.e. the dataset) with hash *dataset_id* from mongo
        """
        dataset = Dataset.find_one(dataset_id)
        result = None

        if dataset:
            Dataset.delete(dataset_id)
            Observation.delete(dataset)
            result = {SUCCESS: 'deleted dataset: %s' % dataset_id}
        return dump_or_error(result, 'id not found')
Exemplo n.º 10
0
 def update(cls, dframe, dataset):
     """
     Update *dataset* by overwriting all observations with the given
     *dframe*.
     """
     previous_dtypes = cls.find(dataset, as_df=True).dtypes.to_dict()
     new_dtypes = dframe.dtypes.to_dict().items()
     cols_to_add = dict([(name, dtype) for name, dtype in
                 new_dtypes if name not in previous_dtypes])
     Dataset.update_schema(dataset, cols_to_add)
     cls.delete(dataset)
     cls.save(dframe, dataset)
     return cls.find(dataset, as_df=True)
Exemplo n.º 11
0
    def combined_dataset(cls, ids, window_length):
        dataset = Dataset.empty()
        for id in ids:
            session = cls.from_api(id)
            windows = list(session.window_gen(window_length=window_length))
            dataset = dataset + session.dataset(windows)

        return dataset
Exemplo n.º 12
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))   
     crawl = Crawl(dataset=dataset, status='QUEUED')
     crawl.put()
     
     ''' Queue the crawl immediately '''
     crawl.queue(5)      
     return webapp2.redirect('/datasets/' + datasetID)
Exemplo n.º 13
0
 def __init__(self, exchange, period_start: datetime, period_end=None, interval=60, *args, **kwargs):
     self.exchange = exchange
     self.interval = interval
     self.period_start = period_start
     self.period_end = period_end
     self.start = datetime.now()
     self.dataset = Dataset().create(
         data={'exchange': '/api/exchanges/'+self.exchange.name.lower(), 'periodStart': self.period_start, 'periodEnd': self.period_end,
               'candleSize': 60,
               'currency': '/api/currencies/'+self.exchange.currency.lower(), 'asset': '/api/currencies/'+self.exchange.asset.lower()})
Exemplo n.º 14
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))
     for crawl in Crawl.all().filter('dataset =', dataset).run():
         crawl.delete()
         
     for dump in Dumpfile.all().filter('dataset =', dataset).run():
         dump.delete()
     
     dataset.delete()
     logging.info('Deleted dataset ' + datasetID)
     return webapp2.redirect('/datasets')
Exemplo n.º 15
0
    def dataset(self, windows, remove_seconds=0):
        if len(windows) == 0:
            return Dataset.empty()

        n_samples = len(windows)
        n_channels = len(self.ch_names)
        window_length = np.shape(windows)[1]

        X = np.empty([n_samples, n_channels, window_length])
        y = np.empty([n_samples], dtype=np.int8)

        for i, window in enumerate(windows):
            X[i] = window[:, 0:n_channels].T
            y[i] = int(max(window[:, -1]))

        if remove_seconds > 0:
            change_points = []
            action_labels = []
            for i in range(1, len(y), 1):
                if y[i] != y[i - 1]:
                    change_points.append(i)
                    action_labels.append(np.max(y[i - 1:i + 1]))

            remove_distance = (250 * remove_seconds) / window_length
            keep_indices = []
            for i in range(len(y)):
                label = y[i]
                if label == 0:
                    viable = True
                    for point in change_points:
                        if np.abs(i - point) <= remove_distance:
                            viable = False
                    if viable:
                        keep_indices.append(i)
                else:
                    keep_indices.append(i)

            X = X[keep_indices]
            y = y[keep_indices]

        return Dataset(X, y, self.person_id, self.id)
Exemplo n.º 16
0
def start_dataset_creating():
    start = datetime.datetime.now()
    dataset_name = 'dataset_' + str(time.time()).replace('.', '')
    dataset_dir = os.path.join(current_app.config['DATASET_DIR'], dataset_name)
    os.makedirs(dataset_dir, exist_ok=True)

    dataset = Dataset(name=dataset_name,
                      path=dataset_dir,
                      dt_start=start,
                      status=DatasetStatus.start,
                      type=DatasetType.top_one)
    db.session.add(dataset)
    db.session.commit()

    collector = DatasetCollector(dataset_model=dataset)

    try:
        # TODO: добавить параметры датасета
        collector.create_doctor_item_base_matrix()
        collector.create_datasets_for_catboost(min_appts=10)
    except Exception as e:
        traceback.print_exc()
        dataset.status = DatasetStatus.fail
        dataset.error = str(e)
    else:
        dataset.status = DatasetStatus.end
    finally:
        dataset.dt_end = datetime.datetime.now()
        db.session.add(dataset)
        db.session.commit()

    return redirect(url_for('dataset.main'))
Exemplo n.º 17
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv'])
     dframe = self.test_data['good_eats.csv']
     Dataset.build_schema(self.dataset, dframe.dtypes)
     Observation.save(dframe, self.dataset)
     self.calculations = [
         'rating',
         'gps',
         'amount + gps_alt',
         'amount - gps_alt',
         'amount + 5',
         'amount - gps_alt + 2.5',
         'amount * gps_alt',
         'amount / gps_alt',
         'amount * gps_alt / 2.5',
         'amount + gps_alt * gps_precision',
         '(amount + gps_alt) * gps_precision',
         'amount = 2',
         '10 < amount',
         '10 < amount + gps_alt',
         'not amount = 2',
         'not(amount = 2)',
         'amount = 2 and 10 < amount',
         'amount = 2 or 10 < amount',
         'not not amount = 2 or 10 < amount',
         'not amount = 2 or 10 < amount',
         '(not amount = 2) or 10 < amount',
         'not(amount = 2 or 10 < amount)',
         'amount ^ 3',
         '(amount + gps_alt) ^ 2 + 100',
         '-amount',
         '-amount < gps_alt - 100',
         'rating in ["delectible"]',
         'risk_factor in ["low_risk"]',
         'amount in ["9.0", "2.0", "20.0"]',
         '(risk_factor in ["low_risk"]) and (amount in ["9.0", "20.0"])',
     ]
     self.places = 5
Exemplo n.º 18
0
def parse_datasets(res):
    soup = BeautifulSoup(res.text, 'html.parser')
    views = soup.findAll('div', {'class': 'views-row'})
    datasets = []
    for view in views:
        div = view.find('div', {'class': 'views-field-body'})
        if div != -1:
            el = div.find('a')
            datasetid = el['href'].replace("/download/content/", "")
            name = el.text
            datasets.append(Dataset(datasetid, name))

    return datasets
Exemplo n.º 19
0
Arquivo: io.py Projeto: asseym/bamboo
def create_dataset_from_csv(csv_file):
    """
    Create a dataset from the uploaded .csv file.
    """
    dataset_id = uuid.uuid4().hex
    dataset = Dataset.create(dataset_id)

    # need to write out to a named tempfile in order
    # to get a handle in order for pandas read_csv
    with tempfile.NamedTemporaryFile() as tmpfile:
        tmpfile.write(read_uploaded_file(csv_file))
        import_dataset(tmpfile.name, dataset)

    return {ID: dataset_id}
Exemplo n.º 20
0
    def GET(self, dataset_id, mode=False, query='{}', select=None,
            group=ALL):
        """
        Return data set for hash *dataset_id*.
        Execute query *query* in mongo if passed.
        If summary is passed return summary statistics for data set.
        If group is passed group the summary, if summary is false group is
        ignored.
        """
        dataset = Dataset.find_one(dataset_id)
        result = None

        try:
            if dataset:
                if mode == MODE_INFO:
                    result = Dataset.schema(dataset)
                elif mode == MODE_SUMMARY:
                    result = summarize(dataset, query, select, group)
                else:
                    return mongo_to_json(Observation.find(dataset, query,
                                select))
        except JSONError, e:
            result = {ERROR: e.__str__()}
Exemplo n.º 21
0
    def full_dataset_gen(cls, window_length, count=1, sessions=None):

        if sessions is None:
            Print.info("Fetching sessions")
            sessions = Session.fetch_all(only_real=True,
                                         include_timeframes=True)

        for _ in range(count):
            dataset = Dataset.empty()
            for session in sessions:
                windows = list(session.window_gen(window_length=window_length))
                dataset = dataset + session.dataset(windows=windows)

            yield dataset
Exemplo n.º 22
0
 def sample(self, n):
     """Sample a batch size n of experience"""
     if len(self.memory) < n:
         raise IndexError('Size of memory ({}) is less than requested sample ({})'.format(len(self), n))
     else:
         scores = [x[1] for x in self.memory]
         sample = np.random.choice(len(self), size=n, replace=False, p=scores / np.sum(scores))
         sample = [self.memory[i] for i in sample]
         smiles = [x[0] for x in sample]
         scores = [x[1] for x in sample]
         prior_likelihood = [x[2] for x in sample]
     tokenized = [self.voc.tokenize(smile) for smile in smiles]
     encoded = [self.voc.encode(tokenized_i) for tokenized_i in tokenized]
     encoded = Dataset.collate_fn(encoded)
     return encoded, np.array(scores), np.array(prior_likelihood)
Exemplo n.º 23
0
def summarize(dataset, query, select, group):
    """
    Return a summary for the rows/values filtered by *query* and *select*
    and grouped by *group* or the overall summary if no group is specified.
    """
    # narrow list of observations via query/select
    dframe = Observation.find(dataset, query, select, as_df=True)

    # do not allow group by numeric types
    # TODO check schema for valid groupby columns once included
    _type = dframe.dtypes.get(group)
    if group != ALL and (_type is None or _type.type != np.object_):
        return {ERROR: "group: '%s' is not categorical." % group}

    # check cached stats for group and update as necessary
    stats = dataset.get(STATS, {})
    if not stats.get(group):
        stats = {ALL: summarize_df(dframe)} if group == ALL \
                else summarize_with_groups(dframe, stats, group)
        Dataset.update(dataset, {STATS: stats})
    stats_to_return = stats.get(group)

    return dict_from_mongo(stats_to_return if group == ALL else {group:
            stats_to_return})
Exemplo n.º 24
0
 def _test_summary_no_group(self, results):
     result_keys = results.keys()
     print result_keys
     print self.test_data[self._file_name].columns.tolist()
     self.assertEqual(len(result_keys), self.NUM_COLS)
     columns = [col for col in
             self.test_data[self._file_name].columns.tolist()
             if not col in MONGO_RESERVED_KEYS]
     dataset = Dataset.find_one(self.dataset_id)
     labels_to_slugs = build_labels_to_slugs(dataset)
     for col in columns:
         slug = labels_to_slugs[col]
         self.assertTrue(slug in result_keys,
                 'col (slug): %s in: %s' % (slug, result_keys))
         self.assertTrue(SUMMARY in results[slug].keys())
Exemplo n.º 25
0
def generate_yaml_from_netCDF(nc_path, product_name, product_description,
                              no_data):
    # Extract data from netCDF file
    dataset = xarray.load_dataset(nc_path)

    measurements = []
    dims = [i for i in dataset.sizes.mapping.mapping]
    for var in dataset:
        ds_var = dataset[var].variable
        if "units" in ds_var.attrs.keys():
            measurements.append(
                Measurement(
                    var,
                    ds_var.dtype.name,
                    ds_var.attrs["units"],
                    no_data,
                    Path(nc_path).name,
                ))

    # Classes generation
    dataset = Dataset(
        product_name,
        dataset.longitude.data,
        dataset.latitude.data,
        measurements,
    )

    product = Product(
        product_name,
        product_description,
        measurements=measurements,
        storage_driver="NetCDF CF",
        storage_dimension_order=dims,
    )

    # YAML configuration
    yaml.emitter.Emitter.process_tag = lambda self, *args, **kw: None
    CWD = os.path.dirname(__file__)
    # Product generation
    with open(os.path.join(CWD, "./tests/product_generated.yaml"), "w") as f:
        yaml.dump(product, f, sort_keys=False)

    # Dataset generation
    data = yaml.dump(dataset, sort_keys=False)
    data = data.replace("'%", "")
    data = data.replace("%'", "")
    with open(os.path.join(CWD, "./tests/dataset_generated.yaml"), "w") as f:
        f.write(data)
Exemplo n.º 26
0
 def get(self, datasetID):
     startIn = self.request.get('start').split(':')
     if len(startIn) == 2:
         logging.info('Queuing harvest in ' + startIn[0] + ' hours ' + startIn[1] + ' minutes')
         seconds = int(startIn[0]) * 3600 + int(startIn[1]) * 60
         dataset = Dataset.get_by_id(long(datasetID))
         
         ''' TODO store 'interval' param in dataset object (if any) '''                    
         
         crawl = Crawl(dataset=dataset, status='QUEUED')
         crawl.put()
         crawl.queue(seconds)
         return webapp2.redirect('/datasets/' + datasetID)        
     else:
         ''' TODO decent error handling '''
         logging.info('Invalid crawl time: ' + self.request.get('start'))
         return webapp2.redirect('/datasets/' + datasetID + '?error=true')
Exemplo n.º 27
0
 def initiate_from_file(self, fname, scoring_function, Prior):
     """Adds experience from a file with SMILES
        Needs a scoring function and an RNN to score the sequences"""
     with open(fname, 'r') as f:
         smiles = []
         for line in f:
             smile = line.split()[0]
             if Chem.MolFromSmiles(smile):
                 smile = Chem.MolToSmiles(Chem.MolFromSmiles(smile), isomericSmiles=False)
                 smiles.append(smile)
     scores = scoring_function(smiles)
     tokenized = [self.voc.tokenize(smile) for smile in smiles]
     encoded = [self.voc.encode(tokenized_i) for tokenized_i in tokenized]
     encoded = Dataset.collate_fn(encoded)
     prior_likelihood, _ = Prior.likelihood(encoded.long())
     prior_likelihood = prior_likelihood.data.cpu().numpy()
     new_experience = zip(smiles, scores, prior_likelihood)
     self.add_experience(new_experience)
Exemplo n.º 28
0
Arquivo: io.py Projeto: asseym/bamboo
def create_dataset_from_url(url, allow_local_file=False):
    """
    Load a URL, read from a CSV, create a dataset and return the unique ID.
    """
    _file = None

    try:
        _file = open_data_file(url, allow_local_file)
    except (IOError, urllib2.HTTPError):
        # error reading file/url, return
        pass

    if not _file:
        # could not get a file handle
        return {ERROR: 'could not get a filehandle for: %s' % url}

    dataset_id = uuid.uuid4().hex
    dataset = Dataset.create(dataset_id)
    import_dataset(_file, dataset)

    return {ID: dataset_id}
Exemplo n.º 29
0
    def __init__(self, exchange: Exchange, timeout=60, *args, **kwargs):
        super().__init__(exchange, timeout, *args, **kwargs)
        self.buy_price = 0
        self.sell_price = 0
        self.stop_loss = 0

        self.market_delta = 0

        self.advised = False
        self.waiting_order = False
        self.fulfilled_orders = []
        self.last_price = 0
        # create a dataset for the session
        self.dataset = Dataset().create(
            data={
                'exchange': self.exchange.name.lower(),
                'periodStart': datetime.now(),
                'candleSize': 60,
                'currency': self.exchange.currency,
                'asset': self.exchange.asset
            })
Exemplo n.º 30
0
    def __init__(self,
                 exchange: Exchange,
                 period_start: datetime,
                 period_end=None,
                 interval=60):
        self.launchedAt = datetime.now()
        # Try to find dataset
        dataset = Dataset().get({
            "exchange": exchange.name.lower(),
            "currency": exchange.currency.lower(),
            "asset": exchange.asset.lower(),
            "periodStart": period_start,
            "periodEnd": period_end,
            "candleSize": interval
        })
        if dataset and len(dataset) > 0:
            print(dataset)
            print(dataset[0])
            print("Dataset found: " + dataset[0]['uuid'])
            price = Price()
            for prices in price.query('get', {"dataset": dataset[0]['uuid']}):
                for price in prices:
                    print(price)
                    newPrice = Price()
                    newPrice.populate(price)
                    exchange.strategy.set_price(newPrice)
                    exchange.strategy.run()
        else:
            print("Dataset not found, external API call to " + exchange.name)
            for price in exchange.historical_symbol_ticker_candle(
                    period_start, period_end, interval):
                exchange.strategy.set_price(price)
                exchange.strategy.run()

        execution_time = datetime.now() - self.launchedAt
        print('Execution time: ' + str(execution_time.total_seconds()) +
              ' seconds')
        sys.exit(0)
Exemplo n.º 31
0
    config = yaml.safe_load(f)

# load logger
lc = config['environment']['log_config']
logging.config.fileConfig(lc)
logs = logging.getLogger()

# load device config
cuda = config['environment']['cuda']
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# load dataloader
it = config['test']['image_root']
bs = 1
iz = None
data = Dataset(it, iz, cuda)
loader = DataLoader(data, bs, cuda)

# load color transform network
net_col = col.Generator(2)
net_col = nn.DataParallel(net_col)
net_col = net_col.cuda() if cuda else net_col

# load temporal constraint network
net_tem = tem.Generator(64)
net_tem = nn.DataParallel(net_tem)
net_tem = net_tem.cuda() if cuda else net_tem

# load pretrained models
# col_gen.load_state_dict(torch.load(test['load_pretrain_model'][0], map_location='cpu'))
# tem_gen.load_state_dict(torch.load(test['load_pretrain_model'][1], map_location='cpu'))
Exemplo n.º 32
0
elif mode == 'live':
    exchange.start_symbol_ticker_socket(exchange.get_symbol())

elif mode == 'backtest':
    period_start = config('PERIOD_START')
    period_end = config('PERIOD_END')

    print("Backtest period from {} to {} with {} seconds candlesticks.".format(
        period_start, period_end, interval))

    # Try to find dataset
    dataset = Dataset().query(
        'get', {
            "exchange": '/api/exchanges/' + exchange.name.lower(),
            "currency": '/api/currencies/' + currency.lower(),
            "asset": '/api/currencies/' + asset.lower(),
            "period_start": period_start,
            "period_end": period_end,
            "candleSize": interval
        })

    if dataset and len(dataset) > 0:
        print(dataset[0])
        price = Price()
        for price in price.query('get', {"dataset": dataset[0]['uuid']}):
            newPrice = Price()
            newPrice.populate(price)
            exchange.strategy.set_price(newPrice)
            exchange.strategy.run()
    else:
        print("Dataset not found, external API call to " + exchange.name)
Exemplo n.º 33
0
 def get(self):
     datasets = Dataset.all()
     self.render_response('datasets/datasets_listall.html', **{'datasets':datasets})
Exemplo n.º 34
0
 def post(self):
     dataset = Dataset(name=self.request.get('name'),
                       voidURI=self.request.get('voidURI'))
     dataset.put()
     
     return webapp2.redirect('/datasets')
Exemplo n.º 35
0
        except ParseError, err:
            # do not save record, return error
            return {ERROR: err}

        record = {
            DATASET_ID: dataset[DATASET_ID],
            cls.FORMULA: formula,
            cls.NAME: name,
        }
        cls.collection.insert(record)

        # invalidate summary ALL since we have a new column
        stats = dataset.get(STATS)
        if stats:
            del stats[ALL]
            del dataset[STATS]
            Dataset.update(dataset, {STATS: stats})

        # call remote calculate and pass calculation id
        calculate_column.delay(dataset, dframe, formula, name)
        return mongo_remove_reserved_keys(record)

    @classmethod
    def find(cls, dataset):
        """
        Return the calculations for given *dataset*.
        """
        return [mongo_remove_reserved_keys(record) for record in cls.collection.find({
            DATASET_ID: dataset[DATASET_ID],
        })]
Exemplo n.º 36
0
 def test_save(self):
     for dataset_name in self.TEST_DATASETS:
         record = Dataset.save(self.test_dataset_ids[dataset_name])
         self.assertTrue(isinstance(record, dict))
         self.assertTrue('_id' in record.keys())
Exemplo n.º 37
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))
     self.response.write(json.dumps(dataset.toJSON()))
Exemplo n.º 38
0
 def get(self, datasetID):
     dataset = Dataset.get_by_id(long(datasetID))
     self.render_response('datasets/datasets_show.html', **{'dataset':dataset})
Exemplo n.º 39
0
 def test_create(self):
     for dataset_name in self.TEST_DATASETS:
         dataset = Dataset.create(self.test_dataset_ids[dataset_name])
         self.assertTrue(isinstance(dataset, dict))
Exemplo n.º 40
0
from models.dataset import Dataset

# Creating a dataset of dimension 2 in input and 3 in output
dset = Dataset(2, 3)

# Adding datapoints
dset.add_xy([0.0, 1.0], [ 1.0, 2.0, 0.0])
dset.add_xy([1.0, 0.0], [ 0.0, 0.0, 2.0])
dset.add_xy([2.0,-1.0], [-1.0,-2.0, 4.0])

# Nearest neighbors queries on input, requesting 2 neighbors
dset.nn_x([0.2, 0.5], 2)
# Nearest neighbors queries on output, requesting 1 neighbors
dist, index = dset.nn_y([1.0, 1.0, 1.0], 1)

# Retrieving the nearest output of [1.0, 1.0, 1.0]
print dset.get_y(index[0])
# Retrieving the nearest datapoint
print dset.get_xy(index[0])
Exemplo n.º 41
0
    def _test_calculator(self, delay=True):
        dframe = Observation.find(self.dataset, as_df=True)

        columns = dframe.columns.tolist()
        start_num_cols = len(columns)
        added_num_cols = 0

        column_labels_to_slugs = build_labels_to_slugs(self.dataset)
        label_list, slugified_key_list = [list(ary) for ary in
                zip(*column_labels_to_slugs.items())]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            if delay:
                task = calculate_column.delay(self.dataset, dframe,
                        formula, name)
                # test that task has completed
                self.assertTrue(task.ready())
                self.assertTrue(task.successful())
            else:
                task = calculate_column(self.dataset, dframe,
                        formula, name)

            column_labels_to_slugs = build_labels_to_slugs(self.dataset)

            unslug_name = name
            name = column_labels_to_slugs[unslug_name]

            # test that updated dataframe persisted
            dframe = Observation.find(self.dataset, as_df=True)
            self.assertTrue(name in dframe.columns)

            # test new number of columns
            added_num_cols += 1
            self.assertEqual(start_num_cols + added_num_cols,
                    len(dframe.columns.tolist()))

            # test that the schema is up to date
            dataset = Dataset.find_one(self.dataset[DATASET_ID])
            self.assertTrue(SCHEMA in dataset.keys())
            self.assertTrue(isinstance(dataset[SCHEMA], dict))
            schema = dataset[SCHEMA]

            # test slugified column names
            slugified_key_list.append(name)
            self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list))

            # test column labels
            label_list.append(unslug_name)
            labels = [schema[col][LABEL] for col in schema.keys()]
            self.assertEqual(sorted(labels), sorted(label_list))

            # test result of calculation
            formula = column_labels_to_slugs[formula]

            for idx, row in dframe.iterrows():
                try:
                    result = np.float64(row[name])
                    stored = np.float64(row[formula])
                    # np.nan != np.nan, continue if we have two nan values
                    if np.isnan(result) and np.isnan(stored):
                        continue
                    msg = self._equal_msg(result, stored, formula)
                    self.assertAlmostEqual(result, stored, self.places, msg)
                except ValueError:
                    msg = self._equal_msg(row[name], row[formula], formula)
                    self.assertEqual(row[name], row[formula], msg)
Exemplo n.º 42
0
 def setUp(self):
     TestBase.setUp(self)
     self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv'])
     Dataset.build_schema(self.dataset,
             self.test_data['good_eats.csv'].dtypes)