def connect_to_monary_and_email_operator(ds, **kwargs):
    m = Monary()
    pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}]
    states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"])
    strs = list(map(lambda x: x.decode("utf-8"), states))
    result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population))
    print(result)
Пример #2
0
def do_insert():
    m = Monary()
    num_docs = NUM_BATCHES * BATCH_SIZE
    params = [MonaryParam(
        ma.masked_array(nprand.uniform(0, i + 1, num_docs),
                        np.zeros(num_docs)), "x%d" % i) for i in range(5)]
    wc = WriteConcern(w=MONARY_W_DEFAULT)
    with profile("monary insert"):
        m.insert("monary_test", "collection", params, write_concern=wc)
Пример #3
0
def connect_to_monary_and_email_operator(ds, **kwargs):
    m = Monary()
    pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}]
    states, population = m.aggregate("zips", "data", pipeline,
                                     ["_id", "totPop"], ["string:2", "int64"])
    strs = list(map(lambda x: x.decode("utf-8"), states))
    result = list("%s: %d" % (state, pop)
                  for (state, pop) in zip(strs, population))
    print(result)
Пример #4
0
def do_insert():
    m = Monary()
    num_docs = NUM_BATCHES * BATCH_SIZE
    params = [
        MonaryParam(
            ma.masked_array(nprand.uniform(0, i + 1, num_docs),
                            np.zeros(num_docs)), "x%d" % i) for i in range(5)
    ]
    wc = WriteConcern(w=MONARY_W_DEFAULT)
    with profile("monary insert"):
        m.insert("monary_test", "collection", params, write_concern=wc)
Пример #5
0
def monary_load(start=0, stop=-1, find_args={}, species_to_retrieve=[]):
    if species_to_retrieve == []:
        species_to_retrieve = species
    else:
        species_to_retrieve = [s for s in species_to_retrieve if s in species]
    query = {}
    for s in species_to_retrieve:
        query[s] = {"$gt": 0}
    find_args["$or"] = [{k: query[k]} for k in query.keys()]
    with Monary("127.0.0.1") as monary:
        out = monary.query(
            "creeval",
            collection,
            find_args,
            num_metadata + cat_metadata + species_to_retrieve, ["float32"] *
            (len(num_metadata) + len(cat_metadata) + len(species_to_retrieve)),
            limit=(stop - start),
            offset=start)
    for i, col in enumerate(out[0:len(num_metadata + cat_metadata)]):
        out[i] = np.ma.filled(col, np.ma.mean(col))
        #if any(np.isnan(col)):
        #	print col
    out = np.ma.row_stack(out).T
    X = out[:, 0:len(num_metadata + cat_metadata)]
    y = out[:, len(num_metadata + cat_metadata):]
    y = (y > 0).astype(int)

    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    pickle.dump(scaler, open(collection + "_scaler.pkl", "wb"))
    y = np.asarray(y)

    return DenseDesignMatrix(X=X, y=y)
Пример #6
0
    def _get_validation_mse(self):
        monary = Monary(host=self.mongo_host)

        def get_mse_for_fold(fold):
            iterations, loss, source_id = monary.query(
                db=self.mongo_db,
                coll='validation_scores',
                query={
                    'experiment_id': self.experiment_id,
                    'fold': fold
                },
                fields=[
                    'iteration', 'scores.regression.mean_squared_error',
                    'source_id'
                ],
                types=['int32', 'float32', 'int8'])

            scores_df = pd.DataFrame({
                'loss': loss,
                'source_id': source_id
            },
                                     index=iterations)
            scores_df = scores_df.sort_index()
            return scores_df

        FOLDS = ['unseen_appliances', 'unseen_activations_of_seen_appliances']
        scores = {}
        for fold in FOLDS:
            scores[fold] = get_mse_for_fold(fold)

        return scores
Пример #7
0
    def _get_train_costs(self):
        # Get train scores
        monary = Monary(host=self.mongo_host)
        iterations, loss, source_id = monary.query(
            db=self.mongo_db,
            coll='train_scores',
            query={'experiment_id': self.experiment_id},
            fields=['iteration', 'loss', 'source_id'],
            types=['int32', 'float32', 'int8']
        )

        scores_df = pd.DataFrame(
            {'loss': loss, 'source_id': source_id}, index=iterations)
        scores_df = scores_df.sort_index()

        return scores_df
Пример #8
0
    def _get_train_costs(self):
        # Get train scores
        monary = Monary(host=self.mongo_host)
        iterations, loss, source_id = monary.query(
            db=self.mongo_db,
            coll='train_scores',
            query={'experiment_id': self.experiment_id},
            fields=['iteration', 'loss', 'source_id'],
            types=['int32', 'float32', 'int8'])

        scores_df = pd.DataFrame({
            'loss': loss,
            'source_id': source_id
        },
                                 index=iterations)
        scores_df = scores_df.sort_index()

        return scores_df
Пример #9
0
def load_from_db(db, collection, field_names, field_types, db_host, query_spec={}):
    with Monary(host=db_host["hostname"], username=db_host["username"], password=db_host["password"],
                database="admin") as monary:
        arrays = monary.query(
            db,  # database name
            collection,  # collection name
            query_spec,  # query spec
            field_names,  # field names (in Mongo record)
            field_types  # Monary field types (see below)
        )

    return field_names, arrays
Пример #10
0
def do_monary_query():
    with Monary("127.0.0.1") as m:
        with profile("monary query"):
            arrays = m.query(
                "monary_test",  # database name
                "collection",  # collection name
                {},  # query spec
                ["x1", "x2", "x3", "x4", "x5"],  # field names
                ["float64"] * 5  # field types
            )

    # prove that we did something...
    print(numpy.mean(arrays, axis=-1))
Пример #11
0
def do_monary_block_query():
    count = 0
    sums = numpy.zeros((5, ))
    with Monary("127.0.0.1") as m:
        with profile("monary block query"):
            for arrays in m.block_query(
                    "monary_test",  # database name
                    "collection",  # collection name
                {},  # query spec
                ["x1", "x2", "x3", "x4", "x5"],  # field names
                ["float64"] * 5,  # field types
                    block_size=32 * 1024,
            ):
                count += len(arrays[0])
                sums += [numpy.sum(arr) for arr in arrays]

    print "visited %i items" % count
    print sums / count  # prove that we did something...
Пример #12
0
#script to save csv with comment data
import pymongo, urllib3
from pymongo import MongoClient
from hpfunctions import getUrl, stripWhite,stripWhiteList
from lxml import html
client=MongoClient()
db=client['hp']
import pandas
import numpy
from monary import Monary 

mon=Monary()

print 'available columns'

for i in db.comStats.find()[0]:print i

columns = ['typos', 'avSyllables', 'nPunct']
numpy_arrays = mon.query('hp', 
                        'comStats', 
                        {},
                        columns, 
                        ['int32', 'int32', 'int32:20'])

df = numpy.matrix(numpy_arrays).transpose() 
df = pandas.DataFrame(df, columns=columns)
print 'starting to write file pandasTest.csv'
df.to_csv('pandasTest.csv', sep='\t')
Пример #13
0
    def _plot_validation_scores_for_source_and_fold(self, ax, source_id, fold,
                                                    show_axes_labels,
                                                    show_scales):
        fields = ['iteration'] + ['scores.' + metric_name for metric_name in
                                  self.validation_metric_names]
        monary = Monary(host=self.mongo_host)
        result = monary.query(
            db=self.mongo_db,
            coll='validation_scores',
            query={
                'experiment_id': self.experiment_id,
                'source_id': source_id,
                'fold': fold
            },
            fields=fields,
            types=['int32'] + ['float32'] * len(self.validation_metric_names)
        )

        index = result[0]
        data = {metric_name: result[i+1] for i, metric_name in
                enumerate(self.validation_metric_names)}
        df = pd.DataFrame(data, index=index)
        df = df.sort_index()
        df = self._downsample(df)

        # Create multiple independent axes.  Adapted from Joe Kington's answer:
        # http://stackoverflow.com/a/7734614

        # Colours
        n = len(self.validation_metric_names)
        colors = get_colors(n)

        # Twin the x-axis to make independent y-axes.
        axes = [ax]
        for metric_name in self.validation_metric_names[1:]:
            axes.append(ax.twinx())

        SEP = 0.2
        if show_scales:
            for i, axis in enumerate(axes):
                axis.yaxis.tick_right()
                if i != 0:
                    # To make the border of the right-most axis visible,
                    # we need to turn the frame on. This hides the other plots,
                    # however, so we need to turn its fill off.
                    axis.set_frame_on(True)
                    axis.patch.set_visible(False)
                    # Move the last y-axes spines over to the right.
                    axis.spines['right'].set_position(
                        ('axes', 1 + (SEP * i)))
        else:
            for axis in axes:
                axis.tick_params(labelright=False, labelleft=False)
                axis.yaxis.set_ticks_position('none')
                axis.spines['right'].set_visible(False)

        for axis in axes:
            for spine in ['top', 'left', 'bottom']:
                axis.spines[spine].set_visible(False)
            axis.xaxis.set_ticks_position('none')

        lines = []
        for i, (axis, metric_name, color) in enumerate(
                zip(axes, self.validation_metric_names, colors)):
            axis.tick_params(axis='y', colors=color, direction='out')
            label = metric_name.replace("regression.", "")
            label = label.replace("classification_", "")
            label = label.replace("_", " ")
            label = label.replace(".", " ")
            label = label.replace(" ", "\n")
            line, = axis.plot(
                df.index, df[metric_name].values, color=color, label=label)
            if show_axes_labels and show_scales:
                axis.set_ylabel(
                    label, color=color, rotation=0, fontsize=8, va='bottom')
                if i == 0:
                    coords = (1.05, 1.1)
                else:
                    coords = (1.05 + (SEP * i), 1.1)
                axis.yaxis.set_label_coords(*coords)
            lines.append(line)

        self._last_iteration_processed['validation'] = index[-1]
        return lines
Пример #14
0
from cache import environ
from db import mongo_ohlc, monary_bar
from monary import Monary

# from pymongo import MongoClient
# from datetime import time, timedelta

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.width', 100)
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 20)
pd.set_option('display.float_format', '{:,g}'.format)

if __name__ == '__main__':
    # 20-30sec for ETF, 5-10sec for FX (BTFX)

    if False:
        client = Monary()
        # collection = client.flow['bar']# if mode == 'bbo' else client.flow
        dt = {}
        monary_bar(dt, client, 'SPY', 'TRADE', step_min=1)
        client.close()

    else:
        env = environ.get_env('FX_1Min')
        res, dt = mongo_ohlc(env, last_periods=100, mode='bbo', processes=20,
                             type_list=['bid'])
#
#        res, dt = mongo_ohlc(env, last_periods = 100, mode = 'bar', processes = 20,
# type_list = ['TRADE'], type_field = 'event')
Пример #15
0
    def get_client(self,
                   database_name=None,
                   uri=None,
                   monary=False,
                   host=None,
                   autoreconnect=False,
                   **kwargs):
        """Get a Mongoclient. Returns Mongo database object.
        If you provide a mongodb connection string uri, we will insert user & password into it,
        otherwise one will be built from the configuration settings.
        If database_name=None, will connect to the default database of the uri. database=something
        overrides event the uri's specification of a database.
        host is special magic for split_hosts
        kwargs will be passed to pymongo.mongoclient/Monary
        """
        # Format of URI we should eventually send to mongo
        full_uri_format = 'mongodb://{user}:{password}@{host}:{port}/{database}'

        if uri is None:
            # We must construct the entire URI from the settings
            uri = full_uri_format.format(database=database_name, **self.config)
        else:
            # A URI was given. We expect it to NOT include user and password:
            result = parse_passwordless_uri(uri)
            _host, port, _database_name = result
            if result is not None:
                if not host:
                    host = _host
                if database_name is None:
                    database_name = _database_name
                uri = full_uri_format.format(database=database_name,
                                             host=host,
                                             port=port,
                                             user=self.config['user'],
                                             password=self.config['password'])
            else:
                # Some other URI was provided. Just try it and hope for the best
                pass

        if monary:
            # Be careful enabling this debug log statement, it's useful but prints the password in the uri
            # self.log.debug("Connecting to Mongo via monary using uri %s" % uri)
            # serverselection option makes the C driver retry if it can't connect;
            # since we often make new monary connections this is useful to protect against brief network hickups.
            client = Monary(
                uri +
                '?serverSelectionTryOnce=false&serverSelectionTimeoutMS=60000',
                **kwargs)
            self.log.debug("Succesfully connected via monary (probably...)")
            return client

        else:
            # Be careful enabling this debug log statement, it's useful but prints the password in the uri
            # self.log.debug("Connecting to Mongo using uri %s" % uri)
            client = pymongo.MongoClient(uri, **kwargs)
            client.admin.command(
                'ping')  # raises pymongo.errors.ConnectionFailure on failure
            self.log.debug("Successfully pinged client")

            if autoreconnect:
                # Wrap the client in a magic object that retries autoreconnect exceptions
                client = MongoProxy(client,
                                    disconnect_on_timeout=False,
                                    wait_time=180)

            return client
from monary import Monary
import numpy as np
import pandas as pd
import time

mon = Monary()

columns = [
    'properties.total_residential_units',
    'properties.total_job_spaces',
    'properties.parcel_id',
    'properties.max_dua',
    'properties.max_far'
]

t1 = time.time()

numpy_arrays = mon.query(
    'togethermap',
    'places',
    {'collectionId': 'ZC7yyAyA8jkDFnRtf'},
    columns,
    ['float32']*len(columns)
)

df = np.matrix(numpy_arrays).transpose()
df = pd.DataFrame(df, columns=columns)

print time.time()-t1
print df.describe()
Пример #17
0
    def _plot_validation_scores_for_source_and_fold(self, ax, source_id, fold,
                                                    show_axes_labels,
                                                    show_scales):
        fields = ['iteration'] + [
            'scores.' + metric_name
            for metric_name in self.validation_metric_names
        ]
        monary = Monary(host=self.mongo_host)
        result = monary.query(db=self.mongo_db,
                              coll='validation_scores',
                              query={
                                  'experiment_id': self.experiment_id,
                                  'source_id': source_id,
                                  'fold': fold
                              },
                              fields=fields,
                              types=['int32'] +
                              ['float32'] * len(self.validation_metric_names))

        index = result[0]
        data = {
            metric_name: result[i + 1]
            for i, metric_name in enumerate(self.validation_metric_names)
        }
        df = pd.DataFrame(data, index=index)
        df = df.sort_index()
        df = self._downsample(df)

        # Create multiple independent axes.  Adapted from Joe Kington's answer:
        # http://stackoverflow.com/a/7734614

        # Colours
        n = len(self.validation_metric_names)
        colors = get_colors(n)

        # Twin the x-axis to make independent y-axes.
        axes = [ax]
        for metric_name in self.validation_metric_names[1:]:
            axes.append(ax.twinx())

        SEP = 0.2
        if show_scales:
            for i, axis in enumerate(axes):
                axis.yaxis.tick_right()
                if i != 0:
                    # To make the border of the right-most axis visible,
                    # we need to turn the frame on. This hides the other plots,
                    # however, so we need to turn its fill off.
                    axis.set_frame_on(True)
                    axis.patch.set_visible(False)
                    # Move the last y-axes spines over to the right.
                    axis.spines['right'].set_position(('axes', 1 + (SEP * i)))
        else:
            for axis in axes:
                axis.tick_params(labelright=False, labelleft=False)
                axis.yaxis.set_ticks_position('none')
                axis.spines['right'].set_visible(False)

        for axis in axes:
            for spine in ['top', 'left', 'bottom']:
                axis.spines[spine].set_visible(False)
            axis.xaxis.set_ticks_position('none')

        lines = []
        for i, (axis, metric_name, color) in enumerate(
                zip(axes, self.validation_metric_names, colors)):
            axis.tick_params(axis='y', colors=color, direction='out')
            label = metric_name.replace("regression.", "")
            label = label.replace("classification_", "")
            label = label.replace("_", " ")
            label = label.replace(".", " ")
            label = label.replace(" ", "\n")
            line, = axis.plot(df.index,
                              df[metric_name].values,
                              color=color,
                              label=label)
            if show_axes_labels and show_scales:
                axis.set_ylabel(label,
                                color=color,
                                rotation=0,
                                fontsize=8,
                                va='bottom')
                if i == 0:
                    coords = (1.05, 1.1)
                else:
                    coords = (1.05 + (SEP * i), 1.1)
                axis.yaxis.set_label_coords(*coords)
            lines.append(line)

        self._last_iteration_processed['validation'] = index[-1]
        return lines
Пример #18
0
from monary import Monary
m = Monary()
pipeline = [{"$group" : {"_id" : "$state", "totPop" : { "$sum" : "$pop"}}}]
states, population = m.aggregate("zips","data", pipeline, ["_id","totpop"], ["string:2", "int64"])
strs = list(map(lambda x: x.decode("utf-8"), states))
result = list("%s: %d" % (state, pop)
     for (state, pop) in zip(strs, population))
Пример #19
0
    def run(self):
        count = 0

        host = config.get('mongo', 'host')
        db = config.get('mongo', 'database')

        def _fill_field(field_arr, field_type):
            if field_type.startswith('string'):
                field_arr = field_arr.astype(np.str).filled('')
            elif field_type == 'bool':
                field_arr = field_arr.astype(np.str).filled(None)
            elif field_type.startswith('int'):
                field_arr = field_arr.filled(0)
            elif field_type.startswith('float'):
                field_arr = field_arr.filled(np.NaN)
            else:
                raise Exception('Unknown field type %s' % field_type)

            return field_arr

        with Monary(host) as m:

            log.info("Querying Monary")

            # Get field definitions for default collection
            query_fields, df_cols, field_types = zip(
                *self.get_collection_source_columns(self.collection_name))

            catalogue_blocks = m.block_query(db,
                                             self.collection_name,
                                             self.query,
                                             query_fields,
                                             field_types,
                                             block_size=self.block_size)

            log.info("Processing Monary data")

            for catalogue_block in catalogue_blocks:

                # Bit of a hack: fill fields with a blank value (depending on type)
                # So the masked value doesn't get used.  As the masked is shared between
                # each block, if a field is empty it is getting populated by previous values
                catalogue_block = [
                    _fill_field(arr, field_types[i])
                    for i, arr in enumerate(catalogue_block)
                ]

                # Create a pandas data frame with block of records
                # Columns use the name from the output columns - but must be in the same order as query_fields
                # Which is why we're using tuples for the columns
                df = pd.DataFrame(np.matrix(catalogue_block).transpose(),
                                  columns=df_cols)

                # Loop through all the columns and ensure hidden integer fields are cast as int32
                # For example, taxonomy_irn is used to join with taxonomy df
                for i, df_col in enumerate(df_cols):
                    if field_types[i].startswith('int'):
                        df[df_col] = df[df_col].astype(field_types[i])

                df = self.process_dataframe(m, df)

                # Output the dataframe
                self.output().write(df)

                row_count, col_count = df.shape
                count += row_count
                log.info("\t %s records", count)

        # After running, update mongo
        self.mongo_target.touch()
Пример #20
0
from monary import Monary
import numpy

with Monary("127.0.0.1") as monary:
    arrays = monary.query(
        "HealthCare_Twitter_Analysis",  # database name
        "tweets",  # collection name
        {},  # query spec
        ["n-grams.text"],  # field names (in Mongo record)
        ["float64"]  # Monary field types (see below)
    )

print arrays[0]
Пример #21
0
from monary import Monary
import numpy as np
import pandas as pd
import time

mon = Monary()

columns = [
    'properties.total_residential_units', 'properties.total_job_spaces',
    'properties.parcel_id', 'properties.max_dua', 'properties.max_far'
]

t1 = time.time()

numpy_arrays = mon.query('togethermap', 'places',
                         {'collectionId': 'ZC7yyAyA8jkDFnRtf'}, columns,
                         ['float32'] * len(columns))

df = np.matrix(numpy_arrays).transpose()
df = pd.DataFrame(df, columns=columns)

print time.time() - t1
print df.describe()
Пример #22
0
__author__ = 'jflaisha'

from monary import Monary, MonaryParam
import numpy as np

client = Monary(database="sam")  # defaults to localhost:27017


def extract_arrays(fortran_output):
    """
    Generate list containing a tuple for each HUC in a SAM run.  The output from the Fortran is a 2-dimensional
    array where the y-axis represents each HUC in the run.  The x-axis is a series of output values, with the first
    value being the HUC_ID and the remaining a time-series of output values.

    :param fortran_output: numpy.ndarray
    :return: list of tuples (tuple[0] = HUC_ID (str), tuple[1] = Output values (numpy.ndarray))
    """
    # return list(array for array in self.fortran_output)
    # return [array for array in self.fortran_output]
    return [array for array in fortran_output]


class SamMonary(object):
    def __init__(self, jid, huc_output_array, day_array, huc_id):
        """
        Class represents each HUC worth of output data from SuperPRZM run.  The class methods take the numpy array
        SuperPRZM output data and convert them to MonaryParams to be inserted into MongoDB using Monary.
        :param jid: string, job ID for SAM/SuperPRZM run
        :param huc_output_array: numpy array, SuperPRZM output data for one HUC
        :param day_array: numpy array, sequence of "Julian Days" of simulation date range
        :param huc_id: string, HUC12 ID (12 digits)