def connect_to_monary_and_email_operator(ds, **kwargs): m = Monary() pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}] states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"]) strs = list(map(lambda x: x.decode("utf-8"), states)) result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population)) print(result)
def do_insert(): m = Monary() num_docs = NUM_BATCHES * BATCH_SIZE params = [MonaryParam( ma.masked_array(nprand.uniform(0, i + 1, num_docs), np.zeros(num_docs)), "x%d" % i) for i in range(5)] wc = WriteConcern(w=MONARY_W_DEFAULT) with profile("monary insert"): m.insert("monary_test", "collection", params, write_concern=wc)
def do_insert(): m = Monary() num_docs = NUM_BATCHES * BATCH_SIZE params = [ MonaryParam( ma.masked_array(nprand.uniform(0, i + 1, num_docs), np.zeros(num_docs)), "x%d" % i) for i in range(5) ] wc = WriteConcern(w=MONARY_W_DEFAULT) with profile("monary insert"): m.insert("monary_test", "collection", params, write_concern=wc)
def monary_load(start=0, stop=-1, find_args={}, species_to_retrieve=[]): if species_to_retrieve == []: species_to_retrieve = species else: species_to_retrieve = [s for s in species_to_retrieve if s in species] query = {} for s in species_to_retrieve: query[s] = {"$gt": 0} find_args["$or"] = [{k: query[k]} for k in query.keys()] with Monary("127.0.0.1") as monary: out = monary.query( "creeval", collection, find_args, num_metadata + cat_metadata + species_to_retrieve, ["float32"] * (len(num_metadata) + len(cat_metadata) + len(species_to_retrieve)), limit=(stop - start), offset=start) for i, col in enumerate(out[0:len(num_metadata + cat_metadata)]): out[i] = np.ma.filled(col, np.ma.mean(col)) #if any(np.isnan(col)): # print col out = np.ma.row_stack(out).T X = out[:, 0:len(num_metadata + cat_metadata)] y = out[:, len(num_metadata + cat_metadata):] y = (y > 0).astype(int) scaler = StandardScaler().fit(X) X = scaler.transform(X) pickle.dump(scaler, open(collection + "_scaler.pkl", "wb")) y = np.asarray(y) return DenseDesignMatrix(X=X, y=y)
def _get_validation_mse(self): monary = Monary(host=self.mongo_host) def get_mse_for_fold(fold): iterations, loss, source_id = monary.query( db=self.mongo_db, coll='validation_scores', query={ 'experiment_id': self.experiment_id, 'fold': fold }, fields=[ 'iteration', 'scores.regression.mean_squared_error', 'source_id' ], types=['int32', 'float32', 'int8']) scores_df = pd.DataFrame({ 'loss': loss, 'source_id': source_id }, index=iterations) scores_df = scores_df.sort_index() return scores_df FOLDS = ['unseen_appliances', 'unseen_activations_of_seen_appliances'] scores = {} for fold in FOLDS: scores[fold] = get_mse_for_fold(fold) return scores
def _get_train_costs(self): # Get train scores monary = Monary(host=self.mongo_host) iterations, loss, source_id = monary.query( db=self.mongo_db, coll='train_scores', query={'experiment_id': self.experiment_id}, fields=['iteration', 'loss', 'source_id'], types=['int32', 'float32', 'int8'] ) scores_df = pd.DataFrame( {'loss': loss, 'source_id': source_id}, index=iterations) scores_df = scores_df.sort_index() return scores_df
def _get_train_costs(self): # Get train scores monary = Monary(host=self.mongo_host) iterations, loss, source_id = monary.query( db=self.mongo_db, coll='train_scores', query={'experiment_id': self.experiment_id}, fields=['iteration', 'loss', 'source_id'], types=['int32', 'float32', 'int8']) scores_df = pd.DataFrame({ 'loss': loss, 'source_id': source_id }, index=iterations) scores_df = scores_df.sort_index() return scores_df
def load_from_db(db, collection, field_names, field_types, db_host, query_spec={}): with Monary(host=db_host["hostname"], username=db_host["username"], password=db_host["password"], database="admin") as monary: arrays = monary.query( db, # database name collection, # collection name query_spec, # query spec field_names, # field names (in Mongo record) field_types # Monary field types (see below) ) return field_names, arrays
def do_monary_query(): with Monary("127.0.0.1") as m: with profile("monary query"): arrays = m.query( "monary_test", # database name "collection", # collection name {}, # query spec ["x1", "x2", "x3", "x4", "x5"], # field names ["float64"] * 5 # field types ) # prove that we did something... print(numpy.mean(arrays, axis=-1))
def do_monary_block_query(): count = 0 sums = numpy.zeros((5, )) with Monary("127.0.0.1") as m: with profile("monary block query"): for arrays in m.block_query( "monary_test", # database name "collection", # collection name {}, # query spec ["x1", "x2", "x3", "x4", "x5"], # field names ["float64"] * 5, # field types block_size=32 * 1024, ): count += len(arrays[0]) sums += [numpy.sum(arr) for arr in arrays] print "visited %i items" % count print sums / count # prove that we did something...
#script to save csv with comment data import pymongo, urllib3 from pymongo import MongoClient from hpfunctions import getUrl, stripWhite,stripWhiteList from lxml import html client=MongoClient() db=client['hp'] import pandas import numpy from monary import Monary mon=Monary() print 'available columns' for i in db.comStats.find()[0]:print i columns = ['typos', 'avSyllables', 'nPunct'] numpy_arrays = mon.query('hp', 'comStats', {}, columns, ['int32', 'int32', 'int32:20']) df = numpy.matrix(numpy_arrays).transpose() df = pandas.DataFrame(df, columns=columns) print 'starting to write file pandasTest.csv' df.to_csv('pandasTest.csv', sep='\t')
def _plot_validation_scores_for_source_and_fold(self, ax, source_id, fold, show_axes_labels, show_scales): fields = ['iteration'] + ['scores.' + metric_name for metric_name in self.validation_metric_names] monary = Monary(host=self.mongo_host) result = monary.query( db=self.mongo_db, coll='validation_scores', query={ 'experiment_id': self.experiment_id, 'source_id': source_id, 'fold': fold }, fields=fields, types=['int32'] + ['float32'] * len(self.validation_metric_names) ) index = result[0] data = {metric_name: result[i+1] for i, metric_name in enumerate(self.validation_metric_names)} df = pd.DataFrame(data, index=index) df = df.sort_index() df = self._downsample(df) # Create multiple independent axes. Adapted from Joe Kington's answer: # http://stackoverflow.com/a/7734614 # Colours n = len(self.validation_metric_names) colors = get_colors(n) # Twin the x-axis to make independent y-axes. axes = [ax] for metric_name in self.validation_metric_names[1:]: axes.append(ax.twinx()) SEP = 0.2 if show_scales: for i, axis in enumerate(axes): axis.yaxis.tick_right() if i != 0: # To make the border of the right-most axis visible, # we need to turn the frame on. This hides the other plots, # however, so we need to turn its fill off. axis.set_frame_on(True) axis.patch.set_visible(False) # Move the last y-axes spines over to the right. axis.spines['right'].set_position( ('axes', 1 + (SEP * i))) else: for axis in axes: axis.tick_params(labelright=False, labelleft=False) axis.yaxis.set_ticks_position('none') axis.spines['right'].set_visible(False) for axis in axes: for spine in ['top', 'left', 'bottom']: axis.spines[spine].set_visible(False) axis.xaxis.set_ticks_position('none') lines = [] for i, (axis, metric_name, color) in enumerate( zip(axes, self.validation_metric_names, colors)): axis.tick_params(axis='y', colors=color, direction='out') label = metric_name.replace("regression.", "") label = label.replace("classification_", "") label = label.replace("_", " ") label = label.replace(".", " ") label = label.replace(" ", "\n") line, = axis.plot( df.index, df[metric_name].values, color=color, label=label) if show_axes_labels and show_scales: axis.set_ylabel( label, color=color, rotation=0, fontsize=8, va='bottom') if i == 0: coords = (1.05, 1.1) else: coords = (1.05 + (SEP * i), 1.1) axis.yaxis.set_label_coords(*coords) lines.append(line) self._last_iteration_processed['validation'] = index[-1] return lines
from cache import environ from db import mongo_ohlc, monary_bar from monary import Monary # from pymongo import MongoClient # from datetime import time, timedelta pd.set_option('display.notebook_repr_html', False) pd.set_option('display.width', 100) pd.set_option('display.max_columns', 6) pd.set_option('display.max_rows', 20) pd.set_option('display.float_format', '{:,g}'.format) if __name__ == '__main__': # 20-30sec for ETF, 5-10sec for FX (BTFX) if False: client = Monary() # collection = client.flow['bar']# if mode == 'bbo' else client.flow dt = {} monary_bar(dt, client, 'SPY', 'TRADE', step_min=1) client.close() else: env = environ.get_env('FX_1Min') res, dt = mongo_ohlc(env, last_periods=100, mode='bbo', processes=20, type_list=['bid']) # # res, dt = mongo_ohlc(env, last_periods = 100, mode = 'bar', processes = 20, # type_list = ['TRADE'], type_field = 'event')
def get_client(self, database_name=None, uri=None, monary=False, host=None, autoreconnect=False, **kwargs): """Get a Mongoclient. Returns Mongo database object. If you provide a mongodb connection string uri, we will insert user & password into it, otherwise one will be built from the configuration settings. If database_name=None, will connect to the default database of the uri. database=something overrides event the uri's specification of a database. host is special magic for split_hosts kwargs will be passed to pymongo.mongoclient/Monary """ # Format of URI we should eventually send to mongo full_uri_format = 'mongodb://{user}:{password}@{host}:{port}/{database}' if uri is None: # We must construct the entire URI from the settings uri = full_uri_format.format(database=database_name, **self.config) else: # A URI was given. We expect it to NOT include user and password: result = parse_passwordless_uri(uri) _host, port, _database_name = result if result is not None: if not host: host = _host if database_name is None: database_name = _database_name uri = full_uri_format.format(database=database_name, host=host, port=port, user=self.config['user'], password=self.config['password']) else: # Some other URI was provided. Just try it and hope for the best pass if monary: # Be careful enabling this debug log statement, it's useful but prints the password in the uri # self.log.debug("Connecting to Mongo via monary using uri %s" % uri) # serverselection option makes the C driver retry if it can't connect; # since we often make new monary connections this is useful to protect against brief network hickups. client = Monary( uri + '?serverSelectionTryOnce=false&serverSelectionTimeoutMS=60000', **kwargs) self.log.debug("Succesfully connected via monary (probably...)") return client else: # Be careful enabling this debug log statement, it's useful but prints the password in the uri # self.log.debug("Connecting to Mongo using uri %s" % uri) client = pymongo.MongoClient(uri, **kwargs) client.admin.command( 'ping') # raises pymongo.errors.ConnectionFailure on failure self.log.debug("Successfully pinged client") if autoreconnect: # Wrap the client in a magic object that retries autoreconnect exceptions client = MongoProxy(client, disconnect_on_timeout=False, wait_time=180) return client
from monary import Monary import numpy as np import pandas as pd import time mon = Monary() columns = [ 'properties.total_residential_units', 'properties.total_job_spaces', 'properties.parcel_id', 'properties.max_dua', 'properties.max_far' ] t1 = time.time() numpy_arrays = mon.query( 'togethermap', 'places', {'collectionId': 'ZC7yyAyA8jkDFnRtf'}, columns, ['float32']*len(columns) ) df = np.matrix(numpy_arrays).transpose() df = pd.DataFrame(df, columns=columns) print time.time()-t1 print df.describe()
def _plot_validation_scores_for_source_and_fold(self, ax, source_id, fold, show_axes_labels, show_scales): fields = ['iteration'] + [ 'scores.' + metric_name for metric_name in self.validation_metric_names ] monary = Monary(host=self.mongo_host) result = monary.query(db=self.mongo_db, coll='validation_scores', query={ 'experiment_id': self.experiment_id, 'source_id': source_id, 'fold': fold }, fields=fields, types=['int32'] + ['float32'] * len(self.validation_metric_names)) index = result[0] data = { metric_name: result[i + 1] for i, metric_name in enumerate(self.validation_metric_names) } df = pd.DataFrame(data, index=index) df = df.sort_index() df = self._downsample(df) # Create multiple independent axes. Adapted from Joe Kington's answer: # http://stackoverflow.com/a/7734614 # Colours n = len(self.validation_metric_names) colors = get_colors(n) # Twin the x-axis to make independent y-axes. axes = [ax] for metric_name in self.validation_metric_names[1:]: axes.append(ax.twinx()) SEP = 0.2 if show_scales: for i, axis in enumerate(axes): axis.yaxis.tick_right() if i != 0: # To make the border of the right-most axis visible, # we need to turn the frame on. This hides the other plots, # however, so we need to turn its fill off. axis.set_frame_on(True) axis.patch.set_visible(False) # Move the last y-axes spines over to the right. axis.spines['right'].set_position(('axes', 1 + (SEP * i))) else: for axis in axes: axis.tick_params(labelright=False, labelleft=False) axis.yaxis.set_ticks_position('none') axis.spines['right'].set_visible(False) for axis in axes: for spine in ['top', 'left', 'bottom']: axis.spines[spine].set_visible(False) axis.xaxis.set_ticks_position('none') lines = [] for i, (axis, metric_name, color) in enumerate( zip(axes, self.validation_metric_names, colors)): axis.tick_params(axis='y', colors=color, direction='out') label = metric_name.replace("regression.", "") label = label.replace("classification_", "") label = label.replace("_", " ") label = label.replace(".", " ") label = label.replace(" ", "\n") line, = axis.plot(df.index, df[metric_name].values, color=color, label=label) if show_axes_labels and show_scales: axis.set_ylabel(label, color=color, rotation=0, fontsize=8, va='bottom') if i == 0: coords = (1.05, 1.1) else: coords = (1.05 + (SEP * i), 1.1) axis.yaxis.set_label_coords(*coords) lines.append(line) self._last_iteration_processed['validation'] = index[-1] return lines
from monary import Monary m = Monary() pipeline = [{"$group" : {"_id" : "$state", "totPop" : { "$sum" : "$pop"}}}] states, population = m.aggregate("zips","data", pipeline, ["_id","totpop"], ["string:2", "int64"]) strs = list(map(lambda x: x.decode("utf-8"), states)) result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population))
def run(self): count = 0 host = config.get('mongo', 'host') db = config.get('mongo', 'database') def _fill_field(field_arr, field_type): if field_type.startswith('string'): field_arr = field_arr.astype(np.str).filled('') elif field_type == 'bool': field_arr = field_arr.astype(np.str).filled(None) elif field_type.startswith('int'): field_arr = field_arr.filled(0) elif field_type.startswith('float'): field_arr = field_arr.filled(np.NaN) else: raise Exception('Unknown field type %s' % field_type) return field_arr with Monary(host) as m: log.info("Querying Monary") # Get field definitions for default collection query_fields, df_cols, field_types = zip( *self.get_collection_source_columns(self.collection_name)) catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size) log.info("Processing Monary data") for catalogue_block in catalogue_blocks: # Bit of a hack: fill fields with a blank value (depending on type) # So the masked value doesn't get used. As the masked is shared between # each block, if a field is empty it is getting populated by previous values catalogue_block = [ _fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block) ] # Create a pandas data frame with block of records # Columns use the name from the output columns - but must be in the same order as query_fields # Which is why we're using tuples for the columns df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols) # Loop through all the columns and ensure hidden integer fields are cast as int32 # For example, taxonomy_irn is used to join with taxonomy df for i, df_col in enumerate(df_cols): if field_types[i].startswith('int'): df[df_col] = df[df_col].astype(field_types[i]) df = self.process_dataframe(m, df) # Output the dataframe self.output().write(df) row_count, col_count = df.shape count += row_count log.info("\t %s records", count) # After running, update mongo self.mongo_target.touch()
from monary import Monary import numpy with Monary("127.0.0.1") as monary: arrays = monary.query( "HealthCare_Twitter_Analysis", # database name "tweets", # collection name {}, # query spec ["n-grams.text"], # field names (in Mongo record) ["float64"] # Monary field types (see below) ) print arrays[0]
from monary import Monary import numpy as np import pandas as pd import time mon = Monary() columns = [ 'properties.total_residential_units', 'properties.total_job_spaces', 'properties.parcel_id', 'properties.max_dua', 'properties.max_far' ] t1 = time.time() numpy_arrays = mon.query('togethermap', 'places', {'collectionId': 'ZC7yyAyA8jkDFnRtf'}, columns, ['float32'] * len(columns)) df = np.matrix(numpy_arrays).transpose() df = pd.DataFrame(df, columns=columns) print time.time() - t1 print df.describe()
__author__ = 'jflaisha' from monary import Monary, MonaryParam import numpy as np client = Monary(database="sam") # defaults to localhost:27017 def extract_arrays(fortran_output): """ Generate list containing a tuple for each HUC in a SAM run. The output from the Fortran is a 2-dimensional array where the y-axis represents each HUC in the run. The x-axis is a series of output values, with the first value being the HUC_ID and the remaining a time-series of output values. :param fortran_output: numpy.ndarray :return: list of tuples (tuple[0] = HUC_ID (str), tuple[1] = Output values (numpy.ndarray)) """ # return list(array for array in self.fortran_output) # return [array for array in self.fortran_output] return [array for array in fortran_output] class SamMonary(object): def __init__(self, jid, huc_output_array, day_array, huc_id): """ Class represents each HUC worth of output data from SuperPRZM run. The class methods take the numpy array SuperPRZM output data and convert them to MonaryParams to be inserted into MongoDB using Monary. :param jid: string, job ID for SAM/SuperPRZM run :param huc_output_array: numpy array, SuperPRZM output data for one HUC :param day_array: numpy array, sequence of "Julian Days" of simulation date range :param huc_id: string, HUC12 ID (12 digits)