def deleteUntrackedFromCache(self, configs): """ Delete data from cache for entities that are no longer cached """ if not self.config['delete_cache_for_untracked_entities']: return # Get the list of cached entity types tableTemplate = self.config['rethink_entity_table_template'] existingTables = rethinkdb.table_list().run(self.rethink) existingCacheTables = [] tablePattern = tableTemplate.format(type="*") for table in existingTables: if fnmatch.fnmatch(table, tablePattern): existingCacheTables.append(table) usedCacheTables = [c['table'] for c in configs] unusedCacheTables = [t for t in existingCacheTables if t not in usedCacheTables] LOG.debug("Unusesd cache tables: {0}".format(unusedCacheTables)) LOG.info("Deleting {0} cache tables".format(len(unusedCacheTables))) for table in unusedCacheTables: LOG.info("Deleting table: {0}".format(table)) rethinkdb.table_drop(table).run(self.rethink)
def deleteUntrackedFromCache(self, configs): """ Delete data from cache for entities that are no longer cached """ if not self.config['delete_cache_for_untracked_entities']: return # Get the list of cached entity types tableTemplate = self.config['rethink_entity_table_template'] existingTables = rethinkdb.table_list().run(self.rethink) existingCacheTables = [] tablePattern = tableTemplate.format(type="*") for table in existingTables: if fnmatch.fnmatch(table, tablePattern): existingCacheTables.append(table) usedCacheTables = [c['table'] for c in configs] unusedCacheTables = [ t for t in existingCacheTables if t not in usedCacheTables ] LOG.debug("Unusesd cache tables: {0}".format(unusedCacheTables)) LOG.info("Deleting {0} cache tables".format(len(unusedCacheTables))) for table in unusedCacheTables: LOG.info("Deleting table: {0}".format(table)) rethinkdb.table_drop(table).run(self.rethink)
def initialSetup(): print "Setting up database..." dbs = rethinkdb.db_list().run() if not con.general.databases["rethink"]["db"] in dbs: print "Creating database in rethink" rethinkdb.db_create(con.general.databases["rethink"]["db"]).run() dbt = list(rethinkdb.table_list().run()) for db in c.general.flush["rethink"]: if c.general.flush["rethink"][db]: print "Flushing rethink "+db+" table..." if db in dbt: rethinkdb.table_drop(db).run() dbt.pop(dbt.index(db)) print "Creating new rethink tables..." for table in c.general.tables: if not table in dbt: print "Creating table {}".format(table) rethinkdb.table_create(table).run() for key in c.general.flush["redis"]: if c.general.flush["redis"][key]: print "Flushing redis "+key+" keys..." keys = con.redis.keys(key+":*") for key in keys: con.redis.delete(key)
def setUp(self): self.db_name = 'radiowcs_test' assert self.db_name != 'radiowcs' self.table_name = 'test' self.db = database.Database() self.db.database_name = self.db_name self.db.table_name = self.table_name self.db.connect() self.connection = r.connect( host='localhost', port=28015, db=self.db_name, auth_key='', timeout=30 ) try: r.db_create(self.db_name).run(self.connection) r.table_create(self.table_name).run(self.connection) except r.RqlRuntimeError: print 'unittest setup: Drop table' r.table_drop(self.table_name).run(self.connection) r.table_create(self.table_name).run(self.connection) r.db(self.db_name).table(self.table_name).index_create( 'title').run(self.connection) r.db(self.db_name).table(self.table_name).index_create('artist').run(self.connection) r.db(self.db_name).table(self.table_name).index_create( 'date').run(self.connection) # 'out of order' insertions r.db(self.db_name).table(self.table_name).insert({'title':'foobar', 'artist': 'Selena', 'date': '1430183323'}).run(self.connection) r.db(self.db_name).table(self.table_name).insert({'title':'hello world', 'artist': 'John', 'date': '1430082566'}).run(self.connection) r.db(self.db_name).table(self.table_name).insert({'title':'zombie apoc', 'artist': 'xxJANExx', 'date': '1430385845'}).run(self.connection) r.db(self.db_name).table(self.table_name).insert({'title':'Black', 'artist': 'Kettle', 'date': '1430284300'}).run(self.connection)
def do_fix(db, collection=None): if collection is None: bad_meta, bad_tables = find_spurious_meta_and_tables(r.table('__METADATA__').run(db), r.table_list().run(db)) if len(bad_meta) == 0 and len(bad_tables) == 0: return 0, 0 r.table('__METADATA__').get_all(*bad_meta).delete().run(db) for table in bad_tables: r.table_drop(table).run(db) return len(bad_meta), len(bad_tables) #else check_collection_name(collection) meta = r.table('__METADATA__').get(collection).run(db) if meta is None: raise BadCollection('collection {} does not exist.'.format(collection)) doing_init = meta.get('doing_init') appending_filenames = meta.get('appending_filenames') if not collection in r.table_list().run(db): raise BadCollection("this is a spurious collection.") if doing_init: do_delete(db, collection) return 'doing_init' if appending_filenames: bad_samples = [k for k in meta['samples'] if meta['samples'][k] in appending_filenames] result = r.table(collection) \ .filter(r.row['IDs'].keys().set_intersection(appending_filenames) != [])\ .replace(lambda x: r.branch(x['IDs'].keys().set_difference(appending_filenames) == [], None, # delete record x.merge({ 'IDs': r.literal(x['IDs'].without(appending_filenames)), 'QUALs': r.literal(x['QUALs'].without(appending_filenames)), 'FILTERs': r.literal(x['FILTERs'].without(appending_filenames)), 'INFOs': r.literal(x['INFOs'].without(appending_filenames)), 'samples': r.literal(x['samples'].without(bad_samples)), }))).run(db) r.table('__METADATA__').get(collection)\ .replace(lambda x: x.merge({ 'vcfs': r.literal(x['vcfs'].without(appending_filenames)), 'samples': r.literal(x['samples'].without(bad_samples)) }).without('appending_filenames')).run(db) return appending_filenames, bad_samples, result['deleted'], result['replaced'] return None
def do_delete(db, collection): check_collection_name(collection) if not collection in r.table_list().run(db): return None r.table_drop(collection).run(db) r.table('__METADATA__').get(collection).delete().run(db) return True
def init_table(name, **kwargs): """ Initialize a table in the database. """ with connect() as con: try: r.table_drop(name).run(con) except r.ReqlOpFailedError: pass r.table_create(name, **kwargs).run(con)
def test_connection_with_database(self): app = Flask(__name__) db = RethinkDB(app, db='test') with app.test_request_context(): try: # Make sure RethinkDB is turned on! r.table_create('table').run(db.conn) except (RqlDriverError, RqlRuntimeError) as e: self.fail(e) else: # Do some cleanup r.table_drop('table').run(db.conn)
def setUp(self): self.db_name = 'radiowcs_test' assert self.db_name != 'radiowcs' self.table_name = 'test' self.db = database.Database() self.db.database_name = self.db_name self.db.table_name = self.table_name self.db.connect() self.connection = r.connect(host='localhost', port=28015, db=self.db_name, auth_key='', timeout=30) try: r.db_create(self.db_name).run(self.connection) r.table_create(self.table_name).run(self.connection) except r.RqlRuntimeError: print 'unittest setup: Drop table' r.table_drop(self.table_name).run(self.connection) r.table_create(self.table_name).run(self.connection) r.db(self.db_name).table(self.table_name).index_create('title').run( self.connection) r.db(self.db_name).table(self.table_name).index_create('artist').run( self.connection) r.db(self.db_name).table(self.table_name).index_create('date').run( self.connection) # 'out of order' insertions r.db(self.db_name).table(self.table_name).insert({ 'title': 'foobar', 'artist': 'Selena', 'date': '1430183323' }).run(self.connection) r.db(self.db_name).table(self.table_name).insert({ 'title': 'hello world', 'artist': 'John', 'date': '1430082566' }).run(self.connection) r.db(self.db_name).table(self.table_name).insert({ 'title': 'zombie apoc', 'artist': 'xxJANExx', 'date': '1430385845' }).run(self.connection) r.db(self.db_name).table(self.table_name).insert({ 'title': 'Black', 'artist': 'Kettle', 'date': '1430284300' }).run(self.connection)
def reseed(): db_list = r.db_list().run(conn) if db_name not in db_list: r.db_create(db_name).run(conn) table_list = r.table_list().run(conn) if products_table in table_list: r.table_drop(products_table).run(conn) r.table_create(products_table).run(conn) r.table(products_table).insert(test_products).run(conn)
def flush_rethink_tables(): tables_to_flush = [ table for table, flush in rethink_bags.flush.iteritems() if flush ] logger.debug("Tables that should be flushed: {}".format(tables_to_flush)) current_tables = rethinkdb.table_list().coerce_to("array").run() logger.debug("Current tables in rethink: {}".format(current_tables)) flushing_tables = list(set(current_tables).intersection(tables_to_flush)) logger.info("Flushing tables in rethink: {}".format(flushing_tables)) for table in flushing_tables: rethinkdb.table_drop(table).run() rethinkdb.table_create(table).run() logger.debug("Table {} flushed in rethink".format(table))
def main(): options = { 'server': config['JIRA'] } jira = JIRA(options, basic_auth=(config['USERNAME'], config['PASSWORD'])) months = [ ('2015-03', '2015-04'), ('2015-04', '2015-05'), ('2015-05', '2015-06'), ('2015-06', '2015-07'), ('2015-07', '2015-08'), ('2015-08', '2015-09'), ('2015-09', '2015-10'), ('2015-10', '2015-11'), ('2015-11', '2015-12'), ('2015-12', '2016-01'), ('2016-01', '2016-02'), ('2016-02', '2016-03'), ('2016-03', '2016-04') ] total_issues = 0 bulk_add = [] for month in months: print("Downloading issues for interval %s/%s" % month) jql = "created >= '%s-01' AND created < '%s-01'" % month issues_in_month = jira.search_issues(jql, maxResults=1000, json_result=True) issues = issues_in_month['issues'] filtered_issues = filter_issues(issues) issues_count = len(issues) filtered_count = len(filtered_issues) assert filtered_count == issues_count total_issues = total_issues + issues_count bulk_add.extend(filtered_issues) print("Successfully downloaded %d issues" % total_issues) print("Loading %d issues into RethinkDB" % len(bulk_add)) r.connect(config['RETHINKDB'], 28015, db='jira').repl() r.table_drop('issues').run() r.table_create('issues').run() r.table('issues').insert(bulk_add).run() print("OK! Bye")
def test_connection_with_inexisting_database(self): app = Flask(__name__) db = RethinkDB(app, db='doesnotexist') with app.test_request_context(): try: # Make sure RethinkDB is turned on! # Specifying an inexisting database should raise an exception r.table_create('table').run(db.conn) except (RqlDriverError, RqlRuntimeError): pass else: # Do some cleanup r.table_drop('table').run(db.conn) self.fail("Should have raised a RqlDriverError")
def _reset_data(self, table): if table in rdb.table_list().run(self.session): result = rdb.table_drop(table).run(self.session) assert result['dropped'] == 1 result = rdb.table_create(table).run(self.session) result = rdb.table(table).index_create('date').run(self.session) return result.get('created', 0) == 1
def CleanupOldState(client, blocklist): """ Remove the tables for state that are no longer necessary :param SawtoothClient client: sawtooth.client.SawtoothClient for accessing the ledger :param list blocklist: list of block identifiers """ statenames = map(lambda b: 'blk' + b, blocklist) tablelist = rethinkdb.table_list().run() for table in tablelist: if table.startswith('blk') and table not in statenames: try: logger.info('drop old state table %s', table) rethinkdb.table_drop(table).run() except: logger.exception('failed to drop state table %s', table)
def drop_tables(): from .registry import model_registry created_tables = r.table_list().run() for model_cls in model_registry.all().values(): if model_cls._table in created_tables: result = r.table_drop(model_cls._table).run() if result['tables_dropped'] != 1: raise RuntimeError('Could not drop table %s for model %s' % ( model_cls._table, model_cls.__name__))
def drop_tables(): from .registry import model_registry created_tables = r.table_list().run() for model_cls in model_registry.all().values(): if model_cls._table in created_tables: result = r.table_drop(model_cls._table).run() if result['tables_dropped'] != 1: raise RuntimeError('Could not drop table %s for model %s' % (model_cls._table, model_cls.__name__))
def LocalMain(config): """ Main processing loop for the synchronization process """ # pull database and collection names from the configuration and set up the # connections that we need dbhost = config.get('DatabaseHost', 'localhost') dbport = int(config.get('DatabasePort', 28015)) dbname = config['DatabaseName'] rconn = rethinkdb.connect(dbhost, dbport) rconn.repl() rconn.use(dbname) tablelist = rethinkdb.table_list().run() for table in tablelist: try: logger.info('drop table %s', table) rethinkdb.table_drop(table).run() except: logger.exception('failed to drop table %s', table) rconn.close()
def delete_unused_tables(conn): run_rql(r.table_drop("userprofiles"), conn) run_rql(r.table_drop("usergroups"), conn) run_rql(r.table_drop("runs"), conn) run_rql(r.table_drop("reviews"), conn) run_rql(r.table_drop("review2item"), conn) run_rql(r.table_drop("machines"), conn) run_rql(r.table_drop("ui"), conn) run_rql(r.table_drop("elements"), conn) run_rql(r.table_drop("sample2sample"), conn) run_rql(r.db('mcpub').table_drop("sample2sample"), conn) run_rql(r.table_drop("shares"), conn) run_rql(r.table_drop("user2share"), conn) run_rql(r.table_drop("experimenttasks"), conn) run_rql(r.table_drop("experiment2experimenttask"), conn) run_rql(r.table_drop("experimenttask2process"), conn) run_rql(r.table_drop("experimentnotes"), conn) run_rql(r.table_drop("experiment2experimentnote"), conn) run_rql(r.table_drop("dataset2experimentnote"), conn)
def table_drop(cls): return r.table_drop(cls.__table_name__).run(get_conn())
def drop_table(self): r.table_drop(self._table_name).run(self._conn)
import rethinkdb as r r.connect(port = 42865).repl() r.table_drop("foo").run() r.table_create("foo").run() print r.table("foo").index_create("sid", lambda x: r.js("1")).run()
def delete_table(self, table): with self._get_conn() as conn: rdb.table_drop(table).run(conn)
def deleteTable(self, tname): conn = rdb.connect(db=DB_NAME) res = rdb.table_drop(tname).run(conn) print res
import rethinkdb as r r.connect(port=42865).repl() r.table_drop("foo").run() r.table_create("foo").run() print r.table("foo").index_create("sid", lambda x: r.js("1")).run()
def table_drop(cls): return r.table_drop(cls.Meta.table_name).run(get_conn())
def drop_table(table, conn): run(r.table_drop(table), conn)
def retrieve_records(api_key, sensor_path, table_name, end_date=(datetime.datetime.strptime(time.strftime('%Y-%m-%d'), '%Y-%m-%d') - datetime.timedelta(days=1)).strftime('%Y-%m-%d'), start_date=None, json_chunk_size=5e3, verbosity=1): '''Pull records from Acyclica's API and write to RethinkDB. api_key [str]: the 41-character alphanumeric key you were given by Acyclica. Should be read in from an environment variable, encrypted if possible. sensor_path [str]: the path to Acyclica_sensors_CBD.csv (should be fetched automatically once we package this thing). table_name [str]: the name of the RethinkDB table that will be written. If a table of the same name already exists, it will be overwritten. end_date [str]: a date string of the form 'YYYY-MM-DD' specifying the last day of data to pull from Acyclica. Defaults to yesterday. start_date [str]: a date string of the form 'YYYY-MM-DD' specifying the first day of data to fetch from Acyclica. Defaults to None, which means only end_date will be fetched. Set this to 'prev_week' to fetch the full week starting 8 days ago and ending yesterday. json_chunk_size [int or float of form BASEeEXP]: lists passed to jumbo_write_json will be broken into chunks of this size. No need to modify unless you encounter memory use issues, in which case you should first try reducing the default value of 5,000. df_chunk_size [int or float of form BASEeEXP]: DataFrames passed to jumbo_write_df will be broken into chunks of this many rows. No need to modify unless you encounter memory use issues, in which case you should next try reducing the default value of 500,000. verbosity [int]: determines the number of reports that will be printed. 0 = no reports 1 = reports from this function only 2 = more reports from this function and from subroutine jumbo_write_json. Calls jumbo_write_df, which calls jumbo_write_json. Must be connected to a RethinkDB instance before using this. Pull at minimum 1 day and at maximum 1 week of data in increments of 1 day.''' #start timing start_time = time.time() #check for size limit errors # if df_chunk_size > 1e6: # raise(Exception('Maximum df_chunk_size is 1,000,000.')) if json_chunk_size > 1e5: raise(Exception('Maximum json_chunk_size is 100,000. This size is \ rarely a good idea.')) #check for end_date format error try: nul = datetime.datetime.strptime(end_date, '%Y-%m-%d') except: raise(Exception('end_date must be of the form "YYYY-MM-DD".')) #set appropriate start dates based on input if start_date == 'prev_week': start_date = (datetime.datetime.strptime(end_date, '%Y-%m-%d') - datetime.timedelta(days=6)).strftime('%Y-%m-%d') elif start_date is None: start_date = end_date else: pass #check for start_date format error try: nul = datetime.datetime.strptime(start_date, '%Y-%m-%d') except: raise(Exception('start_date must be of the form "YYYY-MM-DD".')) #add 23 h, 59 m, and 59 s to the end date (to grab the whole day) end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') + datetime.timedelta(hours=23, minutes=59, seconds=59) #convert datetime objects to unix time start_unix = int(time.mktime(datetime.datetime.strptime(start_date, '%Y-%m-%d').timetuple())) end_unix = int(time.mktime(end_date.timetuple())) #make sure the user isn't trying to grab more than a week of data, and that #end is after start if end_unix - start_unix > 604800: raise(Exception('Please specify a range of dates no greater than one week.')) if end_unix - start_unix < 0: raise(Exception('end_date must be later than start date.')) #determine how many days have been selected dif = end_unix - start_unix ndays = math.ceil(dif / (24 * 3600)) #get sensor data sensors = pd.read_csv(sensor_path) # sensors = sensors.drop(['name', 'short_name','latitude','longitude'], axis=1) sensors.columns = ['IntersectionID','sensor'] sensor_list = list(sensors['sensor']) if verbosity > 0: print('Preparing to acquire data for ' + str(ndays) + ' day(s) and ' \ + str(len(sensor_list)) + ' sensors.') #remove tables and create them anew try: r.table_drop(table_name).run() if verbosity > 0: print('Table "' + table_name + '" already existed and has ' \ + 'been deleted.') except: pass finally: r.table_create(table_name).run() if verbosity > 0: print('Created table "' + table_name + '".') #request and process one day at a time (roughly 5-10m records acquired per day) day_start_unix = start_unix for day in range(ndays): print('Acquiring records for day ' + str(day + 1) + ' of ' \ + str(ndays) + '. May take several minutes.') #get endpoints for each iteration and (re)instantiate dataframe day_end_unix = day_start_unix + (23 * 3600) + 3599 df = pd.DataFrame(columns=['Timestamp','MAC Hash','Strength','Serial']) #request and preprocess each sensor separately for i in range(len(sensor_list)): # sensorID = sensor_list[1] URL = "https://cr.acyclica.com/datastream/device/csv/time/" \ + api_key + "/" + str(sensor_list[i]) + "/" \ + str(day_start_unix) + "/" + str(day_end_unix) #get raw web content and read into a dataframe items = requests.get(URL).content newdf = pd.read_csv(io.StringIO(items.decode('utf-8')), usecols=['Timestamp','MAC Hash','Strength','Serial']) #round timestamp to nearest second newdf['Timestamp'] = newdf['Timestamp'].round().astype('int') #drop repeated reads within 1s, keeping read with highest strength strmaxes = newdf.groupby(['Timestamp', 'MAC Hash'])['Serial'].transform(max) newdf = newdf[newdf['Serial'] == strmaxes] #append to main dataframe df = df.append(newdf, ignore_index=True) if verbosity == 2: if i + 1 in [15,30,45]: print('Got data for ' + str(i + 1) + ' of ' \ + str(len(sensor_list)) \ + ' sensors. So far there are ' + str(len(df)) \ + ' reads for day ' + str(day + 1) + '.') del(newdf) # df = pd.read_csv('/home/mike/Desktop/untracked/aug5_df_justAfterAppending.csv') # print('WARNING: STILL READING LOCAL DATAFRAME FROM CSV') #drop repeated reads again, keeping read with highest strength strmaxes = df.groupby(['Timestamp', 'MAC Hash'])['Serial'].transform(max) df = df[df['Serial'] == strmaxes] pre_filt_len = str(len(df)) if verbosity > 0: print('Found ' + pre_filt_len + ' sensor reads for day ' \ + str(day + 1) + '. Cleaning those now.') # df.to_csv('/home/mike/Desktop/untracked/aug5_df_justAfterAppending.csv', index=False) json_list = df_to_json_etc(df, verbosity, pre_filt_len, sensors) if verbosity > 0: print('Converted DataFrame to JSON list and grouped by hash. ' \ + 'Passing list of length ' + str(len(json_list)) \ + ' to jumbo_write_json.') # json_list = json_list[0:11000] #set verbosity for jumbo_write_json sil = False if verbosity == 2 else True #write list to rethink # while len(json_list): #runs as long as rows remain in the dataframe # # #take a chunk of the dataframe and convert to json list # l = min(len(df), int(df_chunk_size)) #get the first chunk_size lines, or all the rest if fewer # chunk = df.iloc[0:l] #subset them from the df # df = df.drop(df.index[0:l]) #drop those lines # json_list = chunk.to_dict('records') # if verbosity > 0: # print('Converting chunk of ' + str(l) + ' rows to JSON format.') jumbo_write_json(data=json_list, table_name=table_name, chunk_size=json_chunk_size, silent=sil) # if verbosity > 0: # ndocs = r.table(table_name).count().run() # print('Finished writing day of records. Wrote ' + str(ndocs) \ # + ' docs to table "' + table_name + '".') # insert df into table as JSON (calls subroutine jumbo_write_json) # jumbo_write_df(df=df, table_name=table_name, # df_chunk_size=df_chunk_size, json_chunk_size=json_chunk_size, # verbosity=verbosity) #increment day day_start_unix = day_start_unix + (24 * 3600) if verbosity > 0: run_time = round((time.time() - start_time) / 60, 2) print('Finished writing all records for ' + str(ndays) + ' day(s) ' \ + 'in ' + str(run_time) + ' minutes.')
def drop_db(): tables = r.table_list().run(db.conn) for table in tables: r.table_drop(table).run(db.conn) print 'Tables have been dropped.'