def parse_all_db_files(params): """ Using the config params, determines all the locations where our database files live. Then, parses them each and stores them in a list. In the end, it returns a list of dictionaries. Each entry in that list are the database contents from the different sources. """ db_list = [] logging.info('Parsing primary database at %s' % params['data_retention_db']) db_list.append(utils.load_database(params['data_retention_db'], params)) for db_file in params['backup_db_list']: logging.info('Parsing backup database file at %s' % db_file) db_list.append(utils.load_database(db_file, params)) logging.info('List of database info: %s' % db_list) return db_list
def query_metadata(params) : params = utils.load_paramDict(params) fout = open(params['seqlist'], 'w') if params.get('seqlist', None) is not None else sys.stdout data = utils.load_database(**params) db_columns = params['db_columns'] + params['metadata_columns'] + params['taxa_columns'] if params.get('default', None) is not None : tmp = {v['MapDB']:v for v in params['default_bowtie'] } filter = tmp[params['default']] else : filter = { key:params[key] for key in db_columns + ['name', 'tag', 'min', 'max', 'group'] if params.get(key, None) is not None } for fld, value in filter.iteritems() : if fld in db_columns : data = data[ data[fld].isin(value.split(',')) ] elif fld == 'min' : data = data[ data['size'].astype(int) >= int(value) ] elif fld == 'max' : data = data[ data['size'].astype(int) <= int(value) ] elif fld == 'group' : data = data[ data['barcode'].str.contains(value) ] elif fld == 'tag' : data = data.reset_index(drop=True) barcodes = pd.DataFrame(data['barcode'].apply(lambda barcode:[int(b[1:]) for b in barcode.split('.')]).tolist(), columns=params['barcode_tag']) for f in value.split(';') : f = f.strip() g1, g2 = f[0], f[-1] if f.find('==') > 0 : barcodes = barcodes[barcodes[g1] == barcodes[g2]] else : barcodes = barcodes[barcodes[g1] != barcodes[g2]] data = data.loc[barcodes.index].reset_index(drop=True) data.to_csv(fout, index=False, sep='\t')
def test_calls_removal_function(self, mock_deletion_func, mock_reminder_func, mock_parser): params = self.static_params.copy() params['data_retention_db'] = os.path.join(this_dir, 'test_db_file8.db') params['backup_db_list'] = [ os.path.join(this_dir, 'test_db_file8.db'), ] # the same mock_parser.return_value = params cloud_retention_scanner.main() # read the file contents db = utils.load_database(params['data_retention_db'], params) mock_deletion_func.assert_called_once_with('abc', db['abc'], params)
def test_calls_reminder_email_function_for_multiple_on_different_days( self, mock_reminder_func, mock_parser): params = self.static_params.copy() params['data_retention_db'] = os.path.join(this_dir, 'test_db_file6.db') params['backup_db_list'] = [ os.path.join(this_dir, 'test_db_file6.db'), ] # the same mock_parser.return_value = params cloud_retention_scanner.main() # read the file contents db = utils.load_database(params['data_retention_db'], params) calls = [ mock.call('abc', db['abc'], params, 7), mock.call('def', db['def'], params, 14) ] mock_reminder_func.assert_has_calls(calls)
def query_read(params): params = utils.load_paramDict(params) params['bootstrap'] = int( params['bootstrap']) if 'bootstrap' in params else 0 data = utils.load_database(**params) if params.get('stage', '0') in '0': bowtie2matrix(**params) if params.get('stage', '0') in '01': summary_matrix(data, **params) if params.get('stage', '0') in '012': qvector = ipopt(least_amount=[params['minFreq'], params['minNum']], **params) if params.get('stage', '0') in '0123': qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution') assign_reads(data, qvector, **params) if params.get('stage', '0') in '01234': assign = os.path.join(params['workspace'], 'read_assignment.gz') profiling(data, assign, **params) import glob for fname in glob.glob(os.path.join(params['workspace'], 'r?.fastq')): subprocess.Popen(['gzip', '-f', fname]).communicate()
key=lambda x: (x[1][0], x[0]), reverse=True): if w / basic_aln[0] >= minFreq: fout.write('{0}\t{1:.4f}\t{2:.4f}\t{3} ({4})\n'.format( g, w * 100.0 / basic_aln[0], w * 100.0 / basic_aln[1], '|'.join(t), ','.join(r))) print 'Profilling results are in {0}'.format( os.path.join(params['workspace'], 'profile.txt')) if __name__ == '__main__': params = utils.load_params(sys.argv) params['bootstrap'] = int( params['bootstrap']) if 'bootstrap' in params else 0 data = utils.load_database(**params) o_t = time.time() if params.get('stage', '0') in '0': bowtie2matrix(**params) if params.get('stage', '0') in '01': summary_matrix(data, **params) if params.get('stage', '0') in '012': qvector = ipopt(least_amount=[params['minFreq'], params['minNum']], **params) if params.get('stage', '0') in '0123': qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution') assign_reads(data, qvector, **params) if params.get('stage', '0') in '01234': assign = os.path.join(params['workspace'], 'read_assignment.gz') profiling(data, assign, **params)
def main(project_mapping): """ project_mapping is a two-level nested dict. The first level's keys are the iLab project IDs and each one maps to a dict Each 'second level' dict has a bucket and client_emails key, which give the bucket name gs://<bucket name> and a list of emails, respectively """ logging.info('In cloud tracking module') logging.info('Project mapping: %s' % project_mapping) # get some configuration parameters params = utils.parse_config_file('TRACKING') # need to cleanup some of the parameters: try: params['retention_period'] = int(params['retention_period']) logging.info('Params read from config: %s' % params) logging.info('Retention period set to %s days' % params['retention_period']) except: logging.error('Could not interpret one of the configuration parameters correctly. Check that the intended data types match those in the config file') sys.exit(1) # set the expiration date target_date = datetime.datetime.now() + datetime.timedelta(days=params['retention_period']) # read the database file this_dir = os.path.dirname(os.path.realpath(__file__)) params['data_retention_db'] = os.path.join(this_dir, params['data_retention_db']) if os.path.isfile(params['data_retention_db']): logging.info('About to parse database file' ) project_database = utils.load_database(params['data_retention_db'], params) logging.info('Parsed from the database: %s' % project_database) else: logging.error('Could not find a database file at %s' % params['data_retention_db']) raise MissingPrimaryDatabaseException('The primary database file is missing. Fix that.') for project_id, info_dict in project_mapping.items(): logging.info('Checking project with iLab ID: %s' % project_id) # perhaps we have an ongoing project- then a bucket for this iLab ID probably already exists if project_id in project_database: logging.info('project with ID %s was already in our database. Plan to update the deletion date' % project_id) # get the info we have about this in our database db_entry = project_database[project_id] # ensure the bucket names match. If they do, simply update the retention target date and the email contacts if info_dict['bucket'] == db_entry['bucket']: logging.info('The delivery buckets matched, as expected') logging.info('Changing deletion date from %s to %s' % (db_entry['target_date'].strftime(params['date_format']), target_date.strftime(params['date_format']))) db_entry['target_date'] = target_date existing_emails = set(db_entry['client_emails']) new_emails = set(info_dict['client_emails']) total_emails = existing_emails.union(new_emails) logging.info('Original emails were: %s' % existing_emails) logging.info('New emails were: %s' % new_emails) logging.info('The union of those sets of emails is %s' % total_emails) db_entry['client_emails'] = list(total_emails) else: # somehow the same iLab project was placed into a different bucket. Shouldn't happen, so raise an exception. We # retain 1-to-1 mapping beween ilab and buckets IDs. Maybe later we change this behavior based on a particular use-case logging.error('The bucket name did not match that of a prior project with the same iLab ID. This should not happen.') logging.error('The bucket found in the database was: %s' % db_entry['bucket']) logging.error('The bucket that was just uploaded the demux to was: %s' % info_dict['bucket']) raise MultipleBucketsForSameProjectException('The iLab IDs were the same, but the bucket was somehow different. Someone needs to check this!') #TODO- send a message for someone to fix it. else: logging.info('A new project will be added to the database.') logging.info('update info dict. Before %s, then add %s' % (info_dict, target_date)) info_dict.update({'target_date': target_date}) project_database[project_id] = info_dict logging.info('Project database: %s' % project_database) # now write to the database file: utils.write_to_db(project_database, params)
def reload_config(self): self.search_engines = load_database(self.filename) self.log.info("%d search engines loaded" % len(self.search_engines))
def reload_config(self): self.search_engines = load_database(self.search_engine_file)
def reload_config(self): self.messages = load_database(self.filename) self.log.info("%d messages loaded" % len(self.messages))
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) gpus = [int(i) for i in args.gpu.split(',')] if len(gpus) == 1: torch.cuda.set_device(int(args.gpu)) else: torch.cuda.set_device(gpus[0]) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logging.info('gpu device = %s' % args.gpu) logging.info("args = %s", args) criterion = nn.MSELoss() criterion = criterion.cuda() model = Network(args.init_channels, args.code_size, args.layers, criterion) model = model.cuda() if len(gpus) > 1: print("True") model = nn.parallel.DataParallel(model, device_ids=gpus, output_device=gpus[0]) model = model.module logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) if args.dataset_name == 'CUB_200_2011': args.root = os.path.join(args.root, 'CUB_200_2011') dataset = CUB_200_2011 elif params.dataset_name == 'Stanford_Dogs': args.root = os.path.join(args.root, 'Stanford_Dogs') dataset = Stanford_Dog else: logger.info('Dataset %s does not exsist.' % args.dataset_name) train_data = dataset(args.root, if_train=True) num_train = len(train_data) indices = list(range(num_train)) split = int(np.floor(args.train_portion * num_train)) print('train data length: ', split) train_queue = DataLoader( train_data, batch_size=args.batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), num_workers=8) database = RetrievalModel(args=args) database.cuda() assert os.path.isfile(args.load_db) utils.load_database(database, args.load_db) database.eval() optimizer = get_optimizer(model.parameters(), args.optimizer) for epoch in range(args.epochs): logging.info('epoch %d', epoch) genotype = model.genotype() logging.info('genotype = %s', genotype) print(F.softmax(model.alphas_normal, dim=-1)) print(F.softmax(model.alphas_reduce, dim=-1)) # training train(train_queue, model, criterion, optimizer, database) utils.save(model, os.path.join(args.save, 'weights.pt'))
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) gpus = [int(i) for i in args.gpu.split(',')] if len(gpus) == 1: torch.cuda.set_device(int(args.gpu)) else: torch.cuda.set_device(gpus[0]) # torch.cuda.set_device(args.gpu) # cudnn.benchmark = True torch.manual_seed(args.seed) # cudnn.enabled=True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %s' % args.gpu) logging.info("args = %s", args) genotype = eval("genotypes.%s" % args.arch) model = Network(args.init_channels, args.code_size, args.layers, args.auxiliary, genotype) model = model.cuda() if args.continue_train: utils.load(model,args.load_query) if len(gpus)>1: print("True") model = nn.parallel.DataParallel(model, device_ids=gpus, output_device=gpus[0]) model = model.module logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) if args.dataset_name == 'CUB_200_2011': args.root = os.path.join(args.root, 'CUB_200_2011') dataset = CUB_200_2011 elif params.dataset_name == 'Stanford_Dogs': args.root = os.path.join(args.root, 'Stanford_Dogs') dataset = Stanford_Dog else: logging.info('Dataset %s does not exsist.' % args.dataset_name) train_queue=DataLoader(dataset(args.root, if_train=True), batch_size=args.batch_size, shuffle=True, num_workers=8) database = RetrievalModel(args=args) database.cuda() if len(gpus)>1: print("True") database = nn.parallel.DataParallel(database, device_ids=gpus, output_device=gpus[0]) database = database.module assert os.path.isfile(args.load_db) utils.load_database(database, args.load_db) database.eval() criterion = nn.MSELoss() criterion = criterion.cuda() optimizer = get_optimizer(model.parameters(), args.optimizer) if len(gpus)>1: optimizer = nn.DataParallel(optimizer, device_ids=gpus).module for epoch in range(args.epochs): logging.info('epoch %d', epoch) model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train(train_queue, model, criterion, optimizer,database) utils.save(model, os.path.join(args.save, 'weights.pt'))
def db_MapDB(params): params = utils.load_paramDict(params) params['dbtype'] = params.get('dbtype', 'minimap2') db_columns = [ c for c in params['db_columns'] + params['metadata_columns'] + params['taxa_columns'] if c not in ('sha256') ] assert params.get('seqlist', None) is not None, 'seqlist is required. ' data = utils.load_database(**params) if params['seqlist'] in ('stdin', '-', ''): fin = sys.stdin else: fin = open(params['seqlist']) glist = pd.read_csv(fin, delimiter='\t', dtype='str') fin.close() mapdb = params['MapDB'] mapdb = os.path.join(params['bowtie_db'], mapdb) start_id = 0 indices = {i: 1 for i in glist['index'].tolist()} if len(glob.glob(mapdb + '.*')) > 0: assert params.get('mode', '') in ( 'overwrite', 'append' ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".' if params.get('mode', '') == 'overwrite': for fname in glob.glob(mapdb + '.*'): os.unlink(fname) elif params.get('mode', '') == 'append': for fname in glob.glob(mapdb + '.*.taxa.gz'): i = int(fname.rsplit('.', 3)[1]) if i >= start_id: start_id = i + 1 with gzip.open(fname) as fin: for line in fin: indices[line.strip().split()[1]] = 2 data = data.set_index('index', drop=False) data['size'] = data['size'].astype(int) data = data.loc[[i for i, t in indices.iteritems() if t == 1]].sort_values(by=['size'], ascending=[False]) min_file_num = int(np.ceil( np.sum(data['size']).astype(float) / 3800000000)) buckets = [[0, []] for n in xrange(min_file_num)] id = -1 for index, size, file_path, url_path in data[[ 'index', 'size', 'file_path', 'url_path' ]].values: size, done = int(size), 0 for id in range(id + 1, len(buckets)) + range(id + 1): b = buckets[id] if b[0] + size <= 3800000000: b[0] += size b[1].append([index, size, file_path, url_path]) done = 1 break if done == 0: buckets.append([size, [[index, size, file_path, url_path]]]) if params['dbtype'] == 'minimap2': pool = Pool(min(params['n_thread'], len(buckets))) result = pool.imap_unordered(create_db, [[ params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) #result = map(create_db, [[params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype']] for id, bucket in enumerate(buckets)]) else: result = map(create_db, [[ params['malt_build'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) for r in result: if r[2] != 0: print 'Database {0}.{1} FAILED with code {2}!'.format(*r) with open(mapdb + '.info', 'w') as fout: for id, bucket in enumerate(buckets): for b, _, _, _ in bucket[1]: fout.write('{0}\t{1}\n'.format(b, id + start_id)) print 'Done' if __name__ == '__main__': db_MapDB( dict([[k.strip() for k in arg.split('=', 1)] for arg in sys.argv[1:]]))