def parse_all_db_files(params):
    """
	Using the config params, determines all the locations where our database files live.
	Then, parses them each and stores them in a list.  In the end, it returns a list of 
	dictionaries.  Each entry in that list are the database contents from the different sources.
	"""
    db_list = []
    logging.info('Parsing primary database at %s' %
                 params['data_retention_db'])
    db_list.append(utils.load_database(params['data_retention_db'], params))
    for db_file in params['backup_db_list']:
        logging.info('Parsing backup database file at %s' % db_file)
        db_list.append(utils.load_database(db_file, params))
    logging.info('List of database info: %s' % db_list)
    return db_list
示例#2
0
def query_metadata(params) :
    params = utils.load_paramDict(params)    
    fout = open(params['seqlist'], 'w') if params.get('seqlist', None) is not None else sys.stdout
    data = utils.load_database(**params)

    db_columns = params['db_columns'] + params['metadata_columns'] + params['taxa_columns']
    if params.get('default', None) is not None :
        tmp = {v['MapDB']:v for v in params['default_bowtie'] }
        filter = tmp[params['default']]
    else :
        filter = { key:params[key] for key in db_columns + ['name', 'tag', 'min', 'max', 'group'] if params.get(key, None) is not None }
    for fld, value in filter.iteritems() :
        if fld in db_columns :
            data = data[ data[fld].isin(value.split(',')) ]
        elif fld == 'min' :
            data = data[ data['size'].astype(int) >= int(value) ]
        elif fld == 'max' :
            data = data[ data['size'].astype(int) <= int(value) ]
        elif fld == 'group' :
            data = data[ data['barcode'].str.contains(value) ]
        elif fld == 'tag' :
            data = data.reset_index(drop=True)
            barcodes = pd.DataFrame(data['barcode'].apply(lambda barcode:[int(b[1:]) for b in barcode.split('.')]).tolist(), columns=params['barcode_tag'])
            
            for f in value.split(';') :
                f = f.strip()
                g1, g2 = f[0], f[-1]
                if f.find('==') > 0 :
                    barcodes = barcodes[barcodes[g1] == barcodes[g2]]
                else :
                    barcodes = barcodes[barcodes[g1] != barcodes[g2]]
            data = data.loc[barcodes.index].reset_index(drop=True)

    data.to_csv(fout, index=False, sep='\t')
示例#3
0
 def test_calls_removal_function(self, mock_deletion_func,
                                 mock_reminder_func, mock_parser):
     params = self.static_params.copy()
     params['data_retention_db'] = os.path.join(this_dir,
                                                'test_db_file8.db')
     params['backup_db_list'] = [
         os.path.join(this_dir, 'test_db_file8.db'),
     ]  # the same
     mock_parser.return_value = params
     cloud_retention_scanner.main()
     # read the file contents
     db = utils.load_database(params['data_retention_db'], params)
     mock_deletion_func.assert_called_once_with('abc', db['abc'], params)
示例#4
0
    def test_calls_reminder_email_function_for_multiple_on_different_days(
            self, mock_reminder_func, mock_parser):
        params = self.static_params.copy()
        params['data_retention_db'] = os.path.join(this_dir,
                                                   'test_db_file6.db')
        params['backup_db_list'] = [
            os.path.join(this_dir, 'test_db_file6.db'),
        ]  # the same
        mock_parser.return_value = params
        cloud_retention_scanner.main()

        # read the file contents
        db = utils.load_database(params['data_retention_db'], params)
        calls = [
            mock.call('abc', db['abc'], params, 7),
            mock.call('def', db['def'], params, 14)
        ]
        mock_reminder_func.assert_has_calls(calls)
示例#5
0
def query_read(params):
    params = utils.load_paramDict(params)
    params['bootstrap'] = int(
        params['bootstrap']) if 'bootstrap' in params else 0

    data = utils.load_database(**params)

    if params.get('stage', '0') in '0':
        bowtie2matrix(**params)
    if params.get('stage', '0') in '01':
        summary_matrix(data, **params)
    if params.get('stage', '0') in '012':
        qvector = ipopt(least_amount=[params['minFreq'], params['minNum']],
                        **params)
    if params.get('stage', '0') in '0123':
        qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution')
        assign_reads(data, qvector, **params)
    if params.get('stage', '0') in '01234':
        assign = os.path.join(params['workspace'], 'read_assignment.gz')
        profiling(data, assign, **params)

    import glob
    for fname in glob.glob(os.path.join(params['workspace'], 'r?.fastq')):
        subprocess.Popen(['gzip', '-f', fname]).communicate()
示例#6
0
                                   key=lambda x: (x[1][0], x[0]),
                                   reverse=True):
            if w / basic_aln[0] >= minFreq:
                fout.write('{0}\t{1:.4f}\t{2:.4f}\t{3} ({4})\n'.format(
                    g, w * 100.0 / basic_aln[0], w * 100.0 / basic_aln[1],
                    '|'.join(t), ','.join(r)))
    print 'Profilling results are in {0}'.format(
        os.path.join(params['workspace'], 'profile.txt'))


if __name__ == '__main__':
    params = utils.load_params(sys.argv)
    params['bootstrap'] = int(
        params['bootstrap']) if 'bootstrap' in params else 0

    data = utils.load_database(**params)

    o_t = time.time()
    if params.get('stage', '0') in '0':
        bowtie2matrix(**params)
    if params.get('stage', '0') in '01':
        summary_matrix(data, **params)
    if params.get('stage', '0') in '012':
        qvector = ipopt(least_amount=[params['minFreq'], params['minNum']],
                        **params)
    if params.get('stage', '0') in '0123':
        qvector = os.path.join(params['workspace'], 'ipopt.qmatrix.solution')
        assign_reads(data, qvector, **params)
    if params.get('stage', '0') in '01234':
        assign = os.path.join(params['workspace'], 'read_assignment.gz')
        profiling(data, assign, **params)
def main(project_mapping):
	"""
	project_mapping is a two-level nested dict.
	The first level's keys are the iLab project IDs and each one maps to a dict
	Each 'second level' dict has a bucket and client_emails key, which give the bucket name gs://<bucket name>
	and a list of emails, respectively
	"""

	logging.info('In cloud tracking module')
	logging.info('Project mapping: %s' % project_mapping)

	# get some configuration parameters
	params = utils.parse_config_file('TRACKING')
	
	# need to cleanup some of the parameters:
	try:
		params['retention_period'] = int(params['retention_period'])
		logging.info('Params read from config: %s' % params)
		logging.info('Retention period set to %s days' % params['retention_period'])
	except:
		logging.error('Could not interpret one of the configuration parameters correctly.  Check that the intended data types match those in the config file')
		sys.exit(1)		

	# set the expiration date
	target_date = datetime.datetime.now() + datetime.timedelta(days=params['retention_period'])

	# read the database file
	this_dir = os.path.dirname(os.path.realpath(__file__))
	params['data_retention_db'] = os.path.join(this_dir, params['data_retention_db'])
	if os.path.isfile(params['data_retention_db']):
		logging.info('About to parse database file' )
		project_database = utils.load_database(params['data_retention_db'], params)
		logging.info('Parsed from the database: %s' % project_database)
	else:
		logging.error('Could not find a database file at %s' % params['data_retention_db'])
		raise MissingPrimaryDatabaseException('The primary database file is missing.  Fix that.')

	for project_id, info_dict in project_mapping.items():

		logging.info('Checking project with iLab ID: %s' % project_id)
		# perhaps we have an ongoing project- then a bucket for this iLab ID probably already exists
		if project_id in project_database:

			logging.info('project with ID %s was already in our database. Plan to update the deletion date' % project_id)
			# get the info we have about this in our database
			db_entry = project_database[project_id]
			
			# ensure the bucket names match.  If they do, simply update the retention target date and the email contacts
			if info_dict['bucket'] == db_entry['bucket']:
				logging.info('The delivery buckets matched, as expected')
				logging.info('Changing deletion date from %s to %s' % (db_entry['target_date'].strftime(params['date_format']), target_date.strftime(params['date_format'])))
				db_entry['target_date'] = target_date
				existing_emails = set(db_entry['client_emails'])
				new_emails = set(info_dict['client_emails'])
				total_emails = existing_emails.union(new_emails)
				logging.info('Original emails were: %s' % existing_emails)
				logging.info('New emails were: %s' % new_emails)
				logging.info('The union of those sets of emails is %s' % total_emails)
				db_entry['client_emails'] = list(total_emails)
			else:
				# somehow the same iLab project was placed into a different bucket.  Shouldn't happen, so raise an exception.  We 
				# retain 1-to-1 mapping beween ilab and buckets IDs.  Maybe later we change this behavior based on a particular use-case
				logging.error('The bucket name did not match that of a prior project with the same iLab ID.  This should not happen.')
				logging.error('The bucket found in the database was: %s' % db_entry['bucket'])
				logging.error('The bucket that was just uploaded the demux to was: %s' % info_dict['bucket'])
				raise MultipleBucketsForSameProjectException('The iLab IDs were the same, but the bucket was somehow different.  Someone needs to check this!')
				#TODO- send a message for someone to fix it.

		else:
			logging.info('A new project will be added to the database.')
			logging.info('update info dict.  Before %s, then add %s' % (info_dict, target_date))
			info_dict.update({'target_date': target_date})
			project_database[project_id] = info_dict

	logging.info('Project database: %s' % project_database)
				
	# now write to the database file:
	utils.write_to_db(project_database, params)
示例#8
0
文件: search.py 项目: thomasba/orakel
	def reload_config(self):
		self.search_engines = load_database(self.filename)
		self.log.info("%d search engines loaded" %
				len(self.search_engines))
示例#9
0
	def reload_config(self):
		self.search_engines = load_database(self.search_engine_file)
示例#10
0
 def reload_config(self):
     self.messages = load_database(self.filename)
     self.log.info("%d messages loaded" % len(self.messages))
示例#11
0
def main():
    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    np.random.seed(args.seed)
    gpus = [int(i) for i in args.gpu.split(',')]
    if len(gpus) == 1:
        torch.cuda.set_device(int(args.gpu))
    else:
        torch.cuda.set_device(gpus[0])

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %s' % args.gpu)
    logging.info("args = %s", args)

    criterion = nn.MSELoss()
    criterion = criterion.cuda()

    model = Network(args.init_channels, args.code_size, args.layers, criterion)
    model = model.cuda()

    if len(gpus) > 1:
        print("True")
        model = nn.parallel.DataParallel(model,
                                         device_ids=gpus,
                                         output_device=gpus[0])
        model = model.module

    logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

    if args.dataset_name == 'CUB_200_2011':
        args.root = os.path.join(args.root, 'CUB_200_2011')
        dataset = CUB_200_2011
    elif params.dataset_name == 'Stanford_Dogs':
        args.root = os.path.join(args.root, 'Stanford_Dogs')
        dataset = Stanford_Dog
    else:
        logger.info('Dataset %s does not exsist.' % args.dataset_name)
    train_data = dataset(args.root, if_train=True)
    num_train = len(train_data)
    indices = list(range(num_train))
    split = int(np.floor(args.train_portion * num_train))
    print('train data length: ', split)
    train_queue = DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        num_workers=8)

    database = RetrievalModel(args=args)
    database.cuda()
    assert os.path.isfile(args.load_db)
    utils.load_database(database, args.load_db)
    database.eval()

    optimizer = get_optimizer(model.parameters(), args.optimizer)

    for epoch in range(args.epochs):
        logging.info('epoch %d', epoch)

        genotype = model.genotype()
        logging.info('genotype = %s', genotype)

        print(F.softmax(model.alphas_normal, dim=-1))
        print(F.softmax(model.alphas_reduce, dim=-1))

        # training
        train(train_queue, model, criterion, optimizer, database)

        utils.save(model, os.path.join(args.save, 'weights.pt'))
示例#12
0
 def reload_config(self):
     self.search_engines = load_database(self.search_engine_file)
示例#13
0
def main():
  if not torch.cuda.is_available():
    logging.info('no gpu device available')
    sys.exit(1)

  np.random.seed(args.seed)

  gpus = [int(i) for i in args.gpu.split(',')]
  if len(gpus) == 1:
    torch.cuda.set_device(int(args.gpu))
  else:
    torch.cuda.set_device(gpus[0])

  # torch.cuda.set_device(args.gpu)
  # cudnn.benchmark = True
  torch.manual_seed(args.seed)
  # cudnn.enabled=True
  torch.cuda.manual_seed(args.seed)
  logging.info('gpu device = %s' % args.gpu)
  logging.info("args = %s", args)

  genotype = eval("genotypes.%s" % args.arch)
  model = Network(args.init_channels, args.code_size, args.layers, args.auxiliary, genotype)
  model = model.cuda()
  if args.continue_train:
    utils.load(model,args.load_query)
  if len(gpus)>1:
    print("True")
    model = nn.parallel.DataParallel(model, device_ids=gpus, output_device=gpus[0])
    model = model.module

  logging.info("param size = %fMB", utils.count_parameters_in_MB(model))

  if args.dataset_name == 'CUB_200_2011':
    args.root = os.path.join(args.root, 'CUB_200_2011')
    dataset = CUB_200_2011
  elif params.dataset_name == 'Stanford_Dogs':
    args.root = os.path.join(args.root, 'Stanford_Dogs')
    dataset = Stanford_Dog
  else:
    logging.info('Dataset %s does not exsist.' % args.dataset_name)
  train_queue=DataLoader(dataset(args.root, if_train=True), batch_size=args.batch_size,
                                shuffle=True, num_workers=8)

  database = RetrievalModel(args=args)
  database.cuda()
  if len(gpus)>1:
    print("True")
    database = nn.parallel.DataParallel(database, device_ids=gpus, output_device=gpus[0])
    database = database.module

  assert os.path.isfile(args.load_db)
  utils.load_database(database, args.load_db)
  database.eval()

  criterion = nn.MSELoss()
  criterion = criterion.cuda()

  optimizer = get_optimizer(model.parameters(), args.optimizer)

  if len(gpus)>1:
    optimizer = nn.DataParallel(optimizer, device_ids=gpus).module
  
  for epoch in range(args.epochs):
    logging.info('epoch %d', epoch)
    model.drop_path_prob = args.drop_path_prob * epoch / args.epochs
    train(train_queue, model, criterion, optimizer,database)
    utils.save(model, os.path.join(args.save, 'weights.pt'))
示例#14
0
def db_MapDB(params):
    params = utils.load_paramDict(params)
    params['dbtype'] = params.get('dbtype', 'minimap2')
    db_columns = [
        c for c in params['db_columns'] + params['metadata_columns'] +
        params['taxa_columns'] if c not in ('sha256')
    ]

    assert params.get('seqlist', None) is not None, 'seqlist is required. '

    data = utils.load_database(**params)

    if params['seqlist'] in ('stdin', '-', ''):
        fin = sys.stdin
    else:
        fin = open(params['seqlist'])
    glist = pd.read_csv(fin, delimiter='\t', dtype='str')
    fin.close()

    mapdb = params['MapDB']
    mapdb = os.path.join(params['bowtie_db'], mapdb)
    start_id = 0

    indices = {i: 1 for i in glist['index'].tolist()}

    if len(glob.glob(mapdb + '.*')) > 0:
        assert params.get('mode', '') in (
            'overwrite', 'append'
        ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".'
        if params.get('mode', '') == 'overwrite':
            for fname in glob.glob(mapdb + '.*'):
                os.unlink(fname)
        elif params.get('mode', '') == 'append':
            for fname in glob.glob(mapdb + '.*.taxa.gz'):
                i = int(fname.rsplit('.', 3)[1])
                if i >= start_id:
                    start_id = i + 1
                with gzip.open(fname) as fin:
                    for line in fin:
                        indices[line.strip().split()[1]] = 2
    data = data.set_index('index', drop=False)
    data['size'] = data['size'].astype(int)
    data = data.loc[[i for i, t in indices.iteritems()
                     if t == 1]].sort_values(by=['size'], ascending=[False])
    min_file_num = int(np.ceil(
        np.sum(data['size']).astype(float) / 3800000000))

    buckets = [[0, []] for n in xrange(min_file_num)]
    id = -1
    for index, size, file_path, url_path in data[[
            'index', 'size', 'file_path', 'url_path'
    ]].values:
        size, done = int(size), 0
        for id in range(id + 1, len(buckets)) + range(id + 1):
            b = buckets[id]
            if b[0] + size <= 3800000000:
                b[0] += size
                b[1].append([index, size, file_path, url_path])
                done = 1
                break
        if done == 0:
            buckets.append([size, [[index, size, file_path, url_path]]])
    if params['dbtype'] == 'minimap2':
        pool = Pool(min(params['n_thread'], len(buckets)))
        result = pool.imap_unordered(create_db, [[
            params['minimap2'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
        #result = map(create_db, [[params['minimap2'], mapdb, start_id + id, bucket[1], params['dbtype']] for id, bucket in enumerate(buckets)])
    else:
        result = map(create_db, [[
            params['malt_build'], mapdb, start_id + id, bucket[1],
            params['dbtype']
        ] for id, bucket in enumerate(buckets)])
    for r in result:
        if r[2] != 0:
            print 'Database {0}.{1} FAILED with code {2}!'.format(*r)

    with open(mapdb + '.info', 'w') as fout:
        for id, bucket in enumerate(buckets):
            for b, _, _, _ in bucket[1]:
                fout.write('{0}\t{1}\n'.format(b, id + start_id))
    print 'Done'

    if __name__ == '__main__':
        db_MapDB(
            dict([[k.strip() for k in arg.split('=', 1)]
                  for arg in sys.argv[1:]]))
示例#15
0
	def reload_config(self):
		self.messages = load_database(self.filename)
		self.log.info("%d messages loaded" % len(self.messages))