def process_cghub(config, type_uri='detail', log=None, removedups=False, limit=-1, verbose=False, print_response=False): """ return type: tumor_type2cghub_records: organizes the cghub record classes per tumor type """ log_info(log, 'begin process cghub') module = import_module(config['cghub_module']) mappings = config['metadata_locations']['cghub'] cghub_records, _ = module.main(mappings['study'], log=log, removedups=removedups, limit=limit) tumor_type2cghub_records = {} count = 0 seen_bad_codes = set() for cghub_record in cghub_records: if 0 == count % 8192: log_info(log, '\tprocess %s cghub records' % (count)) count += 1 tumor_type2cghub_records.setdefault( cghub_record.disease_abbr, []).append( create_cghub_metadata(mappings, cghub_record, seen_bad_codes, log)) log_info(log, 'finished process cghub: %s total records' % (count)) return tumor_type2cghub_records
def __init__(self, mem, subdir, memfile="Memfile"): def set_attr(attr, val): object.__setattr__(self, attr, val) set_attr("mem", mem) set_attr("orig_dir", os.path.abspath(os.curdir)) set_attr("subdir", os.path.join(self.orig_dir, subdir)) set_attr("memfile", os.path.join(self.subdir, memfile)) set_attr("mf", util.import_module(self.memfile, self.memfile))
def insert_metadata(config, table_columns, table_rows, log): try: datastore = import_module(config['database_module']) tables = ['metadata_clinical', 'metadata_biospecimen', 'metadata_data', 'metadata_samples'] for index in range(len(tables)): datastore.ISBCGC_database_helper.column_insert(config, list(table_rows[index]), tables[index], table_columns[index], log) except Exception as e: log.exception('problem saving metadata to the database') raise e
def get_song_checker(player_name): 'Returns the "currentSong" function for the specified player.' if player_name not in song_checkers: modname ='nowplaying.%s' % player_name if modname not in sys.modules: sys.modules[modname] = import_module(modname) try: return song_checkers[player_name] except KeyError: return None
def add_task(): """ :param url: str target url :param name: task name :param url_type: desc info :param script: python script for handler url response :param start method: script scheduler method :param depends: requirments txt :param scheduler_type: interval or cron :return: """ script = request.files.get('script') task = request.form.copy().to_dict() if script: path = 'crawlers/' + script.filename script.save(path) else: if 'script' in task: del task['script'] depends = task.get('depends', None) if depends: for depend in depends.split(','): c_o = commands.getoutput('pip install %s' % depend) LOG.info('c_o:%s', c_o) del task['depends'] trigger_value = task['trigger_value'] trigger_dict = {} for entry in trigger_value.split(','): key, value = entry.split(':', 1) trigger_dict[key] = int(value) if value.isdigit() else value del task['trigger_value'] task.update(trigger_dict) try: if scheduler.get_job(task['id']): scheduler.remove_job(task['id']) # if crawler file change, we should reload it crawler_module = util.import_module(task['func']) reload(crawler_module) job = scheduler.add_job(**task) return redirect('/index.html') except ConflictingIdError: return jsonify(dict(error_message='Job %s already exists.' % task.get('id')), status=409) except Exception as e: LOG.exception(e) return jsonify(dict(error_message=str(e)), status=500)
def test_mirna_isoform(self): self.config[ 'input_id_file'] = 'gdc/doc/gdc_manifest_mirnaiso.2016-12-12_test_40.tsv' module = import_module(self.config['gcs_wrapper']) module.open_connection(self.config, self.log) try: project = 'TCGA-UCS' data_type = 'Isoform Expression Quantification' self.run_upload(self.config, project, data_type, self.log) finally: module.close_connection()
def instantiate_etl_class(config, program_name, data_type, log): etl_class = None if data_type in config[program_name]['process_files']['datatype2bqscript']: log.info('\t\t\tinstantiating etl class %s' % (config[program_name]['process_files']['datatype2bqscript'] [data_type]['class'])) etl_module_name = config[program_name]['process_files'][ 'datatype2bqscript'][data_type]['python_module'] module = import_module(etl_module_name) etl_class_name = config[program_name]['process_files'][ 'datatype2bqscript'][data_type]['class'] Etl_class = getattr(module, etl_class_name) etl_class = Etl_class(config) return etl_class
def store_metadata(config, log, table, key2metadata): ''' calls the store_metadata method in the module specified by the configuration file parameters: config: the configuration map log: logger to log any messages table: the mysql table to save the metadata to key2metadata: the metadata to save ''' metadata_modules = config['metadata_modules'] for metadata_module in metadata_modules: module = import_module(metadata_module) module.store_metadata(config, log, table, key2metadata)
def finalize(config, log): if config['process_annotation']: call_metadata2annotation(config, log) if config['process_case'] and config['process_metadata_attrs']: for program_name in config['program_names']: if 0 == len(config['program_name_restrict'] ) or program_name in config['program_name_restrict']: postproc_module = import_module( config[program_name]['process_cases']['postproc_case'] ['postproc_module']) postproc_module.process_metadata_attrs(config, log) if 'process_images' in config and config['process_images']: process_images(config, log)
def insert_metadata(config, table_columns, table_rows, log): try: datastore = import_module(config['database_module']) tables = [ 'metadata_clinical', 'metadata_biospecimen', 'metadata_data', 'metadata_samples' ] for index in range(len(tables)): datastore.ISBCGC_database_helper.column_insert( config, list(table_rows[index]), tables[index], table_columns[index], log) except Exception as e: log.exception('problem saving metadata to the database') raise e
def initializeDB(config, log): module = import_module(config['database_module']) helper = module.ISBCGC_database_helper helper.initialize(config, log) # populate the data_availability tables if config['update_schema']: isb_labels = set(config['data_type2isb_label'].values()) for build in config['genomic_builds']: params = [[build, isb_label] for isb_label in isb_labels] for program_name in config['program_name_restrict']: helper.column_insert( config, params, '%s_metadata_data_type_availability' % (program_name), ['genomic_build', 'isb_label'], log)
def process_cghub(config, type_uri = 'detail', log = None, removedups = False, limit = -1, verbose = False, print_response = False): """ return type: tumor_type2cghub_records: organizes the cghub record classes per tumor type """ log_info(log, 'begin process cghub') module = import_module(config['cghub_module']) mappings = config['metadata_locations']['cghub'] cghub_records, _ = module.main(mappings['study'], log = log, removedups = removedups, limit = limit) tumor_type2cghub_records = {} count = 0 seen_bad_codes = set() for cghub_record in cghub_records: if 0 == count % 8192: log_info(log, '\tprocess %s cghub records' % (count)) count += 1 tumor_type2cghub_records.setdefault(cghub_record.disease_abbr, []).append(create_cghub_metadata(mappings, cghub_record, seen_bad_codes, log)) log_info(log, 'finished process cghub: %s total records' % (count)) return tumor_type2cghub_records
def setup(self, opts): self.uri = opts.get('Uri') # url to load self.rate = int(opts.get('Rate', 30)) # rate in seconds self.timeout = int(opts.get('Timeout', 30)) # timeout for IO operations # load known URI schemes scheme = urlparse.urlparse(self.uri).scheme if scheme == 'http' or scheme == 'https': # load a page over http self.update = self.update_http elif scheme == 'file': # load data from a file self.update = self.update_file elif scheme == 'python': # load data by calling a python function u = util.import_module(urlparse.urlparse(self.uri).netloc) self.update = lambda: self.process(u(opts)) else: raise ValueError("Unknown URI scheme: " + scheme)
def __init__(self, channel_name, encoder_names): # the channel this tube will use self.channel = dict() # the list of encoders this tube will use - order matters self.encoders = list() if not isinstance(encoder_names, list): raise TypeError("Encoders must be specified as a list of string names.") if not channel_name or not isinstance(channel_name, str): raise TypeError("Channel name must be specified as a string.") channel_class = util.import_module('sneakers.channels', channel_name) for encoder in encoder_names: if not encoder or not isinstance(encoder, str): raise TypeError("Encoders must be specified as a list of string names.") encoder_class = self.__import_module('sneakers.encoders', encoder.lower()) self.encoders.append({'name': encoder, 'class': encoder_class()}) self.channel = {'name': channel_name, 'class': channel_class()}
def uploadTCGA(configFileName): print datetime.now(), 'begin uploadTCGA()' global executor try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadTCGA()') executor = futures.ThreadPoolExecutor(max_workers=config['threads']) module = import_module(config['database_module']) module.ISBCGC_database_helper.initialize(config, log) if config['upload_files'] or config['upload_etl_files']: # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files gcs_wrapper.open_connection() info_status(config, log) tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(config, log_name) prepare_upload(tumor_type2platform2archive_types2archives, log) if 'process_cghub' not in config or config['process_cghub']: tumor_type2cghub_records = process_cghub(config, log=log, removedups=True, limit=-1) else: log.warning('\n\t====================\n\tnot processing CGHub records this run!\n\t====================') tumor_type2cghub_records = {} barcode2metadata = process_metadata_current(config, log_name) if 'process_annotations' not in config or config['process_annotations']: barcode2annotations = process_annotations(config, log_name) else: log.warning('\n\t====================\n\tnot processing annotations this run!\n\t====================') barcode2annotations = {} process_tumortypes(config, log_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log) finally: if executor: executor.shutdown(wait=False) if gcs_wrapper: gcs_wrapper.close_connection() log.info('finish uploadTCGA()') print datetime.now(), 'finish uploadTCGA()'
def register_config_items(configitem): """Registers config items with Mercurial's registrar. The argument is a ``registrar.configitem`` instance. """ # TRACKING hg43 configitems = import_module('mercurial.configitems') configitem(b'bugzilla', b'username', default=configitems.dynamicdefault) configitem(b'bugzilla', b'apikey', default=configitems.dynamicdefault) configitem(b'bugzilla', b'password', default=configitems.dynamicdefault) configitem(b'bugzilla', b'userid', default=configitems.dynamicdefault) configitem(b'bugzilla', b'cookie', default=configitems.dynamicdefault) configitem(b'bugzilla', b'firefoxprofile', default=configitems.dynamicdefault) configitem(b'bugzilla', b'url', default=configitems.dynamicdefault) configitem(b'mozilla', b'trustedbmoapikeyservices', default=configitems.dynamicdefault)
def uploadGDC(): print datetime.now(), 'begin uploadGDC()' gcs_wrapper = None try: args = parseargs() with open(args.config) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadGDC()') initializeDB(config, log) if config['upload_files'] or config['upload_etl_files']: # open the GCS wrapper here so it can be used by all the projects/platforms to save files gcs_wrapper = import_module(config['gcs_wrapper']) gcs_wrapper.open_connection(config, log) for endpt_type in config['endpt_types']: log.info('processing %s endpoints' % (endpt_type)) if config['process_annotation']: process_annotations(config, endpt_type, log_dir) else: log.warning( '\n\t====================\n\tnot processing annotations this run!\n\t====================' ) process_programs(config, endpt_type, log_dir, log) finalize(config, log) except: raise finally: if gcs_wrapper: gcs_wrapper.close_connection() log.info('finished uploadGDC()') print datetime.now(), 'finished uploadGDC()'
def __insert_rows(config, endpt_type, tablename, values, mapfilter, log): maps = [] for value in values: maps += flatten_map(value, mapfilter) print_list_synopsis(maps, '\t\trows to save for %s' % (tablename), log) module = import_module(config['database_module']) fieldnames = module.ISBCGC_database_helper.field_names(tablename) rows = [] for nextmap in maps: rows += __addrow(endpt_type, fieldnames, nextmap, log) if config['update_cloudsql']: # def select(cls, config, stmt, log, params = [], verbose = True): wherelist = [] for fieldname in fieldnames: wherelist += ['%s = %%s' % (fieldname)] stmt = 'select %s from %s where %s' % (fieldnames[0], tablename, ' and '.join(wherelist)) count = 0 for index in range(8): if len(rows) == index: break result = module.ISBCGC_database_helper.select( config, stmt, log, rows[index]) count += 1 if len(result) > 0 else 0 if count == min(len(rows), 8): log.warning( '\n\t====================\n\tfirst %d records already saved for %s, skipping\n\t====================' % (count, tablename)) return elif 0 < count: raise ValueError( 'only some of the first %d records were saved for %s' % (count, tablename)) module.ISBCGC_database_helper.column_insert(config, rows, tablename, fieldnames, log) else: log.warning( '\n\t====================\n\tnot saving to cloudsql to %s this run!\n\t====================' % (tablename))
def process_cases(config, endpt_type, program_name, project_name, log_dir): try: log_name = create_log(log_dir, project_name + '_cases') log = logging.getLogger(log_name) log.info('begin process_cases(%s)' % (project_name)) case2info = get_map_rows(config, endpt_type, 'case', program_name, get_filter(project_name), log) save2db( config, endpt_type, '%s_metadata_clinical' % (program_name), case2info, config[program_name]['process_cases']['clinical_table_mapping'], log) remove_null_samples(case2info, log) save2db(config, endpt_type, '%s_metadata_biospecimen' % (program_name), case2info, config[program_name]['process_cases']['sample_table_mapping'], log) # fill in the rest of the metadata depending on the program if 0 < len(case2info.values()): postproc_module = import_module( config[program_name]['process_cases']['postproc_case'] ['postproc_module']) postproc_module.postprocess(config, project_name, endpt_type, log) log.info('finished process_cases(%s)' % (project_name)) # log.info('begin process_cases(%s) for omf files' % (project_name)) # omf2info = get_omf_map_rows(config, project_name, log) # save2db(config, 'metadata_gdc_clinical', case2info, config['process_cases']['clinical_table_mapping'], log) # log.info('finished process_cases(%s) for omf files' % (project_name)) return case2info except: log.exception('problem processing cases(%s):' % (project_name)) raise finally: close_log(log)
def store_metadata(config, log, table, key_metadata): if not config['process_bio']: return count = 0 count_upload = 0 not_data_fields = set() upload_exts = set() field2stats = {} nospecies = [] nosdrf = set() insert_str_listlist = [] datastore = import_module(config['database_module']) field_names = datastore.ISBCGC_database_helper.field_names(table) field_name2column = dict([(column_name, index) for index, column_name in enumerate(field_names)]) list_fields = config['list_fields'] log.info('\tstarting store metadata') for metadata in key_metadata.itervalues(): try: # skip cellline/control samples: if 'SampleBarcode' in metadata and '20' == metadata['SampleBarcode'][13:15]: continue list_values = [None for _ in range(len(field_name2column))] for field, value in metadata.iteritems(): # make sure every string field is stored as a string rather than unicode # unicode is stored as a blob in the datastore try: if value in (None, '->'): metadata[field] = None else: if field in list_fields: metadata[field] = [str(v.encode('ascii', 'ignore').strip()) for v in value] try: value = '(' + ','.join(value) + ')' except: log.exception('problem setting list %s: %s' % (field, value)) else: metadata[field] = str(value.encode('ascii', 'ignore').strip()) # and add to the statistics for this field stats = field2stats.setdefault(field, [0, set()]) stats[0] += 1 try: stats[1].add(value) except: log.exception('problem adding value %s' % (value)) if field in field_name2column: list_values[field_name2column[field]] = value else: not_data_fields.add(field) if 'DatafileUploaded' == field and 'true' == value: count_upload += 1 upload_exts.add(metadata['DatafileName'][metadata['DatafileName'].rfind('.') + 1:]) except Exception as e: log.exception("problem with converting to string and recording stats for %s: %s" % (field, value)) raise e insert_str_listlist += [list_values] archive = metadata['DataArchiveName'] if 'DataArchiveName' in metadata else 'NO_ARCHIVE' if 'metadata_data' == table and 'Species' not in metadata: nospecies.append(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive) if 'metadata_data' == table and 'SDRFFileName' not in metadata and 'project_accession' not in metadata: nosdrf.add(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive) if 0 == count % 1024: log.info('\tinsert statement %s file metadata. latest: %s' % (count, metadata)) count += 1 except Exception as e: log.exception('problem in store_metadata()') raise e log.info('\tsetup %s total records, %s to upload with extensions %s' % (count, count_upload, ','.join(upload_exts))) # now save to cloudsql datastore.ISBCGC_database_helper.insert(config, insert_str_listlist, table, log) log.info('\tstatistics:') fields = field2stats.keys() fields.sort() for field in fields: stats = field2stats[field] try: log.info('\t\tstats for %s(%s:%s): %s' % (field, len(stats[1]), stats[0], ','.join(stats[1]) if 21 > len(stats[1]) else stats[1].pop())) except: log.exception('problem with %s(total: %s distinct: %s)' % (field, stats[0], len(stats[1]))) if 0 < len(nospecies) or 0 < len(nosdrf): if nospecies == nosdrf: log.warning( '\tfiles with no species match files with no sdrf file(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150]))) else: log.warning( '\tfiles with no species(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150]))) log.warning( '\tfiles with no sdrf file(%s):\n\t\t%s' % (len(nosdrf), '\n\t\t'.join(list(nosdrf)[:150]))) log.info('\tfinished store metadata. stored %s total records' % (count))
def store_metadata(config, log, table, key_metadata): if not config['process_bio']: return count = 0 count_upload = 0 not_data_fields = set() upload_exts = set() field2stats = {} nospecies = [] nosdrf = set() insert_str_listlist = [] datastore = import_module(config['database_module']) field_names = datastore.ISBCGC_database_helper.field_names(table) field_name2column = dict([ (column_name, index) for index, column_name in enumerate(field_names) ]) list_fields = config['list_fields'] log.info('\tstarting store metadata for %s' % (table)) # test issue 758 maftype2protocol2count = {} for metadata in key_metadata.itervalues(): try: # skip cellline/control samples: if 'SampleBarcode' in metadata and '20' == metadata[ 'SampleBarcode'][13:15]: continue list_values = [None for _ in range(len(field_name2column))] for field, value in metadata.iteritems(): # make sure every string field is stored as a string rather than unicode # unicode is stored as a blob in the datastore try: if value in (None, '->', '', 'None'): metadata[field] = None else: fields2value[field] = fields2value.setdefault( field, 0) + 1 fields2maxlength[field] = max( fields2maxlength.setdefault(field, 0), len(value)) if field in list_fields: metadata[field] = [ str(v.encode('ascii', 'ignore').strip()) for v in value ] try: value = '(' + ','.join(value) + ')' except: log.exception('problem setting list %s: %s' % (field, value)) else: metadata[field] = str( value.encode('ascii', 'ignore').strip()) # and add to the statistics for this field stats = field2stats.setdefault(field, [0, set()]) stats[0] += 1 try: stats[1].add(value) except: log.exception('problem adding value %s' % (value)) if field in field_name2column: list_values[field_name2column[field]] = value else: not_data_fields.add(field) # test issue 758 if 'SecurityProtocol' == field: protocol2count = None if metadata['DatafileName'].endswith('protected.maf'): protocol2count = maftype2protocol2count.setdefault( 'protected', {}) elif metadata['DatafileName'].endswith('somatic.maf'): protocol2count = maftype2protocol2count.setdefault( 'somatic', {}) elif metadata['DatafileName'].endswith('vcf'): protocol2count = maftype2protocol2count.setdefault( 'vcf', {}) if dict == type(protocol2count): newcount = protocol2count.setdefault( metadata['SecurityProtocol'], 0) + 1 protocol2count[ metadata['SecurityProtocol']] = newcount if 'DatafileUploaded' == field and 'true' == value: log.log_info( 'DatafileUploaded should not be true for %s:%s' % (metadata['Platform'], metadata['DatafileName'])) count_upload += 1 upload_exts.add(metadata['DatafileName'] [metadata['DatafileName'].rfind('.') + 1:]) except Exception as e: log.exception( "problem with converting to string and recording stats for %s: %s" % (field, value)) raise e insert_str_listlist += [list_values] archive = metadata[ 'DataArchiveName'] if 'DataArchiveName' in metadata else 'NO_ARCHIVE' if 'metadata_data' == table and 'Species' not in metadata: nospecies.append(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive) if 'metadata_data' == table and 'SDRFFileName' not in metadata and 'project_accession' not in metadata: nosdrf.add(metadata['DatafileName'] + ' ' + metadata['DataLevel'] + ' ' + metadata['DatafileUploaded'] + ' ' + archive) if 0 == count % 1024: log.info('\tinsert statement %s file metadata. latest: %s' % (count, metadata)) count += 1 except Exception as e: log.exception('problem in store_metadata()') raise e log.info('\tsetup %s total records, %s to upload with extensions %s' % (count, count_upload, ','.join(upload_exts))) # test issue 758 if 0 < len(maftype2protocol2count): log.info('\tmaf security binning:') for filetype, protocol2count in maftype2protocol2count.iteritems(): for protocol, count in protocol2count.iteritems(): log.info('\t\t%s: %s\t%s' % (filetype, protocol, count)) # now save to cloudsql datastore.ISBCGC_database_helper.insert(config, insert_str_listlist, table, log) log.info('\tstatistics:') fields = field2stats.keys() fields.sort() for field in fields: stats = field2stats[field] try: log.info( '\t\tstats for %s(%s:%s): %s' % (field, len(stats[1]), stats[0], ','.join(stats[1]) if 21 > len(stats[1]) else stats[1].pop())) except: log.exception('problem with %s(total: %s distinct: %s)' % (field, stats[0], len(stats[1]))) if 0 < len(nospecies) or 0 < len(nosdrf): if nospecies == nosdrf: log.warning( '\tfiles with no species match files with no sdrf file(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150]))) else: log.warning('\tfiles with no species(%s):\n\t\t%s' % (len(nospecies), '\n\t\t'.join(list(nospecies)[:150]))) log.warning('\tfiles with no sdrf file(%s):\n\t\t%s' % (len(nosdrf), '\n\t\t'.join(list(nosdrf)[:150]))) log.info('\tfinished store metadata. stored %s total records' % (count))
def uploadTCGA(configFileName): ''' based on the configuration map loaded from the configFileName, loads the DCC data into GCS. also obtains metadata based on file paths, SDRF values, and CGHub manifest values parameters: configFileName: the file name of the configuration map ''' print datetime.now(), 'begin uploadTCGA()' global executor gcs_wrapper = None try: with open(configFileName) as configFile: config = json.load(configFile) run_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(run_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadTCGA()') executor = futures.ThreadPoolExecutor(max_workers=config['threads']) info_status(config, log) setup_database(config, log) # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files gcs_wrapper = import_module(config['gcs_wrapper']) gcs_wrapper.open_connection(config, log) tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive( config, run_dir, log_name) prepare_upload(tumor_type2platform2archive_types2archives, log) if 'process_cghub' not in config or config['process_cghub']: tumor_type2cghub_records = process_cghub(config, run_dir, log=log, removedups=True, limit=-1) else: log.warning( '\n\t====================\n\tnot processing CGHub records this run!\n\t====================' ) tumor_type2cghub_records = {} barcode2metadata = process_metadata_current(config, run_dir, log_name) if 'process_annotations' not in config or config['process_annotations']: barcode2annotations = process_annotations(config, run_dir, log_name) else: log.warning( '\n\t====================\n\tnot processing annotations this run!\n\t====================' ) barcode2annotations = {} process_tumortypes(config, run_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log) # associate the annotation metadata with the other metadata tables associate_metadata2annotation(config, log) # print out the stats metadata_modules = config['metadata_modules'] for metadata_module in metadata_modules: module = import_module(metadata_module) module.print_combined_stats(log) finally: if executor: executor.shutdown(wait=False) log.info('finish uploadTCGA()') try: # upload the logs and TCGA files used for upload to GCS upload_run_files(config, run_dir, log) except Exception as e: log.exception('problem moving the logs and run files to GCS') finally: if gcs_wrapper: gcs_wrapper.close_connection() print datetime.now(), 'finish uploadTCGA()'
def store_metadata(config, log, table, key2metadata): metadata_modules = config['metadata_modules'] for metadata_module in metadata_modules: module = import_module(metadata_module) module.store_metadata(config, log, table, key2metadata)
from functools import wraps from unittest import TestCase from mock import patch, MagicMock, call import util import gpg_wrapper_common MODULE_NAME = "gpg_key" LIBRARY_PATH = "library" module = util.import_module(MODULE_NAME, LIBRARY_PATH) class GPGWrapperTestCase(gpg_wrapper_common.GPGKeyTestCase): gpg_class = module.GPG def patch_path(keyring='', exists=True, isdir=False): from os.path import isdir as original_isdir from os.path import exists as original_exists def _isdir(path): if path == keyring: return isdir return original_isdir def _exists(path): if path == keyring: return exists return original_exists def decorator(method): @wraps(method)
def __init__(self, mem, subdir, memfile="Memfile"): self.mem = mem self.orig_dir = os.path.abspath(os.curdir) self.subdir = os.path.join(self.orig_dir, subdir) self.memfile = os.path.join(self.subdir, memfile) self.mf = util.import_module(self.memfile, self.memfile)
from mock import patch, call, MagicMock from functools import wraps from unittest import TestCase from mock import Mock from ansible.runner.return_data import ReturnData import gpg_common import gpg_remote_common import gpg_plugin_common import gpg_wrapper_common from gpg_remote_common import patch_execute_module, provided_mocks import util MODULE_NAME = "gpg" LIBRARY_PATH = "plugins/action_plugins" module = util.import_module(MODULE_NAME, LIBRARY_PATH, _as="_gpg") class ConnectionTestCase(gpg_remote_common.ConnectionTestCase): connection_class = module.Connection connection_error = module.ConnectionError @patch_execute_module(return_value=1) def test_run_gpg(self, e_m): c = self.get_connection() result = c.run_gpg(a=1, b=2) args = dict(a=1, b=2) e_m.assert_called_once_with('gpg', args, check=False) self.assertEqual(result, 1) def patch_tempfile(name="saymyname"): def decorator(method):
def main(configFilename, openETLFilename, contETLFilename, outputFilename, contBucketContentFilename): ''' process the two ETL files and obtain the metadata for the filenames in contETLFilename that aren't in openETLFilename and save to the output file parameters: openETLFilename: the data metadata file produced by the open access upload run (note: this will have controlled access metadata for those controlled access files who appear in the metadata) contETLFilename: the data metadata file produced by the controlled access upload run outputFilename: he file to write the results to ''' with open(configFilename) as configFile: config = json.load(configFile) run_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(run_dir, 'top_processing') log = logging.getLogger(log_name) log.info('%s: start processing controlled vs open upload metadata' % (str(datetime.now()))) openFilename2aliquot2metadata = {} processETLFile(openETLFilename, openFilename2aliquot2metadata, log) contFilename2aliquot2metadata = {} processETLFile(contETLFilename, contFilename2aliquot2metadata, log) openkeys = set(openFilename2aliquot2metadata.keys()) contkeys = set(contFilename2aliquot2metadata.keys()) onlycontkeys = contkeys - openkeys bucketFiles = set() processBucketContents(contBucketContentFilename, bucketFiles, log) intersect = bucketFiles & onlycontkeys log.info('\n\tthe count of files only in the controlled access ETL: %s\n\tthe intersection count with the bucket: %s\n\tthe non intersect left in the ETL: %s' % \ (len(onlycontkeys), len(intersect), len(onlycontkeys - bucketFiles))) headerCols = [ 'Pipeline', 'DataArchiveURL', 'DatafileUploaded', 'Datatype', 'Study', 'DataArchiveVersion', 'DataCenterType', 'Project', 'Platform', 'DataLevel', 'ParticpantBarcode', 'SecurityProtocol', 'SampleBarcode', 'AliquotBarcode', 'IncludeForAnalysis', 'DataCenterName', 'DatafileName', 'DataCenterCode', 'DataArchiveName', 'Species', 'AliquotUUID' ] inserts = [] missing_cols = {} missing_filenames = set() missing_aliquots = set() with open(outputFilename, 'w') as outfile: for key in onlycontkeys: for metadata in contFilename2aliquot2metadata[key].itervalues(): outfile.write(metadata) metadata_json = json.loads(metadata) if len(metadata_json) > len(headerCols): raise ValueError( 'found unknown column(s): %s' % (','.join(set(metadata_json.keys()) - set(headerCols)))) newinsert = [] for header in headerCols: if header in metadata_json: newinsert += [metadata_json[header]] else: newinsert += [None] count = missing_cols.setdefault(header, 0) + 1 missing_filenames.add(key) missing_aliquots.add(metadata_json['AliquotBarcode']) missing_cols[header] = count inserts += [newinsert] log.info('\n\tmissing cols: %s from files %s and aliquots %s\n' % (', '.join('%s: %s' % (column, count) for column, count in missing_cols.iteritems()), len(missing_filenames), len(missing_aliquots))) db_module = import_module(config['database_module']) db_module.ISBCGC_database_helper.column_insert(config, inserts, 'metadata_data', headerCols, log) log.info('%s: finished processing controlled vs open upload metadata' % (str(datetime.now())))
def import_memfile(f): return util.import_module(f, f)