def process_metadata_current(config, log_name): """ return type: barcode2term2value: for each sample barcode, finds the AliquotUUID and CENTER_CODE values and sets it as the DataCenterCode field """ log = logging.getLogger(log_name) log.info('start processing metadata.current.txt') barcode2term2value = {} metadataURL = config['downloads']['metadata_current'] try: metadata = util.getURLData(metadataURL, 'metadata.current.txt', log) lines = metadata.split('\n') except Exception as e: log.exception('problem fetching metadata.current.txt') if 'test' == config['mode']: metadata = open('metadata.current.txt') lines = metadata.read() lines = lines.split('\n') log.warning('using local copy for testing purposes') else: raise e column2term = config['metadata_locations']['metadata.current.txt'] headers = lines[0].split('\t') column2index = {} for column in column2term: column2index[column] = headers.index(column) for line in lines[1:]: if not line: continue fields = line.split('\t') term2value = {} for column, term in column2term.iteritems(): term2value[term] = fields[column2index[column]] barcode2term2value[fields[1]] = term2value log.info('finished processing metadata.current.txt') return barcode2term2value
def process_latestarchive(config, log_name): """ return types: tumor_type2platform2archive_types2archives: this map organizes the archives per tumor_type per platform per archive_type ('mage-tab', 'bio', 'maf', or 'data') platform2archive2metadata: this map organizes the metadata across all tumor types per platform per archive. the metadata fields are gotten from the archive url: DataArchiveURL; DataCenterType; DataCenterName; and Platform """ log = logging.getLogger(log_name) log.info('start process latestarchive') processAll, process = util.getTumorTypes(config, log) metadata_spec = config['metadata_locations']['latestarchive'] for key, value in metadata_spec.iteritems(): metadata_spec[key] = value.split('#') local = False platform2pipeline_tag = config['platform2pipeline_tag'] latestarchivesURL = config['downloads']['latestarchive'] try: archives = util.getURLData(latestarchivesURL, 'latestarchive', log) lines = archives.split('\n') log.info('\tarchive size is %s with %s lines' % (len(archives), len(lines))) if 20 > len(lines) and 'test' == config['mode']: local = True except Exception as e: log.exception('problem fetching latestarchive') if 'test' == config['mode']: local = True else: raise e if local: archives = open('LATESTARCHIVE.txt') log.warning('using local copy for testing purposes') archives = archives.read() lines = archives.split('\n') archive_file_path = write_archive(archives) desired_platforms = config['platform2datatype'].keys() maf_level = config["maflevel"] maf_platforms = config["mafplatforms"] count = 0 keep = 0 tumor_type2platform2archive_types2archives = {} platform2archive2metadata = {} stats = {} for archive_info in lines[1:]: if not archive_info: continue if 0 == count % 1024: log.info('\t\tprocessed %s archives' % (count)) count += 1 archive_fields = archive_info.split('\t') archive_name = archive_fields[0] archive_url = archive_fields[2] fields = archive_name[archive_name.index('_') + 1:].split('.') tumor_type = fields[0].lower() platform = fields[1] add_stats(stats, config, archive_name, archive_url, platform2pipeline_tag) open_access = config['access_tags']['open'] controlled_access = config['access_tags']['controlled'] if (processAll or tumor_type in process) and platform in desired_platforms: keep += 1 platform2archive_types2archives = tumor_type2platform2archive_types2archives.setdefault(tumor_type, {}) archive_types2archives = platform2archive_types2archives.setdefault(platform, {}) archive2metadata = platform2archive2metadata.setdefault(platform, {}) if 'mage-tab' in archive_name: # bcgsc.ca genome.wustl.edu if 'mage-tab' in archive_types2archives: log.warning('\t\tWARNING: found two mage-tab archives for %s[%s]: \n\t\t\t%s\n\t\t%s\n\t\t\t\tand\n\t\t\t%s\n\t\t%s' % (platform, tumor_type, archive_types2archives['mage-tab'][0][0], archive_types2archives['mage-tab'][0][2], archive_name, archive_url)) magetab_archives = archive_types2archives.setdefault('mage-tab', []) magetab_archives += [archive_fields] elif 'bio' == platform: clinical_archives = archive_types2archives.setdefault('bio', []) clinical_archives += [archive_fields] elif 'Level_1' in archive_name or 'Level_2' in archive_name or 'Level_3' in archive_name: data_archives = archive_types2archives.setdefault('data', []) data_archives += [archive_fields] if maf_level in archive_name and platform in maf_platforms: maf_archives = archive_types2archives.setdefault('maf', []) maf_archives += [archive_fields] metadata = archive2metadata.setdefault(archive_name, {}) metadata['Project'] = 'TCGA' for key, values in metadata_spec.iteritems(): if 'ARCHIVE_URL' == key: field = archive_url else: field = archive_name for value in values: fields = value.split(':') if 1 == len(fields): metadata[fields[0]] = field elif 'split' == fields[0]: metadata[fields[3]] = field.split(fields[1])[int(fields[2])] elif 'pattern' == fields[0]: match = re.match(fields[2], field) metadata[fields[1]] = match.group(1) metadata['Pipeline'] = metadata['DataCenterName'] + '__' + platform2pipeline_tag[metadata['Platform']] metadata['SecurityProtocol'] = controlled_access if 'tcga4yeo' in metadata['DataArchiveURL'] else open_access log.info('\tprocessed %s total archives' % (count)) if 0 < count: write_stats(stats) else: log.error('no archives found!!!') if config['upload_open']: upload_latestarchive_file(config, archive_file_path, log) log.info('finished process latestarchive: %s total archives, kept %s' % (count, keep)) return tumor_type2platform2archive_types2archives, platform2archive2metadata
def process_latestarchive(config, log_name): """ return types: tumor_type2platform2archive_types2archives: this map organizes the archives per tumor_type per platform per archive_type ('mage-tab', 'bio', 'maf', or 'data') platform2archive2metadata: this map organizes the metadata across all tumor types per platform per archive. the metadata fields are gotten from the archive url: DataArchiveURL; DataCenterType; DataCenterName; and Platform """ log = logging.getLogger(log_name) log.info('start process latestarchive') processAll, process = util.getTumorTypes(config, log) metadata_spec = config['metadata_locations']['latestarchive'] for key, value in metadata_spec.iteritems(): metadata_spec[key] = value.split('#') local = False platform2pipeline_tag = config['platform2pipeline_tag'] latestarchivesURL = config['downloads']['latestarchive'] try: archives = util.getURLData(latestarchivesURL, 'latestarchive', log) lines = archives.split('\n') log.info('\tarchive size is %s with %s lines' % (len(archives), len(lines))) if 20 > len(lines) and 'test' == config['mode']: local = True except Exception as e: log.exception('problem fetching latestarchive') if 'test' == config['mode']: local = True else: raise e if local: archives = open('LATESTARCHIVE.txt') log.warning('using local copy for testing purposes') archives = archives.read() lines = archives.split('\n') archive_file_path = write_archive(archives) desired_platforms = config['platform2datatype'].keys() maf_level = config["maflevel"] maf_platforms = config["mafplatforms"] count = 0 keep = 0 tumor_type2platform2archive_types2archives = {} platform2archive2metadata = {} stats = {} for archive_info in lines[1:]: if not archive_info: continue if 0 == count % 1024: log.info('\t\tprocessed %s archives' % (count)) count += 1 archive_fields = archive_info.split('\t') archive_name = archive_fields[0] archive_url = archive_fields[2] fields = archive_name[archive_name.index('_') + 1:].split('.') tumor_type = fields[0].lower() platform = fields[1] add_stats(stats, config, archive_name, archive_url, platform2pipeline_tag) open_access = config['access_tags']['open'] controlled_access = config['access_tags']['controlled'] if (processAll or tumor_type in process) and platform in desired_platforms: keep += 1 platform2archive_types2archives = tumor_type2platform2archive_types2archives.setdefault( tumor_type, {}) archive_types2archives = platform2archive_types2archives.setdefault( platform, {}) archive2metadata = platform2archive2metadata.setdefault( platform, {}) if 'mage-tab' in archive_name: # bcgsc.ca genome.wustl.edu if 'mage-tab' in archive_types2archives: log.warning( '\t\tWARNING: found two mage-tab archives for %s[%s]: \n\t\t\t%s\n\t\t%s\n\t\t\t\tand\n\t\t\t%s\n\t\t%s' % (platform, tumor_type, archive_types2archives['mage-tab'][0][0], archive_types2archives['mage-tab'][0][2], archive_name, archive_url)) magetab_archives = archive_types2archives.setdefault( 'mage-tab', []) magetab_archives += [archive_fields] elif 'bio' == platform: clinical_archives = archive_types2archives.setdefault( 'bio', []) clinical_archives += [archive_fields] elif 'Level_1' in archive_name or 'Level_2' in archive_name or 'Level_3' in archive_name: data_archives = archive_types2archives.setdefault('data', []) data_archives += [archive_fields] if maf_level in archive_name and platform in maf_platforms: maf_archives = archive_types2archives.setdefault('maf', []) maf_archives += [archive_fields] metadata = archive2metadata.setdefault(archive_name, {}) metadata['Project'] = 'TCGA' for key, values in metadata_spec.iteritems(): if 'ARCHIVE_URL' == key: field = archive_url else: field = archive_name for value in values: fields = value.split(':') if 1 == len(fields): metadata[fields[0]] = field elif 'split' == fields[0]: metadata[fields[3]] = field.split(fields[1])[int( fields[2])] elif 'pattern' == fields[0]: match = re.match(fields[2], field) metadata[fields[1]] = match.group(1) metadata['Pipeline'] = metadata[ 'DataCenterName'] + '__' + platform2pipeline_tag[ metadata['Platform']] metadata[ 'SecurityProtocol'] = controlled_access if 'tcga4yeo' in metadata[ 'DataArchiveURL'] else open_access log.info('\tprocessed %s total archives' % (count)) if 0 < count: write_stats(stats) else: log.error('no archives found!!!') if config['upload_open']: upload_latestarchive_file(config, archive_file_path, log) log.info('finished process latestarchive: %s total archives, kept %s' % (count, keep)) return tumor_type2platform2archive_types2archives, platform2archive2metadata