def process_metadata_current(config, log_name):
    """
    return type:
        barcode2term2value: for each sample barcode, finds the AliquotUUID and CENTER_CODE values and sets it as the DataCenterCode field
    """
    log = logging.getLogger(log_name)
    log.info('start processing metadata.current.txt')
    barcode2term2value = {}

    metadataURL = config['downloads']['metadata_current']
    try:
        metadata = util.getURLData(metadataURL, 'metadata.current.txt', log)
        lines = metadata.split('\n')
    except Exception as e:
        log.exception('problem fetching metadata.current.txt')
        if 'test' == config['mode']:
            metadata = open('metadata.current.txt')
            lines = metadata.read()
            lines = lines.split('\n')
            log.warning('using local copy for testing purposes')
        else:
            raise e

    column2term = config['metadata_locations']['metadata.current.txt']
    headers = lines[0].split('\t')
    column2index = {}
    for column in column2term:
        column2index[column] = headers.index(column)
    for line in lines[1:]:
        if not line:
            continue
        fields = line.split('\t')
        term2value = {}
        for column, term in column2term.iteritems():
            term2value[term] = fields[column2index[column]]
        barcode2term2value[fields[1]] = term2value

    log.info('finished processing metadata.current.txt')
    return barcode2term2value
def process_metadata_current(config, log_name):
    """
    return type:
        barcode2term2value: for each sample barcode, finds the AliquotUUID and CENTER_CODE values and sets it as the DataCenterCode field
    """
    log = logging.getLogger(log_name)
    log.info('start processing metadata.current.txt')
    barcode2term2value = {}
    
    metadataURL = config['downloads']['metadata_current']
    try:
        metadata = util.getURLData(metadataURL, 'metadata.current.txt', log)
        lines = metadata.split('\n')
    except Exception as e:
        log.exception('problem fetching metadata.current.txt')
        if 'test' == config['mode']:
            metadata = open('metadata.current.txt')
            lines = metadata.read()
            lines = lines.split('\n')
            log.warning('using local copy for testing purposes')
        else:
            raise e
    
    column2term = config['metadata_locations']['metadata.current.txt']
    headers = lines[0].split('\t')
    column2index = {}
    for column in column2term:
        column2index[column] = headers.index(column)
    for line in lines[1:]:
        if not line:
            continue
        fields = line.split('\t')
        term2value = {}
        for column, term in column2term.iteritems():
            term2value[term] = fields[column2index[column]]
        barcode2term2value[fields[1]] = term2value
    
    log.info('finished processing metadata.current.txt')
    return barcode2term2value
def process_latestarchive(config, log_name):
    """
    return types:
        tumor_type2platform2archive_types2archives: this map organizes the archives per
          tumor_type per platform per archive_type ('mage-tab', 'bio', 'maf', or 'data')
        platform2archive2metadata: this map organizes the metadata across all tumor types
          per platform per archive.  the metadata fields are gotten from the archive url:
          DataArchiveURL; DataCenterType; DataCenterName; and Platform
    """
    log = logging.getLogger(log_name)
    log.info('start process latestarchive')
    processAll, process = util.getTumorTypes(config, log)
    metadata_spec = config['metadata_locations']['latestarchive']
    for key, value in metadata_spec.iteritems():
        metadata_spec[key] = value.split('#')
    
    local = False
    platform2pipeline_tag = config['platform2pipeline_tag']
    latestarchivesURL = config['downloads']['latestarchive']
    try:
        archives = util.getURLData(latestarchivesURL, 'latestarchive', log)
        lines = archives.split('\n')
        log.info('\tarchive size is %s with %s lines' % (len(archives), len(lines)))
        if 20 > len(lines) and 'test' == config['mode']:
            local = True
    except Exception as e:
        log.exception('problem fetching latestarchive')
        if 'test' == config['mode']:
            local = True
        else:
            raise e

    if local:
        archives = open('LATESTARCHIVE.txt')
        log.warning('using local copy for testing purposes')
        archives = archives.read()
        lines = archives.split('\n')
    archive_file_path = write_archive(archives)

    desired_platforms = config['platform2datatype'].keys()
    maf_level = config["maflevel"]
    maf_platforms = config["mafplatforms"]
    count = 0
    keep = 0
    tumor_type2platform2archive_types2archives = {}
    platform2archive2metadata = {}
    stats = {}
    for archive_info in lines[1:]:
        if not archive_info:
            continue
        
        if 0 == count % 1024:
            log.info('\t\tprocessed %s archives' % (count))
        count += 1
        archive_fields = archive_info.split('\t')
        archive_name = archive_fields[0]
        archive_url = archive_fields[2]
        fields = archive_name[archive_name.index('_') + 1:].split('.')
        tumor_type = fields[0].lower()
        platform = fields[1]
        add_stats(stats, config, archive_name, archive_url, platform2pipeline_tag)
        open_access = config['access_tags']['open']
        controlled_access = config['access_tags']['controlled']
        if (processAll or tumor_type in process) and platform in desired_platforms:
            keep += 1
            platform2archive_types2archives = tumor_type2platform2archive_types2archives.setdefault(tumor_type, {})
            archive_types2archives = platform2archive_types2archives.setdefault(platform, {})
            archive2metadata = platform2archive2metadata.setdefault(platform, {})
            if 'mage-tab' in archive_name: # bcgsc.ca genome.wustl.edu
                if 'mage-tab' in archive_types2archives:
                    log.warning('\t\tWARNING: found two mage-tab archives for %s[%s]: \n\t\t\t%s\n\t\t%s\n\t\t\t\tand\n\t\t\t%s\n\t\t%s' % 
                        (platform, tumor_type, archive_types2archives['mage-tab'][0][0], archive_types2archives['mage-tab'][0][2], archive_name, archive_url))
                magetab_archives = archive_types2archives.setdefault('mage-tab', [])
                magetab_archives += [archive_fields]
            elif 'bio' == platform:
                clinical_archives = archive_types2archives.setdefault('bio', [])
                clinical_archives += [archive_fields]
            elif 'Level_1' in archive_name or 'Level_2' in archive_name or 'Level_3' in archive_name:
                data_archives = archive_types2archives.setdefault('data', [])
                data_archives += [archive_fields]
                if maf_level in archive_name and platform in maf_platforms:
                    maf_archives = archive_types2archives.setdefault('maf', [])
                    maf_archives += [archive_fields]
            metadata = archive2metadata.setdefault(archive_name, {})
            metadata['Project'] = 'TCGA'
            for key, values in metadata_spec.iteritems():
                if 'ARCHIVE_URL' == key:
                    field = archive_url
                else:
                    field = archive_name
                for value in values:
                    fields = value.split(':')
                    if 1 == len(fields):
                        metadata[fields[0]] = field
                    elif 'split' == fields[0]:
                        metadata[fields[3]] = field.split(fields[1])[int(fields[2])]
                    elif 'pattern' == fields[0]:
                        match = re.match(fields[2], field)
                        metadata[fields[1]] = match.group(1)
            metadata['Pipeline'] = metadata['DataCenterName'] + '__' + platform2pipeline_tag[metadata['Platform']]
            metadata['SecurityProtocol'] = controlled_access if 'tcga4yeo' in metadata['DataArchiveURL'] else open_access
    log.info('\tprocessed %s total archives' % (count))

    if 0 < count:
        write_stats(stats)
    else:
        log.error('no archives found!!!')
    if config['upload_open']:
        upload_latestarchive_file(config, archive_file_path, log)
    log.info('finished process latestarchive: %s total archives, kept %s' % (count, keep))
    return tumor_type2platform2archive_types2archives, platform2archive2metadata
示例#4
0
def process_latestarchive(config, log_name):
    """
    return types:
        tumor_type2platform2archive_types2archives: this map organizes the archives per
          tumor_type per platform per archive_type ('mage-tab', 'bio', 'maf', or 'data')
        platform2archive2metadata: this map organizes the metadata across all tumor types
          per platform per archive.  the metadata fields are gotten from the archive url:
          DataArchiveURL; DataCenterType; DataCenterName; and Platform
    """
    log = logging.getLogger(log_name)
    log.info('start process latestarchive')
    processAll, process = util.getTumorTypes(config, log)
    metadata_spec = config['metadata_locations']['latestarchive']
    for key, value in metadata_spec.iteritems():
        metadata_spec[key] = value.split('#')

    local = False
    platform2pipeline_tag = config['platform2pipeline_tag']
    latestarchivesURL = config['downloads']['latestarchive']
    try:
        archives = util.getURLData(latestarchivesURL, 'latestarchive', log)
        lines = archives.split('\n')
        log.info('\tarchive size is %s with %s lines' %
                 (len(archives), len(lines)))
        if 20 > len(lines) and 'test' == config['mode']:
            local = True
    except Exception as e:
        log.exception('problem fetching latestarchive')
        if 'test' == config['mode']:
            local = True
        else:
            raise e

    if local:
        archives = open('LATESTARCHIVE.txt')
        log.warning('using local copy for testing purposes')
        archives = archives.read()
        lines = archives.split('\n')
    archive_file_path = write_archive(archives)

    desired_platforms = config['platform2datatype'].keys()
    maf_level = config["maflevel"]
    maf_platforms = config["mafplatforms"]
    count = 0
    keep = 0
    tumor_type2platform2archive_types2archives = {}
    platform2archive2metadata = {}
    stats = {}
    for archive_info in lines[1:]:
        if not archive_info:
            continue

        if 0 == count % 1024:
            log.info('\t\tprocessed %s archives' % (count))
        count += 1
        archive_fields = archive_info.split('\t')
        archive_name = archive_fields[0]
        archive_url = archive_fields[2]
        fields = archive_name[archive_name.index('_') + 1:].split('.')
        tumor_type = fields[0].lower()
        platform = fields[1]
        add_stats(stats, config, archive_name, archive_url,
                  platform2pipeline_tag)
        open_access = config['access_tags']['open']
        controlled_access = config['access_tags']['controlled']
        if (processAll
                or tumor_type in process) and platform in desired_platforms:
            keep += 1
            platform2archive_types2archives = tumor_type2platform2archive_types2archives.setdefault(
                tumor_type, {})
            archive_types2archives = platform2archive_types2archives.setdefault(
                platform, {})
            archive2metadata = platform2archive2metadata.setdefault(
                platform, {})
            if 'mage-tab' in archive_name:  # bcgsc.ca genome.wustl.edu
                if 'mage-tab' in archive_types2archives:
                    log.warning(
                        '\t\tWARNING: found two mage-tab archives for %s[%s]: \n\t\t\t%s\n\t\t%s\n\t\t\t\tand\n\t\t\t%s\n\t\t%s'
                        % (platform, tumor_type,
                           archive_types2archives['mage-tab'][0][0],
                           archive_types2archives['mage-tab'][0][2],
                           archive_name, archive_url))
                magetab_archives = archive_types2archives.setdefault(
                    'mage-tab', [])
                magetab_archives += [archive_fields]
            elif 'bio' == platform:
                clinical_archives = archive_types2archives.setdefault(
                    'bio', [])
                clinical_archives += [archive_fields]
            elif 'Level_1' in archive_name or 'Level_2' in archive_name or 'Level_3' in archive_name:
                data_archives = archive_types2archives.setdefault('data', [])
                data_archives += [archive_fields]
                if maf_level in archive_name and platform in maf_platforms:
                    maf_archives = archive_types2archives.setdefault('maf', [])
                    maf_archives += [archive_fields]
            metadata = archive2metadata.setdefault(archive_name, {})
            metadata['Project'] = 'TCGA'
            for key, values in metadata_spec.iteritems():
                if 'ARCHIVE_URL' == key:
                    field = archive_url
                else:
                    field = archive_name
                for value in values:
                    fields = value.split(':')
                    if 1 == len(fields):
                        metadata[fields[0]] = field
                    elif 'split' == fields[0]:
                        metadata[fields[3]] = field.split(fields[1])[int(
                            fields[2])]
                    elif 'pattern' == fields[0]:
                        match = re.match(fields[2], field)
                        metadata[fields[1]] = match.group(1)
            metadata['Pipeline'] = metadata[
                'DataCenterName'] + '__' + platform2pipeline_tag[
                    metadata['Platform']]
            metadata[
                'SecurityProtocol'] = controlled_access if 'tcga4yeo' in metadata[
                    'DataArchiveURL'] else open_access
    log.info('\tprocessed %s total archives' % (count))

    if 0 < count:
        write_stats(stats)
    else:
        log.error('no archives found!!!')
    if config['upload_open']:
        upload_latestarchive_file(config, archive_file_path, log)
    log.info('finished process latestarchive: %s total archives, kept %s' %
             (count, keep))
    return tumor_type2platform2archive_types2archives, platform2archive2metadata