Пример #1
0
    def add_metadata(self, data_df, info, program_name, project, config):
        """Add metadata info to the dataframe
        """
        metadata_list = flatten_map(
            info, config[program_name]['process_files']['data_table_mapping'])
        metadata = metadata_list[0]
        for next_metadata in metadata_list[1:]:
            metadata.update(next_metadata)

        program = project.split('-')[0]
        start_samplecode = config['sample_code_position'][program]['start']
        end_samplecode = config['sample_code_position'][program]['end']
        sample_type_code = metadata['sample_barcode'][
            start_samplecode:end_samplecode]

        data_df['file_gdc_id'] = metadata['file_gdc_id']
        data_df['aliquot_barcode'] = metadata['aliquot_barcode']
        data_df['aliquot_barcode'] = metadata['aliquot_barcode']
        data_df['sample_gdc_id'] = metadata['sample_gdc_id']
        data_df['sample_barcode'] = metadata['sample_barcode']
        data_df['case_gdc_id'] = metadata['case_gdc_id']
        data_df['case_barcode'] = metadata['case_barcode']
        data_df['program_name'] = metadata['program_name'].upper()
        data_df['project_short_name'] = metadata['project_short_name'].upper()
        data_df['sample_type_letter_code'] = config['sample_code2letter'][
            sample_type_code]
        data_df['data_type'] = metadata['data_type']
        data_df['experimental_strategy'] = metadata['experimental_strategy']

        return data_df
Пример #2
0
def __insert_rows(config, endpt_type, tablename, values, mapfilter, log):
    maps = []
    for value in values:
        maps += flatten_map(value, mapfilter)
    print_list_synopsis(maps, '\t\trows to save for %s' % (tablename), log)

    module = import_module(config['database_module'])
    fieldnames = module.ISBCGC_database_helper.field_names(tablename)
    rows = []
    for nextmap in maps:
        rows += __addrow(endpt_type, fieldnames, nextmap, log)
    if config['update_cloudsql']:
        #     def select(cls, config, stmt, log, params = [], verbose = True):
        wherelist = []
        for fieldname in fieldnames:
            wherelist += ['%s = %%s' % (fieldname)]
        stmt = 'select %s from %s where %s' % (fieldnames[0], tablename,
                                               ' and '.join(wherelist))
        count = 0
        for index in range(8):
            if len(rows) == index:
                break
            result = module.ISBCGC_database_helper.select(
                config, stmt, log, rows[index])
            count += 1 if len(result) > 0 else 0
        if count == min(len(rows), 8):
            log.warning(
                '\n\t====================\n\tfirst %d records already saved for %s, skipping\n\t===================='
                % (count, tablename))
            return
        elif 0 < count:
            raise ValueError(
                'only some of the first %d records were saved for %s' %
                (count, tablename))
        module.ISBCGC_database_helper.column_insert(config, rows, tablename,
                                                    fieldnames, log)
    else:
        log.warning(
            '\n\t====================\n\tnot saving to cloudsql to %s this run!\n\t===================='
            % (tablename))
def populate_sample_availibility(config, endpt_type, program_name, project_id,
                                 data_type, infos, log):
    log.info('\tbegin populate_sample_availibility() for %s:%s' %
             (project_id, data_type))

    # iterate through the gdc info and put together the counts for the sample barcodes
    sample_barcode2count = {}
    for info in infos:
        mapping = config[program_name]['process_files']['data_table_mapping']
        flattened = flatten_map(info, mapping)
        for index in range(len(flattened)):
            if (data_type in ('Simple somatic mutation', 'Masked Somatic Mutation') and 'controlled ' == flattened[index]['access']) or \
                (data_type in ('Aligned reads') and 'open' == flattened[index]['access']):
                continue
            sample_barcode = flattened[index]['sample_barcode']
            count = sample_barcode2count.setdefault(sample_barcode, 0)
            sample_barcode2count[sample_barcode] = count + 1

    # read in the appropriate data availability row to get the foreign key
    isb_label = config['data_type2isb_label'][data_type]
    stmt = 'select metadata_data_type_availability_id from %s_metadata_data_type_availability where genomic_build = %%s and isb_label = %%s' % (
        program_name)
    foreign_key = ISBCGC_database_helper.select(
        config, stmt, log,
        [config['endpt2genomebuild'][endpt_type], isb_label])[0][0]

    params = []
    for sample_barcode, count in sample_barcode2count.iteritems():
        params += [[foreign_key, sample_barcode, count]]

    ISBCGC_database_helper.column_insert(
        config, params,
        '%s_metadata_sample_data_availability' % (program_name),
        ['metadata_data_type_availability_id', 'sample_barcode', 'count'], log)

    log.info('\tfinished populate_sample_availibility() for %s:%s' %
             (project_id, data_type))
Пример #4
0
    def add_metadata(self, file_df, data_type, info, program_name, project,
                     config):
        """Add metadata info to the dataframe
        """
        metadata_list = flatten_map(
            info, config[program_name]['process_files']['data_table_mapping'])
        metadata = metadata_list[0]
        for next_metadata in metadata_list[1:]:
            metadata.update(next_metadata)

        metadata_columns = config[program_name]['process_files'][
            'datatype2bqscript'][data_type]['add_metadata_columns']
        for metadata_column in metadata_columns:
            if 'sample_type_code' == metadata_column:
                program = project.split('-')[0]
                start_samplecode = config['sample_code_position'][program][
                    'start']
                end_samplecode = config['sample_code_position'][program]['end']
                file_df['sample_type_code'] = metadata['sample_barcode'][
                    start_samplecode:end_samplecode]
            else:
                file_df[metadata_column] = metadata[metadata_column]

        return file_df
Пример #5
0
    def upload_batch_etl(self, config, outputdir, paths, file2info,
                         program_name, project, data_type, log):
        if not config[program_name]['process_files']['datatype2bqscript'][
                'Isoform Expression Quantification']['only_matrix']:
            super(miRNA_matrix,
                  self).upload_batch_etl(config, outputdir, paths, file2info,
                                         program_name, project, data_type, log)
        else:
            log.info('not calling upload_batch_etl() for %s:%s' %
                     (project, data_type))

        # copy files to common location cross all projects, flattening the directory names into the file names
        input_dir = config['download_base_output_dir'] + '%s/%s/' % (project,
                                                                     data_type)
        common_dir = config['download_base_output_dir'] + config[program_name][
            'process_files']['datatype2bqscript'][
                'Isoform Expression Quantification']['matrix_subdir']

        log.info('\tcopy files for %s:%s for mirna isoform matrix' %
                 (data_type, project))
        contents = listdir(input_dir)
        for content in contents:
            if path.isdir(input_dir + content):
                files = listdir(input_dir + content)
                for file_name in files:
                    full_name = content + '_' + file_name
                    if path.exists(common_dir + full_name):
                        raise ValueError('file already exists: %s' %
                                         (full_name))
                    copy(input_dir + content + '/' + file_name,
                         common_dir + full_name)
        log.info('\tcopied files for %s: %s for mirna isoform matrix' %
                 (data_type, project))

        # first time this is called, safe off the file2info, transformed into aliquot2info, for use in finalize
        mapfile_name = project + "_aliquotinfo.txt"
        mapfile_path = config['download_base_output_dir'] + config[
            program_name]['process_files']['datatype2bqscript'][
                'Isoform Expression Quantification'][
                    'matrix_persist_subdir'] + mapfile_name
        if not path.exists(mapfile_path):
            log.info(
                '\tcreate metadata file for %s:%s for mirna isoform matrix' %
                (data_type, project))
            # create the aliquot centric map
            file_name2info = {}
            for value in file2info.values():
                flattened = flatten_map(
                    value, config[program_name]['process_files']
                    ['data_table_mapping'])[0]
                info = file_name2info.setdefault(
                    '_'.join(
                        [flattened['file_gdc_id'], flattened['file_name']]),
                    {})
                info['aliquot_barcode'] = flattened['aliquot_barcode']
                info['project_short_name'] = flattened['project_short_name']
                program_name = flattened['program_name']
                info['program_name'] = program_name
                sample_type_code = flattened['aliquot_barcode'][
                    config['sample_code_position'][program_name]['start']:
                    config['sample_code_position'][program_name]['end']]
                info['sample_type_code'] = sample_type_code
                info['file_name'] = flattened['file_name']
                info['file_gdc_id'] = flattened['file_gdc_id']
                info['case_gdc_id'] = flattened['case_gdc_id']
                info['sample_gdc_id'] = flattened['sample_gdc_id']
                info['aliquot_gdc_id'] = flattened['aliquot_gdc_id']
            with open(mapfile_path, 'wb') as mapfile:
                dump(file_name2info, mapfile, protocol=HIGHEST_PROTOCOL)
            log.info(
                '\tsaved metadata file for %s:%s for mirna isoform matrix' %
                (data_type, project))
Пример #6
0
def process_files(config, endpt_type, file2info, outputdir, start, end,
                  program_name, project, data_type, etl_class, log):
    try:
        filepath = outputdir + config['download_output_file_template'] % (
            start, end - 1)
        with tarfile.open(filepath) as tf:
            log.info('\t\textract tar files from %s' % (filepath))
            tf.extractall(outputdir)
            log.info('\t\tdone extract tar files from %s' % (filepath))

        with open(outputdir + 'MANIFEST.txt') as manifest:
            lines = manifest.read().split('\n')
            paths = []
            filenames = set()
            for line in lines[1:]:
                filepath = line.split('\t')[1]
                paths += [filepath]
                filenames.add(filepath.split('/')[1])
        paths.sort(key=lambda path: path.split('/')[1])

        if config['upload_files']:
            for path in paths:
                basefolder = config['buckets']['folders']['base_file_folder']

                metadata = flatten_map(
                    file2info[path], config[program_name]['process_files']
                    ['data_table_mapping'])
                keypath_template = config[program_name]['process_files'][
                    'bucket_path_template']
                key_path_components = []
                for part in config[program_name]['process_files'][
                        'bucket_path']:
                    fields = part.split(':')
                    if 1 == len(fields):
                        if 'endpoint_type' == part:
                            key_path_components += [endpt_type]
                        else:
                            key_path_components += [metadata[0][part]]
                    elif 'alt' == fields[0]:
                        if fields[1] in metadata[0] and metadata[0][fields[1]]:
                            key_path_components += [metadata[0][fields[1]]]
                        else:
                            key_path_components += [metadata[0][fields[2]]]

                key_name = basefolder + (keypath_template %
                                         tuple(key_path_components))
                log.info('\t\tuploading %s' % (key_name))
                upload_file(config, outputdir + path,
                            config['buckets']['open'], key_name, log)
        else:
            log.info('\t\t\tnot uploading files for %s:%s' %
                     (project, data_type))

        etl_uploaded = False
        if config['upload_etl_files'] and data_type in config[program_name][
                'process_files']['datatype2bqscript'] and etl_class is not None:
            etl_uploaded = etl_class.upload_batch_etl(config, outputdir, paths,
                                                      file2info, endpt_type,
                                                      program_name, project,
                                                      data_type, log)
        else:
            log.warning(
                '\t\tnot processing files for ETL for project %s and datatype %s%s'
                % (project, data_type, ' because there is no script specified'
                   if config['upload_etl_files'] else ''))
        return etl_uploaded
    except:
        log.exception(
            'problem process file %s for project %s and data_type %s' %
            (filepath, project, data_type))
        raise
    finally:
        if 'delete_dir_contents' not in config or config['delete_dir_contents']:
            delete_dir_contents(outputdir)