示例#1
0
def process_platform(config, log_dir, log_name, tumor_type, platform, archive2metadata, archive_types2archives, barcode2annotations, ffpe_samples):
    try:
        create_log(log_dir + tumor_type + '/', log_name)
        log = logging.getLogger(log_name)
        if 'mage-tab' not in archive_types2archives:
            orphan_data_archives = set([archive_info[0] for archive_info in archive_types2archives['data']]) - \
                set((archive_info[0] for archive_info in archive_types2archives['maf']) if 'maf' in archive_types2archives else [])
            if 0 < len(orphan_data_archives):
                log.warning('\tno mage-tab archives for %s and but there are data archives that are not maf: %s' % (platform, orphan_data_archives))
            else:
                log.warning('\tno mage-tab archives for %s' % (platform))
            if 'maf' in archive_types2archives:
                return process_maf_files(config, archive_types2archives['maf'], {}, {}, archive2metadata, log)
            return {}
        sdrf_metadata = process_sdrf(config, log, archive_types2archives['mage-tab'], archive2metadata, barcode2annotations)
        if 'data' in archive_types2archives:
            upload_archives(config, log, archive_types2archives['data'], sdrf_metadata, archive2metadata, ffpe_samples)
        else:
            log.warning('\tno data archives found for %s' % (tumor_type + ':' + platform))
        
        if 'maf' in archive_types2archives:
            process_maf_files(config, archive_types2archives['maf'], sdrf_metadata, archive_types2archives['data'], archive2metadata, log)
        return sdrf_metadata
    except Exception as e:
        log.exception('%s generated an exception' % (platform))
        raise e
示例#2
0
def process_platform(config, log_dir, log_name, tumor_type, platform,
                     archive2metadata, archive_types2archives,
                     barcode2annotations, exclude_samples):
    '''
    process the archives associated with the platform to obtain metadata from the sdrf archives and to upload the appropriate files from the 
    exploded downloaded archives
    
    parameters:
        config: the configuration map
        log_dir: the base directory for the logs
        log_name: the name of the log to use to log any messages
        tumor_type: the TCGA study being processed
        platform: the TCGA platform being processsed
        archive2metadata: map of archive name to its metadata
        archive_types2archives: map of archive types ('maf', 'mage-tab', and 'data') for this study to its archives
        barcode2annotations: map of barcodes to TCGA annotations
        exclude_samples: a list of barcodes of ffpe samples
    
    returns:
        sdrf_metadata: the netadata obtained from parsing the SDRF files
    '''
    try:
        create_log(log_dir + tumor_type + '/', log_name)
        log = logging.getLogger(log_name)
        if 'mage-tab' not in archive_types2archives:
            orphan_data_archives = (set([archive_info[0] for archive_info in archive_types2archives['data']]) if 'data' in archive_types2archives else set()) - \
                set((archive_info[0] for archive_info in archive_types2archives['maf']) if 'maf' in archive_types2archives else [])
            if 0 < len(orphan_data_archives):
                log.warning(
                    '\tno mage-tab archives for %s and but there are data archives that are not maf: %s'
                    % (platform, orphan_data_archives))
            else:
                log.warning('\tno mage-tab archives for %s' % (platform))

            maf_metadata = {}
            if 'maf' in archive_types2archives:
                return process_maf_files(config, archive_types2archives['maf'],
                                         maf_metadata, archive2metadata, log)
            return maf_metadata
        sdrf_metadata = process_sdrf(config, log,
                                     archive_types2archives['mage-tab'],
                                     archive2metadata, barcode2annotations)
        if 'data' in archive_types2archives:
            upload_archives(config, log, archive_types2archives['data'],
                            sdrf_metadata, archive2metadata, exclude_samples)
        else:
            log.warning('\tno data archives found for %s' %
                        (tumor_type + ':' + platform))

        if 'maf' in archive_types2archives:
            process_maf_files(config, archive_types2archives['maf'],
                              sdrf_metadata, archive2metadata, log)
        return sdrf_metadata
    except Exception as e:
        log.exception('%s generated an exception' % (platform))
        raise e
def process_data_type(config,
                      endpt_type,
                      program_name,
                      project_id,
                      data_type,
                      log_dir,
                      log_name=None):
    try:
        if log_name:
            log_name = create_log(log_dir, log_name)
        else:
            log_name = create_log(
                log_dir, project_id + '_' + data_type.replace(' ', ''))
        log = logging.getLogger(log_name)

        log.info('begin process_data_type %s for %s' % (data_type, project_id))
        file2info = get_map_rows(config, endpt_type, 'file', program_name,
                                 get_filter(config, data_type, project_id),
                                 log)
        file2info = filter_null_samples(config, file2info, project_id,
                                        data_type, log)
        if data_type in config['data_type_gcs']:
            save2db(
                config, endpt_type, '%s_metadata_data_%s' %
                (program_name, config['endpt2genomebuild'][endpt_type]),
                file2info,
                config[program_name]['process_files']['data_table_mapping'],
                log)
            if config['process_paths']:
                set_uploaded_path(config, endpt_type, program_name, project_id,
                                  data_type, log)
        if config['process_data_availability'] and data_type not in (
                'Clinical Supplement', 'Biospecimen Supplement'):
            populate_sample_availibility(config, endpt_type, program_name,
                                         project_id, data_type,
                                         file2info.values(), log)
        upload_files(config, endpt_type, file2info, program_name, project_id,
                     data_type, log)
        log.info('finished process_data_type %s for %s' %
                 (data_type, project_id))

        return file2info
    except:
        log.exception('problem processing data_type %s for %s' %
                      (data_type, project_id))
        raise
    finally:
        close_log(log)
def main(configFileName):
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace("-", "_") + "_" + config["log_dir_tag"] + "/"
        log_name = create_log(log_dir, "create_datadictionary")
        log = logging.getLogger(log_name)
        log.info("begin update of has_Illumia_DNASeq")
        select_stmt = "select samplebarcode from metadata_data \
            where 0 < instr(Platform, 'DNASeq') and 0 = instr(Platform, 'Roche') and \
                0 = instr(Platform, 'ABSOLiD') and 0 = instr(Platform, 'PacBio') and \
                project <> 'CCLE' \
            group by samplebarcode"
        cursor = isbcgc_cloudsql_model.ISBCGC_database_helper.select(config, select_stmt, log)
        barcodes = ""
        for blist in cursor[:20]:
            barcodes += "\n\t\t" + blist[0]
        log.info("\tcompleted select, %s rows:\n\t\t%s\n\t\t..." % (len(cursor), barcodes))
        # there's a problem that the rows that come back for this are [(barcode,), (barcode,), ...] so update fails with too many arguments
        # so put back together
        fixedcursor = []
        for blist in cursor:
            fixedcursor += [blist[0]]
    except Exception as e:
        log.exception("\tproblem selecting")
        raise e

    try:
        update_stmt = "update metadata_samples set has_Illumina_DNASeq = 1 where samplebarcode = %s"
        cursor = isbcgc_cloudsql_model.ISBCGC_database_helper.update(config, update_stmt, log, fixedcursor)
        log.info("finished update of has_Illumia_DNASeq")
    except Exception as e:
        log.exception("\tproblem updating")
        raise e
def main(configfilename):
    print datetime.now(), 'begin process bucket info'
    with open(configfilename) as configFile:
        config = json.load(configFile)

    log_dir = str(date.today()).replace(
        '-', '_') + '_' + config['log_dir_tag'] + '_update_uploaded' + '/'
    log_name = create_log(log_dir, 'top_processing')
    log = logging.getLogger(log_name)

    log.info('begin process bucket info')
    try:
        gcs_wrapper.open_connection()
        file_exts = config['buckets']['update_exts']
        for bucket in config['buckets']['update_uploaded']:
            bucket_name = bucket['bucket_name']
            with open(bucket['outputfile'], 'w') as outfile:
                upload_files = gcs_wrapper.get_bucket_contents(
                    bucket_name, log)
                for upload_file_pair in upload_files:
                    for file_ext in file_exts:
                        if upload_file_pair[0].endswith(file_ext):
                            outfile.write('\t'.joinupload_file_pair() + '\n')
                            break
    except Exception as e:
        log.exception('problem processing bucket info')
        raise e
    finally:
        gcs_wrapper.close_connection()
    log.info('finish process bucket info')
    print datetime.now(), 'finish process bucket info'
示例#6
0
def main(configFileName):
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'create_datadictionary')
        log = logging.getLogger(log_name)
        log.info('begin create metadata data dictionary')

        listlist = []
        with open('MetadataDataDictionary_v3.txt', 'r') as datadict:
            #read the header line and discard
            datadict.readline()
            for line in datadict:
                fields = [field.strip() for field in line.split('\t')]
                if 10 != len(fields):
                    raise ValueError('line wrong length: %s' % fields)
                listlist += [fields]
        ISBCGC_metadata_database_helper.initialize(config, log)
        ISBCGC_metadata_database_helper.insert(config, listlist,
                                               'metadata_datadictionary', log)

        log.info('end create metadata data dictionary')
    except Exception as e:
        raise e
    def process_bio(self, program_name, program, log_dir):
        log_dir = log_dir + 'bio' + '/'
        log = getLogger(create_log(log_dir, 'bio'))

        log.info('processing {} for bio'.format(program_name))
        bio_storage2source2barcodes = {}

        cases, samples = self.get_gdc_barcode_info(program_name, log_dir)
        source2barcodes = bio_storage2source2barcodes.setdefault('gdc', {})
        source2barcodes['api'] = (cases, samples)

        sql2bq = program().bio_sql2bq()
        sql2cases, sql2samples = self.get_sql_barcodes(sql2bq)

        bq2cases, bq2samples = self.get_bq_barcodes(sql2bq.values())
        for table, sqlcases in sql2cases.iteritems():
            if sql2bq[table][0]:
                bqcases = bq2cases[sql2bq[table][0]]
            else:
                bqcases = set()

            sqlsamples = sql2samples[table]
            if sql2bq[table][0]:
                bqsamples = bq2samples[sql2bq[table][0]]
            else:
                bqsamples = set()

            source2barcodes = bio_storage2source2barcodes.setdefault('sql', {})
            source2barcodes[table] = (sqlcases, sqlsamples)

            source2barcodes = bio_storage2source2barcodes.setdefault('bq', {})
            source2barcodes[table] = (bqcases, bqsamples)

        log.info('finished {} for bio'.format(program_name))
        return bio_storage2source2barcodes
def process_projects(config, endpt_type, program, log_dir):
    try:
        log_name = create_log(log_dir, program + '_' + 'project')
        log = logging.getLogger(log_name)

        log.info('begin process_projects for %s' % (program))
        project2info = get_map_rows(config, endpt_type, 'project', program,
                                    get_filter(program), log)
        program2info = {program: project2info[project2info.keys()[0]]}
        if config['process_program']:
            save2db(
                config, endpt_type, 'metadata_program', program2info,
                config[program]['process_projects']['program_table_mapping'],
                log)
        else:
            log.warning(
                '\n\t====================\n\tnot saving to db for programs this run!\n\t===================='
            )
        if config['process_project']:
            save2db(
                config, endpt_type, '%s_metadata_project' % program,
                project2info,
                config[program]['process_projects']['project_table_mapping'],
                log)
        else:
            log.warning(
                '\n\t====================\n\tnot saving to db for projects this run!\n\t===================='
            )
        log.info('finished process_projects for %s' % (program))
        return project2info
    except:
        log.exception('problem processing projects for %s' % (program))
        raise
    finally:
        close_log(log)
示例#9
0
    def setUp(self):
        with open(self.config) as configFile:
            self.config = json.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + self.log_tag
        log_name = create_log(log_dir, 'gdc_test')
        self.log = logging.getLogger(log_name)
def main(configFileName):
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'create_datadictionary')
        log = logging.getLogger(log_name)
        log.info('begin create metadata data dictionary')
        
        listlist = []
        with open('MetadataDataDictionary_v3.txt', 'r') as datadict:
            #read the header line and discard
            datadict.readline()
            for line in datadict:
                fields = [field.strip() for field in line.split('\t')]
                if 10 != len(fields):
                    raise ValueError('line wrong length: %s' % fields)
                listlist += [fields]
        ISBCGC_metadata_database_helper.initialize(config, log)
        ISBCGC_metadata_database_helper.insert(config, listlist, 'metadata_datadictionary', log)

        log.info('end create metadata data dictionary')
    except Exception as e:
        raise e
def main(configfilename, baminfo_filename):
    log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/'
    log_name = create_log(log_dir, 'update_gg_metadata')
    log = logging.getLogger(log_name)
    log.info('begin update gg metadata')

    idcol = 0
    ggdatasetcol = 7
    ggreadgroupset = 6
    nexist = 'NA'
    updates = []
    count = 0
    with open(baminfo_filename) as baminfo:
        baminfo.readline()
        for line in baminfo:
            fields = line.strip().split('\t')
            if fields[ggreadgroupset] == nexist:
                continue
            values = [
                fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol]
            ]
            if 0 == count % 1000:
                log.info('%s processing row %s--%s' %
                         (datetime.now(), count, ':'.join(values)))
            count += 1
            updates += [values]

    stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s'
    with open(configfilename) as configFile:
        config = json.load(configFile)
    ISBCGC_database_helper.update(config, stmt, log, updates, False)

    log.info('finished update gg metadata')
示例#12
0
def process_annotations(config, endpt_type, log_dir):
    try:
        log_name = create_log(log_dir, '%s_annotations' % (endpt_type))
        log = logging.getLogger(log_name)

        for program_name in config['program_names_for_annotation']:
            log.info('begin process_annotations for %s' % (program_name))
            annotation2info = get_map_rows(config, endpt_type, 'annotation',
                                           program_name, get_filter(), log)
            add_barcodes(annotation2info)
            save2db(
                config, endpt_type, '%s_metadata_annotation' % program_name,
                annotation2info, config['%s' % (program_name)]
                ['process_annotations']['annotation_table_mapping'], log)

            if 'etl' in config[program_name]['process_annotations']:
                etl(config, log)
            log.info('finished process_annotations %s' % (program_name))

        return annotation2info
    except:
        log.exception('problem processing annotations:')
        raise
    finally:
        close_log(log)
示例#13
0
def main(config_file_name):
    log = None
    try:
        with open(config_file_name) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/'
        log_name = create_log(log_dir, 'update_ccle_gcs_paths')
        log = logging.getLogger(log_name)
        
        log.info('begin updating CCLE paths in production')
        # first thing to do is to read in the file paths from BigQuery
        query = 'SELECT file_gdc_id, file_gcs_url ' \
            'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \
            'where 0 < instr(file_gcs_url, \'CCLE\')'

        query_results = query_bq_table(query, True, 'isb-cgc', log)
        _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log)
        
        log.info('\tcreate map of filename to path')
        name2path = {}
        for row in rows:
            fields = row[1].split('/')
            name2path[fields[-1]] = '/'.join(fields[3:])
        log.info('\tfinished map of filename to path')
        
        # get the db rows from production cloudsql
        log.info('\tselect ccle filenames from cloudsql')
        query = 'SELECT datafilename ' \
            'FROM main.metadata_data ' \
            'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\''
        
        rows = helper.select(config, query, log, [])
        log.info('\tselected %s ccle filenames from cloudsql' % (len(rows)))
        
        # now setup and do the update of paths in cloud sql
        log.info('\tstart updating paths in cloudsql')
        params = []
        not_matched = []
        for row in rows:
            if row[0] in name2path:
                params += [[name2path[row[0]], row[0]]]
            else:
                not_matched += [row[0]]
        update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s'
        helper.update(config, update, log, params)
        log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched)))

        log.info('finished updating CCLE paths in production')
    except:
        if log:
            log.exception('failed to update ccle GCS filepaths')
    finally:
        if log:
            close_log(log)
def getlog(configFileName):
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)
        log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'create_ccle_metadata')
        log = logging.getLogger(log_name)
    except Exception as e:
        traceback.print_exc(5)
        raise e
    return config, log
示例#15
0
def getlog(configFileName):
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)
        log_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'create_ccle_metadata')
        log = logging.getLogger(log_name)
    except Exception as e:
        traceback.print_exc(5)
        raise e
    return config, log
示例#16
0
def main(config_filename):
    try:
        with open(config_filename) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + '_gdc_upload_run/'
        log_name = create_log(log_dir, 'gdc_upload')
        log = logging.getLogger(log_name)
        log.info('begin melting miRNA isoform matrix')
        test_file = './tcga_etl_pipeline/mirna_isoform_matrix/hiseq/expn_matrix_mimat_norm_IlluminaHiSeq_miRNASeq_small.txt'
        log.info('finished melting miRNA isoform matrix')
    except:
        raise
def train_nn(sess, epochs, batch_size, get_batches_fn, train_op, cross_entropy_loss, input_image,
             correct_label, keep_prob, learning_rate):
    """
    Train neural network and print out the loss during training.
    :param sess: TF Session
    :param epochs: Number of epochs
    :param batch_size: Batch size
    :param get_batches_fn: Function to get batches of training data.  Call using get_batches_fn(batch_size)
    :param train_op: TF Operation to train the neural network
    :param cross_entropy_loss: TF Tensor for the amount of loss
    :param input_image: TF Placeholder for input images
    :param correct_label: TF Placeholder for label images
    :param keep_prob: TF Placeholder for dropout keep probability
    :param learning_rate: TF Placeholder for learning rate
    """
    training_log = util.create_log();
    #util.log_n_print(training_log,"BATCH_SIZE : "+BATCH_SIZE+" EPOCHS : "+EPOCHS+" LEARNING_RATE : "+LEARNING_RATE+" KEEP_PROB : "+KEEP_PROB+"\n");
    util.log_n_print(training_log,"BATCH_SIZE : %d \nEPOCHS : %d \nLEARNING_RATE : %f \nKEEP_PROB : %f \n" % (BATCH_SIZE, EPOCHS, LEARNING_RATE, KEEP_PROB));
    # DONE: Implement function
    min_loss = 1e7;
    # saver = tf.train.Saver();
    best_model_path = None;
    loss_history = [];
    loss = 1;
    if(not util.can_continue()):
              util.log_n_print(training_log,"####### Training will be Force stopped after 1 Epoch #######");
    if is_in_training:
        util.log_n_print(training_log,"Training started....");
        for epoch in range(epochs):
            avg_loss = 0.0;
            total_images = 0;
            util.log_n_print(training_log,"EPOCH : %d" %(epoch+1));
            for image,label in get_batches_fn(batch_size):
                _, loss = sess.run([train_op,cross_entropy_loss],feed_dict = {input_image : image , correct_label : label , keep_prob : KEEP_PROB, learning_rate : LEARNING_RATE});
                avg_loss += (loss*image.shape[0]);
                total_images += image.shape[0];
                if(total_images%70==0):
                  util.log_n_print(training_log,"EPOCH : %d  Images processed : %d " % (epoch+1,total_images));
            avg_loss/=total_images;
            loss_history.append(avg_loss);
            if(not util.can_continue()):
              util.log_n_print(training_log,"Forced stopped after epoch : %d" % epoch);
              break;
  
            util.log_n_print(training_log,"Loss : = {:.3f}".format(avg_loss));
            # saving best model as checkpoint
            # if (loss < min_loss):
            #     best_model_path =  saver.save(sess,"/checkpoints/best_model.ckpt");
            #     min_loss = loss;
        util.log_n_print(training_log,'loss_history : '+str(loss_history));
    return best_model_path;
示例#18
0
def main(project_name, bucket_name, path_file, config_file):
    log_dir = str(date.today()).replace('-', '_') + '_bam_report_/'
    log_name = create_log(log_dir, 'top_processing')
    log = logging.getLogger(log_name)
    log.info('begin bam report')
    
    buckets, path2bam2bai = read_path_file(path_file, log)
    path2contents = get_bucket_content(project_name, buckets, log)
    with open(config_file) as con_file:
        config = json.load(con_file)
    update_database(path2bam2bai, config, log)
        
    count = 0
    no_index = set()
    ext_pairs2count = {}
    for contents in path2contents.itervalues():
        if 0 == count % 1024:
            log.info('\tprocessing contents {}: {}'.format(count, contents))
        count += 1
        file_prefix = None
        for file_name in contents:
            parts = file_name.split('.')
            if 'bam' in parts[-1]:
                bam_ext = parts[-1]
                file_prefix = '.'.join(parts[:-1])
                break
        
        if not file_prefix:
            log.warn('\tdid not find a bam file for {}'.format(contents))
            continue
        contents.remove(file_name)
        if 1 < len(contents):
            log.warn('\tfound  more than 2 files for bam file {}:  {}'.format(file_name, contents))
            continue
        if 0 == len(contents):
            log.warn('\tdidn\'t find an index file for bam file {}'.format(file_name))
            no_index.add(file_name)
            continue
        
        index_file = contents.pop()
        index_ext = index_file[len(file_prefix) + 1:]
        count = ext_pairs2count.setdefault((bam_ext, index_ext), 0)
        ext_pairs2count[(bam_ext, index_ext)] = count + 1
        
    if 0 < len(no_index):
        log.info('\n{}\n\tno-index file: \n\t\t{}\n\textension pairs:\n\t\t{}'.format(bucket_name, '\n\t\t'.join(no_index), '\n\t\t'.join('{}: {}'.format(pairs, count) for pairs, count in ext_pairs2count.iteritems())))
    else:
        log.info('\textension pairs:\n\t\t{}'.format('\n\t\t'.join('{}: {}'.format(pairs, count) for pairs, count in ext_pairs2count.iteritems())))
    log.info('bam report completed')
    def process_bq(self, program_name, program, bq_results, log_dir):
        log_dir = log_dir + 'bq' + '/'
        log = getLogger(create_log(log_dir, '{}_bq'.format(program_name)))
        # data type: isb_label, bq table, sample_barcode
        isb_label2tables = program().bq_datasets()
        params = []
        for data_type, info in isb_label2tables.iteritems():
            params += [[
                program_name, data_type, info[0], info[1], info[2], info[3],
                bq_results, log_dir
            ]]
        calls = {'fn': self.compare_isb_label_bq, 'labels': {'params': params}}
        launch_threads(self.config, 'labels', calls, self.log)

        log.info('processing {} bq'.format(program_name))
        log.info('finished {} bq'.format(program_name))
    def compare_isb_label_bq(self, program_name, data_type, isb_label,
                             bq_table, sample_barcode, has_file, bq_results,
                             log_dir):
        log_dir = log_dir + isb_label + '/'
        log = getLogger(
            create_log(log_dir, '{}_{}_bq'.format(program_name, data_type)))
        log.info('\tprocessing {}-{} for bq'.format(program_name, isb_label))

        api_project2cases, api_project2samples, api_project2files = self.get_api_data_types_barcodes(
            program_name, [data_type], log)
        if 'somatic' not in data_type.lower():
            bq_project2cases, bq_project2samples, bq_project2files = self.get_bq_data_type_barcodes(
                program_name, bq_table, sample_barcode, has_file, log)
        else:
            if "Simple somatic mutation" == data_type:
                bq_project2cases1, bq_project2samples1, bq_project2files1 = self.get_bq_data_type_barcodes(
                    program_name, bq_table[0], sample_barcode[0], has_file,
                    log)
                bq_project2cases_normal1, bq_project2samples_normal1, bq_project2files_normal1 = self.get_bq_data_type_barcodes(
                    program_name, bq_table[0], sample_barcode[1], has_file,
                    log)

                bq_project2cases2, bq_project2samples2, bq_project2files2 = self.get_bq_data_type_barcodes(
                    program_name, bq_table[1], sample_barcode[0], has_file,
                    log)
                bq_project2cases_normal2, bq_project2samples_normal2, bq_project2files_normal2 = self.get_bq_data_type_barcodes(
                    program_name, bq_table[1], sample_barcode[1], has_file,
                    log)

                bq_project2cases = self.merge_set_lists(
                    bq_project2cases1, bq_project2cases2)
                bq_project2samples = self.merge_set_lists(
                    bq_project2samples1, bq_project2samples2)
                bq_project2files = self.merge_set_lists(
                    bq_project2files1, bq_project2files2)
            else:
                bq_project2cases, bq_project2samples, bq_project2files = self.get_bq_data_type_barcodes(
                    program_name, bq_table, sample_barcode[0], has_file, log)
                bq_project2cases_normal, bq_project2samples_normal, bq_project2files_normal = self.get_bq_data_type_barcodes(
                    program_name, bq_table, sample_barcode[1], has_file, log)
        label_project2cases, label_project2samples = self.get_gcs_isb_label_barcodes(
            program_name, isb_label, log)

        project2barcodes = bq_results.setdefault(isb_label, {})
        api_cases = set(case for cases in api_project2cases.itervalues()
                        for case in cases)
示例#21
0
def process_program(config, endpt_type, program_name, projects, log_dir):
    try:
        log_dir += program_name + '_%s' % endpt_type + '/'
        log_name = create_log(log_dir, program_name)
        log = logging.getLogger(log_name)
        log.info('begin process_program(%s)' % (program_name))

        future2project = {}
        initialize_etl(config, program_name, log)
        with futures.ThreadPoolExecutor(
                max_workers=config['program_threads']) as executor:
            for project in projects:
                if project in config['skip_projects']:
                    log.info('\tskipping project %s' % (project))
                    continue
                if 0 == len(config['project_name_restrict']
                            ) or project in config['project_name_restrict']:
                    log.info('\tprocessing project %s' % (project))
                    future2project[executor.submit(process_project, config,
                                                   endpt_type, program_name,
                                                   project, log_dir)] = project
                else:
                    log.info('\tnot processing project %s' % (project))

        future_keys = future2project.keys()
        while future_keys:
            future_done, future_keys = futures.wait(
                future_keys, return_when=futures.FIRST_COMPLETED)
            for future in future_done:
                project = future2project.pop(future)
                if future.exception() is not None:
                    log.exception('\t%s generated an exception--%s: %s' %
                                  (project, type(future.exception()).__name__,
                                   future.exception()))
                else:
                    future.result()
                    log.info('\tfinished project %s' % (project))
        finalize_etl(config, program_name, log)

        log.info('finished process_program(%s)' % (program_name))
    except:
        log.exception('problem processing program %s' % (program_name))
        raise
    finally:
        close_log(log)
    def process_gcs(self, program_name, program, results, log_dir):
        log_dir = log_dir + 'gcs' + '/'
        log = getLogger(create_log(log_dir, '{}_gcs'.format(program_name)))
        log.info('processing {} for gcs'.format(program_name))

        isb_label2tables = program().gcs_datasets()
        params = []
        for isb_label, data_types in isb_label2tables.iteritems():
            params += [[program_name, isb_label, data_types, results, log_dir]]
        calls = {
            'fn': self.compare_isb_label_gcs,
            'labels': {
                'params': params
            }
        }
        launch_threads(self.config, 'labels', calls, self.log)

        log.info('finished {} for gcs'.format(program_name))
    def compare_isb_label_gcs(self, program_name, isb_label, data_types,
                              results, log_dir):
        log_dir = log_dir + isb_label + '/gcs/'
        log = getLogger(
            create_log(log_dir, '{}_{}_gcs'.format(program_name, isb_label)))
        log.info('\tprocessing {}-{} for gcs'.format(program_name, isb_label))

        api_project2cases, api_project2samples, api_project2files = self.get_api_data_types_barcodes(
            program_name, data_types, log)
        gcs_project2cases, gcs_project2samples, gcs_project2files = self.get_gcs_data_types_barcodes(
            program_name, data_types, log)
        label_project2cases, label_project2samples = self.get_gcs_isb_label_barcodes(
            program_name, isb_label, log)

        project2barcodes = results.setdefault(isb_label, {})

        api_cases = set(case for cases in api_project2cases.itervalues()
                        for case in cases)
    def get_gdc_barcode_info(self, program_name, log_dir):
        log = getLogger(create_log(log_dir, 'barcode_info'))
        log.info('processing {} for barcode information'.format(program_name))

        # get the total count to parallelize barcode fetches
        barcode2info = self.request_gdc_barcode_info(program_name,
                                                     program_name, 1, 1, 1)
        # divide into into batches based on total
        info = barcode2info.popitem()[1]
        total = info['total']
        log.info('\tfetching {} cases for {}'.format(total, info))
        batch = total / 20
        log.info('\tlooking at batches of {} repeated 20 times for {}'.format(
            batch, program_name))
        params = []
        cur_start = 1

        for i in range(21):
            params += [[
                program_name + '_%s' % (i), program_name, cur_start,
                min(batch, 200), batch
            ]]
            log.info('\t\tbatch {}: {}'.format(i, params[-1]))
            cur_start += batch

        calls = {
            'fn': self.request_gdc_barcode_info,
            'batches': {
                'params': params
            }
        }
        barcode2info = launch_threads(self.config, 'batches', calls, self.log)
        samples = set()
        for info in barcode2info.itervalues():
            #             if 0 != len(set(cursamples) & set(samples)):
            #                 raise ValueError('saw repeated barcode: {}'.format(set(cursamples) & set(cursamples)))
            samples |= set(info['sample_barcodes'])

        log.info(
            '\tfinished {} for barcode information.  found {} case and {} samples'
            .format(program_name, len(barcode2info), len(samples)))
        return set(barcode2info.keys()), samples
示例#25
0
def uploadTCGA(configFileName):
    print datetime.now(), 'begin uploadTCGA()'
    global executor
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)
        
        log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'top_processing')
        log = logging.getLogger(log_name)
        log.info('begin uploadTCGA()')
        
        executor = futures.ThreadPoolExecutor(max_workers=config['threads'])
        
        module = import_module(config['database_module'])
        module.ISBCGC_database_helper.initialize(config, log)
     
        if config['upload_files'] or config['upload_etl_files']:
            # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files
            gcs_wrapper.open_connection()
        info_status(config, log)
        tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(config, log_name)
        prepare_upload(tumor_type2platform2archive_types2archives, log)
        if 'process_cghub' not in config or config['process_cghub']:
            tumor_type2cghub_records = process_cghub(config, log=log, removedups=True, limit=-1)
        else:
            log.warning('\n\t====================\n\tnot processing CGHub records this run!\n\t====================')
            tumor_type2cghub_records = {}
        barcode2metadata = process_metadata_current(config, log_name)
        if 'process_annotations' not in config or config['process_annotations']:
            barcode2annotations = process_annotations(config, log_name)
        else:
            log.warning('\n\t====================\n\tnot processing annotations this run!\n\t====================')
            barcode2annotations = {}
        process_tumortypes(config, log_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log)
    finally:
        if executor:
            executor.shutdown(wait=False)
        if gcs_wrapper:
            gcs_wrapper.close_connection()
    log.info('finish uploadTCGA()')
    print datetime.now(), 'finish uploadTCGA()'
示例#26
0
def uploadGDC():
    print datetime.now(), 'begin uploadGDC()'

    gcs_wrapper = None
    try:
        args = parseargs()
        with open(args.config) as configFile:
            config = json.load(configFile)

        log_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(log_dir, 'top_processing')
        log = logging.getLogger(log_name)

        log.info('begin uploadGDC()')

        initializeDB(config, log)

        if config['upload_files'] or config['upload_etl_files']:
            # open the GCS wrapper here so it can be used by all the projects/platforms to save files
            gcs_wrapper = import_module(config['gcs_wrapper'])
            gcs_wrapper.open_connection(config, log)

        for endpt_type in config['endpt_types']:
            log.info('processing %s endpoints' % (endpt_type))
            if config['process_annotation']:
                process_annotations(config, endpt_type, log_dir)
            else:
                log.warning(
                    '\n\t====================\n\tnot processing annotations this run!\n\t===================='
                )
            process_programs(config, endpt_type, log_dir, log)
        finalize(config, log)
    except:
        raise
    finally:
        if gcs_wrapper:
            gcs_wrapper.close_connection()

    log.info('finished uploadGDC()')
    print datetime.now(), 'finished uploadGDC()'
示例#27
0
def main(configfilename):
    print datetime.now(), 'begin update DatafileUploaded'
    with open(configfilename) as configFile:
        config = json.load(configFile)

    log_dir = str(date.today()).replace(
        '-', '_') + '_' + config['log_dir_tag'] + '_update_uploaded' + '/'
    log_name = create_log(log_dir, 'top_processing')
    log = logging.getLogger(log_name)
    log.info('begin update DatafileUploaded')
    try:
        for path_file in config['buckets']['update_uploaded']:
            with open(path_file, 'r') as paths:
                updateDatafileUploaded(config, paths, log)

        update_nulls(config, log)
    except Exception as e:
        log.exception('problem updating DatafileUploaded')
        raise e
    log.info('finish update DatafileUploaded')
    print datetime.now(), 'finish update DatafileUploaded'
示例#28
0
def process_cases(config, endpt_type, program_name, project_name, log_dir):
    try:
        log_name = create_log(log_dir, project_name + '_cases')
        log = logging.getLogger(log_name)

        log.info('begin process_cases(%s)' % (project_name))
        case2info = get_map_rows(config, endpt_type, 'case', program_name,
                                 get_filter(project_name), log)
        save2db(
            config, endpt_type, '%s_metadata_clinical' % (program_name),
            case2info,
            config[program_name]['process_cases']['clinical_table_mapping'],
            log)

        remove_null_samples(case2info, log)
        save2db(config, endpt_type, '%s_metadata_biospecimen' % (program_name),
                case2info,
                config[program_name]['process_cases']['sample_table_mapping'],
                log)

        # fill in the rest of the metadata depending on the program
        if 0 < len(case2info.values()):
            postproc_module = import_module(
                config[program_name]['process_cases']['postproc_case']
                ['postproc_module'])
            postproc_module.postprocess(config, project_name, endpt_type, log)

        log.info('finished process_cases(%s)' % (project_name))

        #         log.info('begin process_cases(%s) for omf files' % (project_name))
        #         omf2info = get_omf_map_rows(config, project_name, log)
        #         save2db(config, 'metadata_gdc_clinical', case2info, config['process_cases']['clinical_table_mapping'], log)
        #         log.info('finished process_cases(%s) for omf files' % (project_name))

        return case2info
    except:
        log.exception('problem processing cases(%s):' % (project_name))
        raise
    finally:
        close_log(log)
示例#29
0
    def __init__(self, logger=None, log_level=logging.INFO):
        classname = type(self).__name__
        # print("classname: ", classname)
        if logger is None:
            self.logger = create_log(log_name=classname, level=log_level)
        else:
            self.logger = logger

        self.logger.info("\n-----------------")
        self.logger.info("Begin to init")
        self.logger.info("\n-----------------")

        self.sub = Subtitle(self.logger)
        self.sub.set_parse(True)
        self.files = dict()
        self.filetypes = ['srt', 'bak', 'm3u', 'txt']
        self.vediotypes = ['mkv', 'mp4', 'avi']
        self.rmsrt = False

        if self.sub.lexicon_path is None:
            self.sub.set_lexicon_file("lexicon/lexicon.xlsx")
        pass
示例#30
0
    def __init__(self, logger=None, loglevel=logging.INFO):
        if (logger is None):
            self.logger = create_log(log_name="odsWord", level=loglevel)
        else:
            self.logger = logger

        self.logger.info("\n-----------------")
        self.logger.info("Begin to init")
        self.logger.info("\n-----------------")
        self.files = []

        self.lexicon = set()
        self.stem_lexicon = dict()
        self.noUsed = set(["“", "—", "”"])

        self.punctuation = r".?\[\]!,\":%;()|^=+\/\\_`\*;.:><~"

        self.lexicon_path = None
        self.nameSet = set(names.words('male.txt') + names.words('female.txt'))
        self.checkup = False
        self.debug = False
        self.newWords = None
        pass
示例#31
0
def process_project(config, endpt_type, program_name, project, log_dir):
    try:
        log_dir += project + '/'
        log_name = create_log(log_dir, project)
        log = logging.getLogger(log_name)

        log.info('begin process_project(%s)' % (project))

        case2info = {}
        if config['process_case']:

            log.info('\tprocess cases for %s' % (project))
            case2info = process_cases(config, endpt_type, program_name,
                                      project, log_dir)
            log.info('\tcompleted process cases for %s' % (project))
        else:
            log.warning(
                '\n\t====================\n\tnot processing cases this run for %s!\n\t===================='
                % (project))

        file2info = {}
        if config['process_data_type']:
            with futures.ThreadPoolExecutor(
                    max_workers=config['project_threads']) as executor:
                log.info('\tprocess data_types for %s' % (project))
                future2data_type = {}
                data_types = request_facets_results(
                    config['files_endpt']['%s endpt' % (endpt_type)],
                    config['facets_query'], 'data_type', log)
                for data_type in data_types:
                    if ((len(config['data_type_restrict']) == 0
                         or data_type in config['data_type_restrict'])
                            and (data_type in config['data_type2isb_label']
                                 or data_type in config['data_type_gcs'])):
                        log.info('\t\tprocess data_type \'%s\' for %s' %
                                 (data_type, project))
                        future2data_type[executor.submit(
                            process_data_type, config, endpt_type,
                            program_name, project, data_type,
                            log_dir)] = data_type
                    else:
                        log.info('\t\tnot processing data_type %s for %s' %
                                 (data_type, project))

                retry_ct = 0
                data_type2retry = {}
                future_keys = future2data_type.keys()
                while future_keys:
                    future_done, _ = futures.wait(
                        future_keys, return_when=futures.FIRST_COMPLETED)
                    try:
                        for future in future_done:
                            data_type = future2data_type.pop(future)
                            if future.exception() is not None:
                                # TODO only retry on connection refused, not other exceptions
                                retry_ct = data_type2retry.setdefault(
                                    data_type, 0)
                                if retry_ct > 3:
                                    raise ValueError(
                                        '%s failed multiple times--%s:%s' %
                                        (data_type, type(
                                            future.exception()).__name__,
                                         future.exception()))
                                data_type2retry[data_type] = retry_ct + 1
                                log.warning(
                                    '\tWARNING: resubmitting %s--%s:%s.  try %s'
                                    % (data_type, type(
                                        future.exception()).__name__,
                                       future.exception(), retry_ct))
                                new_future = executor.submit(
                                    process_data_type, config, endpt_type,
                                    program_name, project, data_type, log_dir,
                                    project + '_' +
                                    data_type.replace(' ', '') + '_%d' %
                                    (retry_ct))
                                future2data_type[new_future] = data_type
                            else:
                                log.info(
                                    '\t\tfinished process data_type \'%s\' for %s'
                                    % (data_type, project))
                                file2info = future.result()
                                future_keys = future2data_type.keys()
                    except:
                        future_keys = future2data_type.keys()
                        log.exception('%s failed for %s' %
                                      (data_type, project))
                log.info('\tcompleted process data_types for %s' % (project))
        else:
            log.warning(
                '\n\t====================\n\tnot processing data types this run for %s!\n\t===================='
                % (project))

        log.info('finished process_project(%s)' % (project))
        return case2info, file2info
    except:
        log.exception('problem processing project %s' % (project))
        raise
示例#32
0
def create_field_report(configfilename):
    log = None
    try:
        with open(configfilename) as configFile:
            config = js.load(configFile)

        log_dir = str(date.today()).replace('-', '_') + '_log' + '/'
        log_name = create_log(log_dir, 'field_report')
        log = logging.getLogger(log_name)
        log.info('start create_field_report()')

        endpts = config['field_report']['endpoints']
        # fetch the queryable fields
        output_path = config['field_report']['output_path']
        output_file = str(date.today()).replace(
            '-', '_') + '_' + config['field_report']['output_file']
        with open(output_path + output_file, 'w') as output:
            for endpoint in endpts:
                output.write('Field value report:\n')
                log.info('\tstart endpoint \'%s\'' % (endpoint))
                output.write('\tendpoint \'%s\'\n' % (endpoint))
                template2field2values = {}
                mapping_templates = config['field_report'][
                    'endpoint_mapping_templates']
                url_templates = config['field_report']['url_templates']
                for mapping_template, url_template in zip(
                        mapping_templates, url_templates):
                    response = requests.get(mapping_template % (endpoint))
                    response.raise_for_status()
                    fields = response.json()['_mapping'].keys()
                    fields.sort()

                    field2values = template2field2values.setdefault(
                        url_template, {})
                    mod_count = len(fields) / 20
                    count = 0
                    log.info('\t\tfinding field values for base url \'%s\'' %
                             (url_template.split('%')[0]))
                    output.write(
                        '\t\tfinding field values for base url \'%s\'\n' %
                        (url_template.split('%')[0]))
                    log.info(
                        '\t\tgot information on the fields for %s.  found %s fields'
                        % (endpoint, len(fields)))
                    output.write('\t\tfound %s fields\n' % (len(fields)))
                    for field in fields:
                        try:
                            progress = False if len(
                                fields
                            ) < 71 else True if 0 == count else 0 == count % mod_count
                            get_values_for_field(endpoint, field, field2values,
                                                 output, url_template,
                                                 progress, log)
                        except:
                            log.exception('problem for field %s' % (field))
                            raise
                        count += 1

                log.info(
                    'start field value comparison between current and legacy endpoints'
                )
                output.write('\nField value comparison report:\n')
                regfield2values = None
                legfield2values = None
                for url in template2field2values:
                    if 'legacy' in url:
                        legfield2values = template2field2values[url]
                    else:
                        regfield2values = template2field2values[url]

                onlyreg = []
                same_counts = []
                reg_nocounts = []
                leg_nocounts = []
                same_buckets = []
                same_buckets_no = []
                same_buckets_many = []
                reg_nobucketvalues = []
                leg_nobucketvalues = []
                reg_nomanybucketvalues = []
                leg_nomanybucketvalues = []
                for field in regfield2values:
                    regvalues = regfield2values[field]
                    if field not in legfield2values:
                        onlyreg += [field]
                        continue
                    legvalues = legfield2values[field]
                    if 'error' in regvalues:
                        continue
                    if 'count' in regvalues:
                        if 'count' in legvalues:
                            output.write('\tprocessing %s\n' % (field))
                            output.write(
                                '\t\t%s is a count field for both endpoints\n'
                                % (field))
                            if 'no values' == regvalues[
                                    'count'] and 'no values' != legvalues[
                                        'count']:
                                reg_nocounts += [field]
                            elif 'no values' == legvalues[
                                    'count'] and 'no values' != regvalues[
                                        'count']:
                                leg_nocounts += [field]
                            else:
                                same_counts += [field]
                        elif 'buckets' in legvalues:
                            # this (not surprisingly) doesn't appear to happen
                            output.write(
                                '\tprocessing eqiv and diffs for %s\n' %
                                (field))
                            output.write(
                                '\t\t%s is a count field for the current endpoint and a buckets field for legacy\n'
                                % (field))
                    if 'buckets' in regvalues:
                        if 'error' in regvalues:
                            continue
                        elif 'count' in legvalues:
                            # this (not surprisingly) doesn't appear to happen
                            output.write('\tprocessing %s\n' % (field))
                            output.write(
                                '\t\t%s is a buckets field for the current endpoint and a count field for legacy\n'
                                % (field))
                        elif 'buckets' in legvalues:
                            if 'no values' == regvalues[
                                    'buckets'] and 'no values' != legvalues[
                                        'buckets']:
                                reg_nobucketvalues += [field]
                            elif 'no values' == legvalues[
                                    'buckets'] and 'no values' != regvalues[
                                        'buckets']:
                                leg_nobucketvalues += [field]
                            elif 'many values' == regvalues[
                                    'buckets'] and 'many values' != legvalues[
                                        'buckets']:
                                leg_nomanybucketvalues += [field]
                            elif 'many values' == legvalues[
                                    'buckets'] and 'many values' != regvalues[
                                        'buckets']:
                                reg_nomanybucketvalues += [field]
                            elif regvalues['buckets'] not in (
                                    'no values', 'many values'
                            ) and legvalues['buckets'] not in ('no values',
                                                               'many values'):
                                equiv = True
                                if 0 < len(regvalues['buckets'] -
                                           legvalues['buckets']):
                                    output.write(
                                        '\tprocessing eqiv and diffs for %s\n'
                                        % (field))
                                    output.write(
                                        '\t\t\tthe current endpoint has these additional values:\n\t\t\t\t%s\n'
                                        % ('\n\t\t\t\t'.join(
                                            str(value) for value in
                                            (regvalues['buckets'] -
                                             legvalues['buckets']))))
                                    equiv = False
                                if 0 < len(legvalues['buckets'] -
                                           regvalues['buckets']):
                                    if 0 == len(regvalues['buckets'] -
                                                legvalues['buckets']):
                                        output.write(
                                            '\tprocessing eqiv and diffs for %s\n'
                                            % (field))
                                    output.write(
                                        '\t\t\tthe legacy endpoint has these additional values:\n\t\t\t\t%s\n'
                                        % ('\n\t\t\t\t'.join(
                                            str(value) for value in
                                            (legvalues['buckets'] -
                                             regvalues['buckets']))))
                                    equiv = False
                                if equiv:
                                    same_buckets += [field]
                                else:
                                    output.write(
                                        '\t\t\tthe current and legacy endpoint share these values:\n\t\t\t\t%s\n'
                                        % ('\n\t\t\t\t'.join(
                                            str(value) for value in
                                            (legvalues['buckets']
                                             & regvalues['buckets']))))
                            else:
                                if 'no values' == regvalues['buckets']:
                                    same_buckets_no += [field]
                                elif 'many values' == regvalues['buckets']:
                                    same_buckets_many += [field]
                                else:
                                    raise ValueError(
                                        'unexpected case for %s: %s %s' %
                                        (field, regvalues['buckets'],
                                         legvalues['buckets']))

                if (0 < len(same_counts)):
                    output.write(
                        '\tcount fields for both endpoints:\n\t\t%s\n' %
                        ('\n\t\t'.join(sorted(same_counts))))
                if (0 < len(reg_nocounts)):
                    output.write(
                        '\tregular endpoints that have no count values but the legacy does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(reg_nocounts))))
                if (0 < len(leg_nocounts)):
                    output.write(
                        '\tlegacy endpoints that have no values but the current does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(leg_nocounts))))

                output.write('\n\tsummary comparsion of field counts:')
                if (0 < len(same_buckets)):
                    output.write(
                        '\t\teqivalent bucket fields for both endpoints:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(same_buckets))))
                if (0 < len(same_buckets_no)):
                    output.write(
                        '\t\tbucket fields with no values for both endpoints:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(same_buckets_no))))
                if (0 < len(same_buckets_many)):
                    output.write(
                        '\t\tbucket fields with many values for both endpoints:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(same_buckets_many))))
                if (0 < len(reg_nobucketvalues)):
                    output.write(
                        '\t\tregular endpoints that have no bucket values but the legacy does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(reg_nobucketvalues))))
                if (0 < len(leg_nobucketvalues)):
                    output.write(
                        '\t\tlegacy endpoints that have no bucket but the current does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(leg_nobucketvalues))))
                if (0 < len(reg_nomanybucketvalues)):
                    output.write(
                        '\t\tregular endpoints that don\'t have many bucket values but the legacy does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(reg_nomanybucketvalues))))
                if (0 < len(leg_nomanybucketvalues)):
                    output.write(
                        '\t\tlegacy endpoints that don\'t have many bucket values but the current does:\n\t\t%s\n'
                        % ('\n\t\t'.join(sorted(leg_nomanybucketvalues))))

                output.write('\t\tfields only in current endpoint:\n\t\t%s\n' %
                             '\n\t\t'.join(sorted(onlyreg)))
                onlyleg = []
                for field in legfield2values:
                    if field not in regfield2values:
                        onlyleg += [field]
                output.write('\t\tfields only in legacy endpoint:\n\t\t%s\n' %
                             '\n\t\t'.join(sorted(onlyleg)))
                log.info(
                    'finished field value comparison between current and legacy endpoints'
                )

        log.info('finished create_field_report()')
    except:
        if log:
            log.exception('problem with creating the field report')
        raise
示例#33
0
def uploadTCGA(configFileName):
    '''
    based on the configuration map loaded from the configFileName, loads the DCC data into GCS.  also
    obtains metadata based on file paths, SDRF values, and CGHub manifest values
    
    parameters:
        configFileName: the file name of the configuration map
    '''
    print datetime.now(), 'begin uploadTCGA()'
    global executor
    gcs_wrapper = None
    try:
        with open(configFileName) as configFile:
            config = json.load(configFile)

        run_dir = str(date.today()).replace(
            '-', '_') + '_' + config['log_dir_tag'] + '/'
        log_name = create_log(run_dir, 'top_processing')
        log = logging.getLogger(log_name)
        log.info('begin uploadTCGA()')
        executor = futures.ThreadPoolExecutor(max_workers=config['threads'])
        info_status(config, log)

        setup_database(config, log)

        # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files
        gcs_wrapper = import_module(config['gcs_wrapper'])
        gcs_wrapper.open_connection(config, log)

        tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(
            config, run_dir, log_name)
        prepare_upload(tumor_type2platform2archive_types2archives, log)
        if 'process_cghub' not in config or config['process_cghub']:
            tumor_type2cghub_records = process_cghub(config,
                                                     run_dir,
                                                     log=log,
                                                     removedups=True,
                                                     limit=-1)
        else:
            log.warning(
                '\n\t====================\n\tnot processing CGHub records this run!\n\t===================='
            )
            tumor_type2cghub_records = {}
        barcode2metadata = process_metadata_current(config, run_dir, log_name)
        if 'process_annotations' not in config or config['process_annotations']:
            barcode2annotations = process_annotations(config, run_dir,
                                                      log_name)
        else:
            log.warning(
                '\n\t====================\n\tnot processing annotations this run!\n\t===================='
            )
            barcode2annotations = {}
        process_tumortypes(config, run_dir,
                           tumor_type2platform2archive_types2archives,
                           platform2archive2metadata, tumor_type2cghub_records,
                           barcode2metadata, barcode2annotations, log)

        # associate the annotation metadata with the other metadata tables
        associate_metadata2annotation(config, log)

        # print out the stats
        metadata_modules = config['metadata_modules']
        for metadata_module in metadata_modules:
            module = import_module(metadata_module)
            module.print_combined_stats(log)
    finally:
        if executor:
            executor.shutdown(wait=False)
    log.info('finish uploadTCGA()')

    try:
        # upload the logs and TCGA files used for upload to GCS
        upload_run_files(config, run_dir, log)
    except Exception as e:
        log.exception('problem moving the logs and run files to GCS')
    finally:
        if gcs_wrapper:
            gcs_wrapper.close_connection()

    print datetime.now(), 'finish uploadTCGA()'
示例#34
0
def process_tumortype(config, log_dir, tumor_type,
                      platform2archive_types2archives,
                      platform2archive2metadata, cghub_records,
                      barcode2metadata, barcode2annotations):
    '''
    process the study/tumor_type for uploading the files from the dcc to GCS and to obtain meatadata to save to mysql.  loops through the
    platforms in parallel
    
    parameters:
        config: the configuration map
        log_dir: the base directory for the logs
        tumor_type: the TCGA study being processed
        platform2archive_types2archives: map platforms to archive types ('maf', 'mage-tab', and 'data') to archives
        cghub_records: cghub metadata
        platform2archive2metadata: map of platforms to archive name to the archive metadata
        barcode2metadata: metadata from metadata.current.txt
        barcode2annotations: map of barcodes to TCGA annotations
    
    returns:
        clinical_metadata:  metadata from the clinical bio files
        biospecimen_metadata: metadata from the biospecimen bio files
        flattened_data_map: metadata from the file paths and SDRF files
    '''
    print '\t', datetime.now(), '\tprocessing tumor type %s' % (tumor_type)
    log_name = create_log(log_dir + tumor_type + '/', tumor_type)
    log = logging.getLogger(log_name)
    log.info('\tprocessing tumor type %s' % (tumor_type))

    if config['process_bio']:
        try:
            clinical_metadata, biospecimen_metadata, exclude_samples = parse_bio(
                config, platform2archive_types2archives['bio']['bio'],
                tumor_type, platform2archive2metadata['bio'],
                create_log(log_dir + tumor_type + '/', tumor_type + '_bio'))
        except Exception as e:
            log.exception('problem parsing bio and sample files')
            raise e
    else:
        clinical_metadata = {}
        biospecimen_metadata = {}
        exclude_samples = set()

    all_platforms = True
    platforms = []
    if 'platforms' in config:
        platforms = config['platforms']
        if not (0 == len(platforms) or
                (1 == len(platforms) and 'all' == platforms[0])):
            all_platforms = False
    aliquot2filename2metadata = {}
    future2platform = {}
    for platform, archive_types2archives in platform2archive_types2archives.iteritems(
    ):
        if 'bio' == platform:
            continue
        if all_platforms or platform in (platforms):
            log_name = tumor_type + '_' + platform
            future2platform[executor.submit(
                process_platform, config, log_dir, log_name, tumor_type,
                platform, platform2archive2metadata[platform],
                archive_types2archives, barcode2annotations,
                exclude_samples)] = platform

    platform2retry = {}
    future_keys = future2platform.keys()
    while future_keys:
        future_done, _ = futures.wait(future_keys,
                                      return_when=futures.FIRST_COMPLETED)
        try:
            for future in future_done:
                platform = future2platform.pop(future)
                if future.exception() is not None:
                    # TODO only retry on connection refused, not other exceptions
                    retry_ct = platform2retry.setdefault(platform, 0)
                    if retry_ct > 3:
                        raise ValueError('%s failed multiple times: %s' %
                                         (platform, future.exception()))
                    platform2retry[platform] = retry_ct + 1
                    log.warning('\tWARNING: resubmitting %s: %s.  try %s' %
                                (platform, future.exception(), retry_ct))
                    new_future = executor.submit(
                        process_platform, config, log_dir,
                        tumor_type + '_' + platform + '_' + str(retry_ct + 1),
                        tumor_type, platform,
                        platform2archive2metadata[platform],
                        platform2archive_types2archives[platform],
                        barcode2annotations, exclude_samples)
                    future2platform[new_future] = platform
                else:
                    merge_metadata(aliquot2filename2metadata, future.result(),
                                   platform, log)
                    future_keys = future2platform.keys()
        except:
            future_keys = future2platform.keys()
            log.exception('%s failed' % (platform))

    try:
        merge_metadata_current_metadata(aliquot2filename2metadata,
                                        barcode2metadata, log)
        merge_cghup(config, aliquot2filename2metadata, cghub_records, log)
        # data map has a different structure than the clinical and biospecimen maps, remove the top map of aliquot to file_list metadata and combine all the files
        # for compatibility in calls to the data store and etl
        flattened_data_map = {}
        for aliquot, file_name2field2value in aliquot2filename2metadata.iteritems(
        ):
            for file_name, field2value in file_name2field2value.iteritems():
                flattened_data_map[aliquot + ':' + file_name] = field2value

        # do this per platform to parallelize
        store_metadata(config, log, 'metadata_clinical', clinical_metadata)
        store_metadata(config, log, 'metadata_biospecimen',
                       biospecimen_metadata)
        store_metadata(config, log, 'metadata_data', flattened_data_map)

        samples_metadata = process_metadata_samples(clinical_metadata,
                                                    biospecimen_metadata,
                                                    aliquot2filename2metadata,
                                                    log)
        store_metadata(config, log, 'metadata_samples', samples_metadata)
    except Exception as e:
        log.exception('problem storing metadata for %s' % (tumor_type))
        raise e
    print '\t', datetime.now(), 'finished tumor type %s' % (tumor_type)
    log.info('\tfinished tumor type %s' % (tumor_type))
    return clinical_metadata, biospecimen_metadata, flattened_data_map
示例#35
0
def process_tumortype(config, log_dir, tumor_type, platform2archive_types2archives, platform2archive2metadata, cghub_records, barcode2metadata, barcode2annotations):
    print '\t', datetime.now(), 'processing tumor type %s' % (tumor_type)
    log_name = create_log(log_dir + tumor_type + '/', tumor_type)
    log = logging.getLogger(log_name)
    log.info( '\tprocessing tumor type %s' % (tumor_type))
    
    if config['process_bio']:
        try:
            clinical_metadata, biospecimen_metadata, ffpe_samples = parse_bio(config, platform2archive_types2archives['bio']['bio'], tumor_type, platform2archive2metadata['bio'], create_log(log_dir + tumor_type + '/', tumor_type + '_bio'))
        except Exception as e:
            log.exception('problem parsing bio and sample files')
            raise e
    else:
        clinical_metadata = {}
        biospecimen_metadata = {}
        ffpe_samples = set()
        
    all_platforms = True
    platforms = []
    if 'platforms' in config:
        platforms = config['platforms']
        if not (0 == len(platforms) or (1 == len(platforms) and 'all' == platforms[0])):
            all_platforms = False
    aliquot2filename2metadata = {}
    future2platform = {}
    for platform, archive_types2archives in platform2archive_types2archives.iteritems():
        if 'bio' == platform:
            continue
        if all_platforms or platform in (platforms):
            log_name = tumor_type + '_' + platform
            future2platform[executor.submit(process_platform, config, log_dir, log_name, tumor_type, platform, platform2archive2metadata[platform], archive_types2archives, barcode2annotations, ffpe_samples)] = platform
 
    platform2retry = {}
    future_keys = future2platform.keys()
    while future_keys:
        future_done, _ = futures.wait(future_keys, return_when = futures.FIRST_COMPLETED)
        try:
            for future in future_done:
                platform = future2platform.pop(future)
                if future.exception() is not None:
                    # TODO only retry on connection refused, not other exceptions
                    retry_ct = platform2retry.setdefault(platform, 0)
                    if retry_ct > 3:
                        raise ValueError('%s failed multiple times: %s' % (platform, future.exception()))
                    platform2retry[platform] = retry_ct + 1
                    log.warning('\tWARNING: resubmitting %s: %s.  try %s' % (platform, future.exception(), retry_ct))
                    new_future = executor.submit(process_platform, config, log_dir, tumor_type + '_' + platform + '_' + str(retry_ct + 1), tumor_type, 
                            platform, platform2archive2metadata[platform], platform2archive_types2archives[platform], barcode2annotations, ffpe_samples)
                    future2platform[new_future] = platform
                else:
                    merge_metadata(aliquot2filename2metadata, future.result(), platform, log)
                    future_keys = future2platform.keys()
        except:
            future_keys = future2platform.keys()
            log.exception('%s failed' % (platform))
 
    try:
        merge_metadata_current_metadata(aliquot2filename2metadata, barcode2metadata, log)
        merge_cghup(config, aliquot2filename2metadata, cghub_records, log)
        # data map has a different structure than the clinical and biospecimen maps, remove the top map of aliquot to file_list metadata and combine all the files
        # for compatibility in calls to the data store and etl
        flattened_data_map = {}
        for aliquot, file_name2field2value in aliquot2filename2metadata.iteritems():
            for file_name, field2value in file_name2field2value.iteritems():
                flattened_data_map[aliquot + ':' + file_name] = field2value

        # do this per platform to parallelize
        store_metadata(config, log, 'metadata_clinical', clinical_metadata)
        store_metadata(config, log, 'metadata_biospecimen', biospecimen_metadata)
        store_metadata(config, log, 'metadata_data', flattened_data_map)

        samples_metadata = process_metadata_samples(clinical_metadata, biospecimen_metadata, aliquot2filename2metadata, log)
        store_metadata(config, log, 'metadata_samples', samples_metadata)
    except Exception as e:
        log.exception('problem storing metadata for %s' % (tumor_type))
        raise e
    print '\t', datetime.now(), 'finished tumor type %s' % (tumor_type)
    log.info('\tfinished tumor type %s' % (tumor_type))
    return clinical_metadata, biospecimen_metadata, flattened_data_map
    def process_program(self, program_name, program, log_dir):
        try:
            log_dir = log_dir + program_name + '/'
            log = getLogger(create_log(log_dir, program_name))
            log.info('processing {}'.format(program_name))

            output_bio_compare = 'case and sample compare:\n'
            bio_storage2source2barcodes = self.process_bio(
                program_name, program, log_dir)
            cases, samples = bio_storage2source2barcodes['gdc']['api']
            for sql_source, barcodes in bio_storage2source2barcodes[
                    'sql'].iteritems():
                sqlcases, sqlsamples = barcodes
                sources = sorted(bio_storage2source2barcodes['bq'].keys())
                for bq_source in sources:
                    barcodes = bio_storage2source2barcodes['bq'][bq_source]
                    bqcases, bqsamples = barcodes
                    output_bio_compare += self.compare_barcodes(
                        program_name, 'sql-{}:bq-{}'.format(
                            sql_source, bq_source), 'case', cases, sqlcases,
                        'sql', bqcases, 'bq', log) + '\n'
                    output_bio_compare += self.compare_barcodes(
                        program_name, 'sql-{}:bq-{}'.format(
                            sql_source, bq_source), 'sample', samples,
                        sqlsamples, 'sql', bqsamples, 'bq', log) + '\n{}\n'

            output_bio_counts = 'Case and Sample compares for {} clinical and biospecimen\n\nGDC Case API:\ncases\tsamples\n{}\t{}\n\nCloud SQL\n'.format(
                program_name, len(cases), len(samples))
            for source, barcodes in bio_storage2source2barcodes[
                    'sql'].iteritems():
                sqlcases, sqlsamples = barcodes
                output_bio_counts += '{}:\ncases\tsamples\n{}\t{}\n\n'.format(
                    source, len(sqlcases), len(sqlsamples))

            output_bio_counts += 'BigQuery\n'
            sources = sorted(bio_storage2source2barcodes['bq'].keys())
            for source in sources:
                bqcases, bqsamples = bio_storage2source2barcodes['bq'][source]
                output_bio_counts += '{}:\ncases\tsamples\n{}\t{}\n\n'.format(
                    source, len(bqcases), len(bqsamples))

            gcs_results = {}
            self.process_gcs(program_name, program, gcs_results, log_dir)
            output_gcs_compare = 'case, sample and file compare for gcs vs. isb_label:\n'
            output_gcs_counts = ''
            for isb_label in gcs_results:
                for project, barcodes in gcs_results[isb_label].iteritems():
                    output_gcs_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'case', barcodes[0], barcodes[1],
                        'gcs', barcodes[2], 'label', log) + '\n'
                    output_gcs_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'sample', barcodes[3], barcodes[4],
                        'gcs', barcodes[5], 'label', log) + '\n'
                    output_gcs_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'file', barcodes[6], barcodes[7],
                        'gcs', set(), 'label', log) + '\n{}\n'
                    if 'all' == project:
                        output_gcs_counts = '{}Case and Sample compares for {} Google Cloud Storage\n\nTotals:\ncases\napi\tgcs\tisb_label\n{}\t{}\t{}\nsamples\napi\tgcs\tisb_label\n{}\t{}\t{}\nfiles\napi\tgcs\n{}\t{}\n\n' \
                            .format('{}\n'.format('*' * 20), program_name, len(barcodes[0]), len(barcodes[1]), len(barcodes[2]), len(barcodes[3]), len(barcodes[4]), len(barcodes[5]), len(barcodes[6]), len(barcodes[7]))

            bq_results = {}
            self.process_bq(program_name, program, bq_results, log_dir)
            output_bq_compare = 'case, sample and file compare for bq vs. isb_label:\n'
            output_bq_counts = ''
            for isb_label in bq_results:
                for project, barcodes in bq_results[isb_label].iteritems():
                    output_bq_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'case', barcodes[0], barcodes[1], 'bq',
                        barcodes[2], 'label', log) + '\n'
                    output_bq_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'sample', barcodes[3], barcodes[4],
                        'bq', barcodes[5], 'label', log) + '\n'
                    output_bq_compare += self.compare_barcodes(
                        program_name, '{0}:project-{1}:label-{2}'.format(
                            program_name, project,
                            isb_label), 'file', barcodes[6], barcodes[7], 'bq',
                        set(), 'label', log) + '\n'
                    if 'all' == project:
                        output_bq_counts = '{}Case and Sample compares for {} Google BigQuery\n\nTotals:\ncases\napi\tbq\tisb_label\n{}\t{}\t{}\nsamples\napi\tbq\tisb_label\n{}\t{}\t{}\nfiles\napi\tbq\n{}\t{}\n\n' \
                            .format('{}\n'.format('*' * 20), program_name, len(barcodes[0]), len(barcodes[1]), len(barcodes[2]), len(barcodes[3]), len(barcodes[4]), len(barcodes[5]), len(barcodes[6]), len(barcodes[7]))

            with open(
                    'gdc/doc/' + str(date.today()).replace('-', '_') +
                    '_{}_validate_bq_gcs_label.txt'.format(program_name),
                    'w') as out:
                out.writelines([
                    'Validity Report\n\n', output_bio_counts,
                    output_bio_compare, output_gcs_counts, output_gcs_counts,
                    output_bq_counts, output_bq_compare
                ])
                out.write(
                    'Differences:\n\tapi\tgcs\tisb_label\tbq\t\napi\t{}\n')

            log.info('finished {}'.format(program_name))
        except:
            log.exception('problem processing {}'.format(program_name))
            raise
        return {}
示例#37
0
    if sub_type == 'save':
        sub.dump_data()
    elif sub_type == 'cloud':
        sub.cloud()

    print()
    end_dtime = datetime.now()
    # print("End time: "+str(end_dtime))
    timedelta = end_dtime - start_dtime
    print("Cost time: " + str(timedelta))

    # getChecksum(sys.argv[1])
    pass


if __name__ == "__main__":
    logger = create_log(log_name="subtitle", level=logging.INFO)
    logger.info("\n\n\n-------------------\n begin \n-------------------\n")
    logger.info(sys.argv)

    if len(sys.argv) < 2:
        # print "need args!!"
        logger.error("need args!!sys.argv:{0}".format(sys.argv))
        usage()
        logger.info("-----------------\n\n\n")
        sys.exit(2)

    main(sys.argv[1:], logger)
    logger.info("\n-----------------\n finish \n-----------------\n\n\n")
    pass
示例#38
0
def main(argv=None, log_ger=None):
    if log_ger is None:
        log_ger = create_log(log_name="subtitle", level=logging.INFO)

    fname = None

    start_dtime = datetime.now()
    # print("Start time: "+str(start_dtime))#.strftime("%Y-%m-%d %H:%M:%S"))
    print()
    # sub=Subtitle(logging.getLogger())
    sub = Subtitle(log_ger)

    try:
        opts, args = getopt.getopt(
            argv,
            "hvf:w:t:d:e:p:s:b:?lm:WDc",
            ["help", "version", "parse",
             "checkup" "file=", "word=",
             "type=", "dir=", "pickle=",
             "limit=", "section=", "bigger="])
        # print opts, args
        log_ger.info("opts:{0};args:{1}".format(opts, args))
    except getopt.GetoptError as msg:
        print("error happened when get options!!! error:{0}".format(msg))
        usage()
        log_ger.error("getopt.GetoptError:{0}, exit!".format(msg))
        sys.exit(2)
    except Exception as msg:
        log_ger.error("error:{0}, exit!".format(msg))
        sys.exit(2)

    _is_lines_show = False
    _is_words_show = False
    sub_type = ""
    words_limit = None
    for opt, arg in opts:
        if opt in ("-?", "-h", "--help"):
            usage()
            sys.exit()
            pass
        elif opt in ("-v", "--version"):
            version()
            sys.exit()
            pass
        elif opt in ("-b", "--bigger"):
            sub.set_times_bigger(int(arg))
            pass
        elif opt in ("-c", "--checkup"):
            sub.checkup = True
            pass
        elif opt in ("-d", "--dir"):
            print("Sorry, -d --dir option still not offer")
            sys.exit()
            pass
        elif opt in ("-e", "--excel"):
            pkl = arg
            sub.set_lexicon_file(pkl)
            pass
        elif opt in ("-s", "--section"):
            if ',' in arg:
                section = arg.split(',')
                if len(section) == 2:
                    # print(section)
                    start, end = section
                    if len(start) != 0:
                        sub.set_start(int(start))
                    if len(end) != 0:
                        sub.set_end(int(end))
                    # print(start, end)
                else:
                    print("something wrong, with option -s --section:", arg)
                    sys.exit()
            else:
                print("something wrong, with option -s --section:", arg)
                sys.exit()

            pass
        elif opt in ('-f', "--file"):
            fname = arg
            sub.add_file(fname)
            pass
        elif opt in ('-p', "--parse"):
            sub.set_parse(True)
            pass

        elif opt == '-D':
            log_ger.setLevel(logging.DEBUG)
            sub.set_logger(log_ger)
            sub.set_debug(True)
            pass
        elif opt in ("-w", "--word"):
            word = arg
            sub.add_word(word)
            # 多用于测试,放弃写入
            sub.set_output(False)
            pass
        elif opt in ("-t", "--type"):
            sub_type = arg
            if sub_type not in ('save', 'scan', 'cloud'):
                usage()
                sys.exit()
                pass
            pass
        elif opt in ("-m", "--limit"):
            words_limit = int(arg)
            # print words_limit
            _is_words_show = True
            pass
        elif opt == '-l':
            # show lines
            _is_lines_show = True
            pass
        elif opt == '-W':
            # show words
            _is_words_show = True
            pass

    """
    if(len(sys.argv)<2):
      print "need args!!"
      log_ger.error("need args!!sys.argv:{0}".format(sys.argv))
      return None
      pass
    """
    # print sys.argv

    # sub.add_punctuation([',','!',';','.',':','>','<'])
    # sub.addLexicon(["hello", "world"])

    if sub.lexicon_path is None:
        sub.set_lexicon_file("lexicon/lexicon.xlsx")
    sub.load_old_data()

    sub.add_files(args)
    # sub.add_strings("hello world, I'm wang. Please call me wang.")

    sub.check_all(encode='utf-8')

    if _is_lines_show:
        sub.lines_show(words_limit)
        pass

    if _is_words_show:
        # print words_limit
        sub.words_show(words_limit)
        pass
    sub.show()

    if sub_type == 'save':
        sub.dump_data()
    elif sub_type == 'cloud':
        sub.cloud()

    print()
    end_dtime = datetime.now()
    # print("End time: "+str(end_dtime))
    timedelta = end_dtime - start_dtime
    print("Cost time: " + str(timedelta))

    # getChecksum(sys.argv[1])
    pass