def process_platform(config, log_dir, log_name, tumor_type, platform, archive2metadata, archive_types2archives, barcode2annotations, ffpe_samples): try: create_log(log_dir + tumor_type + '/', log_name) log = logging.getLogger(log_name) if 'mage-tab' not in archive_types2archives: orphan_data_archives = set([archive_info[0] for archive_info in archive_types2archives['data']]) - \ set((archive_info[0] for archive_info in archive_types2archives['maf']) if 'maf' in archive_types2archives else []) if 0 < len(orphan_data_archives): log.warning('\tno mage-tab archives for %s and but there are data archives that are not maf: %s' % (platform, orphan_data_archives)) else: log.warning('\tno mage-tab archives for %s' % (platform)) if 'maf' in archive_types2archives: return process_maf_files(config, archive_types2archives['maf'], {}, {}, archive2metadata, log) return {} sdrf_metadata = process_sdrf(config, log, archive_types2archives['mage-tab'], archive2metadata, barcode2annotations) if 'data' in archive_types2archives: upload_archives(config, log, archive_types2archives['data'], sdrf_metadata, archive2metadata, ffpe_samples) else: log.warning('\tno data archives found for %s' % (tumor_type + ':' + platform)) if 'maf' in archive_types2archives: process_maf_files(config, archive_types2archives['maf'], sdrf_metadata, archive_types2archives['data'], archive2metadata, log) return sdrf_metadata except Exception as e: log.exception('%s generated an exception' % (platform)) raise e
def process_platform(config, log_dir, log_name, tumor_type, platform, archive2metadata, archive_types2archives, barcode2annotations, exclude_samples): ''' process the archives associated with the platform to obtain metadata from the sdrf archives and to upload the appropriate files from the exploded downloaded archives parameters: config: the configuration map log_dir: the base directory for the logs log_name: the name of the log to use to log any messages tumor_type: the TCGA study being processed platform: the TCGA platform being processsed archive2metadata: map of archive name to its metadata archive_types2archives: map of archive types ('maf', 'mage-tab', and 'data') for this study to its archives barcode2annotations: map of barcodes to TCGA annotations exclude_samples: a list of barcodes of ffpe samples returns: sdrf_metadata: the netadata obtained from parsing the SDRF files ''' try: create_log(log_dir + tumor_type + '/', log_name) log = logging.getLogger(log_name) if 'mage-tab' not in archive_types2archives: orphan_data_archives = (set([archive_info[0] for archive_info in archive_types2archives['data']]) if 'data' in archive_types2archives else set()) - \ set((archive_info[0] for archive_info in archive_types2archives['maf']) if 'maf' in archive_types2archives else []) if 0 < len(orphan_data_archives): log.warning( '\tno mage-tab archives for %s and but there are data archives that are not maf: %s' % (platform, orphan_data_archives)) else: log.warning('\tno mage-tab archives for %s' % (platform)) maf_metadata = {} if 'maf' in archive_types2archives: return process_maf_files(config, archive_types2archives['maf'], maf_metadata, archive2metadata, log) return maf_metadata sdrf_metadata = process_sdrf(config, log, archive_types2archives['mage-tab'], archive2metadata, barcode2annotations) if 'data' in archive_types2archives: upload_archives(config, log, archive_types2archives['data'], sdrf_metadata, archive2metadata, exclude_samples) else: log.warning('\tno data archives found for %s' % (tumor_type + ':' + platform)) if 'maf' in archive_types2archives: process_maf_files(config, archive_types2archives['maf'], sdrf_metadata, archive2metadata, log) return sdrf_metadata except Exception as e: log.exception('%s generated an exception' % (platform)) raise e
def process_data_type(config, endpt_type, program_name, project_id, data_type, log_dir, log_name=None): try: if log_name: log_name = create_log(log_dir, log_name) else: log_name = create_log( log_dir, project_id + '_' + data_type.replace(' ', '')) log = logging.getLogger(log_name) log.info('begin process_data_type %s for %s' % (data_type, project_id)) file2info = get_map_rows(config, endpt_type, 'file', program_name, get_filter(config, data_type, project_id), log) file2info = filter_null_samples(config, file2info, project_id, data_type, log) if data_type in config['data_type_gcs']: save2db( config, endpt_type, '%s_metadata_data_%s' % (program_name, config['endpt2genomebuild'][endpt_type]), file2info, config[program_name]['process_files']['data_table_mapping'], log) if config['process_paths']: set_uploaded_path(config, endpt_type, program_name, project_id, data_type, log) if config['process_data_availability'] and data_type not in ( 'Clinical Supplement', 'Biospecimen Supplement'): populate_sample_availibility(config, endpt_type, program_name, project_id, data_type, file2info.values(), log) upload_files(config, endpt_type, file2info, program_name, project_id, data_type, log) log.info('finished process_data_type %s for %s' % (data_type, project_id)) return file2info except: log.exception('problem processing data_type %s for %s' % (data_type, project_id)) raise finally: close_log(log)
def main(configFileName): try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace("-", "_") + "_" + config["log_dir_tag"] + "/" log_name = create_log(log_dir, "create_datadictionary") log = logging.getLogger(log_name) log.info("begin update of has_Illumia_DNASeq") select_stmt = "select samplebarcode from metadata_data \ where 0 < instr(Platform, 'DNASeq') and 0 = instr(Platform, 'Roche') and \ 0 = instr(Platform, 'ABSOLiD') and 0 = instr(Platform, 'PacBio') and \ project <> 'CCLE' \ group by samplebarcode" cursor = isbcgc_cloudsql_model.ISBCGC_database_helper.select(config, select_stmt, log) barcodes = "" for blist in cursor[:20]: barcodes += "\n\t\t" + blist[0] log.info("\tcompleted select, %s rows:\n\t\t%s\n\t\t..." % (len(cursor), barcodes)) # there's a problem that the rows that come back for this are [(barcode,), (barcode,), ...] so update fails with too many arguments # so put back together fixedcursor = [] for blist in cursor: fixedcursor += [blist[0]] except Exception as e: log.exception("\tproblem selecting") raise e try: update_stmt = "update metadata_samples set has_Illumina_DNASeq = 1 where samplebarcode = %s" cursor = isbcgc_cloudsql_model.ISBCGC_database_helper.update(config, update_stmt, log, fixedcursor) log.info("finished update of has_Illumia_DNASeq") except Exception as e: log.exception("\tproblem updating") raise e
def main(configfilename): print datetime.now(), 'begin process bucket info' with open(configfilename) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '_update_uploaded' + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin process bucket info') try: gcs_wrapper.open_connection() file_exts = config['buckets']['update_exts'] for bucket in config['buckets']['update_uploaded']: bucket_name = bucket['bucket_name'] with open(bucket['outputfile'], 'w') as outfile: upload_files = gcs_wrapper.get_bucket_contents( bucket_name, log) for upload_file_pair in upload_files: for file_ext in file_exts: if upload_file_pair[0].endswith(file_ext): outfile.write('\t'.joinupload_file_pair() + '\n') break except Exception as e: log.exception('problem processing bucket info') raise e finally: gcs_wrapper.close_connection() log.info('finish process bucket info') print datetime.now(), 'finish process bucket info'
def main(configFileName): try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'create_datadictionary') log = logging.getLogger(log_name) log.info('begin create metadata data dictionary') listlist = [] with open('MetadataDataDictionary_v3.txt', 'r') as datadict: #read the header line and discard datadict.readline() for line in datadict: fields = [field.strip() for field in line.split('\t')] if 10 != len(fields): raise ValueError('line wrong length: %s' % fields) listlist += [fields] ISBCGC_metadata_database_helper.initialize(config, log) ISBCGC_metadata_database_helper.insert(config, listlist, 'metadata_datadictionary', log) log.info('end create metadata data dictionary') except Exception as e: raise e
def process_bio(self, program_name, program, log_dir): log_dir = log_dir + 'bio' + '/' log = getLogger(create_log(log_dir, 'bio')) log.info('processing {} for bio'.format(program_name)) bio_storage2source2barcodes = {} cases, samples = self.get_gdc_barcode_info(program_name, log_dir) source2barcodes = bio_storage2source2barcodes.setdefault('gdc', {}) source2barcodes['api'] = (cases, samples) sql2bq = program().bio_sql2bq() sql2cases, sql2samples = self.get_sql_barcodes(sql2bq) bq2cases, bq2samples = self.get_bq_barcodes(sql2bq.values()) for table, sqlcases in sql2cases.iteritems(): if sql2bq[table][0]: bqcases = bq2cases[sql2bq[table][0]] else: bqcases = set() sqlsamples = sql2samples[table] if sql2bq[table][0]: bqsamples = bq2samples[sql2bq[table][0]] else: bqsamples = set() source2barcodes = bio_storage2source2barcodes.setdefault('sql', {}) source2barcodes[table] = (sqlcases, sqlsamples) source2barcodes = bio_storage2source2barcodes.setdefault('bq', {}) source2barcodes[table] = (bqcases, bqsamples) log.info('finished {} for bio'.format(program_name)) return bio_storage2source2barcodes
def process_projects(config, endpt_type, program, log_dir): try: log_name = create_log(log_dir, program + '_' + 'project') log = logging.getLogger(log_name) log.info('begin process_projects for %s' % (program)) project2info = get_map_rows(config, endpt_type, 'project', program, get_filter(program), log) program2info = {program: project2info[project2info.keys()[0]]} if config['process_program']: save2db( config, endpt_type, 'metadata_program', program2info, config[program]['process_projects']['program_table_mapping'], log) else: log.warning( '\n\t====================\n\tnot saving to db for programs this run!\n\t====================' ) if config['process_project']: save2db( config, endpt_type, '%s_metadata_project' % program, project2info, config[program]['process_projects']['project_table_mapping'], log) else: log.warning( '\n\t====================\n\tnot saving to db for projects this run!\n\t====================' ) log.info('finished process_projects for %s' % (program)) return project2info except: log.exception('problem processing projects for %s' % (program)) raise finally: close_log(log)
def setUp(self): with open(self.config) as configFile: self.config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + self.log_tag log_name = create_log(log_dir, 'gdc_test') self.log = logging.getLogger(log_name)
def main(configFileName): try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'create_datadictionary') log = logging.getLogger(log_name) log.info('begin create metadata data dictionary') listlist = [] with open('MetadataDataDictionary_v3.txt', 'r') as datadict: #read the header line and discard datadict.readline() for line in datadict: fields = [field.strip() for field in line.split('\t')] if 10 != len(fields): raise ValueError('line wrong length: %s' % fields) listlist += [fields] ISBCGC_metadata_database_helper.initialize(config, log) ISBCGC_metadata_database_helper.insert(config, listlist, 'metadata_datadictionary', log) log.info('end create metadata data dictionary') except Exception as e: raise e
def main(configfilename, baminfo_filename): log_dir = str(date.today()).replace('-', '_') + '_gg_update' + '/' log_name = create_log(log_dir, 'update_gg_metadata') log = logging.getLogger(log_name) log.info('begin update gg metadata') idcol = 0 ggdatasetcol = 7 ggreadgroupset = 6 nexist = 'NA' updates = [] count = 0 with open(baminfo_filename) as baminfo: baminfo.readline() for line in baminfo: fields = line.strip().split('\t') if fields[ggreadgroupset] == nexist: continue values = [ fields[ggdatasetcol], fields[ggreadgroupset], fields[idcol] ] if 0 == count % 1000: log.info('%s processing row %s--%s' % (datetime.now(), count, ':'.join(values))) count += 1 updates += [values] stmt = 'update metadata_data set GG_dataset_id = %s, GG_readgroupset_id = %s where analysis_id = %s' with open(configfilename) as configFile: config = json.load(configFile) ISBCGC_database_helper.update(config, stmt, log, updates, False) log.info('finished update gg metadata')
def process_annotations(config, endpt_type, log_dir): try: log_name = create_log(log_dir, '%s_annotations' % (endpt_type)) log = logging.getLogger(log_name) for program_name in config['program_names_for_annotation']: log.info('begin process_annotations for %s' % (program_name)) annotation2info = get_map_rows(config, endpt_type, 'annotation', program_name, get_filter(), log) add_barcodes(annotation2info) save2db( config, endpt_type, '%s_metadata_annotation' % program_name, annotation2info, config['%s' % (program_name)] ['process_annotations']['annotation_table_mapping'], log) if 'etl' in config[program_name]['process_annotations']: etl(config, log) log.info('finished process_annotations %s' % (program_name)) return annotation2info except: log.exception('problem processing annotations:') raise finally: close_log(log)
def main(config_file_name): log = None try: with open(config_file_name) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + 'ccle/' log_name = create_log(log_dir, 'update_ccle_gcs_paths') log = logging.getLogger(log_name) log.info('begin updating CCLE paths in production') # first thing to do is to read in the file paths from BigQuery query = 'SELECT file_gdc_id, file_gcs_url ' \ 'FROM [isb-cgc:GDC_metadata.GDCfileID_to_GCSurl] ' \ 'where 0 < instr(file_gcs_url, \'CCLE\')' query_results = query_bq_table(query, True, 'isb-cgc', log) _, rows, _ = fetch_paged_results(query_results, 2000, None, None, log) log.info('\tcreate map of filename to path') name2path = {} for row in rows: fields = row[1].split('/') name2path[fields[-1]] = '/'.join(fields[3:]) log.info('\tfinished map of filename to path') # get the db rows from production cloudsql log.info('\tselect ccle filenames from cloudsql') query = 'SELECT datafilename ' \ 'FROM main.metadata_data ' \ 'where 0 < instr(datafilename, \'bam\') and project = \'CCLE\'' rows = helper.select(config, query, log, []) log.info('\tselected %s ccle filenames from cloudsql' % (len(rows))) # now setup and do the update of paths in cloud sql log.info('\tstart updating paths in cloudsql') params = [] not_matched = [] for row in rows: if row[0] in name2path: params += [[name2path[row[0]], row[0]]] else: not_matched += [row[0]] update = 'update main.metadata_data set datafilenamekey = %s where datafilename = %s' helper.update(config, update, log, params) log.info('\tcompleted update of paths in cloudsql. updated %d, did not find matches from BQ in cloudsql for %s' % (len(params), ', '.join(not_matched))) log.info('finished updating CCLE paths in production') except: if log: log.exception('failed to update ccle GCS filepaths') finally: if log: close_log(log)
def getlog(configFileName): try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'create_ccle_metadata') log = logging.getLogger(log_name) except Exception as e: traceback.print_exc(5) raise e return config, log
def getlog(configFileName): try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'create_ccle_metadata') log = logging.getLogger(log_name) except Exception as e: traceback.print_exc(5) raise e return config, log
def main(config_filename): try: with open(config_filename) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_gdc_upload_run/' log_name = create_log(log_dir, 'gdc_upload') log = logging.getLogger(log_name) log.info('begin melting miRNA isoform matrix') test_file = './tcga_etl_pipeline/mirna_isoform_matrix/hiseq/expn_matrix_mimat_norm_IlluminaHiSeq_miRNASeq_small.txt' log.info('finished melting miRNA isoform matrix') except: raise
def train_nn(sess, epochs, batch_size, get_batches_fn, train_op, cross_entropy_loss, input_image, correct_label, keep_prob, learning_rate): """ Train neural network and print out the loss during training. :param sess: TF Session :param epochs: Number of epochs :param batch_size: Batch size :param get_batches_fn: Function to get batches of training data. Call using get_batches_fn(batch_size) :param train_op: TF Operation to train the neural network :param cross_entropy_loss: TF Tensor for the amount of loss :param input_image: TF Placeholder for input images :param correct_label: TF Placeholder for label images :param keep_prob: TF Placeholder for dropout keep probability :param learning_rate: TF Placeholder for learning rate """ training_log = util.create_log(); #util.log_n_print(training_log,"BATCH_SIZE : "+BATCH_SIZE+" EPOCHS : "+EPOCHS+" LEARNING_RATE : "+LEARNING_RATE+" KEEP_PROB : "+KEEP_PROB+"\n"); util.log_n_print(training_log,"BATCH_SIZE : %d \nEPOCHS : %d \nLEARNING_RATE : %f \nKEEP_PROB : %f \n" % (BATCH_SIZE, EPOCHS, LEARNING_RATE, KEEP_PROB)); # DONE: Implement function min_loss = 1e7; # saver = tf.train.Saver(); best_model_path = None; loss_history = []; loss = 1; if(not util.can_continue()): util.log_n_print(training_log,"####### Training will be Force stopped after 1 Epoch #######"); if is_in_training: util.log_n_print(training_log,"Training started...."); for epoch in range(epochs): avg_loss = 0.0; total_images = 0; util.log_n_print(training_log,"EPOCH : %d" %(epoch+1)); for image,label in get_batches_fn(batch_size): _, loss = sess.run([train_op,cross_entropy_loss],feed_dict = {input_image : image , correct_label : label , keep_prob : KEEP_PROB, learning_rate : LEARNING_RATE}); avg_loss += (loss*image.shape[0]); total_images += image.shape[0]; if(total_images%70==0): util.log_n_print(training_log,"EPOCH : %d Images processed : %d " % (epoch+1,total_images)); avg_loss/=total_images; loss_history.append(avg_loss); if(not util.can_continue()): util.log_n_print(training_log,"Forced stopped after epoch : %d" % epoch); break; util.log_n_print(training_log,"Loss : = {:.3f}".format(avg_loss)); # saving best model as checkpoint # if (loss < min_loss): # best_model_path = saver.save(sess,"/checkpoints/best_model.ckpt"); # min_loss = loss; util.log_n_print(training_log,'loss_history : '+str(loss_history)); return best_model_path;
def main(project_name, bucket_name, path_file, config_file): log_dir = str(date.today()).replace('-', '_') + '_bam_report_/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin bam report') buckets, path2bam2bai = read_path_file(path_file, log) path2contents = get_bucket_content(project_name, buckets, log) with open(config_file) as con_file: config = json.load(con_file) update_database(path2bam2bai, config, log) count = 0 no_index = set() ext_pairs2count = {} for contents in path2contents.itervalues(): if 0 == count % 1024: log.info('\tprocessing contents {}: {}'.format(count, contents)) count += 1 file_prefix = None for file_name in contents: parts = file_name.split('.') if 'bam' in parts[-1]: bam_ext = parts[-1] file_prefix = '.'.join(parts[:-1]) break if not file_prefix: log.warn('\tdid not find a bam file for {}'.format(contents)) continue contents.remove(file_name) if 1 < len(contents): log.warn('\tfound more than 2 files for bam file {}: {}'.format(file_name, contents)) continue if 0 == len(contents): log.warn('\tdidn\'t find an index file for bam file {}'.format(file_name)) no_index.add(file_name) continue index_file = contents.pop() index_ext = index_file[len(file_prefix) + 1:] count = ext_pairs2count.setdefault((bam_ext, index_ext), 0) ext_pairs2count[(bam_ext, index_ext)] = count + 1 if 0 < len(no_index): log.info('\n{}\n\tno-index file: \n\t\t{}\n\textension pairs:\n\t\t{}'.format(bucket_name, '\n\t\t'.join(no_index), '\n\t\t'.join('{}: {}'.format(pairs, count) for pairs, count in ext_pairs2count.iteritems()))) else: log.info('\textension pairs:\n\t\t{}'.format('\n\t\t'.join('{}: {}'.format(pairs, count) for pairs, count in ext_pairs2count.iteritems()))) log.info('bam report completed')
def process_bq(self, program_name, program, bq_results, log_dir): log_dir = log_dir + 'bq' + '/' log = getLogger(create_log(log_dir, '{}_bq'.format(program_name))) # data type: isb_label, bq table, sample_barcode isb_label2tables = program().bq_datasets() params = [] for data_type, info in isb_label2tables.iteritems(): params += [[ program_name, data_type, info[0], info[1], info[2], info[3], bq_results, log_dir ]] calls = {'fn': self.compare_isb_label_bq, 'labels': {'params': params}} launch_threads(self.config, 'labels', calls, self.log) log.info('processing {} bq'.format(program_name)) log.info('finished {} bq'.format(program_name))
def compare_isb_label_bq(self, program_name, data_type, isb_label, bq_table, sample_barcode, has_file, bq_results, log_dir): log_dir = log_dir + isb_label + '/' log = getLogger( create_log(log_dir, '{}_{}_bq'.format(program_name, data_type))) log.info('\tprocessing {}-{} for bq'.format(program_name, isb_label)) api_project2cases, api_project2samples, api_project2files = self.get_api_data_types_barcodes( program_name, [data_type], log) if 'somatic' not in data_type.lower(): bq_project2cases, bq_project2samples, bq_project2files = self.get_bq_data_type_barcodes( program_name, bq_table, sample_barcode, has_file, log) else: if "Simple somatic mutation" == data_type: bq_project2cases1, bq_project2samples1, bq_project2files1 = self.get_bq_data_type_barcodes( program_name, bq_table[0], sample_barcode[0], has_file, log) bq_project2cases_normal1, bq_project2samples_normal1, bq_project2files_normal1 = self.get_bq_data_type_barcodes( program_name, bq_table[0], sample_barcode[1], has_file, log) bq_project2cases2, bq_project2samples2, bq_project2files2 = self.get_bq_data_type_barcodes( program_name, bq_table[1], sample_barcode[0], has_file, log) bq_project2cases_normal2, bq_project2samples_normal2, bq_project2files_normal2 = self.get_bq_data_type_barcodes( program_name, bq_table[1], sample_barcode[1], has_file, log) bq_project2cases = self.merge_set_lists( bq_project2cases1, bq_project2cases2) bq_project2samples = self.merge_set_lists( bq_project2samples1, bq_project2samples2) bq_project2files = self.merge_set_lists( bq_project2files1, bq_project2files2) else: bq_project2cases, bq_project2samples, bq_project2files = self.get_bq_data_type_barcodes( program_name, bq_table, sample_barcode[0], has_file, log) bq_project2cases_normal, bq_project2samples_normal, bq_project2files_normal = self.get_bq_data_type_barcodes( program_name, bq_table, sample_barcode[1], has_file, log) label_project2cases, label_project2samples = self.get_gcs_isb_label_barcodes( program_name, isb_label, log) project2barcodes = bq_results.setdefault(isb_label, {}) api_cases = set(case for cases in api_project2cases.itervalues() for case in cases)
def process_program(config, endpt_type, program_name, projects, log_dir): try: log_dir += program_name + '_%s' % endpt_type + '/' log_name = create_log(log_dir, program_name) log = logging.getLogger(log_name) log.info('begin process_program(%s)' % (program_name)) future2project = {} initialize_etl(config, program_name, log) with futures.ThreadPoolExecutor( max_workers=config['program_threads']) as executor: for project in projects: if project in config['skip_projects']: log.info('\tskipping project %s' % (project)) continue if 0 == len(config['project_name_restrict'] ) or project in config['project_name_restrict']: log.info('\tprocessing project %s' % (project)) future2project[executor.submit(process_project, config, endpt_type, program_name, project, log_dir)] = project else: log.info('\tnot processing project %s' % (project)) future_keys = future2project.keys() while future_keys: future_done, future_keys = futures.wait( future_keys, return_when=futures.FIRST_COMPLETED) for future in future_done: project = future2project.pop(future) if future.exception() is not None: log.exception('\t%s generated an exception--%s: %s' % (project, type(future.exception()).__name__, future.exception())) else: future.result() log.info('\tfinished project %s' % (project)) finalize_etl(config, program_name, log) log.info('finished process_program(%s)' % (program_name)) except: log.exception('problem processing program %s' % (program_name)) raise finally: close_log(log)
def process_gcs(self, program_name, program, results, log_dir): log_dir = log_dir + 'gcs' + '/' log = getLogger(create_log(log_dir, '{}_gcs'.format(program_name))) log.info('processing {} for gcs'.format(program_name)) isb_label2tables = program().gcs_datasets() params = [] for isb_label, data_types in isb_label2tables.iteritems(): params += [[program_name, isb_label, data_types, results, log_dir]] calls = { 'fn': self.compare_isb_label_gcs, 'labels': { 'params': params } } launch_threads(self.config, 'labels', calls, self.log) log.info('finished {} for gcs'.format(program_name))
def compare_isb_label_gcs(self, program_name, isb_label, data_types, results, log_dir): log_dir = log_dir + isb_label + '/gcs/' log = getLogger( create_log(log_dir, '{}_{}_gcs'.format(program_name, isb_label))) log.info('\tprocessing {}-{} for gcs'.format(program_name, isb_label)) api_project2cases, api_project2samples, api_project2files = self.get_api_data_types_barcodes( program_name, data_types, log) gcs_project2cases, gcs_project2samples, gcs_project2files = self.get_gcs_data_types_barcodes( program_name, data_types, log) label_project2cases, label_project2samples = self.get_gcs_isb_label_barcodes( program_name, isb_label, log) project2barcodes = results.setdefault(isb_label, {}) api_cases = set(case for cases in api_project2cases.itervalues() for case in cases)
def get_gdc_barcode_info(self, program_name, log_dir): log = getLogger(create_log(log_dir, 'barcode_info')) log.info('processing {} for barcode information'.format(program_name)) # get the total count to parallelize barcode fetches barcode2info = self.request_gdc_barcode_info(program_name, program_name, 1, 1, 1) # divide into into batches based on total info = barcode2info.popitem()[1] total = info['total'] log.info('\tfetching {} cases for {}'.format(total, info)) batch = total / 20 log.info('\tlooking at batches of {} repeated 20 times for {}'.format( batch, program_name)) params = [] cur_start = 1 for i in range(21): params += [[ program_name + '_%s' % (i), program_name, cur_start, min(batch, 200), batch ]] log.info('\t\tbatch {}: {}'.format(i, params[-1])) cur_start += batch calls = { 'fn': self.request_gdc_barcode_info, 'batches': { 'params': params } } barcode2info = launch_threads(self.config, 'batches', calls, self.log) samples = set() for info in barcode2info.itervalues(): # if 0 != len(set(cursamples) & set(samples)): # raise ValueError('saw repeated barcode: {}'.format(set(cursamples) & set(cursamples))) samples |= set(info['sample_barcodes']) log.info( '\tfinished {} for barcode information. found {} case and {} samples' .format(program_name, len(barcode2info), len(samples))) return set(barcode2info.keys()), samples
def uploadTCGA(configFileName): print datetime.now(), 'begin uploadTCGA()' global executor try: with open(configFileName) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadTCGA()') executor = futures.ThreadPoolExecutor(max_workers=config['threads']) module = import_module(config['database_module']) module.ISBCGC_database_helper.initialize(config, log) if config['upload_files'] or config['upload_etl_files']: # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files gcs_wrapper.open_connection() info_status(config, log) tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive(config, log_name) prepare_upload(tumor_type2platform2archive_types2archives, log) if 'process_cghub' not in config or config['process_cghub']: tumor_type2cghub_records = process_cghub(config, log=log, removedups=True, limit=-1) else: log.warning('\n\t====================\n\tnot processing CGHub records this run!\n\t====================') tumor_type2cghub_records = {} barcode2metadata = process_metadata_current(config, log_name) if 'process_annotations' not in config or config['process_annotations']: barcode2annotations = process_annotations(config, log_name) else: log.warning('\n\t====================\n\tnot processing annotations this run!\n\t====================') barcode2annotations = {} process_tumortypes(config, log_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log) finally: if executor: executor.shutdown(wait=False) if gcs_wrapper: gcs_wrapper.close_connection() log.info('finish uploadTCGA()') print datetime.now(), 'finish uploadTCGA()'
def uploadGDC(): print datetime.now(), 'begin uploadGDC()' gcs_wrapper = None try: args = parseargs() with open(args.config) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadGDC()') initializeDB(config, log) if config['upload_files'] or config['upload_etl_files']: # open the GCS wrapper here so it can be used by all the projects/platforms to save files gcs_wrapper = import_module(config['gcs_wrapper']) gcs_wrapper.open_connection(config, log) for endpt_type in config['endpt_types']: log.info('processing %s endpoints' % (endpt_type)) if config['process_annotation']: process_annotations(config, endpt_type, log_dir) else: log.warning( '\n\t====================\n\tnot processing annotations this run!\n\t====================' ) process_programs(config, endpt_type, log_dir, log) finalize(config, log) except: raise finally: if gcs_wrapper: gcs_wrapper.close_connection() log.info('finished uploadGDC()') print datetime.now(), 'finished uploadGDC()'
def main(configfilename): print datetime.now(), 'begin update DatafileUploaded' with open(configfilename) as configFile: config = json.load(configFile) log_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '_update_uploaded' + '/' log_name = create_log(log_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin update DatafileUploaded') try: for path_file in config['buckets']['update_uploaded']: with open(path_file, 'r') as paths: updateDatafileUploaded(config, paths, log) update_nulls(config, log) except Exception as e: log.exception('problem updating DatafileUploaded') raise e log.info('finish update DatafileUploaded') print datetime.now(), 'finish update DatafileUploaded'
def process_cases(config, endpt_type, program_name, project_name, log_dir): try: log_name = create_log(log_dir, project_name + '_cases') log = logging.getLogger(log_name) log.info('begin process_cases(%s)' % (project_name)) case2info = get_map_rows(config, endpt_type, 'case', program_name, get_filter(project_name), log) save2db( config, endpt_type, '%s_metadata_clinical' % (program_name), case2info, config[program_name]['process_cases']['clinical_table_mapping'], log) remove_null_samples(case2info, log) save2db(config, endpt_type, '%s_metadata_biospecimen' % (program_name), case2info, config[program_name]['process_cases']['sample_table_mapping'], log) # fill in the rest of the metadata depending on the program if 0 < len(case2info.values()): postproc_module = import_module( config[program_name]['process_cases']['postproc_case'] ['postproc_module']) postproc_module.postprocess(config, project_name, endpt_type, log) log.info('finished process_cases(%s)' % (project_name)) # log.info('begin process_cases(%s) for omf files' % (project_name)) # omf2info = get_omf_map_rows(config, project_name, log) # save2db(config, 'metadata_gdc_clinical', case2info, config['process_cases']['clinical_table_mapping'], log) # log.info('finished process_cases(%s) for omf files' % (project_name)) return case2info except: log.exception('problem processing cases(%s):' % (project_name)) raise finally: close_log(log)
def __init__(self, logger=None, log_level=logging.INFO): classname = type(self).__name__ # print("classname: ", classname) if logger is None: self.logger = create_log(log_name=classname, level=log_level) else: self.logger = logger self.logger.info("\n-----------------") self.logger.info("Begin to init") self.logger.info("\n-----------------") self.sub = Subtitle(self.logger) self.sub.set_parse(True) self.files = dict() self.filetypes = ['srt', 'bak', 'm3u', 'txt'] self.vediotypes = ['mkv', 'mp4', 'avi'] self.rmsrt = False if self.sub.lexicon_path is None: self.sub.set_lexicon_file("lexicon/lexicon.xlsx") pass
def __init__(self, logger=None, loglevel=logging.INFO): if (logger is None): self.logger = create_log(log_name="odsWord", level=loglevel) else: self.logger = logger self.logger.info("\n-----------------") self.logger.info("Begin to init") self.logger.info("\n-----------------") self.files = [] self.lexicon = set() self.stem_lexicon = dict() self.noUsed = set(["“", "—", "”"]) self.punctuation = r".?\[\]!,\":%;()|^=+\/\\_`\*;.:><~" self.lexicon_path = None self.nameSet = set(names.words('male.txt') + names.words('female.txt')) self.checkup = False self.debug = False self.newWords = None pass
def process_project(config, endpt_type, program_name, project, log_dir): try: log_dir += project + '/' log_name = create_log(log_dir, project) log = logging.getLogger(log_name) log.info('begin process_project(%s)' % (project)) case2info = {} if config['process_case']: log.info('\tprocess cases for %s' % (project)) case2info = process_cases(config, endpt_type, program_name, project, log_dir) log.info('\tcompleted process cases for %s' % (project)) else: log.warning( '\n\t====================\n\tnot processing cases this run for %s!\n\t====================' % (project)) file2info = {} if config['process_data_type']: with futures.ThreadPoolExecutor( max_workers=config['project_threads']) as executor: log.info('\tprocess data_types for %s' % (project)) future2data_type = {} data_types = request_facets_results( config['files_endpt']['%s endpt' % (endpt_type)], config['facets_query'], 'data_type', log) for data_type in data_types: if ((len(config['data_type_restrict']) == 0 or data_type in config['data_type_restrict']) and (data_type in config['data_type2isb_label'] or data_type in config['data_type_gcs'])): log.info('\t\tprocess data_type \'%s\' for %s' % (data_type, project)) future2data_type[executor.submit( process_data_type, config, endpt_type, program_name, project, data_type, log_dir)] = data_type else: log.info('\t\tnot processing data_type %s for %s' % (data_type, project)) retry_ct = 0 data_type2retry = {} future_keys = future2data_type.keys() while future_keys: future_done, _ = futures.wait( future_keys, return_when=futures.FIRST_COMPLETED) try: for future in future_done: data_type = future2data_type.pop(future) if future.exception() is not None: # TODO only retry on connection refused, not other exceptions retry_ct = data_type2retry.setdefault( data_type, 0) if retry_ct > 3: raise ValueError( '%s failed multiple times--%s:%s' % (data_type, type( future.exception()).__name__, future.exception())) data_type2retry[data_type] = retry_ct + 1 log.warning( '\tWARNING: resubmitting %s--%s:%s. try %s' % (data_type, type( future.exception()).__name__, future.exception(), retry_ct)) new_future = executor.submit( process_data_type, config, endpt_type, program_name, project, data_type, log_dir, project + '_' + data_type.replace(' ', '') + '_%d' % (retry_ct)) future2data_type[new_future] = data_type else: log.info( '\t\tfinished process data_type \'%s\' for %s' % (data_type, project)) file2info = future.result() future_keys = future2data_type.keys() except: future_keys = future2data_type.keys() log.exception('%s failed for %s' % (data_type, project)) log.info('\tcompleted process data_types for %s' % (project)) else: log.warning( '\n\t====================\n\tnot processing data types this run for %s!\n\t====================' % (project)) log.info('finished process_project(%s)' % (project)) return case2info, file2info except: log.exception('problem processing project %s' % (project)) raise
def create_field_report(configfilename): log = None try: with open(configfilename) as configFile: config = js.load(configFile) log_dir = str(date.today()).replace('-', '_') + '_log' + '/' log_name = create_log(log_dir, 'field_report') log = logging.getLogger(log_name) log.info('start create_field_report()') endpts = config['field_report']['endpoints'] # fetch the queryable fields output_path = config['field_report']['output_path'] output_file = str(date.today()).replace( '-', '_') + '_' + config['field_report']['output_file'] with open(output_path + output_file, 'w') as output: for endpoint in endpts: output.write('Field value report:\n') log.info('\tstart endpoint \'%s\'' % (endpoint)) output.write('\tendpoint \'%s\'\n' % (endpoint)) template2field2values = {} mapping_templates = config['field_report'][ 'endpoint_mapping_templates'] url_templates = config['field_report']['url_templates'] for mapping_template, url_template in zip( mapping_templates, url_templates): response = requests.get(mapping_template % (endpoint)) response.raise_for_status() fields = response.json()['_mapping'].keys() fields.sort() field2values = template2field2values.setdefault( url_template, {}) mod_count = len(fields) / 20 count = 0 log.info('\t\tfinding field values for base url \'%s\'' % (url_template.split('%')[0])) output.write( '\t\tfinding field values for base url \'%s\'\n' % (url_template.split('%')[0])) log.info( '\t\tgot information on the fields for %s. found %s fields' % (endpoint, len(fields))) output.write('\t\tfound %s fields\n' % (len(fields))) for field in fields: try: progress = False if len( fields ) < 71 else True if 0 == count else 0 == count % mod_count get_values_for_field(endpoint, field, field2values, output, url_template, progress, log) except: log.exception('problem for field %s' % (field)) raise count += 1 log.info( 'start field value comparison between current and legacy endpoints' ) output.write('\nField value comparison report:\n') regfield2values = None legfield2values = None for url in template2field2values: if 'legacy' in url: legfield2values = template2field2values[url] else: regfield2values = template2field2values[url] onlyreg = [] same_counts = [] reg_nocounts = [] leg_nocounts = [] same_buckets = [] same_buckets_no = [] same_buckets_many = [] reg_nobucketvalues = [] leg_nobucketvalues = [] reg_nomanybucketvalues = [] leg_nomanybucketvalues = [] for field in regfield2values: regvalues = regfield2values[field] if field not in legfield2values: onlyreg += [field] continue legvalues = legfield2values[field] if 'error' in regvalues: continue if 'count' in regvalues: if 'count' in legvalues: output.write('\tprocessing %s\n' % (field)) output.write( '\t\t%s is a count field for both endpoints\n' % (field)) if 'no values' == regvalues[ 'count'] and 'no values' != legvalues[ 'count']: reg_nocounts += [field] elif 'no values' == legvalues[ 'count'] and 'no values' != regvalues[ 'count']: leg_nocounts += [field] else: same_counts += [field] elif 'buckets' in legvalues: # this (not surprisingly) doesn't appear to happen output.write( '\tprocessing eqiv and diffs for %s\n' % (field)) output.write( '\t\t%s is a count field for the current endpoint and a buckets field for legacy\n' % (field)) if 'buckets' in regvalues: if 'error' in regvalues: continue elif 'count' in legvalues: # this (not surprisingly) doesn't appear to happen output.write('\tprocessing %s\n' % (field)) output.write( '\t\t%s is a buckets field for the current endpoint and a count field for legacy\n' % (field)) elif 'buckets' in legvalues: if 'no values' == regvalues[ 'buckets'] and 'no values' != legvalues[ 'buckets']: reg_nobucketvalues += [field] elif 'no values' == legvalues[ 'buckets'] and 'no values' != regvalues[ 'buckets']: leg_nobucketvalues += [field] elif 'many values' == regvalues[ 'buckets'] and 'many values' != legvalues[ 'buckets']: leg_nomanybucketvalues += [field] elif 'many values' == legvalues[ 'buckets'] and 'many values' != regvalues[ 'buckets']: reg_nomanybucketvalues += [field] elif regvalues['buckets'] not in ( 'no values', 'many values' ) and legvalues['buckets'] not in ('no values', 'many values'): equiv = True if 0 < len(regvalues['buckets'] - legvalues['buckets']): output.write( '\tprocessing eqiv and diffs for %s\n' % (field)) output.write( '\t\t\tthe current endpoint has these additional values:\n\t\t\t\t%s\n' % ('\n\t\t\t\t'.join( str(value) for value in (regvalues['buckets'] - legvalues['buckets'])))) equiv = False if 0 < len(legvalues['buckets'] - regvalues['buckets']): if 0 == len(regvalues['buckets'] - legvalues['buckets']): output.write( '\tprocessing eqiv and diffs for %s\n' % (field)) output.write( '\t\t\tthe legacy endpoint has these additional values:\n\t\t\t\t%s\n' % ('\n\t\t\t\t'.join( str(value) for value in (legvalues['buckets'] - regvalues['buckets'])))) equiv = False if equiv: same_buckets += [field] else: output.write( '\t\t\tthe current and legacy endpoint share these values:\n\t\t\t\t%s\n' % ('\n\t\t\t\t'.join( str(value) for value in (legvalues['buckets'] & regvalues['buckets'])))) else: if 'no values' == regvalues['buckets']: same_buckets_no += [field] elif 'many values' == regvalues['buckets']: same_buckets_many += [field] else: raise ValueError( 'unexpected case for %s: %s %s' % (field, regvalues['buckets'], legvalues['buckets'])) if (0 < len(same_counts)): output.write( '\tcount fields for both endpoints:\n\t\t%s\n' % ('\n\t\t'.join(sorted(same_counts)))) if (0 < len(reg_nocounts)): output.write( '\tregular endpoints that have no count values but the legacy does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(reg_nocounts)))) if (0 < len(leg_nocounts)): output.write( '\tlegacy endpoints that have no values but the current does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(leg_nocounts)))) output.write('\n\tsummary comparsion of field counts:') if (0 < len(same_buckets)): output.write( '\t\teqivalent bucket fields for both endpoints:\n\t\t%s\n' % ('\n\t\t'.join(sorted(same_buckets)))) if (0 < len(same_buckets_no)): output.write( '\t\tbucket fields with no values for both endpoints:\n\t\t%s\n' % ('\n\t\t'.join(sorted(same_buckets_no)))) if (0 < len(same_buckets_many)): output.write( '\t\tbucket fields with many values for both endpoints:\n\t\t%s\n' % ('\n\t\t'.join(sorted(same_buckets_many)))) if (0 < len(reg_nobucketvalues)): output.write( '\t\tregular endpoints that have no bucket values but the legacy does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(reg_nobucketvalues)))) if (0 < len(leg_nobucketvalues)): output.write( '\t\tlegacy endpoints that have no bucket but the current does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(leg_nobucketvalues)))) if (0 < len(reg_nomanybucketvalues)): output.write( '\t\tregular endpoints that don\'t have many bucket values but the legacy does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(reg_nomanybucketvalues)))) if (0 < len(leg_nomanybucketvalues)): output.write( '\t\tlegacy endpoints that don\'t have many bucket values but the current does:\n\t\t%s\n' % ('\n\t\t'.join(sorted(leg_nomanybucketvalues)))) output.write('\t\tfields only in current endpoint:\n\t\t%s\n' % '\n\t\t'.join(sorted(onlyreg))) onlyleg = [] for field in legfield2values: if field not in regfield2values: onlyleg += [field] output.write('\t\tfields only in legacy endpoint:\n\t\t%s\n' % '\n\t\t'.join(sorted(onlyleg))) log.info( 'finished field value comparison between current and legacy endpoints' ) log.info('finished create_field_report()') except: if log: log.exception('problem with creating the field report') raise
def uploadTCGA(configFileName): ''' based on the configuration map loaded from the configFileName, loads the DCC data into GCS. also obtains metadata based on file paths, SDRF values, and CGHub manifest values parameters: configFileName: the file name of the configuration map ''' print datetime.now(), 'begin uploadTCGA()' global executor gcs_wrapper = None try: with open(configFileName) as configFile: config = json.load(configFile) run_dir = str(date.today()).replace( '-', '_') + '_' + config['log_dir_tag'] + '/' log_name = create_log(run_dir, 'top_processing') log = logging.getLogger(log_name) log.info('begin uploadTCGA()') executor = futures.ThreadPoolExecutor(max_workers=config['threads']) info_status(config, log) setup_database(config, log) # open the GCS wrapper here so it can be used by all the tumor types/platforms to save files gcs_wrapper = import_module(config['gcs_wrapper']) gcs_wrapper.open_connection(config, log) tumor_type2platform2archive_types2archives, platform2archive2metadata = process_latestarchive( config, run_dir, log_name) prepare_upload(tumor_type2platform2archive_types2archives, log) if 'process_cghub' not in config or config['process_cghub']: tumor_type2cghub_records = process_cghub(config, run_dir, log=log, removedups=True, limit=-1) else: log.warning( '\n\t====================\n\tnot processing CGHub records this run!\n\t====================' ) tumor_type2cghub_records = {} barcode2metadata = process_metadata_current(config, run_dir, log_name) if 'process_annotations' not in config or config['process_annotations']: barcode2annotations = process_annotations(config, run_dir, log_name) else: log.warning( '\n\t====================\n\tnot processing annotations this run!\n\t====================' ) barcode2annotations = {} process_tumortypes(config, run_dir, tumor_type2platform2archive_types2archives, platform2archive2metadata, tumor_type2cghub_records, barcode2metadata, barcode2annotations, log) # associate the annotation metadata with the other metadata tables associate_metadata2annotation(config, log) # print out the stats metadata_modules = config['metadata_modules'] for metadata_module in metadata_modules: module = import_module(metadata_module) module.print_combined_stats(log) finally: if executor: executor.shutdown(wait=False) log.info('finish uploadTCGA()') try: # upload the logs and TCGA files used for upload to GCS upload_run_files(config, run_dir, log) except Exception as e: log.exception('problem moving the logs and run files to GCS') finally: if gcs_wrapper: gcs_wrapper.close_connection() print datetime.now(), 'finish uploadTCGA()'
def process_tumortype(config, log_dir, tumor_type, platform2archive_types2archives, platform2archive2metadata, cghub_records, barcode2metadata, barcode2annotations): ''' process the study/tumor_type for uploading the files from the dcc to GCS and to obtain meatadata to save to mysql. loops through the platforms in parallel parameters: config: the configuration map log_dir: the base directory for the logs tumor_type: the TCGA study being processed platform2archive_types2archives: map platforms to archive types ('maf', 'mage-tab', and 'data') to archives cghub_records: cghub metadata platform2archive2metadata: map of platforms to archive name to the archive metadata barcode2metadata: metadata from metadata.current.txt barcode2annotations: map of barcodes to TCGA annotations returns: clinical_metadata: metadata from the clinical bio files biospecimen_metadata: metadata from the biospecimen bio files flattened_data_map: metadata from the file paths and SDRF files ''' print '\t', datetime.now(), '\tprocessing tumor type %s' % (tumor_type) log_name = create_log(log_dir + tumor_type + '/', tumor_type) log = logging.getLogger(log_name) log.info('\tprocessing tumor type %s' % (tumor_type)) if config['process_bio']: try: clinical_metadata, biospecimen_metadata, exclude_samples = parse_bio( config, platform2archive_types2archives['bio']['bio'], tumor_type, platform2archive2metadata['bio'], create_log(log_dir + tumor_type + '/', tumor_type + '_bio')) except Exception as e: log.exception('problem parsing bio and sample files') raise e else: clinical_metadata = {} biospecimen_metadata = {} exclude_samples = set() all_platforms = True platforms = [] if 'platforms' in config: platforms = config['platforms'] if not (0 == len(platforms) or (1 == len(platforms) and 'all' == platforms[0])): all_platforms = False aliquot2filename2metadata = {} future2platform = {} for platform, archive_types2archives in platform2archive_types2archives.iteritems( ): if 'bio' == platform: continue if all_platforms or platform in (platforms): log_name = tumor_type + '_' + platform future2platform[executor.submit( process_platform, config, log_dir, log_name, tumor_type, platform, platform2archive2metadata[platform], archive_types2archives, barcode2annotations, exclude_samples)] = platform platform2retry = {} future_keys = future2platform.keys() while future_keys: future_done, _ = futures.wait(future_keys, return_when=futures.FIRST_COMPLETED) try: for future in future_done: platform = future2platform.pop(future) if future.exception() is not None: # TODO only retry on connection refused, not other exceptions retry_ct = platform2retry.setdefault(platform, 0) if retry_ct > 3: raise ValueError('%s failed multiple times: %s' % (platform, future.exception())) platform2retry[platform] = retry_ct + 1 log.warning('\tWARNING: resubmitting %s: %s. try %s' % (platform, future.exception(), retry_ct)) new_future = executor.submit( process_platform, config, log_dir, tumor_type + '_' + platform + '_' + str(retry_ct + 1), tumor_type, platform, platform2archive2metadata[platform], platform2archive_types2archives[platform], barcode2annotations, exclude_samples) future2platform[new_future] = platform else: merge_metadata(aliquot2filename2metadata, future.result(), platform, log) future_keys = future2platform.keys() except: future_keys = future2platform.keys() log.exception('%s failed' % (platform)) try: merge_metadata_current_metadata(aliquot2filename2metadata, barcode2metadata, log) merge_cghup(config, aliquot2filename2metadata, cghub_records, log) # data map has a different structure than the clinical and biospecimen maps, remove the top map of aliquot to file_list metadata and combine all the files # for compatibility in calls to the data store and etl flattened_data_map = {} for aliquot, file_name2field2value in aliquot2filename2metadata.iteritems( ): for file_name, field2value in file_name2field2value.iteritems(): flattened_data_map[aliquot + ':' + file_name] = field2value # do this per platform to parallelize store_metadata(config, log, 'metadata_clinical', clinical_metadata) store_metadata(config, log, 'metadata_biospecimen', biospecimen_metadata) store_metadata(config, log, 'metadata_data', flattened_data_map) samples_metadata = process_metadata_samples(clinical_metadata, biospecimen_metadata, aliquot2filename2metadata, log) store_metadata(config, log, 'metadata_samples', samples_metadata) except Exception as e: log.exception('problem storing metadata for %s' % (tumor_type)) raise e print '\t', datetime.now(), 'finished tumor type %s' % (tumor_type) log.info('\tfinished tumor type %s' % (tumor_type)) return clinical_metadata, biospecimen_metadata, flattened_data_map
def process_tumortype(config, log_dir, tumor_type, platform2archive_types2archives, platform2archive2metadata, cghub_records, barcode2metadata, barcode2annotations): print '\t', datetime.now(), 'processing tumor type %s' % (tumor_type) log_name = create_log(log_dir + tumor_type + '/', tumor_type) log = logging.getLogger(log_name) log.info( '\tprocessing tumor type %s' % (tumor_type)) if config['process_bio']: try: clinical_metadata, biospecimen_metadata, ffpe_samples = parse_bio(config, platform2archive_types2archives['bio']['bio'], tumor_type, platform2archive2metadata['bio'], create_log(log_dir + tumor_type + '/', tumor_type + '_bio')) except Exception as e: log.exception('problem parsing bio and sample files') raise e else: clinical_metadata = {} biospecimen_metadata = {} ffpe_samples = set() all_platforms = True platforms = [] if 'platforms' in config: platforms = config['platforms'] if not (0 == len(platforms) or (1 == len(platforms) and 'all' == platforms[0])): all_platforms = False aliquot2filename2metadata = {} future2platform = {} for platform, archive_types2archives in platform2archive_types2archives.iteritems(): if 'bio' == platform: continue if all_platforms or platform in (platforms): log_name = tumor_type + '_' + platform future2platform[executor.submit(process_platform, config, log_dir, log_name, tumor_type, platform, platform2archive2metadata[platform], archive_types2archives, barcode2annotations, ffpe_samples)] = platform platform2retry = {} future_keys = future2platform.keys() while future_keys: future_done, _ = futures.wait(future_keys, return_when = futures.FIRST_COMPLETED) try: for future in future_done: platform = future2platform.pop(future) if future.exception() is not None: # TODO only retry on connection refused, not other exceptions retry_ct = platform2retry.setdefault(platform, 0) if retry_ct > 3: raise ValueError('%s failed multiple times: %s' % (platform, future.exception())) platform2retry[platform] = retry_ct + 1 log.warning('\tWARNING: resubmitting %s: %s. try %s' % (platform, future.exception(), retry_ct)) new_future = executor.submit(process_platform, config, log_dir, tumor_type + '_' + platform + '_' + str(retry_ct + 1), tumor_type, platform, platform2archive2metadata[platform], platform2archive_types2archives[platform], barcode2annotations, ffpe_samples) future2platform[new_future] = platform else: merge_metadata(aliquot2filename2metadata, future.result(), platform, log) future_keys = future2platform.keys() except: future_keys = future2platform.keys() log.exception('%s failed' % (platform)) try: merge_metadata_current_metadata(aliquot2filename2metadata, barcode2metadata, log) merge_cghup(config, aliquot2filename2metadata, cghub_records, log) # data map has a different structure than the clinical and biospecimen maps, remove the top map of aliquot to file_list metadata and combine all the files # for compatibility in calls to the data store and etl flattened_data_map = {} for aliquot, file_name2field2value in aliquot2filename2metadata.iteritems(): for file_name, field2value in file_name2field2value.iteritems(): flattened_data_map[aliquot + ':' + file_name] = field2value # do this per platform to parallelize store_metadata(config, log, 'metadata_clinical', clinical_metadata) store_metadata(config, log, 'metadata_biospecimen', biospecimen_metadata) store_metadata(config, log, 'metadata_data', flattened_data_map) samples_metadata = process_metadata_samples(clinical_metadata, biospecimen_metadata, aliquot2filename2metadata, log) store_metadata(config, log, 'metadata_samples', samples_metadata) except Exception as e: log.exception('problem storing metadata for %s' % (tumor_type)) raise e print '\t', datetime.now(), 'finished tumor type %s' % (tumor_type) log.info('\tfinished tumor type %s' % (tumor_type)) return clinical_metadata, biospecimen_metadata, flattened_data_map
def process_program(self, program_name, program, log_dir): try: log_dir = log_dir + program_name + '/' log = getLogger(create_log(log_dir, program_name)) log.info('processing {}'.format(program_name)) output_bio_compare = 'case and sample compare:\n' bio_storage2source2barcodes = self.process_bio( program_name, program, log_dir) cases, samples = bio_storage2source2barcodes['gdc']['api'] for sql_source, barcodes in bio_storage2source2barcodes[ 'sql'].iteritems(): sqlcases, sqlsamples = barcodes sources = sorted(bio_storage2source2barcodes['bq'].keys()) for bq_source in sources: barcodes = bio_storage2source2barcodes['bq'][bq_source] bqcases, bqsamples = barcodes output_bio_compare += self.compare_barcodes( program_name, 'sql-{}:bq-{}'.format( sql_source, bq_source), 'case', cases, sqlcases, 'sql', bqcases, 'bq', log) + '\n' output_bio_compare += self.compare_barcodes( program_name, 'sql-{}:bq-{}'.format( sql_source, bq_source), 'sample', samples, sqlsamples, 'sql', bqsamples, 'bq', log) + '\n{}\n' output_bio_counts = 'Case and Sample compares for {} clinical and biospecimen\n\nGDC Case API:\ncases\tsamples\n{}\t{}\n\nCloud SQL\n'.format( program_name, len(cases), len(samples)) for source, barcodes in bio_storage2source2barcodes[ 'sql'].iteritems(): sqlcases, sqlsamples = barcodes output_bio_counts += '{}:\ncases\tsamples\n{}\t{}\n\n'.format( source, len(sqlcases), len(sqlsamples)) output_bio_counts += 'BigQuery\n' sources = sorted(bio_storage2source2barcodes['bq'].keys()) for source in sources: bqcases, bqsamples = bio_storage2source2barcodes['bq'][source] output_bio_counts += '{}:\ncases\tsamples\n{}\t{}\n\n'.format( source, len(bqcases), len(bqsamples)) gcs_results = {} self.process_gcs(program_name, program, gcs_results, log_dir) output_gcs_compare = 'case, sample and file compare for gcs vs. isb_label:\n' output_gcs_counts = '' for isb_label in gcs_results: for project, barcodes in gcs_results[isb_label].iteritems(): output_gcs_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'case', barcodes[0], barcodes[1], 'gcs', barcodes[2], 'label', log) + '\n' output_gcs_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'sample', barcodes[3], barcodes[4], 'gcs', barcodes[5], 'label', log) + '\n' output_gcs_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'file', barcodes[6], barcodes[7], 'gcs', set(), 'label', log) + '\n{}\n' if 'all' == project: output_gcs_counts = '{}Case and Sample compares for {} Google Cloud Storage\n\nTotals:\ncases\napi\tgcs\tisb_label\n{}\t{}\t{}\nsamples\napi\tgcs\tisb_label\n{}\t{}\t{}\nfiles\napi\tgcs\n{}\t{}\n\n' \ .format('{}\n'.format('*' * 20), program_name, len(barcodes[0]), len(barcodes[1]), len(barcodes[2]), len(barcodes[3]), len(barcodes[4]), len(barcodes[5]), len(barcodes[6]), len(barcodes[7])) bq_results = {} self.process_bq(program_name, program, bq_results, log_dir) output_bq_compare = 'case, sample and file compare for bq vs. isb_label:\n' output_bq_counts = '' for isb_label in bq_results: for project, barcodes in bq_results[isb_label].iteritems(): output_bq_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'case', barcodes[0], barcodes[1], 'bq', barcodes[2], 'label', log) + '\n' output_bq_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'sample', barcodes[3], barcodes[4], 'bq', barcodes[5], 'label', log) + '\n' output_bq_compare += self.compare_barcodes( program_name, '{0}:project-{1}:label-{2}'.format( program_name, project, isb_label), 'file', barcodes[6], barcodes[7], 'bq', set(), 'label', log) + '\n' if 'all' == project: output_bq_counts = '{}Case and Sample compares for {} Google BigQuery\n\nTotals:\ncases\napi\tbq\tisb_label\n{}\t{}\t{}\nsamples\napi\tbq\tisb_label\n{}\t{}\t{}\nfiles\napi\tbq\n{}\t{}\n\n' \ .format('{}\n'.format('*' * 20), program_name, len(barcodes[0]), len(barcodes[1]), len(barcodes[2]), len(barcodes[3]), len(barcodes[4]), len(barcodes[5]), len(barcodes[6]), len(barcodes[7])) with open( 'gdc/doc/' + str(date.today()).replace('-', '_') + '_{}_validate_bq_gcs_label.txt'.format(program_name), 'w') as out: out.writelines([ 'Validity Report\n\n', output_bio_counts, output_bio_compare, output_gcs_counts, output_gcs_counts, output_bq_counts, output_bq_compare ]) out.write( 'Differences:\n\tapi\tgcs\tisb_label\tbq\t\napi\t{}\n') log.info('finished {}'.format(program_name)) except: log.exception('problem processing {}'.format(program_name)) raise return {}
if sub_type == 'save': sub.dump_data() elif sub_type == 'cloud': sub.cloud() print() end_dtime = datetime.now() # print("End time: "+str(end_dtime)) timedelta = end_dtime - start_dtime print("Cost time: " + str(timedelta)) # getChecksum(sys.argv[1]) pass if __name__ == "__main__": logger = create_log(log_name="subtitle", level=logging.INFO) logger.info("\n\n\n-------------------\n begin \n-------------------\n") logger.info(sys.argv) if len(sys.argv) < 2: # print "need args!!" logger.error("need args!!sys.argv:{0}".format(sys.argv)) usage() logger.info("-----------------\n\n\n") sys.exit(2) main(sys.argv[1:], logger) logger.info("\n-----------------\n finish \n-----------------\n\n\n") pass
def main(argv=None, log_ger=None): if log_ger is None: log_ger = create_log(log_name="subtitle", level=logging.INFO) fname = None start_dtime = datetime.now() # print("Start time: "+str(start_dtime))#.strftime("%Y-%m-%d %H:%M:%S")) print() # sub=Subtitle(logging.getLogger()) sub = Subtitle(log_ger) try: opts, args = getopt.getopt( argv, "hvf:w:t:d:e:p:s:b:?lm:WDc", ["help", "version", "parse", "checkup" "file=", "word=", "type=", "dir=", "pickle=", "limit=", "section=", "bigger="]) # print opts, args log_ger.info("opts:{0};args:{1}".format(opts, args)) except getopt.GetoptError as msg: print("error happened when get options!!! error:{0}".format(msg)) usage() log_ger.error("getopt.GetoptError:{0}, exit!".format(msg)) sys.exit(2) except Exception as msg: log_ger.error("error:{0}, exit!".format(msg)) sys.exit(2) _is_lines_show = False _is_words_show = False sub_type = "" words_limit = None for opt, arg in opts: if opt in ("-?", "-h", "--help"): usage() sys.exit() pass elif opt in ("-v", "--version"): version() sys.exit() pass elif opt in ("-b", "--bigger"): sub.set_times_bigger(int(arg)) pass elif opt in ("-c", "--checkup"): sub.checkup = True pass elif opt in ("-d", "--dir"): print("Sorry, -d --dir option still not offer") sys.exit() pass elif opt in ("-e", "--excel"): pkl = arg sub.set_lexicon_file(pkl) pass elif opt in ("-s", "--section"): if ',' in arg: section = arg.split(',') if len(section) == 2: # print(section) start, end = section if len(start) != 0: sub.set_start(int(start)) if len(end) != 0: sub.set_end(int(end)) # print(start, end) else: print("something wrong, with option -s --section:", arg) sys.exit() else: print("something wrong, with option -s --section:", arg) sys.exit() pass elif opt in ('-f', "--file"): fname = arg sub.add_file(fname) pass elif opt in ('-p', "--parse"): sub.set_parse(True) pass elif opt == '-D': log_ger.setLevel(logging.DEBUG) sub.set_logger(log_ger) sub.set_debug(True) pass elif opt in ("-w", "--word"): word = arg sub.add_word(word) # 多用于测试,放弃写入 sub.set_output(False) pass elif opt in ("-t", "--type"): sub_type = arg if sub_type not in ('save', 'scan', 'cloud'): usage() sys.exit() pass pass elif opt in ("-m", "--limit"): words_limit = int(arg) # print words_limit _is_words_show = True pass elif opt == '-l': # show lines _is_lines_show = True pass elif opt == '-W': # show words _is_words_show = True pass """ if(len(sys.argv)<2): print "need args!!" log_ger.error("need args!!sys.argv:{0}".format(sys.argv)) return None pass """ # print sys.argv # sub.add_punctuation([',','!',';','.',':','>','<']) # sub.addLexicon(["hello", "world"]) if sub.lexicon_path is None: sub.set_lexicon_file("lexicon/lexicon.xlsx") sub.load_old_data() sub.add_files(args) # sub.add_strings("hello world, I'm wang. Please call me wang.") sub.check_all(encode='utf-8') if _is_lines_show: sub.lines_show(words_limit) pass if _is_words_show: # print words_limit sub.words_show(words_limit) pass sub.show() if sub_type == 'save': sub.dump_data() elif sub_type == 'cloud': sub.cloud() print() end_dtime = datetime.now() # print("End time: "+str(end_dtime)) timedelta = end_dtime - start_dtime print("Cost time: " + str(timedelta)) # getChecksum(sys.argv[1]) pass