def upload_archive(config, sdrf_metadata, archive2metadata, exclude_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log): ''' uploads the files in the archive that meet the conditions parameters: config: the configuration map sdrf_metadata: metadata map to update archive2metadata: archive metadata exclude_samples: list of ffpe preserved samples or samples without a project assigned not to upload archive_fields: archive name, creation date, and URL upload_archives: map of level to center to platform of archives to upload seen_files: files that have been seen in a previously processed archive nonupload_files: list of file extensions of files not to upload access: either open or controlled log: logger to log any messages ''' archive_path = None if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata): log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0])) try: level = archive_fields[0].split('.')[-4].replace('_', ' ') user_info = config['user_info'] archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password']) file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, exclude_samples, level, log) if 0 < len(file2metadata): upload_files(config, archive_path, file2metadata, log) else: log.warning('did not find files to load for %s' % (archive_fields[0])) finally: if archive_path: shutil.rmtree(archive_path) log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def upload_archive(config, log, archive_fields, archive2metadata, sdrf_metadata, access): user_info = config['user_info'] log.info('\tchecking %s-access maf archive %s.' % (access, archive_fields[0])) try: if config['download_archives']: archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password']) filenames = process_files(archive_path, log) if 0 < len(filenames): file2metadata = {} for file_name in filenames: if file_name.endswith('maf'): file2metadata[file_name] = parse_maf_file( file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata) upload_archives.upload_files(config, archive_path, file2metadata, log) else: log.warning('\tdid not find files to load for %s' % (archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0])) finally: shutil.rmtree(archive_path)
def process_sdrf(config, log, magetab_archives, archive2metadata, barcode2annotations): """ return types: barcode2files2term2values: maps aliquot barcode to a map with filenames for that barcode as keys to another map of terms based on the ['metadata_locations']['sdrf'] section of the config file """ log.info('start processing sdrf') sdrf_pat = re.compile("^.*sdrf.txt$") anti_pat = re.compile("^.*antibody_annotation.txt$") barcode2files2term2values = {} archive2barcodes = {} for archive_fields in magetab_archives: try: log.info('\tprocessing %s' % (archive_fields[0])) archive_path = util.setup_archive(archive_fields, log) files = os.listdir(archive_path) antibody_files = [] cur_barcode2files2term2values = {} for file_name in files: if sdrf_pat.match(file_name): parse_sdrf(config, log, archive_path + file_name, archive2metadata, cur_barcode2files2term2values, archive2barcodes, archive_fields, file_name, barcode2annotations) util.merge_metadata(barcode2files2term2values, cur_barcode2files2term2values, archive_fields[0] + ': ' + ','.join([archive_fields[0] for archive_fields in magetab_archives]), log) upload_sdrf_file(config, archive_path, file_name, barcode2files2term2values.values()[0].values()[0], log) elif anti_pat.match(file_name): antibody_files += [file_name] for file_name in antibody_files: upload_sdrf_file(config, archive_path, file_name, barcode2files2term2values.values()[0].values()[0], log) finally: shutil.rmtree(archive_path) log.info('finished processing sdrf') return barcode2files2term2values
def parse_archives(config, log, archives, study, archive2metadata, clinical_metadata, biospecimen_metadata, exclude_samples): ''' downloads and unpacks the archives. then parses, and if appropriate for the archive, uploads the files to GCS parameters: config: the configuration map log: logger to log any messages archives: information on the archives to unpack study: name of the TCGA study the files belongs to archive2metadata: metadata of the archive clinical_metadata: the return map for clinical metadata biospecimen_metadata: the return map for biospecimen metadata ''' tmp_dir_parent = os.environ.get('ISB_TMP', '/tmp/') for archive_fields in archives: if not 'Level_1' in archive_fields[0]: log.info('skipping bio archive %s' % (archive_fields[0])) continue log.info('processing archive %s' % (archive_fields[0])) archive_path = os.path.join(tmp_dir_parent, archive_fields[0] + '/') if not os.path.isdir(archive_path): os.makedirs(archive_path) archive_path = util.setup_archive(config, archive_fields, log) files = os.listdir(archive_path) parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, exclude_samples, clinical_metadata, biospecimen_metadata) shutil.rmtree(archive_path)
def parse_archives(config, log, archives, study, archive2metadata, clinical_metadata, biospecimen_metadata): tmp_dir_parent = os.environ.get('ISB_TMP', '/tmp/') for archive_fields in archives: if not 'Level_1' in archive_fields[0]: log.info('skipping bio archive %s' % (archive_fields[0])) continue log.info('processing archive %s' % (archive_fields[0])) archive_path = os.path.join(tmp_dir_parent, archive_fields[0] + '/') if not os.path.isdir(archive_path): os.makedirs(archive_path) archive_path = util.setup_archive(archive_fields, log) files = os.listdir(archive_path) parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, clinical_metadata, biospecimen_metadata) shutil.rmtree(archive_path)
def upload_archive(config, sdrf_metadata, archive2metadata, ffpe_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log): archive_path = None if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata): log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0])) try: level = archive_fields[0].split('.')[-4].replace('_', ' ') user_info = config['user_info'] archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password']) file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, ffpe_samples, level, log) if 0 < len(file2metadata): upload_files(config, archive_path, file2metadata, log) else: log.warning('did not find files to load for %s' % (archive_fields[0])) finally: if archive_path: shutil.rmtree(archive_path) log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def upload_archive(config, log, archive_fields, archive2metadata, sdrf_metadata, access): user_info = config['user_info'] log.info('\tchecking %s-access maf archive %s.' % (access, archive_fields[0])) try: if config['download_archives']: archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password']) filenames = process_files(archive_path, log) if 0 < len(filenames): file2metadata = {} for file_name in filenames: if file_name.endswith('maf'): file2metadata[file_name] = parse_maf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata) upload_archives.upload_files(config, archive_path, file2metadata, log) else: log.warning('\tdid not find files to load for %s' % (archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0])) finally: shutil.rmtree(archive_path)
def upload_archive(config, log, archive_fields, archive2metadata, sdrf_metadata, seen_files, access): ''' uploads and gathers metadata on the maf-related files in the archive parameters: config: the configuration map log: logger to log any messages archive_fields: archive name, creation date, and URL archive2metadata: archive metadata sdrf_metadata: metadata map to update access: either open or controlled ''' user_info = config['user_info'] log.info('\tchecking %s-access maf archive %s.' % (access, archive_fields[0])) archive_path = None try: if config['download_archives']: archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password']) maf_upload_files = config['maf_upload_files'] filenames = process_files(archive_path, maf_upload_files, seen_files, log) if 0 < len(filenames): file2metadata = {} for file_name in filenames: if file_name.endswith('maf'): file2metadata[file_name] = parse_maf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata) elif file_name.endswith('vcf') or file_name.endswith('vcf.gz'): file2metadata[file_name] = parse_vcf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata) upload_archives.upload_files(config, archive_path, file2metadata, log) else: log.warning('\tdid not find files to load for %s' % (archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0])) finally: if archive_path: shutil.rmtree(archive_path)