def __init__(self, id, data): super(BaseBuildJob, self).__init__() self.id = id self.data = data self.package_path = os.path.join(root_path, self.id) self.package_cache = os.path.join(self.package_path, '.bacon.d') self.stored_fingerprint_path = os.path.join(self.package_cache, "fingerprint") self.dependency_jobs = [] make_dir_if_needed(self.package_cache)
def start_build(sra_id, wait_time, buffer_size_gb, container_type, required_ram_gb, available_ram_gb): input_dir = download_dir(sra_id) output_dir = build_dir(sra_id) num_cores = max(4, round(required_ram_gb / 3.75)) logging.info(f'[{sra_id}] Starting build from {input_dir} to {output_dir}, buffer={round(buffer_size_gb, 2)}GB ' f'on {num_cores} cores') util.make_dir_if_needed(build_dir(sra_id)) log_file_name = os.path.join(build_dir(sra_id), 'build.log') log_file = util.TeeLogger(log_file_name) write_log_header(log_file, 'build', sra_id, required_ram_gb, available_ram_gb) build_processes[sra_id] = (subprocess.Popen( ['./build.sh', sra_id, input_dir, output_dir, str(buffer_size_gb), str(num_cores), container_type], stdout=log_file, stderr=log_file), time.time(), wait_time, required_ram_gb) return True
def start_download(download_resp): if 'id' not in download_resp: logging.info('No more downloads available. We\'re almost done!') return sra_id = download_resp['id'] pending_processes.append(sra_id) dump_pending(pending_processes) util.make_dir_if_needed(download_dir(sra_id)) log_file_name = os.path.join(download_dir(sra_id), 'download.log') log_file = util.TeeLogger(log_file_name, 'Stage') bucket = '0' if args.source == 'ncbi': if 'bucket' not in download_resp: logging.info(f'[{sra_id}] Specified NCBI as download source, but server response has no "bucket" field. ' f'Will download via HTTP instead of GCS') else: bucket = download_resp['bucket'] download_processes[sra_id] = ( subprocess.Popen(['./download.sh', args.source, sra_id, download_dir_base(), bucket], stdout=log_file, stderr=log_file), time.time()) sra_info[sra_id] = (time.time(),)
def __init__(self, id, data): super(JavaModuleBuildJob, self).__init__(id, data) self.classes_cache_directory = os.path.join(self.package_cache, "classes") self.test_classes_cache_directory = os.path.join(self.package_cache, "test-classes") self.archive_cache = os.path.join(self.package_cache, "dist") self.compile_dependencies = [] self.test_dependencies = [] self.parse_dependencies(id, "dependencies", self.compile_dependencies) self.parse_dependencies(id, "test-dependencies", self.test_dependencies) make_dir_if_needed(self.classes_cache_directory) make_dir_if_needed(self.test_classes_cache_directory) make_dir_if_needed(self.archive_cache)
def check_env(): """ Make sure all the necessary software is in place to successfully run the client and create working directories """ util.make_dir_if_needed(download_dir_base()) util.make_dir_if_needed(build_dir_base()) util.make_dir_if_needed(clean_dir_base()) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') file_handler = logging.FileHandler(f'{args.output_dir}/client.log') file_handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s %(message)s') file_handler.setFormatter(formatter) logging.getLogger().addHandler(file_handler) if subprocess.call(['./prereq.sh']) != 0: logging.error("Some prerequisites are missing on this machine. Bailing out.") exit(1)
def parse_java_build_file(id, data): if data.has_key('dependencies'): for dependency in data['dependencies']: if len(dependency.split(":")) != 3: parse_build_file(dependency) return JavaModuleBuildJob(id, data) root_path = sys.argv[1] if len(sys.argv) > 2: task = sys.argv[2] else: task = "build" make_dir_if_needed(os.path.expanduser("~/.bacon.d")) parse_build_file(root_path) # print "build order:" # print yaml.dump(build_order) # print "task: %s" % task if task == "compile": goals = ["compile"] elif task == "clean": goals = ["clean"] elif task == "test": goals = ["compile", "compileTest", "test"] elif task == "run": goals = ["compile", "compileTest", "test", "run"]
def check_status(): global must_quit if must_quit: return False total_reserved_ram_gb = 0 # how much memory all active processes need completed_downloads = set() for sra_id, (download_process, start_time) in download_processes.items(): return_code = download_process.poll() is_timed_out = (time.time() - start_time) > 120 * 60 if return_code is not None or is_timed_out: if os.path.exists(os.path.join(download_dir(sra_id), 'code')): return_code = int(open(os.path.join(download_dir(sra_id), 'code')).read()) elif is_timed_out: logging.warning(f'[{sra_id}] Download timed out after {time.time() - start_time} seconds.') return_code = 254 else: logging.error(f'[{sra_id}] Download process did not provide a return code. Assuming error') return_code = 255 completed_downloads.add(sra_id) log_file_name = os.path.join(download_dir(sra_id), 'download.log') logging.info(f'[{sra_id}] Download finished with output\n {util.tab(open(log_file_name).readlines())}\n\n\n') log_new_file_name = os.path.join(clean_dir(sra_id), 'download.log') util.make_dir_if_needed(clean_dir(sra_id)) os.rename(log_file_name, log_new_file_name) download_path = download_dir(sra_id) sra_dir = os.path.join(download_path, 'sra') download_size_mb = util.dir_size_MB(sra_dir) size_file = os.path.join(download_dir(sra_id), 'size') if os.path.exists(size_file): sra_size_mb = round(int(open(size_file).read()) / 1e6, 2) logging.info(f'Downloaded sra files have {sra_size_mb}MB') else: logging.warning('Could not find size file. Reporting size -1') sra_size_mb = -1 subprocess.run(['rm', '-rf', sra_dir]) kmc_dir = os.path.join(download_path, 'kmc') kmc_size_mb = util.dir_size_MB(kmc_dir) success = True if return_code == 0: logging.info(f'[{sra_id}] Download completed successfully.') stats_file = os.path.join(download_path, 'stats') try: with open(stats_file) as stats: json_resp = json.loads(stats.read()) if '#k-mers_coverage' in json_resp and '#k-mers_below_min_threshold' in json_resp: kmer_count_unique = int(json_resp['#Unique_counted_k-mers']) kmer_coverage = int(json_resp['#k-mers_coverage']) kmer_count_singletons = int(json_resp['#k-mers_below_min_threshold']) else: logging.warning(f'[{sra_id}] Invalid KMC stat files, assuming failure') success = False except FileNotFoundError: logging.warning(f'[{sra_id}] Could not find KMC stats file {stats_file}, baling out.') success = False else: success = False if success: params = {'id': sra_id, 'time': int(time.time() - start_time), 'size_mb': sra_size_mb, 'download_size_mb': download_size_mb, 'kmc_size_mb': kmc_size_mb, 'kmer_count_unique': kmer_count_unique, 'kmer_coverage': kmer_coverage, 'kmer_count_singletons': kmer_count_singletons} sra_info[sra_id] = ( *sra_info[sra_id], sra_size_mb, kmer_count_unique, kmer_coverage, kmer_count_singletons) ack('download', params) waiting_builds[sra_id] = (time.time()) else: logging.warning(f'[{sra_id}] Download failed. Removing {download_path}') subprocess.run(['rm', '-rf', download_path]) params = {'id': sra_id, 'time': int(time.time() - start_time), 'size_mb': sra_size_mb, 'download_size_mb': download_size_mb, 'kmc_size_mb': kmc_size_mb, 'exit_code': return_code} nack('download', params) else: total_reserved_ram_gb += 2 # approximate 2GB of RAM for each download process (bc of KMC) for d in completed_downloads: del download_processes[d] completed_builds = set() used_cores = 0 for sra_id, (build_process, start_time, wait_time, reserved_ram_gb) in build_processes.items(): return_code = build_process.poll() if return_code is not None: completed_builds.add(sra_id) log_file_name = os.path.join(build_dir(sra_id), 'build.log') logging.info(f'[{sra_id}] Build finished with output\n {util.tab(open(log_file_name).readlines())}\n\n\n') log_new_file_name = os.path.join(clean_dir(sra_id), 'build.log') os.rename(log_file_name, log_new_file_name) # clean up the download path; if adding retries, do this only on success download_path = download_dir(sra_id) logging.info(f'[{sra_id}] Cleaning up {download_path}') subprocess.run(['rm', '-rf', download_path]) build_path = build_dir(sra_id) build_size_mb = util.dir_size_MB(build_path) if return_code == 0: logging.info(f'[{sra_id}] Building graph completed successfully.') sanity = check_sanity(sra_id) params = {'id': sra_id, 'time': int(time.time() - start_time), 'wait_time': int(wait_time), 'size_mb': build_size_mb, 'sanity': sanity} ack('build', params) waiting_cleans[sra_id] = (time.time()) else: logging.warning(f'[{sra_id}] Building graph failed. Removing {build_path}.') subprocess.run(['rm', '-rf', build_path]) params = {'id': sra_id, 'time': int(time.time() - start_time), 'wait_time': int(wait_time), 'size_mb': build_size_mb, 'return_code': return_code} nack('build', params) else: total_reserved_ram_gb += reserved_ram_gb used_cores += max(4, round(reserved_ram_gb / 3.75)) for d in completed_builds: del build_processes[d] completed_cleans = set() for sra_id, (clean_process, start_time, wait_time, reserved_ram_gb) in clean_processes.items(): return_code = clean_process.poll() if return_code is not None: completed_cleans.add(sra_id) log_file_name = os.path.join(clean_dir(sra_id), 'clean.log') logging.info(f'[{sra_id}] Clean finished with output\n {util.tab(open(log_file_name).readlines())}\n\n\n') # clean up the original graph; if adding retries, do this only on success build_path = build_dir(sra_id) logging.info(f'[{sra_id}] Cleaning up {build_path}') subprocess.run(['rm', '-rf', build_path]) cleaned_dir = clean_dir(sra_id) cleaned_size_mb = util.dir_size_MB(cleaned_dir) if return_code == 0: logging.info(f'[{sra_id}] Cleaning graph completed successfully.') params = {'id': sra_id, 'time': int(time.time() - start_time), 'wait_time': int(wait_time), 'size_mb': cleaned_size_mb} ack('clean', params) start_transfer(sra_id, cleaned_dir, 'clean') else: params = {'id': sra_id, 'time': int(time.time() - start_time), 'wait_time': int(wait_time), 'size_mb': cleaned_size_mb, 'return_code': return_code} nack('clean', params) logging.warning(f'[{sra_id}] Cleaning graph failed. Removing {cleaned_dir}') subprocess.run(['rm', '-rf', cleaned_dir]) else: total_reserved_ram_gb += reserved_ram_gb used_cores += max(4, round(reserved_ram_gb / 3.75)) for d in completed_cleans: del clean_processes[d] completed_transfers = set() for sra_id, (transfer_process, start_time) in transfer_processes.items(): return_code = transfer_process.poll() if return_code is not None: completed_transfers.add(sra_id) # clean up the cleaned graph; if adding retries, do this only on success clean_path = clean_dir(sra_id) cleaned_size_mb = util.dir_size_MB(clean_path) logging.info(f'[{sra_id}] Cleaning up {clean_path}') subprocess.run(['rm', '-rf', clean_path]) if return_code == 0: logging.info(f'[{sra_id}] Transferring graph completed successfully.') params = {'id': sra_id, 'time': int(time.time() - start_time), 'total_time': int(time.time() - sra_info[sra_id][0]), 'size_init_mb': sra_info[sra_id][1], 'size_final_mb': cleaned_size_mb} ack('transfer', params) else: logging.warning(f'[{sra_id}] Transferring cleaned graph failed.') params = {'id': sra_id, 'time': int(time.time() - start_time), 'size_mb': cleaned_size_mb} nack('transfer', params) # for cleaning we allow using all the available RAM total_ram_gb = psutil.virtual_memory().total / 1e9 not_reserved_ram_gb = total_ram_gb - total_reserved_ram_gb # TODO: figure out why we have so much free memory when all cores are exhausted if used_cores < 2 * CORES and waiting_cleans: logging.info(f'Ram reserved {round(total_reserved_ram_gb, 2)}GB, total {round(total_ram_gb, 2)}') for sra_id, (start_time) in waiting_cleans.items(): # remove the old clean waiting and append the new one after build_path = build_dir(sra_id) build_size_gb = util.dir_size_MB(build_path) / 1e3 required_ram_gb = max(build_size_gb * 2, build_size_gb + 1) if not_reserved_ram_gb > required_ram_gb: logging.info( f'[{sra_id}] Estimated {required_ram_gb}GB needed for cleaning, available {not_reserved_ram_gb} GB') kmer_count_unique = sra_info[sra_id][2] kmer_coverage = sra_info[sra_id][3] kmer_count_singletons = sra_info[sra_id][4] fallback = 5 if kmer_coverage > 5 else 2 if kmer_coverage > 2 or kmer_count_unique > 1e6 else 1 # multiplying singletons by 2 bc we compute canonical graph and KMC doesn't start_clean(sra_id, time.time() - start_time, 2 * kmer_count_singletons, fallback, required_ram_gb, not_reserved_ram_gb) not_reserved_ram_gb -= required_ram_gb del waiting_cleans[sra_id] break logging.info(f'[{sra_id}] Not enough RAM for cleaning. ' f'Have {round(not_reserved_ram_gb, 2)}GB need {round(build_size_gb + 0.5, 2)}GB') if used_cores < 2 * CORES and waiting_builds: logging.info(f'Ram reserved {round(total_reserved_ram_gb, 2)}GB, total {round(total_ram_gb, 2)}') for sra_id, (start_time) in waiting_builds.items(): num_kmers = sra_info[sra_id][2] # estimate RAM needed for loading graph in memory; bytes_per_kmer = 2.6 # 0.6 bytes/kmer (for --small representation), 2 byte/kmer-count kmer_count = 2.6 * num_kmers # 2x canonical+non-canonical + ~30% for dummy kmers (typically it's 10%) required_ram_gb = round(kmer_count * bytes_per_kmer / 1e9 + 0.5, 2) if required_ram_gb > total_ram_gb - 2: download_path = download_dir(sra_id) logging.warning( f'[{sra_id}] Building graph needs too much RAM: {required_ram_gb}GB). Removing {download_path}.') subprocess.run(['rm', '-rf', download_path]) params = {'id': sra_id, 'time': int(time.time() - start_time), 'required_ram_gb': required_ram_gb} nack('build', params) del waiting_builds[sra_id] break elif required_ram_gb < not_reserved_ram_gb and not_reserved_ram_gb > 2: logging.info( f'[{sra_id}] Estimated {required_ram_gb}GB needed for building, available {not_reserved_ram_gb} GB') # how much memory does it take to load all unique kmers into RAM: 8B for the kmer, 2B for the count required_ram_all_mem_gb = num_kmers * (8 + 2) * 3.5 / 1e9; # also account for dummy kmers if required_ram_all_mem_gb < 5 and required_ram_all_mem_gb < not_reserved_ram_gb: required_ram_gb = max(required_ram_gb, required_ram_all_mem_gb) start_build(sra_id, time.time() - start_time, math.ceil(required_ram_all_mem_gb), 'vector', required_ram_gb, not_reserved_ram_gb) else: buffer_size_gb = max(2, min(round(required_ram_gb * 0.8 - 1), 20)) start_build(sra_id, time.time() - start_time, buffer_size_gb, 'vector_disk', required_ram_gb, not_reserved_ram_gb) del waiting_builds[sra_id] not_reserved_ram_gb -= required_ram_gb # not that it matters break else: logging.info( f'[{sra_id}] Not enough RAM for building. Have {round(total_ram_gb - total_reserved_ram_gb, 2)}GB ' f'need {required_ram_gb}GB') for d in completed_transfers: del transfer_processes[d] return download_processes or build_processes or clean_processes or transfer_processes or not downloads_done
default='nandos') parser.add_argument('--image_dir', default='data/letters/my') parser.add_argument('--label_path', default='data/letters/my.txt') parser.add_argument('--model_dir', default='model/') parser.add_argument('--verbosity', default=500, type=int) parser.add_argument('--network', default='efficientdet-d0') parser.add_argument('--device', default='cuda') parser.add_argument('--checkpoint', default=None) parser.add_argument('--prefix', default='letters') args = parser.parse_args() if __name__ == '__main__': device = args.device make_dir_if_needed(args.model_dir) calc_loss = total_loss() if args.dataset == 'nandos': train_dataset = NandosDataset(args.image_dir, args.label_path, device=device, transform=transforms.Compose([ Augmenter(), MaxSizeResizer(1280), SquarePad(), ToTensor(), ])) elif args.dataset == 'letters': train_dataset = LetterDataset(args.image_dir,