def left_to_renumber_mmCIF( default_input_path_to_mmCIF=current_directory + "/mmCIF", default_output_path_to_mmCIF=current_directory + "/output_mmCIF"): without_already_renumbered_mmCIF = list() # output_mmCIF_files_were_found_4Char = set() # input_mmCIF_files_were_found_4Char = set() output_mmCIF_files_were_found_set = set() input_mmCIF_files_were_found_set = set() mmCIF_files_were_found = look_what_is_inside( "mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF) output_mmCIF_files_were_found = look_what_is_inside( "output_mmCIF", default_output_path_to_mmCIF=default_output_path_to_mmCIF) for output_mmCIF_file in output_mmCIF_files_were_found: output_mmCIF_files_were_found_set.add(output_mmCIF_file) for input_mmCIF_file in mmCIF_files_were_found: input_mmCIF_files_were_found_set.add(input_mmCIF_file) set_difference = input_mmCIF_files_were_found_set - output_mmCIF_files_were_found_set for mmCIF_file in mmCIF_files_were_found: if mmCIF_file in set_difference: without_already_renumbered_mmCIF.append(mmCIF_file) return without_already_renumbered_mmCIF
def left_to_renumber_PDB(default_input_path_to_PDB=current_directory + "/PDB", default_output_path_to_PDB=current_directory + "/output_PDB"): without_already_renumbered_PDB = list() output_PDB_files_were_found_4Char = set() input_PDB_files_were_found_4Char = set() input_PDB_files_were_found = look_what_is_inside( "PDB", default_input_path_to_PDB=default_input_path_to_PDB) output_PDB_files_were_found = look_what_is_inside( "output_PDB", default_output_path_to_PDB=default_output_path_to_PDB) for output_PDB_file in output_PDB_files_were_found: output_PDB_files_were_found_4Char.add(output_PDB_file[:4]) for input_PDB_file in input_PDB_files_were_found: input_PDB_files_were_found_4Char.add(input_PDB_file[3:7]) set_difference = input_PDB_files_were_found_4Char - output_PDB_files_were_found_4Char list_difference = list(set_difference) for PDB_id in list_difference: without_already_renumbered_PDB.append("pdb" + PDB_id + ".ent.gz") return without_already_renumbered_PDB
def ProcessPool_run_renum_mmCIF(format_mmCIF, mmCIF_to_renumber, default_input_path_to_mmCIF, default_input_path_to_SIFTS, default_output_path_to_mmCIF, default_mmCIF_num, gzip_mode, exception_AccessionIDs, nproc): first_res = 0 for i in range(3): if not os.path.exists(default_output_path_to_mmCIF): os.makedirs(default_output_path_to_mmCIF) # renumber loop resulting = list() executor = ProcessPoolExecutor(max_workers=nproc) partial_master_mmCIF_renumber_function = partial( master_mmCIF_renumber_function, default_input_path_to_mmCIF=default_input_path_to_mmCIF, default_input_path_to_SIFTS=default_input_path_to_SIFTS, default_output_path_to_mmCIF=default_output_path_to_mmCIF, default_mmCIF_num=default_mmCIF_num, gzip_mode=gzip_mode, exception_AccessionIDs=exception_AccessionIDs) jobs = [ executor.submit(partial_master_mmCIF_renumber_function, mmCIF_files) for mmCIF_files in mmCIF_to_renumber ] for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), miniters=1, position=0, leave=True, desc="Renumbering " + format_mmCIF + " files"): result = job.result() if result is None: continue resulting.append(result) if i == 0: first_res = resulting if format_mmCIF == "mmCIF_assembly": output_mmCIF = look_what_is_inside( 'output_mmCIF_assembly', default_output_path_to_mmCIF_assembly= default_output_path_to_mmCIF) else: output_mmCIF = look_what_is_inside( 'output_mmCIF', default_output_path_to_mmCIF=default_output_path_to_mmCIF) # checker loop check_list = list() executor = ProcessPoolExecutor(max_workers=nproc) partial_reform_assembly = partial( check_assemblies, default_output_path_to_mmCIF_assembly=default_output_path_to_mmCIF) jobs = [ executor.submit(partial_reform_assembly, assembly_files) for assembly_files in output_mmCIF ] for job in tqdm.tqdm(as_completed(jobs), total=len(jobs), miniters=1, position=0, leave=True, desc="Checking " + format_mmCIF + " files"): resultus = job.result() check_list.append(resultus) if format_mmCIF == "mmCIF_assembly": output_mmCIF = look_what_is_inside( 'output_mmCIF_assembly', default_output_path_to_mmCIF_assembly= default_output_path_to_mmCIF) else: output_mmCIF = look_what_is_inside( 'output_mmCIF', default_output_path_to_mmCIF=default_output_path_to_mmCIF) output_mmCIF_4char = set() for n in output_mmCIF: output_mmCIF_4char.add(n[:4]) if len(check_list) <= len(output_mmCIF): break else: new_round_mmCIF_to_renumber = set() for n in mmCIF_to_renumber: if n[:4] in output_mmCIF_4char: continue else: new_round_mmCIF_to_renumber.add(n) mmCIF_to_renumber = new_round_mmCIF_to_renumber return first_res
def run_downloads_with_ThreadPool( format_to_download="mmCIF", urls_to_target=(), default_input_path_to_mmCIF=current_directory + "/mmCIF", default_input_path_to_PDB=current_directory + "/PDB", default_input_path_to_SIFTS=current_directory + "/SIFTS", default_input_path_to_mmCIF_assembly=current_directory + "/mmCIF_assembly", default_input_path_to_PDB_assembly=current_directory + "/PDB_assembly"): for i in range(3): executor = ThreadPoolExecutor() partial_download_with_pool = partial( download_with_pool, default_input_path_to_mmCIF=default_input_path_to_mmCIF, default_input_path_to_PDB=default_input_path_to_PDB, default_input_path_to_SIFTS=default_input_path_to_SIFTS, default_input_path_to_mmCIF_assembly= default_input_path_to_mmCIF_assembly, default_input_path_to_PDB_assembly= default_input_path_to_PDB_assembly) jobs = [ executor.submit(partial_download_with_pool, url) for url in urls_to_target ] for _ in tqdm.tqdm(as_completed(jobs), total=len(jobs), miniters=1, position=0, leave=True, desc="Downloading " + format_to_download + " files"): pass files_targeted = list() format_of_db = 0 for url in urls_to_target: file_name_start_pos = url.rfind("/") + 1 file_name = url[file_name_start_pos:] files_targeted.append(file_name) format_start_pos = file_name_start_pos - 4 format_of_db = url[format_start_pos:format_start_pos + 3] if format_of_db == "CIF": input_files = look_what_is_inside( "mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF) elif format_of_db == "pdb": input_files = look_what_is_inside( 'PDB', default_input_path_to_PDB=default_input_path_to_PDB) elif format_of_db == "xml": input_files = look_what_is_inside( 'SIFTS', default_input_path_to_SIFTS=default_input_path_to_SIFTS) elif format_of_db == "all": input_files = look_what_is_inside( 'PDB_assembly', default_input_path_to_PDB_assembly= default_input_path_to_PDB_assembly) elif format_of_db == "try": input_files = look_what_is_inside( 'mmCIF_assembly', default_input_path_to_mmCIF_assembly= default_input_path_to_mmCIF_assembly) else: input_files = set() # check_if_all_files_in = False # # for files_in in files_targeted: # if files_in in input_files: # pass # else: # check_if_all_files_in = True # # if check_if_all_files_in: # urls_to_target = list(set(files_targeted) - set(input_files))c # else: # break output_4char = set() for n in input_files: output_4char.add(n[:4]) new_round_files_targeted = set() for n in files_targeted: if n[:4] in output_4char: continue else: new_round_files_targeted.add(n) files_targeted = new_round_files_targeted if len(files_targeted) == 0: break
urls_to_target_mmCIF_assembly_files = url_formation_for_pool("mmCIF_assembly", parsed_input_text, default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) urls_to_target_PDB_files = url_formation_for_pool("PDB", parsed_input_text, default_input_path_to_PDB=default_input_path_to_PDB) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", parsed_input_text, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF) run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_assembly_files, default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) # renum PDB passed_as_arg_file_4Char_PDB = list() for file_name in parsed_input_text: passed_as_arg_file_4Char_PDB.append(file_name[:4]) input_PDB_files_were_found = look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB) target_files_list_PDB = list() for file_name in input_PDB_files_were_found: if file_name[3:7] in passed_as_arg_file_4Char_PDB: target_files_list_PDB.append(file_name) ProcessPool_run_renum_PDB("PDB", target_files_list_PDB, default_input_path_to_PDB, default_input_path_to_SIFTS, default_output_path_to_PDB, default_PDB_num, gzip_mode, exception_AccessionIDs, nproc) # renum mmCIF_assembly input_mmCIF_files_were_found = look_what_is_inside("mmCIF_assembly", default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) passed_as_arg_file_4Char_mmCIF = list() for file_name in parsed_input_text: passed_as_arg_file_4Char_mmCIF.append(file_name[:4]) target_files_list_mmCIF = list() for file_name in input_mmCIF_files_were_found:
def supreme_download_master(format_of_db, job_type=None, default_input_path_to_mmCIF=current_directory + "/mmCIF", default_input_path_to_PDB=current_directory + "/PDB", default_input_path_to_SIFTS=current_directory + "/SIFTS", default_input_path_to_mmCIF_assembly=current_directory + "/mmCIF_assembly", default_input_path_to_PDB_assembly=current_directory + "/PDB_assembly", default_output_path_to_mmCIF=current_directory + "/output_mmCIF", default_output_path_to_PDB=current_directory + "/output_PDB", default_output_path_to_mmCIF_assemblies=current_directory + "/output_mmCIF_assembly", default_output_path_to_PDB_assemblies=current_directory + "/output_PDB_assembly"): catalogdownloader.catalog_downloader() if job_type == "refresh": if os.path.exists(default_input_path_to_SIFTS): shutil.rmtree(default_input_path_to_SIFTS) if format_of_db == "mmCIF": if os.path.exists(default_input_path_to_mmCIF): shutil.rmtree(default_input_path_to_mmCIF) if os.path.exists(default_output_path_to_mmCIF): shutil.rmtree(default_output_path_to_mmCIF) if format_of_db == "mmCIF_assembly": if os.path.exists(default_input_path_to_mmCIF_assembly): shutil.rmtree(default_input_path_to_mmCIF_assembly) if os.path.exists(default_output_path_to_mmCIF_assemblies): shutil.rmtree(default_output_path_to_mmCIF_assemblies) if format_of_db == "PDB": if os.path.exists(default_input_path_to_PDB): shutil.rmtree(default_input_path_to_PDB) if os.path.exists(default_output_path_to_PDB): shutil.rmtree(default_output_path_to_PDB) if format_of_db == "PDB_assembly": if os.path.exists(default_input_path_to_PDB_assembly): shutil.rmtree(default_input_path_to_PDB_assembly) if os.path.exists(default_output_path_to_PDB_assemblies): shutil.rmtree(default_output_path_to_PDB_assemblies) if format_of_db == "all": if os.path.exists(default_input_path_to_PDB): shutil.rmtree(default_input_path_to_PDB) if os.path.exists(default_input_path_to_mmCIF): shutil.rmtree(default_input_path_to_mmCIF) if os.path.exists(default_input_path_to_PDB_assembly): shutil.rmtree(default_input_path_to_PDB_assembly) if os.path.exists(default_input_path_to_mmCIF_assembly): shutil.rmtree(default_input_path_to_mmCIF_assembly) if os.path.exists(default_output_path_to_mmCIF): shutil.rmtree(default_output_path_to_mmCIF) if os.path.exists(default_output_path_to_mmCIF_assemblies): shutil.rmtree(default_output_path_to_mmCIF_assemblies) if os.path.exists(default_output_path_to_PDB): shutil.rmtree(default_output_path_to_PDB) if os.path.exists(default_output_path_to_PDB_assemblies): shutil.rmtree(default_output_path_to_PDB_assemblies) if format_of_db == "mmCIF": all_data_from_catreader = latestcatreader.latest_catalog_reader() all_mmCIF_files_from_latest_catalog = all_data_from_catreader[0] all_SIFTS_files_from_latest_catalog = all_data_from_catreader[2] input_mmCIF_files_were_found = lookfilesinside.look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF) left_to_download_mmCIF = lefttodownload.what_is_left_to_download(input_mmCIF_files_were_found, all_mmCIF_files_from_latest_catalog) urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", left_to_download_mmCIF, default_input_path_to_mmCIF=default_input_path_to_mmCIF) run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF) input_SIFTS_files_were_found = lookfilesinside.look_what_is_inside("SIFTS", default_input_path_to_SIFTS=default_input_path_to_SIFTS) left_to_download_SIFTS = lefttodownload.what_is_left_to_download(input_SIFTS_files_were_found, all_SIFTS_files_from_latest_catalog) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", left_to_download_SIFTS, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) return left_to_download_mmCIF if format_of_db == "mmCIF_assembly": all_data_from_catreader = latestcatreader.latest_catalog_reader() all_mmCIF_files = all_data_from_catreader[0] all_SIFTS_files_from_latest_catalog = all_data_from_catreader[2] lefttodownload_mmCIF_assemblies = list() input_mmCIF_assembly_files_were_found = lookfilesinside.look_what_is_inside( "mmCIF_assembly", default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) all_mmCIF_files_4char = set() for mmCIF_file in all_mmCIF_files: all_mmCIF_files_4char.add(mmCIF_file[:4]) input_mmCIF_assembly_files_were_found_4char = set() for mmCIF_assembly_file in input_mmCIF_assembly_files_were_found: input_mmCIF_assembly_files_were_found_4char.add(mmCIF_assembly_file[:4]) set_difference = all_mmCIF_files_4char - input_mmCIF_assembly_files_were_found_4char list_difference = list(set_difference) for mmCIF_id in list_difference: lefttodownload_mmCIF_assemblies.append(mmCIF_id + ".cif.gz") urls_to_target_mmCIF_assembly_files = url_formation_for_pool("mmCIF_assembly", lefttodownload_mmCIF_assemblies) run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_assembly_files, default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) input_SIFTS_files_were_found = lookfilesinside.look_what_is_inside("SIFTS", default_input_path_to_SIFTS=default_input_path_to_SIFTS) left_to_download_SIFTS = lefttodownload.what_is_left_to_download(input_SIFTS_files_were_found, all_SIFTS_files_from_latest_catalog) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", left_to_download_SIFTS, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) return lefttodownload_mmCIF_assemblies if format_of_db == "PDB": all_data_from_catreader = latestcatreader.latest_catalog_reader() all_PDB_files_from_latest_catalog = all_data_from_catreader[1] all_SIFTS_files_from_latest_catalog = all_data_from_catreader[2] input_PDB_files_were_found = lookfilesinside.look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB) left_to_download_PDB = lefttodownload.what_is_left_to_download(input_PDB_files_were_found, all_PDB_files_from_latest_catalog) urls_to_target_PDB_files = url_formation_for_pool("PDB", left_to_download_PDB, default_input_path_to_PDB=default_input_path_to_PDB) run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB) input_SIFTS_files_were_found = lookfilesinside.look_what_is_inside("SIFTS", default_input_path_to_SIFTS=default_input_path_to_SIFTS) left_to_download_SIFTS = lefttodownload.what_is_left_to_download(input_SIFTS_files_were_found, all_SIFTS_files_from_latest_catalog) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", left_to_download_SIFTS, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) return left_to_download_PDB if format_of_db == "PDB_assembly": all_data_from_catreader = latestcatreader.latest_catalog_reader() all_SIFTS_files_from_latest_catalog = all_data_from_catreader[2] download_all_PDB_assemblies = download_pdb_assemblies_list_with_lxml() input_PDB_assembly_files_were_found = lookfilesinside.look_what_is_inside( "PDB_assembly", default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly) try: len(download_all_PDB_assemblies) except TypeError: return print("Cannot reach https://ftp.wwpdb.org/pub/pdb/data/biounit/PDB/all/ maybe try again later") lefttodownload_PDB_assemblies = [assembly for assembly in download_all_PDB_assemblies if assembly.rsplit('/', 1)[-1] not in input_PDB_assembly_files_were_found] run_downloads_with_ThreadPool("PDB_assembly", lefttodownload_PDB_assemblies, default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly) input_SIFTS_files_were_found = lookfilesinside.look_what_is_inside("SIFTS", default_input_path_to_SIFTS=default_input_path_to_SIFTS) left_to_download_SIFTS = lefttodownload.what_is_left_to_download(input_SIFTS_files_were_found, all_SIFTS_files_from_latest_catalog) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", left_to_download_SIFTS, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) return lefttodownload_PDB_assemblies if format_of_db == "all": all_data_from_catreader = latestcatreader.latest_catalog_reader() all_mmCIF_files_from_latest_catalog = all_data_from_catreader[0] all_PDB_files_from_latest_catalog = all_data_from_catreader[1] all_SIFTS_files_from_latest_catalog = all_data_from_catreader[2] input_mmCIF_files_were_found = lookfilesinside.look_what_is_inside("mmCIF", default_input_path_to_mmCIF=default_input_path_to_mmCIF) input_PDB_files_were_found = lookfilesinside.look_what_is_inside("PDB", default_input_path_to_PDB=default_input_path_to_PDB) input_SIFTS_files_were_found = lookfilesinside.look_what_is_inside("SIFTS", default_input_path_to_SIFTS=default_input_path_to_SIFTS) left_to_download_mmCIF = lefttodownload.what_is_left_to_download(input_mmCIF_files_were_found, all_mmCIF_files_from_latest_catalog) left_to_download_PDB = lefttodownload.what_is_left_to_download(input_PDB_files_were_found, all_PDB_files_from_latest_catalog) left_to_download_SIFTS = lefttodownload.what_is_left_to_download(input_SIFTS_files_were_found, all_SIFTS_files_from_latest_catalog) urls_to_target_mmCIF_files = url_formation_for_pool("mmCIF", left_to_download_mmCIF, default_input_path_to_mmCIF=default_input_path_to_mmCIF) urls_to_target_PDB_files = url_formation_for_pool("PDB", left_to_download_PDB, default_input_path_to_PDB=default_input_path_to_PDB) urls_to_target_SIFTS_files = url_formation_for_pool("SIFTS", left_to_download_SIFTS, default_input_path_to_SIFTS=default_input_path_to_SIFTS) run_downloads_with_ThreadPool("mmCIF", urls_to_target_mmCIF_files, default_input_path_to_mmCIF=default_input_path_to_mmCIF) run_downloads_with_ThreadPool("PDB", urls_to_target_PDB_files, default_input_path_to_PDB=default_input_path_to_PDB) run_downloads_with_ThreadPool("SIFTS", urls_to_target_SIFTS_files, default_input_path_to_SIFTS=default_input_path_to_SIFTS) # PDB_assembly download_all_PDB_assemblies = download_pdb_assemblies_list_with_lxml() input_PDB_assembly_files_were_found = lookfilesinside.look_what_is_inside("PDB_assembly") try: len(download_all_PDB_assemblies) except TypeError: return print("Cannot reach https://ftp.wwpdb.org/pub/pdb/data/biounit/PDB/all/ maybe try again later") lefttodownload_PDB_assemblies = [assembly for assembly in download_all_PDB_assemblies if assembly.rsplit('/', 1)[-1] not in input_PDB_assembly_files_were_found] run_downloads_with_ThreadPool("PDB_assembly", lefttodownload_PDB_assemblies, default_input_path_to_PDB_assembly=default_input_path_to_PDB_assembly) # mmCIF_assembly lefttodownload_mmCIF_assemblies = list() input_mmCIF_assembly_files_were_found = lookfilesinside.look_what_is_inside( "mmCIF_assembly", default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) all_mmCIF_files_4char = set() for mmCIF_file in all_mmCIF_files_from_latest_catalog: all_mmCIF_files_4char.add(mmCIF_file[:4]) input_mmCIF_assembly_files_were_found_4char = set() for mmCIF_assembly_file in input_mmCIF_assembly_files_were_found: input_mmCIF_assembly_files_were_found_4char.add(mmCIF_assembly_file[:4]) set_difference = all_mmCIF_files_4char - input_mmCIF_assembly_files_were_found_4char list_difference = list(set_difference) for mmCIF_id in list_difference: lefttodownload_mmCIF_assemblies.append(mmCIF_id + ".cif.gz") urls_to_target_mmCIF_assembly_files = url_formation_for_pool("mmCIF_assembly", lefttodownload_mmCIF_assemblies, default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) run_downloads_with_ThreadPool("mmCIF_assembly", urls_to_target_mmCIF_assembly_files, default_input_path_to_mmCIF_assembly=default_input_path_to_mmCIF_assembly) return [left_to_download_mmCIF, left_to_download_PDB, lefttodownload_mmCIF_assemblies, lefttodownload_PDB_assemblies]