def fetch_xray(xray_fp, force_download=False): """Fetch list of pdb entries, process, and write results to a yaml file. List of all PDB entries, identification of each as a protein, nucleic acid, or protein-nucleic acid complex and whether the structure was determined by diffraction or NMR. Reference: http://www.rcsb.org/pdb/static.do?p=general_information/about_pdb/summaries.html Args: xray_fp (Unicode): The destination yaml file to be written. force_download (bool): If true, download the file even it the path already exists locally. Returns: None """ # Manually unit tested. if isfile(xray_fp) and not force_download: print( "Found local copy of \"{}.\" Using file:\n\t{}".format( basename(xray_fp), xray_fp ) ) return None assert os.path.isabs(xray_fp) remote_directory = '/pub/pdb/derived_data/' remote_file = 'pdb_entry_type.txt' domain = 'ftp.wwpdb.org' ftp = FTP(domain) ftp.login() ftp.cwd(remote_directory) lines = [] ftp.retrlines('RETR {}'.format(remote_file), lambda l: lines.append(l)) xray = [] for line in lines: columns = line.split() x_type = columns[2].strip() p_type = columns[1] pdb = columns[0] if p_type == 'prot' or p_type == 'prot-nuc': if x_type == 'diffraction': xray.append(pdb.upper()) write_yaml(xray, xray_fp) assert isfile(xray_fp) return None
def fetch_obsolete( obs_file_path, url='http://www.rcsb.org/pdb/rest/getObsolete', force_download=False ): """Fetch list of obsolete entries. Fetch list of obsolete entries, process, and write results to a yaml file. Args: obs_file_path (Unicode): The destination yaml file to be written. url (Unicode): The url address of the data. force_download (bool): If true, download the file even it the path already exists locally. Returns: None """ if isfile(obs_file_path) and not force_download: print( "Found local copy of \"{}.\" Using file:\n\t{}".format( basename(obs_file_path), obs_file_path ) ) else: obs = [] obs_req = requests.get(url) root = ETree.fromstring(obs_req.text) for child in root: obs.append(child.attrib['structureId'].upper()) obs_req.close() write_yaml(obs, obs_file_path) return None
def uniprot_composite(dirs): """Creates final UniProt DataFrame. Create final UniProt DataFrame where the UniProt ID provides a unique key. Args: dirs (ProjectFolders): A named tuple of directory paths. """ pdb_initial_composite_fp = os.path.join(dirs.tsv_data, "pdb_initial_composite_df.tsv") assert os.path.isfile(pdb_initial_composite_fp) uni_folder_path = dirs.uni_data file_names = _create_composite_file_names() paths = _create_composite_file_paths(uni_folder_path, file_names) uni_composite_tsv = paths["tsv_file"] uni_composite_yaml = paths["yaml_file"] uni_composite_json = paths["json_file"] if _uni_composite_file_exists(uni_folder_path): print( "A final uni_composite file already exists. Composite " "function complete. (Note: remove existing uni_composite " 'files in the "{}" directory to have them ' "regenerated.".format(uni_folder_path) ) return None pdb_df = pd.read_csv( pdb_initial_composite_fp, sep="\t", header=0, encoding="utf-8", keep_default_na=False, na_values=["NULL", "N/A"] ) print("Creating the UniProt composite structure.") uni_df = create_uni_struct(pdb_df) print("Done creating UniProt composite structure.") print("Validating UniProt composite structure.") uni_pdb_validation(uni_df, pdb_df) print("Validation complete.") print("Assigning missing region designations.") uni_df = create_intervals(pdb_df, uni_df) print("Done assigning missing regions.") assert isinstance(uni_df, pd.DataFrame) delimiter = create_delimiter("\t") uni_df.to_csv(uni_composite_tsv, sep=delimiter, encoding="utf-8") uni_df.to_json(uni_composite_json, force_ascii=False) json_data = read_json(uni_composite_json) write_yaml(json_data, uni_composite_yaml) print("Done writing UniProt composite files:") print("\t{}".format(uni_composite_tsv)) print("\t{}".format(uni_composite_yaml)) print("\t{}".format(uni_composite_json)) print("This is the final UniProt ID DataFrame.") return None