def make_local_template(best_oligo_template): middle_letters_best = best_oligo_template[1:3] if g_args.allow_monomers: best_template_file = os.path.join( pdb_archive, middle_letters_best, 'pdb' + best_oligo_template + ".ent.gz") pdb_name, contents = pctools.parse_pdb_contents(best_template_file) is_nmr = pctools.is_nmr(contents) if is_nmr: print( clrs['r'] + '\n\n Selected template ' + best_oligo_template + ' is an NMR structure \n Will try a a different candidate.\n\n' + clrs['n']) raise else: best_template_file = os.path.join(pdb_homo_archive, middle_letters_best, best_oligo_template + ".pdb.gz") clean_template_file = os.path.join( workdir, best_oligo_template + "_CHOIR_CleanTemplate.pdb") pdb_name, structure, nchains = pctools.parse_any_structure( best_template_file) io.set_structure(structure) io.save(clean_template_file, pctools.SelectIfCA()) return clean_template_file
def curate_homoDB(verbosity): ''' Creates h**o-oligomeric database from a local pdb repsitory. The divided scheme adopted by RCSB, in which the subdirectories are the two middle characters in the PDB code, is assumed. Each database contains three key files: dat, log and fasta. * homodb.dat contains only the pdb codes contained in the database. * homodb.log contains summarized relevant information about each entry. * homodb.fasta contains the sequences of every chain in the database. Called by: update_databases() ''' # Create stats folder if does not exist stats_dir = os.path.join(pdb_homo_archive, 'stats') if not os.path.isdir(stats_dir): os.mkdir(stats_dir) # Compare latest assession with new files assession_log = read_latest_assession(stats_dir) new_files = list_new_files(pdb1_archive, assession_log, verbosity) print(clrs['g'] + str(len(new_files)) + clrs['n'] + ' new structure files were found and will be processed') now = str(time.strftime("%d-%m-%Y@%H.%M.%S")) dat_file = os.path.join(stats_dir, now + '-choirdb.dat') log_file = os.path.join(stats_dir, now + '-choirdb.log') err_file = os.path.join(stats_dir, now + '-choirdb.err') if not os.path.isfile(dat_file): with open(dat_file, 'w+'): pass # Write files not to be updated to new dat file with open(dat_file, 'a') as f: for i in assession_log: if i not in new_files: f.write(i + " " + assession_log[i] + "\n") # Create log file if not os.path.isfile(log_file): with open(log_file, 'w+') as f: f.write('Code, Chains, Author, Software, Date\n') # Read Chain correspondences chain_correspondences_file = os.path.join(stats_dir, 'chain_correspondences.pickle') if os.path.isfile(chain_correspondences_file): with open(chain_correspondences_file, 'rb') as p: chain_correspondences = pickle.load(p) else: chain_correspondences = {} # Main loop that will populate the ProtCHOIR database for pdb in pg(new_files, widgets=widgets): filename = pdb.split('/')[-1] subfolder = pdb.split('/')[-2] # Record assessment in dat file with open(dat_file, 'a') as f: f.write(filename + " " + str(time.time()) + '\n') # Start assession pctools.printv('\nAssessing ' + pdb + '...', verbosity) # Reject files larger than 10Mb file_size = os.stat(pdb).st_size / 1048576 pctools.printv( 'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' + clrs['n'], verbosity) if file_size > 2: pctools.printv(clrs['r'] + "File size too large!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences from asymmetric unit." + clrs['n'], verbosity) try: alternative_pdb = os.path.join( pdb_archive, subfolder, 'pdb' + filename.split('.')[0] + '.ent.gz') pdb_code, structure, nchains = pctools.parse_pdb_structure( alternative_pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) # Write in fasta file pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue try: pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb) pctools.printv( 'Number of chains in structure ' + clrs['y'] + pdb_code + clrs['n'] + ': ' + str(nchains), verbosity) # Reject structures with more than 60 chains if int(nchains) > 60: pctools.printv( "Number of chains (" + clrs['y'] + str(nchains) + clrs['n'] + ") larger than 60! " + clrs['r'] + "Too many chains!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences anyway." + clrs['n'], verbosity) try: pdb_code, structure, nchains = pctools.parse_pdb_structure( pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue structure, chain_correspondences[pdb_code] = pctools.split_states( structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( 'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] + ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] + ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n', verbosity) # Try to get info from the canonic pdb header (homonimous to pdb1) canonpdb = "pdb" + pdb_code + ".ent.gz" try: contents = pctools.parse_pdb_contents( os.path.join(pdb_archive, subfolder, canonpdb))[1] except: pctools.printv( clrs['r'] + '\n\n Mismatch between pdb and biounit entries...' + clrs['n'], verbosity) author, software = pctools.get_annotated_states(contents) pctools.printv( 'Author determined biological unit = ' + str(author), verbosity) pctools.printv( 'Software determined quaternary structure= ' + str(software), verbosity) # Start assessing sequences and structures (from 2 up to 26 chains) if 1 < int(nchains) < 61: ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains) for id in ids: if id[0] >= 90: color = clrs['g'] else: color = clrs['r'] pctools.printv( 'Identity between chains ' + clrs['y'] + str(id[1]) + clrs['n'] + ' and ' + clrs['y'] + str(id[2]) + clrs['n'] + ' is ' + color + str(id[0]) + "%" + clrs['n'] + ".", verbosity) # Save records for pure h**o-oligomers if all(id[0] > 90 for id in ids) and proteinpair is True: pctools.printv( "All identities over 90%. Likely " + clrs['b'] + "h**o-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir(os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') # Investigate partial h**o-oligomers elif any(id[0] > 90 for id in ids) and proteinpair is True: at_least_one_interface = False for id in ids: if id[0] > 90: # Check if similar chains share interfaces if pctools.check_interfaces( structure, id[1], id[2]): at_least_one_interface = True pctools.printv( 'Contacts found between chains ' + clrs['g'] + str(id[1]) + clrs['n'] + ' and ' + clrs['g'] + str(id[2]) + clrs['n'] + ' sharing ' + clrs['g'] + str(id[0]) + clrs['n'] + " % identity.", verbosity) pctools.printv( "At least one putative " + clrs['b'] + "h**o-oligomeric " + clrs['n'] + "interface found.", verbosity) pctools.printv( clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir( os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') break if at_least_one_interface is False: pctools.printv( "No h**o-oligomeric interface found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif proteinpair is False: pctools.printv( clrs['r'] + "No proteic chain pairs found" + clrs['n'] + ".", verbosity) if any([set(seq[1]) != {'X'} for seq in seqs]): pctools.printv( clrs['y'] + "Protein sequences found though" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') else: pctools.printv( clrs['r'] + "Not even a single protein chain. Disregarding." + clrs['n'], verbosity) else: pctools.printv( "No similar chains found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif int(nchains) == 1: pctools.printv( "Only one chain found. Likely " + clrs['r'] + "monomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording monomer sequence." + clrs['n'], verbosity) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono') except: errtype, errvalue, errtraceback = sys.exc_info() errtypeshort = str(errtype).split('\'')[1] pctools.printv( clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) + ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'], verbosity) traceback.print_exception(*sys.exc_info()) if errtypeshort == 'KeyboardInterrupt': quit() #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: pass with open(err_file, 'a') as f: f.write(filename + '\n') continue with open(chain_correspondences_file, 'wb') as p: pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: f.write('\nNo errors. Assessment terminated succesfully.\n')