def test_dp(): """ Run on two small tableaux to test. """ ta = get_tableaux('/local/charikar/astivala/pdb/d1ubia_.ent', 'dssp', 'none',True,True,use_numeric=True)[1][0] tb = get_tableaux('/local/charikar/ASTRAL/pdbstyle-1.73/xd/d1xd3b_.ent', 'dssp', 'none',True,True, use_numeric=True)[1][0] score = tabmatch_dp(ta, tb, use_numeric=True, disallow_type_mismatch = False) print 'score = ',score
def build_db( input_root, secstruct_program="dssp", domain_program="none", include_310_helices=True, include_pi_helices=True, min_sse_len=None, use_numeric=False, use_hk=False, build_dist_matrices=False, ): """ Build the tableaux db in memory. Parameters: input_root - root of PDB or ASTRAL pdbstyle divided hierarchy secstruct_program - secondary structure definition program ('stride' or 'dssp' or 'pdb') to use. domain_progam - domain decompositino method ('ddomain','cath', etc.) include_310_helices - if True, include 3_10 helices in the graph include_pi_helices - if True, include pi helices in the graph min_sse_len - if not None, minimum SSE length use_numeric - if True build database of Numeric array (Omega matrix) rather than PTTableauPacked use_hk - If True build database with HH and KK codes for strands in same sheet. build_dist_matrices - If True build database of SSE axis midpoint distance matrices rather than tableaux. Return value: dict of { pdbid : [PTTableauPacked list] } for use_numeric=False OR { pdbid : [Numeric.array list ]} for use_numeric=True OR { pdbid : [Numeric.array list ]} for build_dist_matrices=True pdbid is e.g. 1QLP when built from PDB or e.g. 1qlp1a when built from ASTRAL pdbstyle """ tableau_db = {} # dict of { pdbid : [PTTableauPacked list] }, OR # { pdbid : [Numeric.array list ]} for use_numeric=True # or build_dist_matrices=True # pdbid is e.g. 1QLP when built from PDB or # e.g. 1qlp1a when built from ASTRAL pdbstyle key_count = 0 tableaux_count = 0 keyerror_count = 0 file_count = 0 for root, dirs, files in os.walk(input_root): for pdb_filename in [ filename for filename in files if ( os.path.splitext(filename)[1] == ".ent" or os.path.splitext(filename)[1] == ".pdb" or os.path.splitext(filename)[1] == ".gz" ) ]: sys.stderr.write("processing " + pdb_filename + "\n") file_count += 1 (pdbid, tableaux_list, sse_string_list) = get_tableaux( os.path.join(root, pdb_filename), secstruct_program, domain_program, include_310_helices, include_pi_helices, None, # sse_id_list min_sse_len, use_numeric, use_hk, build_dist_matrices, ) if not (use_numeric or build_dist_matrices): tableaux_list = [PTTableauPacked(tableau) for tableau in tableaux_list] if tableau_db.has_key(pdbid): sys.stderr.write("ERROR: duplicate key " + pdbid + "\n") keyerror_count += 1 else: tableau_db[pdbid] = tableaux_list key_count += 1 tableaux_count += len(tableaux_list) sys.stdout.write("processed %d files\n" % file_count) sys.stdout.write("resulting in %d db entries\n" % key_count) if build_dist_matrices: sys.stdout.write(" %d SSE distance matrices\n" % tableaux_count) else: sys.stdout.write(" %d tableaux\n" % tableaux_count) sys.stdout.write("with %d duplicate key errors\n" % keyerror_count) return tableau_db