def check_a3m_format(db_basename, force_mode): entries = ffindex.read_index(db_basename+"_a3m.ffindex") data = ffindex.read_data(db_basename+"_a3m.ffdata") corrupted_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) except: corrupted_alignments.add(entry.name) sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n") if len(corrupted_alignments) == 0: return if force_mode: tmp_dir = tempfile.mkdtemp() try: sys.stderr.write("WARNING: remove corrupted a3m's!\n") corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat") write_set_to_file(corrupted_alignments, corrupted_index_file) for suffix in ["a3m", "cs219", "hhm"]: remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex") sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex") optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex") finally: shutil.rmtree(tmp_dir) else: sys.stderr.write("You may try to use the option --force to fix the database!\n")
def extract_data(data, index, ev, cov_min, sim, max_len): domains = defaultdict(dict) for num, idx in enumerate(index): if num % 100 == 0: print ('Processing {num}/{total}'.format(num = num, total = len(index))) lines = read_lines(idx, data) hhr_data = parse_result(lines) for ali in hhr_data: cov = ali.aligned_cols / float(ali.query_length) # calculate whether the template is much larger than the query SCOP temp_len = ali.end[1] - ali.start[1] + 1 rel_size = (temp_len / float(ali.query_length)) -1 if (cov > cov_min) and (ali.evalue < ev) and (ali.similarity > sim) and (rel_size < max_len): uniprot_id = extract_uniprot_id(ali.template_info) # print ('SCOP: {scop_domain} UNIPROT: {up} Qneff: {qneff} Tneff: {tneff} P: {prob} EV: {eval} Sc: {score}, ACols: {acol}, Id: {ident}, Sim: {sim}, SumPr: {sumProb}').format( # scop_domain = idx.name, # up = uniprot_id, # qneff = ali.query_neff, # tneff = ali.template_neff, # prob = ali.probability, # eval = ali.evalue, # score = ali.score, # acol = ali.aligned_cols, # ident = ali.identity, # sim = ali.similarity, # sumProb = ali.sum_probs) # print ('QL: {query_len} TL: {temp_len} Cov: {cov} Temp: {start}-{end} max_len: {max_len}'.format( # query_len = ali.query_length, # temp_len = str(ali.end[1] - ali.start[1] + 1), # cov = cov, # start = ali.start[1], # end = ali.end[1], # max_len = rel_size )) # #print (idx.name, ali.aligned_cols, ali.similarity, ali.sum_probs) # #print (ali.template_info) scop = ali.query_id cluster = ali.template_id.split("|")[1] boundaries = (ali.start[1], ali.end[1]) domains[scop][cluster] = boundaries return domains
def extract_data(data, index, ev, cov_min, sim, max_len): domains = defaultdict(dict) for num, idx in enumerate(index): if num % 100 == 0: print('Processing {num}/{total}'.format(num=num, total=len(index))) lines = read_lines(idx, data) hhr_data = parse_result(lines) for ali in hhr_data: cov = ali.aligned_cols / float(ali.query_length) # calculate whether the template is much larger than the query SCOP temp_len = ali.end[1] - ali.start[1] + 1 rel_size = (temp_len / float(ali.query_length)) - 1 if (cov > cov_min) and (ali.evalue < ev) and ( ali.similarity > sim) and (rel_size < max_len): uniprot_id = extract_uniprot_id(ali.template_info) # print ('SCOP: {scop_domain} UNIPROT: {up} Qneff: {qneff} Tneff: {tneff} P: {prob} EV: {eval} Sc: {score}, ACols: {acol}, Id: {ident}, Sim: {sim}, SumPr: {sumProb}').format( # scop_domain = idx.name, # up = uniprot_id, # qneff = ali.query_neff, # tneff = ali.template_neff, # prob = ali.probability, # eval = ali.evalue, # score = ali.score, # acol = ali.aligned_cols, # ident = ali.identity, # sim = ali.similarity, # sumProb = ali.sum_probs) # print ('QL: {query_len} TL: {temp_len} Cov: {cov} Temp: {start}-{end} max_len: {max_len}'.format( # query_len = ali.query_length, # temp_len = str(ali.end[1] - ali.start[1] + 1), # cov = cov, # start = ali.start[1], # end = ali.end[1], # max_len = rel_size )) # #print (idx.name, ali.aligned_cols, ali.similarity, ali.sum_probs) # #print (ali.template_info) scop = ali.query_id cluster = ali.template_id.split("|")[1] boundaries = (ali.start[1], ali.end[1]) domains[scop][cluster] = boundaries return domains
def get_large_a3ms(a3m_base_path): entries = ffindex.read_index(a3m_base_path+"_a3m.ffindex") data = ffindex.read_data(a3m_base_path+"_a3m.ffindex") large_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) if alignment.get_number_sequences() > 50: large_alignments.add(entry.name) except: sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n") return large_alignments
def get_large_a3ms(a3m_base_path): entries = ffindex.read_index(a3m_base_path + "_a3m.ffindex") data = ffindex.read_data(a3m_base_path + "_a3m.ffindex") large_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) if alignment.get_number_sequences() > 50: large_alignments.add(entry.name) except: sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n") return large_alignments
def check_a3m_format(db_basename, force_mode): entries = ffindex.read_index(db_basename + "_a3m.ffindex") data = ffindex.read_data(db_basename + "_a3m.ffdata") corrupted_alignments = set() for entry in entries: lines = ffindex.read_lines(entry, data) alignment = a3m.A3M_Container() try: alignment.read_a3m_from_lines(lines) except: corrupted_alignments.add(entry.name) sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n") if len(corrupted_alignments) == 0: return if force_mode: tmp_dir = tempfile.mkdtemp() try: sys.stderr.write("WARNING: remove corrupted a3m's!\n") corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat") write_set_to_file(corrupted_alignments, corrupted_index_file) for suffix in ["a3m", "cs219", "hhm"]: remove_files_from_index( corrupted_index_file, db_basename + "_" + suffix + ".ffindex") sort_database(db_basename + "_" + suffix + ".ffdata", db_basename + "_" + suffix + ".ffindex") optimize_database(db_basename + "_" + suffix + ".ffdata", db_basename + "_" + suffix + ".ffindex") finally: shutil.rmtree(tmp_dir) else: sys.stderr.write( "You may try to use the option --force to fix the database!\n")