예제 #1
0
def check_a3m_format(db_basename, force_mode):
  entries = ffindex.read_index(db_basename+"_a3m.ffindex")
  data = ffindex.read_data(db_basename+"_a3m.ffdata")
  
  corrupted_alignments = set()
  for entry in entries:
    lines = ffindex.read_lines(entry, data)
    alignment = a3m.A3M_Container()
    try:
      alignment.read_a3m_from_lines(lines)
    except:
      corrupted_alignments.add(entry.name)
      sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")
  
  if len(corrupted_alignments) == 0:
    return
  
  if force_mode:
    tmp_dir = tempfile.mkdtemp()
    
    try:
      sys.stderr.write("WARNING: remove corrupted a3m's!\n")
      
      corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
      write_set_to_file(corrupted_alignments, corrupted_index_file)
      
      for suffix in ["a3m", "cs219", "hhm"]:
        remove_files_from_index(corrupted_index_file, db_basename+"_"+suffix+".ffindex")
        sort_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
        optimize_database(db_basename+"_"+suffix+".ffdata", db_basename+"_"+suffix+".ffindex")
    finally:
      shutil.rmtree(tmp_dir)
  else:
    sys.stderr.write("You may try to use the option --force to fix the database!\n")
def extract_data(data, index, ev, cov_min, sim, max_len):

	domains = defaultdict(dict)

	for num, idx in enumerate(index):
		
		if num % 100 == 0:
			print ('Processing {num}/{total}'.format(num = num, total = len(index)))
		
		lines = read_lines(idx, data)
		hhr_data = parse_result(lines)

		for ali in hhr_data:

			cov = ali.aligned_cols / float(ali.query_length)

			# calculate whether the template is much larger than the query SCOP
			temp_len = ali.end[1] - ali.start[1] + 1 
			rel_size = (temp_len / float(ali.query_length)) -1
			
			if (cov > cov_min) and (ali.evalue < ev) and (ali.similarity > sim) and (rel_size < max_len):
				
				uniprot_id = extract_uniprot_id(ali.template_info)
				
				# print ('SCOP: {scop_domain} UNIPROT: {up} Qneff: {qneff} Tneff: {tneff} P: {prob} EV: {eval} Sc: {score}, ACols: {acol}, Id: {ident}, Sim: {sim}, SumPr: {sumProb}').format(
				# 	scop_domain = idx.name,
				# 	up = uniprot_id,
				# 	qneff = ali.query_neff,
				# 	tneff = ali.template_neff,
				# 	prob = ali.probability,
				# 	eval = ali.evalue,
				# 	score = ali.score,
				# 	acol = ali.aligned_cols,
				# 	ident = ali.identity,
				# 	sim = ali.similarity,
				# 	sumProb = ali.sum_probs)


				# print ('QL: {query_len} TL: {temp_len} Cov: {cov} Temp: {start}-{end} max_len: {max_len}'.format(
				# 	query_len = ali.query_length, 
				# 	temp_len = str(ali.end[1] - ali.start[1] + 1),
				# 	cov = cov,
				# 	start = ali.start[1],
				# 	end = ali.end[1],
				# 	max_len = rel_size ))
				
				# #print (idx.name, ali.aligned_cols, ali.similarity, ali.sum_probs)
				# #print (ali.template_info)
				
				scop = ali.query_id
				cluster = ali.template_id.split("|")[1]
				boundaries = (ali.start[1], ali.end[1])
				
				domains[scop][cluster] = boundaries

	return domains
def extract_data(data, index, ev, cov_min, sim, max_len):

    domains = defaultdict(dict)

    for num, idx in enumerate(index):

        if num % 100 == 0:
            print('Processing {num}/{total}'.format(num=num, total=len(index)))

        lines = read_lines(idx, data)
        hhr_data = parse_result(lines)

        for ali in hhr_data:

            cov = ali.aligned_cols / float(ali.query_length)

            # calculate whether the template is much larger than the query SCOP
            temp_len = ali.end[1] - ali.start[1] + 1
            rel_size = (temp_len / float(ali.query_length)) - 1

            if (cov > cov_min) and (ali.evalue < ev) and (
                    ali.similarity > sim) and (rel_size < max_len):

                uniprot_id = extract_uniprot_id(ali.template_info)

                # print ('SCOP: {scop_domain} UNIPROT: {up} Qneff: {qneff} Tneff: {tneff} P: {prob} EV: {eval} Sc: {score}, ACols: {acol}, Id: {ident}, Sim: {sim}, SumPr: {sumProb}').format(
                # 	scop_domain = idx.name,
                # 	up = uniprot_id,
                # 	qneff = ali.query_neff,
                # 	tneff = ali.template_neff,
                # 	prob = ali.probability,
                # 	eval = ali.evalue,
                # 	score = ali.score,
                # 	acol = ali.aligned_cols,
                # 	ident = ali.identity,
                # 	sim = ali.similarity,
                # 	sumProb = ali.sum_probs)

                # print ('QL: {query_len} TL: {temp_len} Cov: {cov} Temp: {start}-{end} max_len: {max_len}'.format(
                # 	query_len = ali.query_length,
                # 	temp_len = str(ali.end[1] - ali.start[1] + 1),
                # 	cov = cov,
                # 	start = ali.start[1],
                # 	end = ali.end[1],
                # 	max_len = rel_size ))

                # #print (idx.name, ali.aligned_cols, ali.similarity, ali.sum_probs)
                # #print (ali.template_info)

                scop = ali.query_id
                cluster = ali.template_id.split("|")[1]
                boundaries = (ali.start[1], ali.end[1])

                domains[scop][cluster] = boundaries

    return domains
예제 #4
0
def get_large_a3ms(a3m_base_path):
  entries = ffindex.read_index(a3m_base_path+"_a3m.ffindex")
  data = ffindex.read_data(a3m_base_path+"_a3m.ffindex")
  
  large_alignments = set()
  for entry in entries:
    lines = ffindex.read_lines(entry, data)
    alignment = a3m.A3M_Container()
    try:
      alignment.read_a3m_from_lines(lines)
      
      if alignment.get_number_sequences() > 50:
        large_alignments.add(entry.name)
    except:
      sys.stderr.write("Warning: A3M "+entry.name+" is corrupted!\n")

  return large_alignments
def get_large_a3ms(a3m_base_path):
    entries = ffindex.read_index(a3m_base_path + "_a3m.ffindex")
    data = ffindex.read_data(a3m_base_path + "_a3m.ffindex")

    large_alignments = set()
    for entry in entries:
        lines = ffindex.read_lines(entry, data)
        alignment = a3m.A3M_Container()
        try:
            alignment.read_a3m_from_lines(lines)

            if alignment.get_number_sequences() > 50:
                large_alignments.add(entry.name)
        except:
            sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n")

    return large_alignments
예제 #6
0
def check_a3m_format(db_basename, force_mode):
    entries = ffindex.read_index(db_basename + "_a3m.ffindex")
    data = ffindex.read_data(db_basename + "_a3m.ffdata")

    corrupted_alignments = set()
    for entry in entries:
        lines = ffindex.read_lines(entry, data)
        alignment = a3m.A3M_Container()
        try:
            alignment.read_a3m_from_lines(lines)
        except:
            corrupted_alignments.add(entry.name)
            sys.stderr.write("Warning: A3M " + entry.name + " is corrupted!\n")

    if len(corrupted_alignments) == 0:
        return

    if force_mode:
        tmp_dir = tempfile.mkdtemp()

        try:
            sys.stderr.write("WARNING: remove corrupted a3m's!\n")

            corrupted_index_file = os.path.join(tmp_dir, "corrupted.dat")
            write_set_to_file(corrupted_alignments, corrupted_index_file)

            for suffix in ["a3m", "cs219", "hhm"]:
                remove_files_from_index(
                    corrupted_index_file,
                    db_basename + "_" + suffix + ".ffindex")
                sort_database(db_basename + "_" + suffix + ".ffdata",
                              db_basename + "_" + suffix + ".ffindex")
                optimize_database(db_basename + "_" + suffix + ".ffdata",
                                  db_basename + "_" + suffix + ".ffindex")
        finally:
            shutil.rmtree(tmp_dir)
    else:
        sys.stderr.write(
            "You may try to use the option --force to fix the database!\n")