def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write("%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)