def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None): cluster_names = IdSet() for species in clusters_dict: species_clusters = IdSet(clusters_dict[species].keys()) cluster_names |= species_clusters if out_file: cluster_names.write(out_file) return cluster_names & IdSet( white_list_ids) if white_list_ids else cluster_names
def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common"): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance( files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read(filename, comments_prefix="#") a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance( files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read(filename, comments_prefix="#") b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write( "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file): repeat_classes_set = IdSet() repeat_families_set = IdSet() with open(input_file, "r") as in_fd: for i in range(0, 3): in_fd.readline() with open(output_file, "w") as out_fd: for line in in_fd: tmp = line.strip().split() strand = "+" if tmp[8] == "+" else "-" repeat_class_family = tmp[10].split("/") if len(repeat_class_family) == 1: repeat_class_family.append(".") repeat_classes_set.add(repeat_class_family[0]) repeat_families_set.add("/".join(repeat_class_family)) parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \ % (repeat_class_family[0], repeat_class_family[1], tmp[9], tmp[0], tmp[1], tmp[2], tmp[3]) out_fd.write("%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters)) repeat_classes_set.write(annotated_repeat_classes_file) repeat_families_set.write(annotated_repeat_families_file)