示例#1
0
    def intersect_ids(list_of_group_a, list_of_group_b, mode="common"):
        # possible modes: common, only_a, only_b, not_common,  combine, count
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        for id_list in list_of_group_a:
            a = a | IdSet(id_list)

        for id_list in list_of_group_b:
            b = b | IdSet(id_list)

        if mode != "count":
            return IdSet(expression(a, b))
        else:
            return len(a), len(b), len(a
                                       & b), len(a -
                                                 b), len(b -
                                                         a), len(a
                                                                 ^ b), len(a
                                                                           | b)
示例#2
0
 def extract_transcripts_by_ids(self, input_gff, transcript_id_file,
                                output_gff):
     transcript_ids = IdSet()
     transcript_ids.read(transcript_id_file, header=False)
     GFF.write(
         self.record_with_extracted_transcripts_generator(
             input_gff, transcript_ids), open(output_gff, "w"))
示例#3
0
 def extract_monocluster_ids_from_file(self,
                                       dir_with_cluster_files,
                                       out_file,
                                       file_with_white_list_ids=None):
     # filenames are counted as species names
     white_list_ids = None
     if file_with_white_list_ids:
         white_list_ids = IdSet()
         white_list_ids.read(file_with_white_list_ids)
     clusters_dict = self.read_cluster_files_from_dir(
         dir_with_cluster_files)
     monoclusters = self.extract_monocluster_ids(
         clusters_dict, out_file=out_file, white_list_ids=white_list_ids)
     return monoclusters
示例#4
0
 def get_sequence_names(clusters_dict,
                        write_ids=False,
                        out_prefix=None,
                        white_list_ids=None):
     sequence_names_dict = SynDict()
     for species in clusters_dict:
         sequence_names_dict[species] = IdSet()
     for species in clusters_dict:
         for cluster_id in clusters_dict[species]:
             if white_list_ids:
                 if cluster_id not in white_list_ids:
                     continue
             sequence_names_dict[species] = sequence_names_dict[
                 species] | IdSet(clusters_dict[species][cluster_id])
     if write_ids:
         for species in clusters_dict:
             out_file = "%s_%s.ids" % (
                 out_prefix, species) if out_prefix else "%s.ids" % species
             sequence_names_dict[species].write(out_file)
     return sequence_names_dict
示例#5
0
    def extract_monocluster_ids(self,
                                clusters_dict,
                                white_list_ids=None,
                                out_file=None):
        """
        Extracts clusters with only one sequence in all species.
        """
        monocluster_ids = IdSet()
        cluster_names = self.get_cluster_names(clusters_dict)

        for cluster_name in cluster_names:
            for species in clusters_dict:
                if white_list_ids:
                    if cluster_name not in white_list_ids:
                        break
                if cluster_name not in clusters_dict[species]:
                    break
                if len(clusters_dict[species][cluster_name]) > 1:
                    break
            else:
                monocluster_ids.add(cluster_name)

        if out_file:
            monocluster_ids.write(out_file)

        return monocluster_ids
示例#6
0
    def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None):
        cluster_names = IdSet()
        for species in clusters_dict:
            species_clusters = IdSet(clusters_dict[species].keys())
            cluster_names |= species_clusters
        if out_file:
            cluster_names.write(out_file)

        return cluster_names & IdSet(
            white_list_ids) if white_list_ids else cluster_names
示例#7
0
    def extract_sequences_by_clusters(self,
                                      dir_with_cluster_files,
                                      dir_with_sequence_files,
                                      output_dir,
                                      file_with_white_list_cluster_ids=None,
                                      mode="families",
                                      sequence_file_extension="fasta",
                                      sequence_file_format="fasta",
                                      label_species=False,
                                      separator_for_labeling="@",
                                      species_label_first=True):
        """
        basenames of cluster and sequence files must be same

        mode:
            clusters - extract sequences from clusters in separate files,
            species - extract sequences from species to separate files
        """
        white_list_ids = None
        if file_with_white_list_cluster_ids:
            white_list_ids = IdSet()
            white_list_ids.read(file_with_white_list_cluster_ids)

        clusters_dict = self.read_cluster_files_from_dir(
            dir_with_cluster_files)
        cluster_names = self.get_cluster_names(clusters_dict,
                                               white_list_ids=white_list_ids)

        sequence_super_dict = OrderedDict()
        out_dir = self.check_path(output_dir)

        for species in clusters_dict:
            idx_file = "%s_tmp.idx" % species
            sequence_file = "%s%s.%s" % (self.check_path(
                dir_with_sequence_files), species, sequence_file_extension)
            sequence_super_dict[species] = SeqIO.index_db(
                idx_file, sequence_file, format=sequence_file_format)

        if mode == "species":
            seqeuence_names = self.get_sequence_names(
                clusters_dict,
                write_ids=False,
                out_prefix=None,
                white_list_ids=white_list_ids)
            for species in seqeuence_names:
                out_file = "%s%s.%s" % (out_dir, species,
                                        sequence_file_extension)
                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    sequence_super_dict[species], seqeuence_names[species]),
                            out_file,
                            format=sequence_file_format)
        elif mode == "families":

            def per_family_record_generator(seq_super_dict, clust_dict,
                                            cluster_id):
                if species_label_first:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        label, separator_for_labeling, name)
                else:
                    label_sequence = lambda label, name: "%s%s%s" % (
                        name, separator_for_labeling, label)

                for species in seq_super_dict:
                    #print species, cluster_id
                    for record_id in clust_dict[species][cluster_id]:
                        if label_species:
                            record = deepcopy(
                                seq_super_dict[species][record_id])
                            record.id = label_sequence(species, record_id)
                            yield record
                        else:
                            yield seq_super_dict[species][record_id]

            for cluster_name in cluster_names:
                out_file = "%s%s.%s" % (out_dir, cluster_name,
                                        sequence_file_extension)
                SeqIO.write(per_family_record_generator(
                    sequence_super_dict, clusters_dict, cluster_name),
                            out_file,
                            format=sequence_file_format)

        for species in clusters_dict:
            os.remove("%s_tmp.idx" % species)
示例#8
0
    def intersect_ids_from_files(files_with_ids_from_group_a,
                                 files_with_ids_from_group_b,
                                 result_file=None,
                                 mode="common"):
        a = IdSet()
        b = IdSet()

        if mode == "common":
            expression = lambda a, b: a & b
        elif mode == "only_a":
            expression = lambda a, b: a - b
        elif mode == "only_b":
            expression = lambda a, b: b - a
        elif mode == "not_common":
            expression = lambda a, b: a ^ b
        elif mode == "combine":
            expression = lambda a, b: a | b

        #print(files_with_ids_from_group_a)
        for filename in [files_with_ids_from_group_a] if isinstance(
                files_with_ids_from_group_a,
                str) else files_with_ids_from_group_a:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            a = a | id_set

        for filename in [files_with_ids_from_group_b] if isinstance(
                files_with_ids_from_group_b,
                str) else files_with_ids_from_group_b:
            id_set = IdSet()
            id_set.read(filename, comments_prefix="#")
            b = b | id_set

        result_fd = open(result_file, "w") if result_file else sys.stdout
        if mode != "count":
            final_set = IdSet(expression(a, b))
            final_set.write(result_fd)
        else:
            result_fd.write(
                "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n"
                % (len(a), len(b), len(a & b), len(a - b), len(b - a),
                   len(a ^ b), len(a | b)))
示例#9
0
    gene_ids_white_list = [output_pfam_supported_genes_ids]
elif args.swissprot_db:
    gene_ids_white_list = [output_swissprot_supported_genes_ids]
else:
    gene_ids_white_list = [all_annotated_genes_ids]

HMMER3.intersect_ids_from_files([all_annotated_genes_ids],
                                gene_ids_black_list,
                                genes_not_masked_ids,
                                mode="only_a")
HMMER3.intersect_ids_from_files(gene_ids_white_list,
                                gene_ids_black_list,
                                final_genes_ids,
                                mode="only_a")
#"""
final_ids = IdSet()
final_ids.read(final_genes_ids)

AnnotationsRoutines.extract_annotation_from_gff(output_gff, final_ids,
                                                ["gene"], final_gff)
print("Extracting CDS...")
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff)

print("Drawing histograms...")

for stat_file in output_evidence_stats, output_supported_stats, \
                 output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \
                 output_swissprot_pfam_or_hints_supported_transcripts_evidence, \
                 output_swissprot_pfam_and_hints_supported_transcripts_evidence:
示例#10
0
    def convert_rm_out_to_gff(input_file, output_file, annotated_repeat_classes_file, annotated_repeat_families_file):
        repeat_classes_set = IdSet()
        repeat_families_set = IdSet()
        with open(input_file, "r") as in_fd:
            for i in range(0, 3):
                in_fd.readline()

            with open(output_file, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip().split()
                    strand = "+" if tmp[8] == "+" else "-"
                    repeat_class_family = tmp[10].split("/")
                    if len(repeat_class_family) == 1:
                        repeat_class_family.append(".")
                    repeat_classes_set.add(repeat_class_family[0])
                    repeat_families_set.add("/".join(repeat_class_family))
                    parameters = "Class=%s;Family=%s;Matching_repeat=%s;SW_score=%s;Perc_div=%s;Perc_del=%s;Pers_ins=%s" \
                                 % (repeat_class_family[0], repeat_class_family[1],
                                    tmp[9], tmp[0], tmp[1], tmp[2], tmp[3])
                    out_fd.write("%s\tRepeatMasker\trepeat\t%s\t%s\t.\t%s\t.\t%s\n" % (tmp[4], tmp[5], tmp[6], strand, parameters))
        repeat_classes_set.write(annotated_repeat_classes_file)
        repeat_families_set.write(annotated_repeat_families_file)