示例#1
0
    def count_per_scaffold_feature_number(gff_file,
                                          out_file=None,
                                          feature_type_list=[]):
        feature_count_dict = SynDict()

        if feature_type_list:

            def check_feature_type(feature_type):
                return feature_type in feature_type_list
        else:

            def check_feature_type(feature_type):
                return True

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")
                if check_feature_type(line_list[2]):
                    if line_list[0] in feature_count_dict:
                        feature_count_dict[line_list[0]] += 1
                    else:
                        feature_count_dict[line_list[0]] = 1

        if out_file:
            feature_count_dict.write(out_file)

        return feature_count_dict
示例#2
0
    def add_length_to_accordance_file(accordance_file, length_file,
                                      output_prefix):

        accordance_dict = SynDict(filename=accordance_file,
                                  allow_repeats_of_key=True)
        length_dict = SynDict(filename=length_file, expression=int)
        print(length_dict)
        longest_list = IdList()

        all_output_file = "%s.all.correspondence" % output_prefix
        longest_output_file = "%s.longest.correspondence" % output_prefix
        longest_id_file = "%s.longest.ids" % output_prefix

        with open(all_output_file, "w") as all_out_fd:
            with open(longest_output_file, "w") as longest_out_fd:
                for gene in accordance_dict:
                    current_transcript = None
                    current_length = 0
                    for transcript in accordance_dict[gene]:
                        if length_dict[transcript] > current_length:
                            current_transcript = transcript
                            current_length = length_dict[transcript]
                        all_out_fd.write(
                            "%s\t%s\t%i\n" %
                            (gene, transcript, length_dict[transcript]))

                    longest_out_fd.write(
                        "%s\t%s\t%i\n" %
                        (gene, current_transcript, current_length))
                    longest_list.append(current_transcript)
        longest_list.write(longest_id_file)
    def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
示例#4
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file,
                                                  output_file,
                                                  comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print ("tttt")
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print ("ppppp")
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
示例#5
0
    def prepare_data_for_target_alignment(self,
                                          query_fasta,
                                          target_fasta,
                                          correspondence_file,
                                          out_dir,
                                          correspondence_query_column=0,
                                          correspondence_target_column=1):

        query_dict = self.parse_seq_file(query_fasta, "parse")
        target_dict = self.parse_seq_file(target_fasta, "parse")

        self.safe_mkdir(out_dir)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      allow_repeats_of_key=True,
                                      key_index=correspondence_query_column,
                                      value_index=correspondence_target_column)

        for query_id in correspondence_dict:
            query_outfile = "%s/%s.query.fasta" % (out_dir, query_id)
            target_outfile = "%s/%s.target.fasta" % (out_dir, query_id)

            SeqIO.write(self.record_by_id_generator(query_dict, [query_id]),
                        query_outfile,
                        format="fasta")
            SeqIO.write(self.record_by_id_generator(
                target_dict, correspondence_dict[query_id]),
                        target_outfile,
                        format="fasta")

        queries_with_targets_set = set(correspondence_dict.keys())
        queries_set = set(query_dict.keys())

        return queries_with_targets_set, queries_set - queries_with_targets_set
示例#6
0
    def count_gaps(self):

        gaps_dict = SynDict()
        seq_length_dict = SynDict()

        for row in range(0, self.number_of_sequences):
            sequence_id = self.alignment[row].id
            gaps_dict[sequence_id] = str(self.alignment[row].seq).count(self.gap_symbol)
            seq_length_dict[sequence_id] = self.length - gaps_dict[sequence_id]

        return gaps_dict, seq_length_dict
示例#7
0
文件: TRF.py 项目: mahajrod/RouToolPa
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
示例#8
0
 def syn2fam(syn_file,
             fam_file,
             key_column=0,
             value_column=1,
             separator="\t"):
     syn_dict = SynDict(filename=syn_file,
                        allow_repeats_of_key=True,
                        key_index=key_column,
                        value_index=value_column,
                        separator=separator,
                        split_values=True)
     syn_dict.write(fam_file, splited_values=True)
示例#9
0
    def get_feature_dict(self,
                         input_gff,
                         output_prefix=None,
                         feature_type_list=["CDS"],
                         unification_key="Parent"):

        feature_dict = SynDict()
        for line_list in self.file_line_as_list_generator(input_gff,
                                                          comments_prefix="#",
                                                          separator="\t"):
            annotation_dict = self.parse_gff_annotation_string_to_dict(
                line_list[self.GFF_ATTRIBUTE_COLUMN])

            if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list:
                continue

            if unification_key not in annotation_dict:
                continue
            #print unification_key
            #print(annotation_dict)

            if annotation_dict[unification_key][0] not in feature_dict:
                feature_dict[annotation_dict[unification_key][0]] = []

            feature_dict[annotation_dict[unification_key][0]].append([
                line_list[self.GFF_SCAFFOLD_COLUMN],
                line_list[self.GFF_START_COLUMN],
                line_list[self.GFF_END_COLUMN],
                line_list[self.GFF_STRAND_COLUMN]
            ])

        if output_prefix:
            feature_dict.write(
                "%s.tab" % output_prefix,
                value_expression=self.feature_list_entry_to_tab_str,
                line_per_value=True)
            feature_dict.write(
                "%s.coordinates_only.tab" % output_prefix,
                value_expression=self.feature_list_entry_to_tab_str,
                line_per_value=True,
                values_only=True)

            feature_dict.write(
                "%s.list" % output_prefix,
                value_expression=self.feature_list_entry_to_gatk_interval_str,
                line_per_value=True)
            feature_dict.write(
                "%s.coordinates_only.list" % output_prefix,
                value_expression=self.feature_list_entry_to_gatk_interval_str,
                line_per_value=True,
                values_only=True)

        return feature_dict
示例#10
0
    def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file,
                                                      output_file):
        GO_terms_dict = SynDict(filename=emapper_annotation_file,
                                key_index=0,
                                value_index=5,
                                split_values=True,
                                values_separator=",",
                                comments_prefix="#",
                                separator="\t")
        GO_terms_dict.header = "#protein_id\tGO_terms"
        GO_terms_dict.write(output_file, header=True, splited_values=True)

        return GO_terms_dict
示例#11
0
 def add_len_to_simple_output(top_hits_simple, len_file, out_file):
     len_dict = SynDict()
     len_dict.read(len_file)
     with open(top_hits_simple, "r") as in_fd:
         with open(out_file, "w") as out_fd:
             for line in in_fd:
                 tmp_list = line.strip().split("\t")
                 out_fd.write(
                     "%s\t%s\t%s\t%s\t%s\t%f\n" %
                     (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3],
                      tmp_list[1], tmp_list[2],
                      (float(tmp_list[2]) - float(tmp_list[1]) + 1) /
                      float(len_dict[tmp_list[0]])))
示例#12
0
 def replace_region_names_in_gff(input_gff, synonyms_file, output_gff):
     syn_dict = SynDict()
     syn_dict.read(synonyms_file, comments_prefix="#")
     with open(input_gff, "r") as in_fd:
         with open(output_gff, "w") as out_fd:
             for line in in_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                 else:
                     line_list = line.split("\t")
                     if line_list[0] in syn_dict:
                         line_list[0] = syn_dict[line_list[0]]
                         out_fd.write("\t".join(line_list))
                     else:
                         out_fd.write(line)
示例#13
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w",
                                                  cluster_column=0,
                                                  element_column=1,
                                                  column_separator="\t",
                                                  element_separator=",",
                                                  id_column=None):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict(filename=cluster_file,
                               split_values=True,
                               comments_prefix="#",
                               key_index=cluster_column,
                               value_index=element_column,
                               separator=column_separator,
                               values_separator=element_separator)

        element_id_list = IdList(filename=element_file,
                                 comments_prefix="#",
                                 column_number=id_column)
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
示例#14
0
    def extract_predicted_gene_names_from_emapper_annotation_file(
            emapper_annotation_file, output_file):
        extract_predicted_gene_names_dict = SynDict(
            filename=emapper_annotation_file,
            key_index=0,
            value_index=4,
            split_values=True,
            values_separator=",",
            comments_prefix="#",
            separator="\t")
        extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name"
        extract_predicted_gene_names_dict.write(output_file,
                                                header=True,
                                                splited_values=True)

        return extract_predicted_gene_names_dict
示例#15
0
    def merge_clusters(clusters_dict,
                       label_species="False",
                       separator_for_labeling="_",
                       species_label_first=True):

        if species_label_first:
            label_sequence = lambda label, name: "%s%s%s" % (
                label, separator_for_labeling, name)
        else:
            label_sequence = lambda label, name: "%s%s%s" % (
                name, separator_for_labeling, label)
        if label_species:
            expression = label_sequence
        else:
            expression = lambda label, name: name

        merged_clusters = SynDict()
        for species in clusters_dict:
            for cluster in clusters_dict[species]:
                if cluster not in merged_clusters:
                    merged_clusters[cluster] = []
                for sequence_name in clusters_dict[species][cluster]:
                    merged_clusters[cluster].append(
                        expression(species, sequence_name))

        return merged_clusters
示例#16
0
    def extract_single_copy_clusters_from_files(
            self,
            list_of_cluster_files,
            output_file,
            label_elements=False,
            separator="@",
            label_position="first",
            function_to_convert_filename_to_label=None):
        dict_of_cluster_dicts = OrderedDict()
        for filename in list_of_cluster_files:
            if function_to_convert_filename_to_label:
                label = function_to_convert_filename_to_label(filename)
            else:
                label = self.split_filename(filename)[
                    1]  # use basename as label

            dict_of_cluster_dicts[label] = SynDict()
            dict_of_cluster_dicts[label].read(filename,
                                              split_values=True,
                                              comments_prefix="#")

        sc_clusters_dict = self.extract_single_copy_clusters(
            dict_of_cluster_dicts,
            label_elements=label_elements,
            separator=separator,
            label_position=label_position)

        sc_clusters_dict.write(output_file, splited_values=True)

        return sc_clusters_dict
示例#17
0
    def replace_label(cluster_dict,
                      syn_dict=None,
                      old_separator="@",
                      old_label_position="first",
                      new_separator="@",
                      new_label_position="first"):
        new_cluster_dict = SynDict()
        for cluster in cluster_dict:
            new_cluster_dict[cluster] = []
            for element in cluster_dict[cluster]:
                tmp = element.split(old_separator)
                if old_label_position == "first":
                    label = tmp[0]
                    element_id = old_separator.join(tmp[1:])
                else:
                    label = tmp[-1]
                    element_id = old_separator.join(tmp[:-1])

                if new_label_position == 'first':
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (syn_dict[label] if syn_dict else label,
                                    new_separator, element_id))
                else:
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (element_id, new_separator,
                                    syn_dict[label] if syn_dict else label))

        return new_cluster_dict
示例#18
0
 def add_add_new_column_by_key_column(self,
                                      table_file,
                                      syn_dict_file,
                                      key_column,
                                      output_file,
                                      new_column_name=None,
                                      separator='\t',
                                      absent_value="."):
     column_syn_dict = SynDict(filename=syn_dict_file,
                               allow_repeats_of_key=True,
                               values_separator="@")
     with open(table_file, "r") as in_fd, open(output_file, "w") as out_fd:
         if new_column_name:
             header_line = in_fd.readline().strip(
             ) + "\t%s\n" % new_column_name
             out_fd.write(header_line)
             for line in in_fd:
                 line_list = line.strip().split(separator)
                 if line_list[key_column] in column_syn_dict:
                     print(line_list[key_column])
                     print(column_syn_dict[line_list[key_column]])
                 line_list.append(
                     absent_value if line_list[key_column] not in
                     column_syn_dict else "|".
                     join(column_syn_dict[line_list[key_column]]))
                 out_fd.write(separator.join(line_list) + "\n")
示例#19
0
    def rename_scaffolds_in_gff(self,
                                input_gff,
                                syn_file,
                                output_prefix,
                                verbose=True):

        syn_dict = SynDict(filename=syn_file)
        skipped_id_list = IdSet()

        output_gff = "%s.renamed.gff" % output_prefix
        skipped_gff = "%s.skipped.gff" % output_prefix
        skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix

        with self.metaopen(input_gff, "r") as in_fd, \
             self.metaopen(output_gff, "w") as out_fd, \
             self.metaopen(skipped_gff, "w") as skipped_fd:

            for line in in_fd:
                if line[0] == "#":
                    out_fd.write(line)
                gff_list = line.split("\t")
                if gff_list[0] in syn_dict:
                    gff_list[0] = syn_dict[gff_list[0]]
                    out_fd.write("\t".join(gff_list))
                else:
                    skipped_fd.write(line)
                    skipped_id_list.add(gff_list[0])

        if verbose:
            print("Not renamed scaffolds: %i" % len(skipped_id_list))

        skipped_id_list.write(skipped_id_file)
示例#20
0
    def combine_count_files(count_file_list,
                            output_file,
                            sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError(
                    "Several files doesn't have corresponding sample name")

        samples = zip(
            sample_name_list if sample_name_list else count_file_list,
            count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename,
                                          header=False,
                                          separator="\t",
                                          allow_repeats_of_key=False,
                                          split_values=False,
                                          values_separator=",",
                                          key_index=0,
                                          value_index=1,
                                          close_after_if_file_object=False,
                                          expression=None,
                                          comments_prefix="__")

        count_table.write(output_file)
示例#21
0
    def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"):
        #print type(FileRoutines)
        input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins)

        out_dir = self.check_path(output_dir)
        self.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
示例#22
0
    def convert_emapper_annotation_file_to_fam(emapper_annotation_file,
                                               output_fam,
                                               eggnogdb_prefix=None,
                                               species_name=None,
                                               label_separator="@",
                                               diamond_mode=False,
                                               database=None):
        fam_dict = SynDict()

        if diamond_mode and (database is not None):

            def extract_fam_from_line(line_list):
                db_dict = dict(
                    map(lambda s: s.split("@")[::-1], line_list[9].split(",")))
                return db_dict[database] if database in db_dict else "unknown"
        elif diamond_mode:
            raise ValueError(
                "ERROR!!! Database name (veNOG or other) is required in diamond mode!"
            )
        else:

            def extract_fam_from_line(line_list):
                return line_list[10].split("|")[0]

        with open(emapper_annotation_file, "r") as annotations_fd:
            for line in annotations_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")

                fam_id = extract_fam_from_line(line_list)
                if not (eggnogdb_prefix is None):
                    fam_id = eggnogdb_prefix + fam_id

                gene_id = "%s%s%s" % (
                    species_name, label_separator,
                    line_list[0]) if species_name else line_list[0]

                if fam_id in fam_dict:
                    fam_dict[fam_id].append(gene_id)
                else:
                    fam_dict[fam_id] = [gene_id]

        fam_dict.write(filename=output_fam, splited_values=True)
示例#23
0
    def create_gvf_files_from_species_gene_fam_and_gene_GO_fam(
            self, species_gene_fam_file, gene_GO_fam_file, output_directory):
        species_gene_dict = SynDict(filename=species_gene_fam_file,
                                    split_values=True)
        gene_GO_dict = SynDict(filename=gene_GO_fam_file, split_values=True)

        self.safe_mkdir(output_directory)

        for species in species_gene_dict:
            with open("%s/%s.gvf" % (output_directory, species),
                      "w") as out_fd:
                for gene in species_gene_dict[species]:
                    if gene not in gene_GO_dict:
                        print(
                            "WARNING gene %s for species %s is absent in gene_GO file"
                            % (gene, species))
                        continue
                    out_fd.write("%s\t%s\n" %
                                 (gene, "\t".join(gene_GO_dict[gene])))
示例#24
0
    def combine_syn_dicts(list_of_syn_dict):
        combined_dict = SynDict()
        for syn_dict in list_of_syn_dict:
            for key in syn_dict:
                if key in combined_dict:
                    combined_dict[key] += syn_dict[key]
                else:
                    combined_dict[key] = syn_dict[key]

        return combined_dict
示例#25
0
    def prepare_annotation_file_from_transcript_and_cds(
            self,
            transcript_file,
            cds_file,
            correspondence_file,
            output_prefix,
            format="fasta",
            correspondence_key_column=0,
            correspondence_value_column=1,
            verbose=False):
        transcript_dict = self.parse_seq_file(transcript_file,
                                              "parse",
                                              format=format)

        cds_dict = self.parse_seq_file(cds_file, "parse", format=format)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      comments_prefix="#",
                                      key_index=correspondence_key_column,
                                      value_index=correspondence_value_column)

        no_corresponding_cds_transcript_list = IdList()
        cds_not_found_transcript_list = IdList()

        annotation_file = "%s.annotation" % output_prefix
        no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix
        cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix

        with open(annotation_file, "w") as annotation_fd:
            for transcript_id in transcript_dict:
                if transcript_id not in correspondence_dict:
                    no_corresponding_cds_transcript_list.append(transcript_id)
                    if verbose:
                        print(
                            "No cds in correspondence file for transcript %s" %
                            transcript_id)
                    continue
                cds_id = correspondence_dict[transcript_id]
                length = len(cds_dict[cds_id].seq)
                start = transcript_dict[transcript_id].seq.upper().find(
                    cds_dict[cds_id].seq.upper())
                if start == -1:
                    cds_not_found_transcript_list.append(transcript_id)
                    if verbose:
                        print("CDS was not found for transcript %s" %
                              transcript_id)
                    continue
                annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id,
                                                         start + 1, length)

                annotation_fd.write(annotation_string)

        no_corresponding_cds_transcript_list.write(
            no_corresponding_cds_transcript_file)
        cds_not_found_transcript_list.write(cds_not_found_transcript_file)
示例#26
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
示例#27
0
    def extract_clusters_by_size_from_file(self,
                                           cluster_file,
                                           min_cluster_size=None,
                                           max_cluster_size=None,
                                           white_list_ids=None,
                                           out_file=None):
        cluster_dict = SynDict(filename=cluster_file, split_values=True)

        return self.extract_clusters_by_size(cluster_dict,
                                             min_cluster_size=min_cluster_size,
                                             max_cluster_size=max_cluster_size,
                                             white_list_ids=white_list_ids,
                                             out_file=out_file)
示例#28
0
    def remove_elements_by_ids_from_files(self,
                                          input_file,
                                          output_file,
                                          black_list_file,
                                          mode="full"):

        cluster_dict = SynDict(filename=input_file, split_values=True)
        black_list = IdList(filename=black_list_file)

        filtered_dict = self.remove_elements_by_ids(cluster_dict,
                                                    black_list,
                                                    mode=mode)

        filtered_dict.write(output_file, splited_values=True)
示例#29
0
    def count_column_values_from_file(self,
                                      input_file,
                                      column_number,
                                      output_file=None,
                                      separator="\t",
                                      comments_prefix="#",
                                      verbose=False):

        column_value_dict = SynDict()

        for line_list in self.file_line_as_list_generator(
                input_file, separator=separator,
                comments_prefix=comments_prefix):

            if line_list[column_number] in column_value_dict:
                column_value_dict[line_list[column_number]] += 1
            else:
                column_value_dict[line_list[column_number]] = 1

        if output_file:
            column_value_dict.write(output_file)

        return column_value_dict
示例#30
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict