示例#1
0
    def count_column_values_from_file(self,
                                      input_file,
                                      column_number,
                                      output_file=None,
                                      separator="\t",
                                      comments_prefix="#",
                                      verbose=False):

        column_value_dict = SynDict()

        for line_list in self.file_line_as_list_generator(
                input_file, separator=separator,
                comments_prefix=comments_prefix):

            if line_list[column_number] in column_value_dict:
                column_value_dict[line_list[column_number]] += 1
            else:
                column_value_dict[line_list[column_number]] = 1

        if output_file:
            column_value_dict.write(output_file)

        if verbose:
            print("#Column %i (0-based) contains %i different values" %
                  (column_number, len(column_value_set)))

        return column_value_dict
示例#2
0
    def convert_emapper_annotation_file_to_fam(emapper_annotation_file,
                                               output_fam,
                                               eggnogdb_prefix=None,
                                               species_name=None,
                                               label_separator="."):
        fam_dict = SynDict()
        with open(emapper_annotation_file, "r") as annotations_fd:
            for line in annotations_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")

                fam_id = line_list[10].split("|")[0]
                if not (eggnogdb_prefix is None):
                    fam_id = eggnogdb_prefix + fam_id

                gene_id = "%s%s%s" % (
                    species_name, label_separator,
                    line_list[0]) if species_name else line_list[0]

                if fam_id in fam_dict:
                    fam_dict[fam_id].append(gene_id)
                else:
                    fam_dict[fam_id] = [gene_id]

        fam_dict.write(filename=output_fam, splited_values=True)
示例#3
0
    def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
示例#4
0
    def get_families_from_top_hits(top_hits_file, fam_file):

        hit_dict = SynDict()
        hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#")
        hit_dict.write(fam_file, splited_values=True)

        return hit_dict
示例#5
0
 def extract_dom_names_hits_from_domtblout(domtblout_file, output_file):
     hits_dict = SynDict()
     hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True,
                    key_index=3, value_index=0, comments_prefix="#")
     if output_file:
         hits_dict.write(output_file, splited_values=True)
     return hits_dict
示例#6
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print "tttt"
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print "ppppp"
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
示例#7
0
    def count_unique_positions_per_sequence_from_file(self,
                                                      alignment_file,
                                                      output_prefix,
                                                      format="fasta",
                                                      gap_symbol="-",
                                                      return_mode="absolute",
                                                      verbose=True):

        alignment = AlignIO.read(alignment_file, format=format)
        number_of_sequences = len(alignment)
        alignment_length = len(alignment[0])
        position_presence_matrix = self.get_position_presence_matrix(
            alignment, gap_symbol=gap_symbol, verbose=verbose)
        unique_position_count_dict = SynDict()
        unique_position_count_percent_dict = SynDict()

        for row in range(0, number_of_sequences):
            sequence_id = alignment[row].id
            unique_positions = 0
            for column in range(0, alignment_length):
                if (position_presence_matrix[row, column]
                        == 1) or (position_presence_matrix[row, column] == -1):
                    unique_positions += 1

            unique_position_count_dict[sequence_id] = unique_positions
            unique_position_count_percent_dict[sequence_id] = 100 * float(
                unique_positions) / (alignment_length -
                                     str(alignment[row].seq).count(gap_symbol))

        unique_position_count_dict.write("%s.absolute_counts" % output_prefix)
        unique_position_count_percent_dict.write("%s.percent_counts" %
                                                 output_prefix)

        return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
示例#8
0
    def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]):
        feature_count_dict = SynDict()

        if feature_type_list:
            def check_feature_type(feature_type):
                return feature_type in feature_type_list
        else:
            def check_feature_type(feature_type):
                return True

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")
                if check_feature_type(line_list[2]):
                    if line_list[0] in feature_count_dict:
                        feature_count_dict[line_list[0]] += 1
                    else:
                        feature_count_dict[line_list[0]] = 1

        if out_file:
            feature_count_dict.write(out_file)

        return feature_count_dict
示例#9
0
文件: PAML.py 项目: melakbet/MAVR
 def find_leaves_with_positive_selection(self, write=True):
     leaf_values_dict = self.get_leaf_values(write=False)
     positive_selected_leaves_dict = SynDict()
     for leaf_name in leaf_values_dict["W"]:
         if leaf_values_dict["W"][leaf_name] > 1:
             positive_selected_leaves_dict[leaf_name] = leaf_values_dict["W"][leaf_name]
     if write:
         positive_selected_leaves_dict.write("leaves_with_positive_selection.t")
     return positive_selected_leaves_dict
示例#10
0
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(
                    line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
示例#11
0
 def extract_hits_from_tbl_output(blast_hits, output_file):
     hits = SynDict()
     hits.read(blast_hits,
               allow_repeats_of_key=True,
               key_index=0,
               value_index=1,
               separator="\t")
     hits.write(output_file,
                splited_values=True,
                separator="\t",
                values_separator=",")
     return hits
示例#12
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            elements_with_absent_synonyms_file=None,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        absent_elements_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            all_elements_were_renamed_flag = True
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    if cluster not in absent_elements_dict:
                        absent_elements_dict[cluster] = [element]
                    else:
                        absent_elements_dict[cluster].append(element)
                    all_elements_were_renamed_flag = False
                    renamed_element_list.append(element)

            if (not remove_clusters_with_not_renamed_elements) or (
                    remove_clusters_with_not_renamed_elements
                    and all_elements_were_renamed_flag):
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)

        if elements_with_absent_synonyms_file:
            absent_elements_dict.write(elements_with_absent_synonyms_file,
                                       splited_values=True)

        return absent_elements_dict
示例#13
0
    def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file,
                                                      output_file):
        GO_terms_dict = SynDict(filename=emapper_annotation_file,
                                key_index=0,
                                value_index=5,
                                split_values=True,
                                values_separator=",",
                                comments_prefix="#",
                                separator="\t")
        GO_terms_dict.header = "#protein_id\tGO_terms"
        GO_terms_dict.write(output_file, header=True, splited_values=True)

        return GO_terms_dict
示例#14
0
    def extract_predicted_gene_names_from_emapper_annotation_file(
            emapper_annotation_file, output_file):
        extract_predicted_gene_names_dict = SynDict(
            filename=emapper_annotation_file,
            key_index=0,
            value_index=4,
            split_values=True,
            values_separator=",",
            comments_prefix="#",
            separator="\t")
        extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name"
        extract_predicted_gene_names_dict.write(output_file,
                                                header=True,
                                                splited_values=True)

        return extract_predicted_gene_names_dict
示例#15
0
    def count_miRNA_reads(self, alignment_file, gff_file, output_prefix, annotation_file_type="GTF",
                          min_read_fraction_overlap=1.0, feature_type_to_use=None, attribute_type_to_use=None,
                          sample_name=None, stranded=1):

        no_multimapped_read_counts = "%s.no_multimapped_reads.count" % output_prefix
        with_multimapped_read_counts = "%s.with_multimapped_reads.count" % output_prefix
        all_adjusted_read_counts = "%s.all_adjusted_reads.count" % output_prefix

        self.count(alignment_file, gff_file, no_multimapped_read_counts, annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use, stranded=stranded)

        self.count(alignment_file, gff_file, with_multimapped_read_counts, count_multimapped_reads=True,
                   annotation_file_type=annotation_file_type,
                   min_read_fraction_overlap=min_read_fraction_overlap, feature_type_to_use=feature_type_to_use,
                   attribute_type_to_use=attribute_type_to_use, stranded=stranded)

        no_multimapped_read_count_dict = SynDict(filename=no_multimapped_read_counts, comments_prefix="#",
                                                 key_index=0, value_index=6, expression=int, header=True)
        with_multimapped_read_count_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#",
                                                   key_index=0, value_index=6, expression=int, header=True)
        similar_feature_number_dict = SynDict(filename=with_multimapped_read_counts, comments_prefix="#", header=True,
                                              key_index=0, value_index=1, expression=lambda s: len(s.split(";")))

        sample_nameeee = sample_name if sample_name else similar_feature_number_dict.header.split()[6]

        all_adjusted_read_count_dict = SynDict()
        all_adjusted_read_count_dict.header = ".\t%s" % sample_nameeee

        #print no_multimapped_read_count_dict
        #print with_multimapped_read_count_dict
        #print similar_feature_number_dict

        for feature_id in no_multimapped_read_count_dict:
            all_adjusted_read_count_dict[feature_id] = int(ceil(float(no_multimapped_read_count_dict[feature_id]) + \
                                                            (float(with_multimapped_read_count_dict[feature_id]) - float(no_multimapped_read_count_dict[feature_id])) / float(similar_feature_number_dict[feature_id])))

        all_adjusted_read_count_dict.write(all_adjusted_read_counts, header=True)
示例#16
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict
示例#17
0
    def rename_elements_in_clusters(
            clusters_file,
            syn_file,
            output_clusters_file,
            remove_clusters_with_not_renamed_elements=False,
            syn_file_key_column_index=0,
            syn_file_value_column_index=1,
            syn_file_column_separator='\t'):
        syn_dict = SynDict()
        syn_dict.read(syn_file,
                      comments_prefix="#",
                      key_index=syn_file_key_column_index,
                      value_index=syn_file_value_column_index,
                      separator=syn_file_column_separator)

        clusters_dict = SynDict()
        clusters_dict.read(clusters_file,
                           split_values=True,
                           values_separator=",",
                           comments_prefix="#")

        output_clusters_dict = SynDict()

        for cluster in clusters_dict:
            renamed_element_list = []
            for element in clusters_dict[cluster]:
                if element in syn_dict:
                    renamed_element_list.append(syn_dict[element])
                else:
                    renamed_element_list.append(element)
                    if remove_clusters_with_not_renamed_elements:
                        break
            else:
                output_clusters_dict[cluster] = renamed_element_list

        output_clusters_dict.write(output_clusters_file, splited_values=True)
示例#18
0
    def get_feature_dict(self, input_gff, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent"):

        feature_dict = SynDict()
        for line_list in self.file_line_as_list_generator(input_gff, comments_prefix="#", separator="\t"):
            annotation_dict = self.parse_gff_annotation_string_to_dict(line_list[self.GFF_ATTRIBUTE_COLUMN])

            if line_list[self.GFF_FEATURETYPE_COLUMN] not in feature_type_list:
                continue

            if unification_key not in annotation_dict:
                continue
            #print unification_key
            #print(annotation_dict)

            if annotation_dict[unification_key][0] not in feature_dict:
                feature_dict[annotation_dict[unification_key][0]] = []

            feature_dict[annotation_dict[unification_key][0]].append([line_list[self.GFF_SCAFFOLD_COLUMN],
                                                                     line_list[self.GFF_START_COLUMN],
                                                                     line_list[self.GFF_END_COLUMN],
                                                                     line_list[self.GFF_STRAND_COLUMN]])

        if output_prefix:
            feature_dict.write("%s.tab" % output_prefix,
                               value_expression=self.feature_list_entry_to_tab_str,
                               line_per_value=True)
            feature_dict.write("%s.coordinates_only.tab" % output_prefix,
                               value_expression=self.feature_list_entry_to_tab_str,
                               line_per_value=True,
                               values_only=True)

            feature_dict.write("%s.list" % output_prefix,
                               value_expression=self.feature_list_entry_to_gatk_interval_str,
                               line_per_value=True)
            feature_dict.write("%s.coordinates_only.list" % output_prefix,
                               value_expression=self.feature_list_entry_to_gatk_interval_str,
                               line_per_value=True,
                               values_only=True)

        return feature_dict
示例#19
0
    def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None):

        extracted_families = SynDict()
        common_protein_names_to_families_dict = SynDict()
        common_names_to_eggnog_proteins_syn_dict = SynDict()

        not_found_proteins_common_names = IdList()

        transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value()

        for common_protein_name in protein_syn_dict:
            not_found = True
            for protein_id in protein_syn_dict[common_protein_name]:
                extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id
                if extended_protein_id in transposed_eggnog_fam_dict:
                    not_found = False
                    if common_protein_name not in common_protein_names_to_families_dict:
                        common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]]
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id]
                    else:
                        common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0])
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id)
                    if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families:
                        extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]]

            if not_found:
                not_found_proteins_common_names.append(common_protein_name)

        if output_prefix:
            extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True)
            common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True)
            common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True)
            not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix)

            #print common_names_to_eggnog_proteins_syn_dict
            #print common_protein_names_to_families_dict
        return extracted_families, common_protein_names_to_families_dict, \
               common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
            reference_families[family_id])

reference_random_genes.write("%s_reference_random_genes.t" %
                             args.output_prefix)

for family_id in reference_random_genes:
    if reference_random_genes[family_id] != ".":
        out_fd.write("%s\n" % reference_random_genes[family_id])
示例#21
0
                    type=int,
                    help="Format of input trees")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output_file",
                    default="stdout",
                    help="Output file with leaves of trees. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output_file == "stdout" else open(
    args.output_file, "w")

tree_files_list = os.listdir(args.tree_dir)

names_dict = SynDict()

for tree_file in tree_files_list:
    tree_name = split_filename(tree_file)[1]
    with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd:
        tree = Tree(tree_fd.readline().strip(), format=args.tree_format)
    leaves_list = []
    for node in tree.traverse():
        if node.is_leaf():
            leaves_list.append(node.name)
    names_dict[tree_name] = leaves_list

names_dict.write(args.outp_fd, splited_values=True)
if args.output_file != "stdout":
    out_fd.close()
示例#22
0
    def add_flanks_to_gff_record(self, input_gff, output_prefix, left_flank_len, right_flank_len, fasta_file,
                                 coords_description_entry="core_seq_coords", id_description_entry="ID"):
        sequence_length_dict = self.get_lengths_from_seq_file(fasta_file)
        shorter_flanks_dict = SynDict()

        output_gff = "%s.gff" % output_prefix
        short_flanks_file = "%s.short_flanks.dat" % output_prefix

        with open(input_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    if line[0] == "#":
                        out_fd.write(line)
                        continue
                    line_list = line.strip().split("\t")
                    scaffold = line_list[0]
                    start = int(line_list[3])
                    end = int(line_list[4])

                    record_id = OrderedDict(map(lambda s: s.split("="), line_list[8].split(";")))[id_description_entry]

                    line_list[8] += ";%s=%i,%i" % (coords_description_entry, start, end)

                    if line_list[6] == "-":
                        if start - right_flank_len > 0:
                            line_list[3] = str(start - right_flank_len)
                            right_flank_length = right_flank_len
                        else:
                            right_flank_length = start - 1
                            line_list[3] = "1"

                        if end + left_flank_len <= sequence_length_dict[line_list[0]]:
                            line_list[4] = str(end + left_flank_len)
                            left_flank_length = left_flank_len
                        else:
                            left_flank_length = sequence_length_dict[line_list[0]] - end
                            line_list[4] = sequence_length_dict[line_list[0]]
                    else:
                        if start - left_flank_len > 0:
                            line_list[3] = str(start - left_flank_len)
                            left_flank_length = left_flank_len
                        else:
                            left_flank_length = start - 1
                            line_list[3] = "1"

                        if end + right_flank_len <= sequence_length_dict[line_list[0]]:
                            line_list[4] = str(end + right_flank_len)
                            right_flank_length = right_flank_len
                        else:
                            right_flank_length = sequence_length_dict[line_list[0]] - end
                            line_list[4] = str(sequence_length_dict[line_list[0]])

                    if (left_flank_length < left_flank_len) or (right_flank_length < right_flank_len):
                        print("%s: Short flank" % record_id)
                        shorter_flanks_dict[record_id] = "%i,%i" % (left_flank_length, right_flank_length)
                    line_list[8] += ";%s_relative=%i,%i\n" % (coords_description_entry,
                                                              1 + (right_flank_length if line_list[6] == "-" else left_flank_length),
                                                              end - start + 1 + (right_flank_length if line_list[6] == "-" else left_flank_length))
                    """
                    print line
                    print line_list
                    for element in line_list:
                        print element
                        print type(element)
                    """
                    out_fd.write("\t".join(line_list))

        shorter_flanks_dict.write(short_flanks_file)
示例#23
0
    "--input",
    action="store",
    dest="input",
    required=True,
    type=make_list_of_path_to_files_from_comma_sep_string,
    help="Comma-separated list of fam files or directories with them")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")

args = parser.parse_args()

out_fd = sys.stdout if args.input == "stdout" else open(args.output, "w")
family_dict = SynDict()

for filename in args.input:
    fam_dict = SynDict()
    fam_dict.read(filename, split_values=True)
    for family in fam_dict:
        if family not in family_dict:
            family_dict[family] = fam_dict[family]
        else:
            family_dict[family] += fam_dict[family]

family_dict.write(args.output, splited_values=True)

if args.output != "stdout":
    out_fd.close()
示例#24
0
文件: Drawing.py 项目: melakbet/MAVR
    def draw_length_histogram(sequence_dict,
                              output_prefix,
                              number_of_bins=None,
                              width_of_bins=None,
                              min_length=1,
                              max_length=None,
                              extensions=("png", "svg"),
                              legend_location='best'):
        length_dict = SynDict()

        for record in sequence_dict:
            length_dict[record] = len(sequence_dict[record].seq)

        length_dict.write("%s.len" % output_prefix)

        lengths = length_dict.values()

        max_len = max(lengths)
        min_len = min(lengths)
        median = np.median(lengths)
        mean = np.mean(lengths)

        if max_length is None:
            maximum_length = max_len
        else:
            maximum_length = max_length

        filtered = []

        if (maximum_length < max_len) and (min_length > 1):
            for entry in lengths:
                if min_length <= entry <= maximum_length:
                    filtered.append(entry)
        elif min_length > 1:
            for entry in lengths:
                if min_length <= entry:
                    filtered.append(entry)
        elif maximum_length < max_len:
            for entry in lengths:
                if entry <= maximum_length:
                    filtered.append(entry)
        else:
            filtered = lengths

        plt.figure(1, figsize=(6, 6))
        plt.subplot(1, 1, 1)

        if number_of_bins:
            bins = number_of_bins
        elif width_of_bins:
            bins = np.arange(min_length - 1,
                             maximum_length,
                             width_of_bins,
                             dtype=np.int32)
            bins[0] += 1
            bins = np.append(bins, [maximum_length])
        else:
            bins = 30
        plt.hist(filtered, bins=bins)
        plt.xlim(xmin=min_length, xmax=maximum_length)
        plt.xlabel("Length")
        plt.ylabel("N")
        plt.title("Distribution of sequence lengths")
        plt.legend(("Min: %i\nMax: %i\nMean: %i\nMedian: %i" %
                    (min_len, max_len, mean, median), ),
                   loc=legend_location)
        for ext in extensions:
            plt.savefig("%s.%s" % (output_prefix, ext))

        os.remove("temp.idx")
示例#25
0
    def replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8):

        output_gff = "%s.renamed.gff" % output_prefix
        genes_syn_file = "%s.gene.syn" % output_prefix
        transcripts_syn_file = "%s.transcript.syn" % output_prefix
        cds_syn_file = "%s.cds.syn" % output_prefix
        genes_syn_dict = SynDict()
        transcripts_syn_dict = SynDict()
        cds_syn_dict = SynDict()
        gene_counter = 0
        gene_id_template = "%sG%%0%ii" % (species_prefix, number_of_digits_in_id)
        transcripts_counter = 0
        transcript_id_template = "%sT%%0%ii" % (species_prefix, number_of_digits_in_id)
        cds_counter = 0
        cds_id_template = "%sC%%0%ii" % (species_prefix, number_of_digits_in_id)
        with open(augustus_gff, "r") as in_fd:
            with open(output_gff, "w") as out_fd:
                for line in in_fd:
                    tmp = line.strip()
                    if len(tmp) < 13:
                        out_fd.write(line)
                        continue
                    if tmp[:12] != "# start gene":
                        out_fd.write(line)
                        continue
                    augustus_gene_id = tmp.split(" ")[-1]
                    gene_counter += 1

                    gene_syn_id = gene_id_template % gene_counter
                    genes_syn_dict[augustus_gene_id] = gene_syn_id
                    augustus_transcript_id = ""
                    augustus_transcript_parent = ""
                    out_fd.write("# start gene %s\n" % gene_syn_id)
                    tmp = in_fd.next().strip()
                    while True:
                        while tmp[0] != "#":
                            tmp_list = tmp.split("\t")
                            feature_type = tmp_list[2]
                            edited_str = "\t".join(tmp_list[:-1])
                            info_field_list = tmp_list[-1].split(";")
                            if feature_type == "gene":
                                edited_str += "\tID=%s\n" % gene_syn_id
                            elif feature_type == "transcript":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_transcript_id = entry.split("=")[-1]
                                        if augustus_transcript_id not in transcripts_syn_dict:
                                            transcripts_counter += 1
                                            transcripts_syn_dict[augustus_transcript_id] = transcript_id_template % transcripts_counter
                                        transcript_syn_id = transcripts_syn_dict[augustus_transcript_id]
                                    if "Parent" in entry:
                                        augustus_transcript_parent = entry.split("=")[-1]
                                        if augustus_transcript_parent != augustus_gene_id:
                                            raise ValueError("Transcript parent id and gene id are not same!")
                                edited_str += "\tID=%s;Parent=%s\n" % (transcript_syn_id, gene_syn_id)
                            elif feature_type == "CDS":
                                for entry in info_field_list:
                                    if "ID" in entry:
                                        augustus_cds_id = entry.split("=")[-1]
                                        if augustus_cds_id not in cds_syn_dict:
                                            cds_counter += 1
                                            cds_syn_dict[augustus_cds_id] = cds_id_template % cds_counter
                                        cds_syn_id = cds_syn_dict[augustus_cds_id]
                                    if "Parent" in entry:
                                        augustus_cds_parent = entry.split("=")[-1]
                                        if augustus_cds_parent != augustus_transcript_id:
                                            raise ValueError("CDS parent id and transcript id are not same!")
                                edited_str += "\tID=%s;Parent=%s\n" % (cds_syn_id, transcript_syn_id)
                            elif (feature_type == "stop_codon") or (feature_type == "start_codon"):
                                for entry in info_field_list:
                                    if "Parent" in entry:
                                        augustus_feature_parent = entry.split("=")[-1]
                                        if augustus_feature_parent != augustus_transcript_id:
                                            raise ValueError("Feature parent id and transcript id are not same!")
                                edited_str += "\tParent=%s\n" % transcript_syn_id
                            else:
                                edited_str = tmp + "\n"

                            out_fd.write(edited_str)
                            tmp = in_fd.next().strip()
                        while tmp[0] == "#":
                            if "# end gene" in tmp:
                                break
                            out_fd.write(tmp + "\n")
                            tmp = in_fd.next().strip()
                        if "# end gene" in tmp:
                                break
                    out_fd.write("# end gene %s\n" % gene_syn_id)
        genes_syn_dict.write(genes_syn_file)
        transcripts_syn_dict.write(transcripts_syn_file)
        cds_syn_dict.write(cds_syn_file)
示例#26
0
                    help="Output file with collapsed strings")
parser.add_argument("-c", "--column_separator", action="store", dest="column_separator", default="\t",
                    help="Column separator. Default: '\\t'")
parser.add_argument("-v", "--value_separator", action="store", dest="value_separator", default=",",
                    help="Value separator. Default: ','")
parser.add_argument("-k", "--key_column", action="store", dest="key_column", default=0, type=int,
                    help="Column to be used as key(0-based). Default: 0")
parser.add_argument("-a", "--value_column", action="store", dest="value_column", default=1, type=int,
                    help="Column to be used as value(0-based). Default: 1")
parser.add_argument("-m", "--comments_prefix", action="store", dest="comments_prefix", default="#",
                    help="Prefix of strings(comments) to be ignored. Default: #")
parser.add_argument("-r", "--remove_value_repeats", action="store_true", dest="remove_value_repeats",
                    help="Remove repeats of values")
args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

syn_dict = SynDict()
syn_dict.read(args.input, header=False, separator=args.column_separator, allow_repeats_of_key=True,
              split_values=True, values_separator=args.value_separator,
              key_index=args.key_column, value_index=args.value_column,
              comments_prefix=args.comments_prefix)

if args.remove_value_repeats:
    collapsed_dict = syn_dict.remove_value_repeats()
    collapsed_dict.write(out_fd, splited_values=True, values_separator=args.value_separator,
                         close_after_if_file_object=True)
else:
    syn_dict.write(out_fd, splited_values=True, values_separator=args.value_separator,
                   close_after_if_file_object=True)
#out_fd.close()
示例#27
0
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")
parser.add_argument("-k",
                    "--family_column",
                    action="store",
                    dest="fam_col",
                    default=1,
                    type=int,
                    help="Family column position(0-based). Default: 1")
parser.add_argument("-a",
                    "--genes_column",
                    action="store",
                    dest="gen_col",
                    default=0,
                    type=int,
                    help="Genes column position(0-based). Default: 0")

args = parser.parse_args()

hit_dict = SynDict()

hit_dict.read(args.input,
              header=args.header,
              allow_repeats_of_key=True,
              key_index=args.fam_col,
              value_index=args.gen_col)

hit_dict.write(args.output, splited_values=True)
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True,
                    help="File with ids of families to extract")
parser.add_argument("-o", "--output", action="store", dest="output", default="stdout",
                    help="File to write extracted families. Default - stdout")
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

out_file = sys.stdout if args.output == "stdout" else open(args.output, "w")

fam_dict = SynDict()
fam_dict.read(args.input)

id_set = IdSet()
id_set.read(args.id_file)

extracted_dict = SynDict()
for id_entry in id_set:
    if id_entry in fam_dict:
        extracted_dict[id_entry] = fam_dict[id_entry]
    else:
        if args.verbose:
            print("%s was not found" % id_entry)

extracted_dict.write(out_file, close_after_if_file_object=True)




示例#29
0
文件: NCBI.py 项目: melakbet/MAVR
    def get_cds_for_proteins(self, protein_id_list, output_prefix, download_chunk_size=100, temp_dir_prefix="temp"):

        from Tools.Abstract import Tool

        transcript_temp_dir = "%s_transcripts" % temp_dir_prefix
        protein_temp_dir = "%s_proteins" % temp_dir_prefix
        number_of_ids = len(protein_id_list)
        print "Total %i ids" % number_of_ids

        for directory in transcript_temp_dir, protein_temp_dir:
            self.save_mkdir(directory)
        pep_file = "%s.pep.genbank" % output_prefix
        transcript_file = "%s.trascript.genbank" % output_prefix

        ranges = np.append(np.arange(0, number_of_ids, download_chunk_size), [number_of_ids])

        print "Downloading proteins..."
        for i in range(0, len(ranges)-1):
            print "Downloading chunk %i" % i
            pep_tmp_file = "%s/%s_%i" % (protein_temp_dir, pep_file, i)
            self.efetch("protein", protein_id_list[ranges[i]:ranges[i+1]], pep_tmp_file, rettype="gb", retmode="text")

        os.system("cat %s/* > %s" % (protein_temp_dir, pep_file))

        peptide_dict = SeqIO.index_db("tmp.idx", pep_file, format="genbank")
        downloaded_protein_ids = IdList(peptide_dict.keys())

        print "%i proteins were downloaded" % len(downloaded_protein_ids)
        not_downloaded_proteins_ids = Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="only_a")
        print "%i proteins were not downloaded" % len(not_downloaded_proteins_ids)
        not_downloaded_proteins_ids.write("%s.not_downloaded.ids" % output_prefix)
        downloaded_protein_ids.write("%s.downloaded.ids" % output_prefix)
        print Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="count")

        pep_without_transcripts = IdList()
        pep_with_several_CDS_features = IdList()
        pep_to_transcript_accordance = SynDict()
        transcript_ids = IdList()

        print "Extracting transcript ids corresponding to proteins..."
        for pep_id in peptide_dict:
            for feature in peptide_dict[pep_id].features:
                if feature.type == "CDS":
                    try:
                        transcript_id = feature.qualifiers["coded_by"][0].split(":")[0]
                        if pep_id not in pep_to_transcript_accordance:
                            pep_to_transcript_accordance[pep_id] = [transcript_id]
                        else:
                            pep_to_transcript_accordance[pep_id].append(transcript_id)
                            print("Genbank record for %s contains several CDS features" % pep_id)
                            pep_with_several_CDS_features.append(pep_id)
                        if transcript_id in transcript_ids:
                            print "Repeated transcript id: %s" % transcript_id
                            continue
                        transcript_ids.append(transcript_id)
                    except:
                        print "Transcript id for %s was not found" % pep_id
                        pep_without_transcripts.append(pep_id)

        pep_with_several_CDS_features.write("%s.pep_with_several_CDS.ids" % output_prefix)
        pep_without_transcripts.write("%s.pep_without_transcripts.ids" % output_prefix)
        transcript_ids.write("%s.transcripts.ids" % output_prefix)

        number_of_transcripts = len(transcript_ids)
        print "%i transcripts were found" % number_of_transcripts

        pep_to_transcript_accordance.write("%s.pep_to_transcript.accordance" % output_prefix, splited_values=True)

        transcript_ranges = np.append(np.arange(0, number_of_transcripts, download_chunk_size), [number_of_transcripts])

        print "Downloading transcripts..."
        for i in range(0, len(transcript_ranges)-1):
            print "Downloading chunk %i" % i
            transcript_tmp_file = "%s/%s_%i" % (transcript_temp_dir, transcript_file, i)
            self.efetch("nuccore", transcript_ids[transcript_ranges[i]:transcript_ranges[i+1]],
                        transcript_tmp_file, rettype="gb", retmode="text")

        os.system("cat %s/* > %s" % (transcript_temp_dir, transcript_file))


        transcript_dict = SeqIO.index_db("tmp_1.idx", transcript_file, format="genbank")

        cds_records_list = []
        for transcript_id in transcript_dict:
            for feature in transcript_dict[transcript_id].features:
                CDS_counter = 1
                if feature.type == "CDS":
                    #print feature

                    feature_seq = feature.extract(transcript_dict[transcript_id].seq)
                    feature_id = transcript_id  # case with several CDS per transcripts is was not taken into account
                    if "protein_id" in feature.qualifiers:
                        description = "protein=%s" % feature.qualifiers["protein_id"][0]
                    else:
                        print "Corresponding protein id was not found for %s" % transcript_id
                    cds_records_list.append(SeqRecord(seq=feature_seq, id=feature_id, description=description))
        SeqIO.write(cds_records_list, "%s.cds" % output_prefix, format="fasta")

        stat_string = "Input protein ids\t %i\n" % number_of_ids
        stat_string += "Downloaded proteins\t%i\n" % number_of_transcripts
        stat_string += "Downloaded transcripts\t%i\n" % len(transcript_dict)

        print stat_string

        with open("%s.stats" % output_prefix, "w") as stat_fd:
            stat_fd.write(stat_string)

        for filename in "tmp.idx", "tmp_1.idx":
            os.remove(filename)
示例#30
0
sl_keys = list(complicated_families_dict.sl_keys())
for sl_key in sl_keys:
    sp_set = set()
    for species in complicated_families_dict:
        if sl_key not in complicated_families_dict[species]:
            continue
        tmp = complicated_families_dict[species][sl_key].split(";")
        for i in range(0, len(tmp)):
            if "_" in tmp[i]:
                tmp[i] = tmp[i][2:]
            tmp[i] = tmp[i].split(",")
            for syn_id in tmp[i]:
                complicated_families_syn_ids.add(syn_id)
                sp_set.add(syn_id)
    complicated_families_syn_dict[sl_key] = sp_set
complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True)

for entry in complicated_families_dict.all_values():
    tmp = entry.split(";")
    for i in range(0, len(tmp)):
        if "_" in tmp[i]:
            tmp[i] = tmp[i][2]
        tmp[i] = tmp[i].split(",")
        for syn_id in tmp[i]:
            complicated_families_syn_ids.add(syn_id)
complicated_families_syn_ids.write("complicated_families_check.ids")

nonassembled.write("splited_to_several_families.t", absent_symbol=".")

assemled_to_different_families = species_syn_dict.filter_by_line(filter_different_assembly)
species_syn_dict.write("correctly_assembled_families_in_all_species.t", absent_symbol=".")