示例#1
0
    def get_transcript_to_pep_accordance_from_gtf(gtf_file, output_file, comment_symbol="#"):
        """
        Tested on gtf files from Ensembl relealese 70
        """
        accordance_dict = SynDict()
        with open(gtf_file, "r") as gtf_fd:
            for line in gtf_fd:
                if line[0] == comment_symbol:
                    continue
                tmp_list = line.strip().split("\t")
                tmp_list = tmp_list[-1].split(";")
                protein_id = None
                transcript_id = None
                #print tmp_list
                for entry in tmp_list:
                    tmp_entry = entry.split()

                    if len(tmp_entry) != 2:
                        continue
                    if tmp_entry[0] == "transcript_id":
                        #print "tttt"
                        transcript_id = tmp_entry[1][1:-1]  # remove quotes
                    elif tmp_entry[0] == "protein_id":
                        #print "ppppp"
                        protein_id = tmp_entry[1][1:-1]

                if (transcript_id is not None) and (protein_id is not None):
                    if transcript_id in accordance_dict:
                        accordance_dict[transcript_id].add(protein_id)
                    else:
                        accordance_dict[transcript_id] = {protein_id}
        accordance_dict.write(output_file, splited_values=True)
示例#2
0
    def correct_regions_from_gff(
            self,
            reference,
            variants_vcf,
            gff_file,
            output_prefix=None,
            feature_type_list=["CDS"],
            unification_key="Parent",
            #raw_seq_per_line=False,
            vcf_with_masking=None,
            override_vcf_by_mask=None,
            use_ambiguous_nuccleotides=None):

        feature_dict = AnnotationsRoutines.get_feature_dict(
            gff_file,
            output_prefix=output_prefix,
            feature_type_list=feature_type_list,
            unification_key=unification_key)
        region_file = "%s.coordinates_only.list" % output_prefix

        raw_regions = "%s.raw.seq" % output_prefix
        final_regions = "%s.fasta" % output_prefix

        regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix

        self.correct_reference(
            reference,
            raw_regions,
            variants_vcf,
            raw_seq_per_line=True,
            vcf_with_masking=vcf_with_masking,
            override_vcf_by_mask=override_vcf_by_mask,
            use_ambiguous_nuccleotides=use_ambiguous_nuccleotides,
            interval_list=region_file)

        region_with_frameshift = SynDict()

        def new_regions_generator():
            with open(raw_regions, "r") as in_fd:
                for region_id in feature_dict:
                    seq = ""
                    for i in range(0, len(feature_dict[region_id])):
                        seq_fragment = in_fd.readline().strip()
                        if ((int(feature_dict[region_id][i][2]) -
                             int(feature_dict[region_id][i][1]) + 1) -
                                len(seq_fragment)) % 3 != 0:
                            if region_id not in region_with_frameshift:
                                region_with_frameshift[region_id] = [i]
                            else:
                                region_with_frameshift[region_id].append(i)
                        seq += seq_fragment
                    yield SeqRecord(
                        seq=Seq(seq) if feature_dict[region_id][0][3] == "+"
                        else Seq(seq).reverse_complement(),
                        id=region_id,
                        description="")

        SeqIO.write(new_regions_generator(), final_regions, format="fasta")
        region_with_frameshift.write(regions_with_frameshift_file,
                                     splited_values=True)
示例#3
0
    def convert_emapper_annotation_file_to_fam(emapper_annotation_file,
                                               output_fam,
                                               eggnogdb_prefix=None,
                                               species_name=None,
                                               label_separator="."):
        fam_dict = SynDict()
        with open(emapper_annotation_file, "r") as annotations_fd:
            for line in annotations_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")

                fam_id = line_list[10].split("|")[0]
                if not (eggnogdb_prefix is None):
                    fam_id = eggnogdb_prefix + fam_id

                gene_id = "%s%s%s" % (
                    species_name, label_separator,
                    line_list[0]) if species_name else line_list[0]

                if fam_id in fam_dict:
                    fam_dict[fam_id].append(gene_id)
                else:
                    fam_dict[fam_id] = [gene_id]

        fam_dict.write(filename=output_fam, splited_values=True)
示例#4
0
    def count_per_scaffold_feature_number(gff_file, out_file=None, feature_type_list=[]):
        feature_count_dict = SynDict()

        if feature_type_list:
            def check_feature_type(feature_type):
                return feature_type in feature_type_list
        else:
            def check_feature_type(feature_type):
                return True

        with open(gff_file, "r") as gff_fd:
            for line in gff_fd:
                if line[0] == "#":
                    continue
                line_list = line.split("\t")
                if check_feature_type(line_list[2]):
                    if line_list[0] in feature_count_dict:
                        feature_count_dict[line_list[0]] += 1
                    else:
                        feature_count_dict[line_list[0]] = 1

        if out_file:
            feature_count_dict.write(out_file)

        return feature_count_dict
示例#5
0
    def prepare_data_for_target_alignment(self,
                                          query_fasta,
                                          target_fasta,
                                          correspondence_file,
                                          out_dir,
                                          correspondence_query_column=0,
                                          correspondence_target_column=1):

        query_dict = self.parse_seq_file(query_fasta, "parse")
        target_dict = self.parse_seq_file(target_fasta, "parse")

        self.safe_mkdir(out_dir)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      allow_repeats_of_key=True,
                                      key_index=correspondence_query_column,
                                      value_index=correspondence_target_column)

        for query_id in correspondence_dict:
            query_outfile = "%s/%s.query.fasta" % (out_dir, query_id)
            target_outfile = "%s/%s.target.fasta" % (out_dir, query_id)

            SeqIO.write(self.record_by_id_generator(query_dict, [query_id]),
                        query_outfile,
                        format="fasta")
            SeqIO.write(self.record_by_id_generator(
                target_dict, correspondence_dict[query_id]),
                        target_outfile,
                        format="fasta")

        queries_with_targets_set = set(correspondence_dict.keys())
        queries_set = set(query_dict.keys())

        return queries_with_targets_set, queries_set - queries_with_targets_set
示例#6
0
    def get_codon_alignment_from_files(self,
                                       protein_aln_file,
                                       nucleotide_seq_file,
                                       codon_alignment_file,
                                       cds2protein_accordance_file=None,
                                       alignment_format="fasta",
                                       nucleotide_sequence_format="fasta",
                                       cds_index_file=None,
                                       retain_cds_index=False):
        protein_aln_dict = AlignIO.read(protein_aln_file,
                                        format=alignment_format)
        nucleotide_seq_dict = SeqIO.index_db(
            cds_index_file if cds_index_file else "nuc_tmp.idx",
            nucleotide_seq_file,
            format=nucleotide_sequence_format)

        protein2cds_accordance_dict = None
        if cds2protein_accordance_file:
            protein2cds_accordance_dict = SynDict()
            protein2cds_accordance_dict.read(cds2protein_accordance_file,
                                             key_index=1,
                                             value_index=0)

        self.get_codon_alignment(
            protein_aln_dict,
            nucleotide_seq_dict,
            codon_alignment_file,
            protein2cds_accordance_dict=protein2cds_accordance_dict)
        if (not cds_index_file) and (not retain_cds_index):
            os.remove("nuc_tmp.idx")
示例#7
0
    def add_length_to_accordance_file(accordance_file, length_file, output_prefix):

        accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True)
        length_dict = SynDict(filename=length_file, expression=int)
        print length_dict
        longest_list = IdList()

        all_output_file = "%s.all.correspondence" % output_prefix
        longest_output_file = "%s.longest.correspondence" % output_prefix
        longest_id_file = "%s.longest.ids" % output_prefix

        with open(all_output_file, "w") as all_out_fd:
            with open(longest_output_file, "w") as longest_out_fd:
                for gene in accordance_dict:
                    current_transcript = None
                    current_length = 0
                    for transcript in accordance_dict[gene]:
                        if length_dict[transcript] > current_length:
                            current_transcript = transcript
                            current_length = length_dict[transcript]
                        all_out_fd.write("%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript]))

                    longest_out_fd.write("%s\t%s\t%i\n" % (gene, current_transcript, current_length))
                    longest_list.append(current_transcript)
        longest_list.write(longest_id_file)
示例#8
0
    def count_column_values_from_file(self,
                                      input_file,
                                      column_number,
                                      output_file=None,
                                      separator="\t",
                                      comments_prefix="#",
                                      verbose=False):

        column_value_dict = SynDict()

        for line_list in self.file_line_as_list_generator(
                input_file, separator=separator,
                comments_prefix=comments_prefix):

            if line_list[column_number] in column_value_dict:
                column_value_dict[line_list[column_number]] += 1
            else:
                column_value_dict[line_list[column_number]] = 1

        if output_file:
            column_value_dict.write(output_file)

        if verbose:
            print("#Column %i (0-based) contains %i different values" %
                  (column_number, len(column_value_set)))

        return column_value_dict
示例#9
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
示例#10
0
文件: PAML.py 项目: melakbet/MAVR
 def find_leaves_with_positive_selection(self, write=True):
     leaf_values_dict = self.get_leaf_values(write=False)
     positive_selected_leaves_dict = SynDict()
     for leaf_name in leaf_values_dict["W"]:
         if leaf_values_dict["W"][leaf_name] > 1:
             positive_selected_leaves_dict[leaf_name] = leaf_values_dict["W"][leaf_name]
     if write:
         positive_selected_leaves_dict.write("leaves_with_positive_selection.t")
     return positive_selected_leaves_dict
示例#11
0
    def get_monomer_len_file_from_trf_gff(trf_gff, len_file):
        len_dict = SynDict()

        with open(trf_gff, "r") as trf_fd:
            for line in trf_fd:
                if line[0] == "#":
                    continue
                description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(
                    line)
                len_dict[description_dict["ID"]] = description_dict["Period"]
        # print len_dict
        len_dict.write(len_file)
示例#12
0
 def add_len_to_simple_output(top_hits_simple, len_file, out_file):
     len_dict = SynDict()
     len_dict.read(len_file)
     with open(top_hits_simple, "r") as in_fd:
         with open(out_file, "w") as out_fd:
             for line in in_fd:
                 tmp_list = line.strip().split("\t")
                 out_fd.write(
                     "%s\t%s\t%s\t%s\t%s\t%f\n" %
                     (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3],
                      tmp_list[1], tmp_list[2],
                      (float(tmp_list[2]) - float(tmp_list[1]) + 1) /
                      float(len_dict[tmp_list[0]])))
示例#13
0
    def extract_GO_terms_from_emapper_annotation_file(emapper_annotation_file,
                                                      output_file):
        GO_terms_dict = SynDict(filename=emapper_annotation_file,
                                key_index=0,
                                value_index=5,
                                split_values=True,
                                values_separator=",",
                                comments_prefix="#",
                                separator="\t")
        GO_terms_dict.header = "#protein_id\tGO_terms"
        GO_terms_dict.write(output_file, header=True, splited_values=True)

        return GO_terms_dict
示例#14
0
    def count_unique_positions_per_sequence_from_file(self,
                                                      alignment_file,
                                                      output_prefix,
                                                      format="fasta",
                                                      gap_symbol="-",
                                                      return_mode="absolute",
                                                      verbose=True):

        alignment = AlignIO.read(alignment_file, format=format)
        number_of_sequences = len(alignment)
        alignment_length = len(alignment[0])
        position_presence_matrix = self.get_position_presence_matrix(
            alignment, gap_symbol=gap_symbol, verbose=verbose)
        unique_position_count_dict = SynDict()
        unique_position_count_percent_dict = SynDict()

        for row in range(0, number_of_sequences):
            sequence_id = alignment[row].id
            unique_positions = 0
            for column in range(0, alignment_length):
                if (position_presence_matrix[row, column]
                        == 1) or (position_presence_matrix[row, column] == -1):
                    unique_positions += 1

            unique_position_count_dict[sequence_id] = unique_positions
            unique_position_count_percent_dict[sequence_id] = 100 * float(
                unique_positions) / (alignment_length -
                                     str(alignment[row].seq).count(gap_symbol))

        unique_position_count_dict.write("%s.absolute_counts" % output_prefix)
        unique_position_count_percent_dict.write("%s.percent_counts" %
                                                 output_prefix)

        return unique_position_count_dict if return_mode == "absolute" else unique_position_count_percent_dict
示例#15
0
 def replace_region_names_in_gff(input_gff, synonyms_file, output_gff):
     syn_dict = SynDict()
     syn_dict.read(synonyms_file, comments_prefix="#")
     with open(input_gff, "r") as in_fd:
         with open(output_gff, "w") as out_fd:
             for line in in_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                 else:
                     line_list = line.split("\t")
                     if line_list[0] in syn_dict:
                         line_list[0] = syn_dict[line_list[0]]
                         out_fd.write("\t".join(line_list))
                     else:
                         out_fd.write(line)
示例#16
0
    def merge_clusters(clusters_dict,
                       label_species="False",
                       separator_for_labeling="_",
                       species_label_first=True):

        if species_label_first:
            label_sequence = lambda label, name: "%s%s%s" % (
                label, separator_for_labeling, name)
        else:
            label_sequence = lambda label, name: "%s%s%s" % (
                name, separator_for_labeling, label)
        if label_species:
            expression = label_sequence
        else:
            expression = lambda label, name: name

        merged_clusters = SynDict()
        for species in clusters_dict:
            for cluster in clusters_dict[species]:
                if cluster not in merged_clusters:
                    merged_clusters[cluster] = []
                for sequence_name in clusters_dict[species][cluster]:
                    merged_clusters[cluster].append(
                        expression(species, sequence_name))

        return merged_clusters
示例#17
0
    def extract_single_copy_clusters_from_files(
            self,
            list_of_cluster_files,
            output_file,
            label_elements=False,
            separator="@",
            label_position="first",
            function_to_convert_filename_to_label=None):
        dict_of_cluster_dicts = OrderedDict()
        for filename in list_of_cluster_files:
            if function_to_convert_filename_to_label:
                label = function_to_convert_filename_to_label(filename)
            else:
                label = FileRoutines.split_filename(filename)[
                    1]  # use basename as label

            dict_of_cluster_dicts[label] = SynDict()
            dict_of_cluster_dicts[label].read(filename,
                                              split_values=True,
                                              comments_prefix="#")

        sc_clusters_dict = self.extract_single_copy_clusters(
            dict_of_cluster_dicts,
            label_elements=label_elements,
            separator=separator,
            label_position=label_position)

        sc_clusters_dict.write(output_file, splited_values=True)

        return sc_clusters_dict
示例#18
0
    def replace_label(cluster_dict,
                      syn_dict=None,
                      old_separator="@",
                      old_label_position="first",
                      new_separator="@",
                      new_label_position="first"):
        new_cluster_dict = SynDict()
        for cluster in cluster_dict:
            new_cluster_dict[cluster] = []
            for element in cluster_dict[cluster]:
                tmp = element.split(old_separator)
                if old_label_position == "first":
                    label = tmp[0]
                    element_id = old_separator.join(tmp[1:])
                else:
                    label = tmp[-1]
                    element_id = old_separator.join(tmp[:-1])

                if new_label_position == 'first':
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (syn_dict[label] if syn_dict else label,
                                    new_separator, element_id))
                else:
                    new_cluster_dict[cluster].append(
                        "%s%s%s" % (element_id, new_separator,
                                    syn_dict[label] if syn_dict else label))

        return new_cluster_dict
示例#19
0
    def split_proteins_per_species(self, dir_with_proteins, output_dir, input_format="fasta", output_format="fasta"):
        #print type(FileRoutines)
        input_files = self.make_list_of_path_to_files([dir_with_proteins] if isinstance(dir_with_proteins, str) else dir_with_proteins)

        out_dir = self.check_path(output_dir)
        self.safe_mkdir(out_dir)

        protein_dict = SeqIO.index_db("temp.idx", input_files, format=input_format)

        syn_dict = SynDict()

        for protein_id in protein_dict:
            taxa_id = protein_id.split(".")[0]
            # pep_id = ".".join(tmp_list[1:])
            if taxa_id not in syn_dict:
                syn_dict[taxa_id] = []
            syn_dict[taxa_id].append(protein_id)

        def renamed_records_generator(record_dict, taxa_id):
            for record_id in syn_dict[taxa_id]:
                record = deepcopy(record_dict[record_id])
                #print(record)
                record.id = ".".join(record_id.split(".")[1:])
                yield record

        for taxa_id in syn_dict:
            out_file = "%s%s.pep" % (out_dir, taxa_id)
            SeqIO.write(renamed_records_generator(protein_dict, taxa_id), out_file, format=output_format)
示例#20
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w",
                                                  cluster_column=0,
                                                  element_column=1,
                                                  column_separator="\t",
                                                  element_separator=",",
                                                  id_column=None):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict(filename=cluster_file,
                               split_values=True,
                               comments_prefix="#",
                               key_index=cluster_column,
                               value_index=element_column,
                               separator=column_separator,
                               values_separator=element_separator)

        element_id_list = IdList(filename=element_file,
                                 comments_prefix="#",
                                 column_number=id_column)
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
示例#21
0
    def calculate_fpkm_for_count_table(count_table_file, transcript_length_file, output_file,
                                       separator="\t"):
        length_dict = SynDict(filename=transcript_length_file, expression=int, comments_prefix="#")

        with open(count_table_file, "r") as in_fd:
            header_list = in_fd.readline().strip().split(separator)

            samples_list = header_list[1:]
            gene_list = IdList()
            count_list = []
            for line in in_fd:
                tmp = line.strip().split(separator)
                gene_list.append(tmp[0])
                count_list.append(map(float, tmp[1:]))

            per_sample_total_counts = []

            for sample_index in range(0, len(samples_list)):
                total_counts = 0
                for gene_index in range(0, len(count_list)):
                    total_counts += count_list[gene_index][sample_index]
                per_sample_total_counts.append(total_counts)

        with open(output_file, "w") as out_fd:
            out_fd.write(separator.join(header_list) + "\n")
            for gene_index in range(0, len(count_list)):
                normalized_counts_list = []
                for sample_index in range(0, len(samples_list)):
                    gene_count = count_list[gene_index][sample_index] * (10**9) / length_dict[gene_list[gene_index]] / per_sample_total_counts[sample_index]
                    normalized_counts_list.append(gene_count)
                out_fd.write("%s\t%s\n" % (gene_list[gene_index], "\t".join(map(str, normalized_counts_list))))
示例#22
0
    def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True):

        syn_dict = SynDict(filename=syn_file)
        skipped_id_list = IdSet()

        output_gff = "%s.renamed.gff" % output_prefix
        skipped_gff = "%s.skipped.gff" % output_prefix
        skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix

        with self.metaopen(input_gff, "r") as in_fd, \
             self.metaopen(output_gff, "w") as out_fd, \
             self.metaopen(skipped_gff, "w") as skipped_fd:

            for line in in_fd:
                if line[0] == "#":
                    out_fd.write(line)
                gff_list = line.split("\t")
                if gff_list[0] in syn_dict:
                    gff_list[0] = syn_dict[gff_list[0]]
                    out_fd.write("\t".join(gff_list))
                else:
                    skipped_fd.write(line)
                    skipped_id_list.add(gff_list[0])

        if verbose:
            print("Not renamed scaffolds: %i" % len(skipped_id_list))

        skipped_id_list.write(skipped_id_file)
示例#23
0
    def parse_regions(self, input_file, format="gff", comment_prefix="#", separator="\t", bed_format="0-based"):
        region_dict = SynDict()
        # All coordinates are converted to 0-based in python notation

        if format == "gff":
            for line in self.file_line_as_list_generator(input_file, comments_prefix="#", separator="\t"):
                if line[self.GFF_SCAFFOLD_COLUMN] not in region_dict:
                    region_dict[line[self.GFF_SCAFFOLD_COLUMN]] = [[int(line[self.GFF_START_COLUMN]) - 1,
                                                                    int(line[self.GFF_END_COLUMN])]]
                else:
                    region_dict[line[self.GFF_SCAFFOLD_COLUMN]].append([int(line[self.GFF_START_COLUMN]) - 1,
                                                                        int(line[self.GFF_END_COLUMN])])

        elif format == "bed":
            for line in self.file_line_as_list_generator(input_file, comments_prefix="#", separator="\t"):
                if line[self.BED_SCAFFOLD_COLUMN] not in region_dict:
                    region_dict[line[self.BED_SCAFFOLD_COLUMN]] = [[(int(line[self.BED_START_COLUMN]) - 1) if bed_format == "1-based" else int(line[self.BED_START_COLUMN]),
                                                                    int(line[self.BED_END_COLUMN])]]
                else:
                    region_dict[line[self.BED_SCAFFOLD_COLUMN]].append([(int(line[self.BED_START_COLUMN]) - 1) if bed_format == "1-based" else int(line[self.BED_START_COLUMN]),
                                                                        int(line[self.BED_END_COLUMN])])

        elif format == "gatk":
            pass

        return region_dict
示例#24
0
    def add_gene_synonyms(self,
                          input_file,
                          output_file,
                          synonym_file,
                          key_column=0,
                          value_column=1,
                          header_name_for_synonym="Common_name",
                          snpeff_tab_column_id_column=8):
        synonym_dict = SynDict(filename=synonym_file,
                               key_index=key_column,
                               value_index=value_column,
                               comments_prefix="#")
        #print synonym_dic
        with open(input_file, "r") as in_fd, open(output_file, "w") as out_fd:
            header = in_fd.readline().strip(
            ) + "\t%s\n" % header_name_for_synonym
            out_fd.write(header)

            for line in in_fd:
                tmp = line.strip().split("\t")
                #print tmp
                gene_name = tmp[snpeff_tab_column_id_column]
                #print gene_name
                #print gene_name
                #print synonym_dict[gene_name]
                #print gene_name
                #if gene_name in synonym_dict:
                #    print "AAAAAAAA"
                tmp.append(synonym_dict[gene_name] if gene_name in
                           synonym_dict else "")
                out_fd.write("\t".join(tmp) + "\n")
示例#25
0
 def add_add_new_column_by_key_column(self,
                                      table_file,
                                      syn_dict_file,
                                      key_column,
                                      output_file,
                                      new_column_name=None,
                                      separator='\t',
                                      absent_value="."):
     column_syn_dict = SynDict(filename=syn_dict_file,
                               allow_repeats_of_key=True,
                               values_separator="@")
     with open(table_file, "r") as in_fd, open(output_file, "w") as out_fd:
         if new_column_name:
             header_line = in_fd.readline().strip(
             ) + "\t%s\n" % new_column_name
             out_fd.write(header_line)
             for line in in_fd:
                 line_list = line.strip().split(separator)
                 if line_list[key_column] in column_syn_dict:
                     print(line_list[key_column])
                     print(column_syn_dict[line_list[key_column]])
                 line_list.append(
                     absent_value if line_list[key_column] not in
                     column_syn_dict else "|".
                     join(column_syn_dict[line_list[key_column]]))
                 out_fd.write(separator.join(line_list) + "\n")
示例#26
0
文件: File.py 项目: mahajrod/GAVGAV
    def replace_column_value_by_syn(input_file, syn_file, out_file, column=0, comment_prefix=None, separator="\t",
                                    syn_header=False, syn_separator="\t",
                                    syn_key_index=0, syn_value_index=1, syn_comment_prefix=None):
        syn_dict = SynDict(filename=syn_file, header=syn_header, separator=syn_separator, key_index=syn_key_index,
                           value_index=syn_value_index, comments_prefix=syn_comment_prefix)
        if comment_prefix:
            comment_prefix_len = len(comment_prefix)
        line_number = 0
        replaced = 0
        not_replaced = 0
        with open(input_file, "r") as in_fd:
            with open(out_file, "w") as out_fd:
                for line in in_fd:
                    line_number += 1
                    if comment_prefix:
                        if line[0:comment_prefix_len] == comment_prefix:
                            out_fd.write(line)
                            continue
                    line_list = line.strip("\n").split(separator)
                    if len(line_list) < column + 1:
                        sys.stderr.write("WARNING!!! Line %i doesn't have column %i\n" % (line_number, column))
                    if line_list[column] in syn_dict:
                        replaced += 1
                        line_list[column] = syn_dict[line_list[column]]
                    else:
                        not_replaced += 1

                    out_fd.write(separator.join(line_list))
                    out_fd.write("\n")

        sys.stderr.write("Replaced: %i\nNot replaced: %i\n" % (replaced, not_replaced))
示例#27
0
    def extract_predicted_gene_names_from_emapper_annotation_file(
            emapper_annotation_file, output_file):
        extract_predicted_gene_names_dict = SynDict(
            filename=emapper_annotation_file,
            key_index=0,
            value_index=4,
            split_values=True,
            values_separator=",",
            comments_prefix="#",
            separator="\t")
        extract_predicted_gene_names_dict.header = "#protein_id\tpredicted_gene_name"
        extract_predicted_gene_names_dict.write(output_file,
                                                header=True,
                                                splited_values=True)

        return extract_predicted_gene_names_dict
示例#28
0
文件: NCBI.py 项目: melakbet/MAVR
    def get_taxonomy(taxa_list, output_file, email, input_type="latin"):
        Entrez.email = email
        out_file = open(output_file, "w")
        out_file.write("#species\tlineage\n")

        species_syn_dict = SynDict()

        if input_type == "latin":
            for taxon in taxa_list:
                print "Handling %s" % taxon
                summary = Entrez.read(Entrez.esearch(db="taxonomy", term=taxon))
                if summary:
                    id_list = summary["IdList"]
                    species_syn_dict[taxon] = []
                    for id in id_list:
                        print "handling %s" % id
                        record = Entrez.read(Entrez.efetch(db="taxonomy", id=id, retmode="xml"))
                        #print record
                        out_file.write("%s\t%s\t%s\n" % (taxon, record[0]["Rank"], record[0]["Lineage"]))

                        species_syn_dict[taxon].append(record[0]['ScientificName'])
                        #species_set.add(record[0]["Species"])
        elif input_type == "id":
            for taxon in taxa_list:
                print "Handling %s" % taxon
                species_syn_dict[taxon] = []
                #print taxon
                record = Entrez.read(Entrez.efetch(db="taxonomy", id=taxon, retmode="xml"))
                #print record
                out_file.write("%s\t%s\t%s\n" % (taxon, record[0]["Rank"], record[0]["Lineage"]))
                #print record[0]
                species_syn_dict[taxon].append(record[0]['ScientificName'])
                #species_set.add(record[0]["Species"])

        return species_syn_dict
示例#29
0
    def get_species_from_eggnog_tsv(self,
                                    eggnog_tsv,
                                    output_prefix,
                                    email=None):

        cluster_dict = SynDict(filename=eggnog_tsv,
                               key_index=1,
                               value_index=5,
                               split_values=True)

        species_ids = self.extract_labels_from_cluster_elements(
            cluster_dict, separator=".", label_position="first")

        if not email:
            species = species_ids
        else:
            species = NCBIRoutines.get_taxonomy(species_ids,
                                                "%s.species.taxonomy" %
                                                output_prefix,
                                                email,
                                                input_type="id")

        species.write("%s.species" % output_prefix, splited_values=True)

        for species_id in species:
            for i in range(0, len(species[species_id])):
                species[species_id][i] = species[species_id][i].lower(
                ).replace(" ", "_")

        species.write("%s.replaced_spaces.species" % output_prefix,
                      splited_values=True)
示例#30
0
    def label_cluster_elements_from_file(self,
                                         input_file,
                                         label,
                                         output_file,
                                         separator="@",
                                         label_position="first"):
        input_dict = SynDict()
        input_dict.read(input_file, split_values=True, comments_prefix="#")

        output_dict = self.label_cluster_elements(
            input_dict,
            label,
            separator=separator,
            label_position=label_position)
        output_dict.write(output_file, splited_values=True)

        return output_dict