Exemplo n.º 1
0
    def extract_annotation_by_refence_id(list_of_target_gff, id_file,
                                         extracted_gff, filtered_out_gff):
        ids = IdList()
        ids.read(id_file)
        extracted_gff_fd = open(extracted_gff, "w")
        filtered_out_gff_fd = open(filtered_out_gff, "w")
        for filename in list_of_target_gff:
            with open(filename, "r") as in_fd:
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if target_name not in ids:
                            writing_fd = filtered_out_gff_fd

                        else:
                            writing_fd = extracted_gff_fd
                        # print target_name
                        writing_fd.write(tmp)
                        while True:
                            tmp = next(in_fd, "")
                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        extracted_gff_fd.close()
        filtered_out_gff_fd.close()
Exemplo n.º 2
0
    def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict,
                                                       output_prefix=None):
        length_file = "%s.protein_length.tsv" % output_prefix
        if output_prefix:
            longest_protein_id_file = "%s.longest_pep.ids" % output_prefix

            len_fd = open(length_file, 'w')
            len_fd.write("#gene_id\tprotein_id\tprotein_length\n")

        data_dict = OrderedDict()
        for protein_id in protein_dict:
            length = len(protein_dict[protein_id].seq)
            description_list = protein_dict[protein_id].description.split()
            #print protein_dict[protein_id]
            #print ''
            #print description_list

            for entry in description_list:
                if "gene:" in entry:
                    gene_id = entry.split(":")[1]
            if output_prefix:
                len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length))
            if gene_id not in data_dict:
                data_dict[gene_id] = protein_id
            else:
                if length > len(protein_dict[data_dict[gene_id]].seq):
                    data_dict[gene_id] = protein_id

        longest_pep_ids = IdList(data_dict.values())
        if output_prefix:
            longest_pep_ids.write(longest_protein_id_file)
            len_fd.close()
        return longest_pep_ids
Exemplo n.º 3
0
    def seq_ids(self):
        id_list = IdList()

        for record in self.records:
            id_list.append(record.id)

        return id_list
Exemplo n.º 4
0
    def extract_top_hits_from_target_gff(list_of_target_gff,
                                         top_hits_gff,
                                         secondary_hits_gff,
                                         id_white_list_file=None,
                                         max_hits_per_query=None):
        if id_white_list_file:
            white_ids = IdList()
            white_ids.read(id_white_list_file)
        top_hits_gff_fd = open(top_hits_gff, "w")
        secondary_hits_gff_fd = open(secondary_hits_gff, "w")
        targets_list = []
        hit_counter = 0
        gene_counter = 0
        for filename in list_of_target_gff:
            index = 0
            with open(filename, "r") as in_fd:
                #print u
                #tmp = None
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if id_white_list_file:
                            if target_name not in white_ids:
                                continue
                        if target_name not in targets_list:
                            writing_fd = top_hits_gff_fd
                            targets_list.append(target_name)
                            gene_counter += 1
                            hit_counter = 0
                        else:
                            writing_fd = secondary_hits_gff_fd
                        # print target_name
                        hit_counter += 1
                        tmp = tmp.replace(
                            "gene_id 0",
                            "gene_id g%i_h%i" % (gene_counter, hit_counter))
                        if hit_counter <= max_hits_per_query:
                            writing_fd.write(tmp)

                        while True:
                            tmp = next(in_fd, "")
                            # print("cccc")

                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            if max_hits_per_query:
                                if hit_counter > max_hits_per_query:
                                    #print "aaaaa"
                                    continue
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        top_hits_gff_fd.close()
        secondary_hits_gff_fd.close()
Exemplo n.º 5
0
    def create_per_cluster_element_id_files(self, cluster_dict,
                                            output_directory):
        self.safe_mkdir(output_directory)

        for cluster_id in cluster_dict:
            cluster_element_id_list = IdList(cluster_dict[cluster_id])
            cluster_element_id_list.write("%s/%s.ids" %
                                          (output_directory, cluster_id))
Exemplo n.º 6
0
def handle_input(filename):
    sys.stdout.write("Handling %s\n" % filename)
    not_significant_ids = IdList()
    not_found_ids = IdList()

    prefix = FileRoutines.split_filename(filename)[1]
    index_file = "%s.tmp.idx" % prefix
    hmm_dict = SearchIO.index_db(index_file, filename, args.format)
    if args.output == "stdout":
        out_fd = sys.stdout
    else:
        out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

    for query in hmm_dict:
        if hmm_dict[query].hits:
            if hmm_dict[query][0].is_included:
                out_fd.write(
                    "%s\t%s\t%s\t%s\n" %
                    (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue,
                     hmm_dict[query][0].bitscore))
            else:
                not_significant_ids.append(query)
        else:
            not_found_ids.append(query)

    if args.output != "stdout":
        out_fd.close()

    os.remove(index_file)
    return not_significant_ids, not_found_ids
Exemplo n.º 7
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w",
                                                  cluster_column=0,
                                                  element_column=1,
                                                  column_separator="\t",
                                                  element_separator=",",
                                                  id_column=None):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict(filename=cluster_file,
                               split_values=True,
                               comments_prefix="#",
                               key_index=cluster_column,
                               value_index=element_column,
                               separator=column_separator,
                               values_separator=element_separator)

        element_id_list = IdList(filename=element_file,
                                 comments_prefix="#",
                                 column_number=id_column)
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
Exemplo n.º 8
0
    def extract_entries_by_GO_from_eggnogmapper_output(eggnogmapper_output, GO_file, output_prefix,
                                                       comments_prefix="#", separator="\t",
                                                       ):

        GO_list = IdList(filename=GO_file, column_number=0)

        #print "GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO"
        #print GO_list
        print (len(GO_list))

        extracted_entries_file = "%s.annotations" % output_prefix

        extracted_entries = 0

        with open(eggnogmapper_output, "r") as eggnog_fd:
            with open(extracted_entries_file, "w") as out_fd:
                for line in eggnog_fd:
                    if line[0] == comments_prefix:
                        out_fd.write(line)
                        continue
                    line_list = line.strip().split(separator)
                    entry_GO_list = line_list[5].split(",")
                    #print entry_GO_list
                    for GO in entry_GO_list:
                        if GO in GO_list:
                            out_fd.write(line)
                            extracted_entries += 1
                            break

        print("Extracted %i entries" % extracted_entries)
Exemplo n.º 9
0
    def filter_psl_by_ids_from_file(self,
                                    psl_file,
                                    output_file,
                                    white_query_id_file=None,
                                    black_query_id_file=None,
                                    white_target_id_file=None,
                                    black_target_id_file=None):

        self.filter_psl_by_ids(
            psl_file,
            output_file,
            white_query_id_list=IdList(
                filename=white_query_id_file) if white_query_id_file else (),
            black_query_id_list=IdList(
                filename=black_query_id_file) if black_query_id_file else (),
            white_target_id_list=IdList(
                filename=white_target_id_file) if white_target_id_file else (),
            black_target_id_list=IdList(
                filename=black_target_id_file) if black_target_id_file else ())
Exemplo n.º 10
0
    def extract_evidence_by_ids(evidence_file,
                                id_file,
                                output_evidence_file,
                                mode="transcript"):
        # possible modes: transcript, gene
        ids = IdList()
        ids.read(id_file, comments_prefix="#")

        column_id = 0 if mode == "gene" else 1

        with open(evidence_file, "r") as ev_fd:
            with open(output_evidence_file, "w") as out_fd:
                for line in ev_fd:
                    if line[0] == "#":
                        out_fd.write(line)
                        continue

                    entry_id = line.split("\t")[column_id]
                    if entry_id in ids:
                        out_fd.write(line)
Exemplo n.º 11
0
 def extract_emapper_annotations_by_protein_ids(emapper_annotation_file,
                                                protein_id_file,
                                                output_annotations):
     protein_ids = IdList(filename=protein_id_file)
     with open(emapper_annotation_file, "r") as ann_fd:
         with open(output_annotations, "w") as out_fd:
             for line in ann_fd:
                 if line[0] == "#":
                     out_fd.write(line)
                     continue
                 if line.split("\t")[0] in protein_ids:
                     out_fd.write(line)
Exemplo n.º 12
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Exemplo n.º 13
0
    def add_length_to_accordance_file(accordance_file, length_file,
                                      output_prefix):

        accordance_dict = SynDict(filename=accordance_file,
                                  allow_repeats_of_key=True)
        length_dict = SynDict(filename=length_file, expression=int)
        print(length_dict)
        longest_list = IdList()

        all_output_file = "%s.all.correspondence" % output_prefix
        longest_output_file = "%s.longest.correspondence" % output_prefix
        longest_id_file = "%s.longest.ids" % output_prefix

        with open(all_output_file, "w") as all_out_fd:
            with open(longest_output_file, "w") as longest_out_fd:
                for gene in accordance_dict:
                    current_transcript = None
                    current_length = 0
                    for transcript in accordance_dict[gene]:
                        if length_dict[transcript] > current_length:
                            current_transcript = transcript
                            current_length = length_dict[transcript]
                        all_out_fd.write(
                            "%s\t%s\t%i\n" %
                            (gene, transcript, length_dict[transcript]))

                    longest_out_fd.write(
                        "%s\t%s\t%i\n" %
                        (gene, current_transcript, current_length))
                    longest_list.append(current_transcript)
        longest_list.write(longest_id_file)
Exemplo n.º 14
0
    def cluster_sequence_names_by_id_fragment_from_file(
            self,
            seq_id_file,
            id_element_index,
            id_separator="_",
            output_prefix=None):

        seq_id_list = IdList(filename=seq_id_file)

        self.cluster_sequence_names_by_id_fragment(seq_id_list,
                                                   id_element_index,
                                                   id_separator=id_separator,
                                                   output_prefix=output_prefix)
Exemplo n.º 15
0
    def remove_elements_by_ids_from_files(self,
                                          input_file,
                                          output_file,
                                          black_list_file,
                                          mode="full"):

        cluster_dict = SynDict(filename=input_file, split_values=True)
        black_list = IdList(filename=black_list_file)

        filtered_dict = self.remove_elements_by_ids(cluster_dict,
                                                    black_list,
                                                    mode=mode)

        filtered_dict.write(output_file, splited_values=True)
Exemplo n.º 16
0
    def extract_clusters_and_elements_by_labels_from_files(
            self,
            cluster_file,
            label_file,
            output_file,
            separator="@",
            label_position="first"):
        cluster_dict = SynDict(filename=cluster_file, split_values=True)
        label_list = IdList(
            filename=label_file) if isinstance(label_file, str) else label_file

        output_dict = self.extract_clusters_and_elements_by_labels(
            cluster_dict,
            label_list,
            separator=separator,
            label_position=label_position)

        output_dict.write(output_file, splited_values=True)
Exemplo n.º 17
0
 def convert_gff_to_simple_bed(input_gff,
                               output_bed,
                               feature_type_list=[],
                               scaffold_id_file=None):
     if scaffold_id_file:
         scaffolds_id_list = IdList(filename=scaffold_id_file)
     with open(input_gff, "r") as gff_fd:
         with open(output_bed, "w") as bed_fd:
             for line in gff_fd:
                 if line[0] == "#":
                     continue
                 tmp_list = line.strip().split("\t")
                 if scaffold_id_file:
                     if tmp_list[0] not in scaffolds_id_list:
                         continue
                 if feature_type_list:
                     if tmp_list[2] not in feature_type_list:
                         continue
                 bed_fd.write("%s\t%s\t%s\n" %
                              (tmp_list[0], tmp_list[3], tmp_list[4]))
Exemplo n.º 18
0
 def extract_ids_from_file(input_file,
                           output_file=None,
                           header=False,
                           column_separator="\t",
                           comments_prefix="#",
                           column_number=None):
     id_list = IdList()
     id_list.read(input_file,
                  column_separator=column_separator,
                  comments_prefix=comments_prefix,
                  column_number=column_number,
                  header=header)
     if output_file:
         id_list.write(output_file, header=header)
     return id_list
Exemplo n.º 19
0
    def combine_ds_dn_w_from_bootstrap_data(self,
                                            input_dir,
                                            output_dir,
                                            use_node_names_if_possible=True):

        dn_dir = "%s/dN/" % output_dir
        ds_dir = "%s/dS/" % output_dir
        w_dir = "%s/W/" % output_dir

        for directory in output_dir, dn_dir, ds_dir, w_dir:
            self.safe_mkdir(directory)

        input_files = map(lambda s: "%s/%s" % (input_dir, s),
                          os.listdir(input_dir))

        data_dict = OrderedDict()
        for filename in input_files:
            with open(filename, "r") as in_fd:
                in_fd.readline()  # read header
                for line in in_fd:
                    node_id, node_name, dn, ds, w = line.strip().split("\t")

                    if use_node_names_if_possible:
                        node = node_id if node_name == "." else node_name
                    else:
                        node = node_id

                    if node not in data_dict:
                        data_dict[node] = OrderedDict()
                        for parameter in "dN", "dS", "W":
                            data_dict[node][parameter] = IdList()
                    data_dict[node]["dN"].append(dn)
                    data_dict[node]["dS"].append(ds)
                    data_dict[node]["W"].append(w)

        for node in data_dict:
            for parameter in "dN", "dS", "W":
                out_file = "%s/%s/%s.tsv" % (output_dir, parameter, node)
                data_dict[node][parameter].write(out_file)
Exemplo n.º 20
0
    def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None):

        extracted_families = SynDict()
        common_protein_names_to_families_dict = SynDict()
        common_names_to_eggnog_proteins_syn_dict = SynDict()

        not_found_proteins_common_names = IdList()

        transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value()

        for common_protein_name in protein_syn_dict:
            not_found = True
            for protein_id in protein_syn_dict[common_protein_name]:
                extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id
                if extended_protein_id in transposed_eggnog_fam_dict:
                    not_found = False
                    if common_protein_name not in common_protein_names_to_families_dict:
                        common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]]
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id]
                    else:
                        common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0])
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id)
                    if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families:
                        extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]]

            if not_found:
                not_found_proteins_common_names.append(common_protein_name)

        if output_prefix:
            extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True)
            common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True)
            common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True)
            not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix)

            #print common_names_to_eggnog_proteins_syn_dict
            #print common_protein_names_to_families_dict
        return extracted_families, common_protein_names_to_families_dict, \
               common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
Exemplo n.º 21
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict
Exemplo n.º 22
0
    "-p",
    "--depth",
    action="store",
    dest="depth",
    type=int,
    default=2,
    help=
    "The maximum depth to perform extraction of cluster using inconsistent method. Default: 2"
)
parser.add_argument(
    "-a",
    "--scaffold_white_list",
    action="store",
    dest="scaffold_white_list",
    default=[],
    type=lambda s: IdList(filename=s) if os.path.exists(s) else s.split(","),
    help="Comma-separated list of the only scaffolds to draw. Default: all")

parser.add_argument(
    "-b",
    "--scaffold_black_list",
    action="store",
    dest="scaffold_black_list",
    default=[],
    type=lambda s: IdList(filename=s) if os.path.exists(s) else s.split(","),
    help=
    "Comma-separated list of scaffolds to skip at drawing. Default: not set")

parser.add_argument(
    "-z",
    "--scaffold_ordered_list",
Exemplo n.º 23
0
                    default=1,
                    help="Number of simultaneous downloads")
parser.add_argument("-c",
                    "--connections",
                    action="store",
                    dest="connections",
                    type=int,
                    default=8,
                    help="Number of connections for each download")

args = parser.parse_args()

if (not args.ids) and (not args.id_file):
    raise ValueError("Both ids and id file were not set")

loader = IdList()
id_list = loader.read(args.id_file) if args.id_file else args.ids

Axel.threads = args.threads
Axel.parallel_download_from_sra(id_list, args.connections)
"""
options_list = []
for entry_id in id_list:
    ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id)
    options_list.append("-n %i %s" % (args.connections, ftp_path))

tool = Tool(cmd="axel", max_threads=args.threads)

tool.parallel_execute(options_list)

for filename in os.listdir(os.getcwd()):
Exemplo n.º 24
0
    def prepare_template_for_popart(alignment_file,
                                    output_file,
                                    haplotype_fam_file=None,
                                    traits_file=None,
                                    whitelist_file=None):
        from RouToolPa.Parsers.Sequence import CollectionSequence
        sequence_collection = CollectionSequence(in_file=alignment_file,
                                                 parsing_mode="parse")
        sequence_collection.get_stats_and_features(count_gaps=False,
                                                   sort=False)
        whitelist = IdSet(filename=whitelist_file)
        alignment_len = sequence_collection.seq_lengths["length"].unique()
        if len(alignment_len) > 1:
            raise ValueError(
                "ERROR!!! Sequences in alignment have different lengths!")
        alignment_len = alignment_len[0]

        haplotype_selected_sequence_dict = SynDict()
        haplotypes_without_sequences_ids = IdList()

        traits_df = pd.read_csv(
            traits_file, sep="\t",
            index_col=0) if traits_file else pd.DataFrame()

        if haplotype_fam_file:
            haplotype_dict = SynDict(filename=haplotype_fam_file,
                                     split_values=True)
            for haplotype_id in haplotype_dict:
                for sequence_id in haplotype_dict[haplotype_id]:
                    if sequence_id in sequence_collection.records:
                        haplotype_selected_sequence_dict[
                            haplotype_id] = sequence_id
                        break
                else:
                    haplotypes_without_sequences_ids.append(haplotype_id)
        else:
            haplotype_dict = dict([(entry, [entry])
                                   for entry in sequence_collection.scaffolds])
            haplotype_selected_sequence_dict = dict([
                (entry, entry) for entry in sequence_collection.scaffolds
            ])

        final_haplotype_set = (set(haplotype_selected_sequence_dict.keys())
                               & whitelist) if whitelist else set(
                                   haplotype_selected_sequence_dict.keys())

        with open(output_file, "w") as out_fd:
            #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict),
            #                                                                                          "\n".join(haplotype_selected_sequence_dict.keys())))
            out_fd.write("#NEXUS\n\n")
            out_fd.write(
                "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n"
                % (len(final_haplotype_set), alignment_len))
            out_fd.write("\tMATRIX\n")

            for haplotype_id in final_haplotype_set:
                out_fd.write(
                    "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[
                        haplotype_selected_sequence_dict[haplotype_id]]))
            out_fd.write("\t;\nEND;\n\n")

            if not traits_df.empty:
                traits_number = len(traits_df.columns)
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n"
                    .format(traits_number))
                out_fd.write("\tTraitLabels {0};\n".format(" ".join(
                    traits_df.columns)))
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %s\n" %
                        (haplotype_id,
                         ",".join(map(str, traits_df.loc[haplotype_id]))
                         if haplotype_id in traits_df.index else
                         ("0," * traits_number)[:-1]))
            else:
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n"
                )
                out_fd.write("\tTraitLabels Area;\n")
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %i\n" %
                        (haplotype_id, len(haplotype_dict[haplotype_id])))
            out_fd.write("\t;\nEND;\n\n")
Exemplo n.º 25
0
__author__ = 'Sergei F. Kliver'
import sys
import argparse
from RouToolPa.Collections.General import IdList

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--fam_file",
                    action="store",
                    dest="fam_file",
                    required=True,
                    help="File with families")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="File to write ids")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

id_list = IdList()
id_list.read(args.fam_file,
             close_after_if_file_object=True,
             column_number=1,
             id_in_column_separator=",")
id_list.write(args.output, close_after_if_file_object=True)
Exemplo n.º 26
0
                    action="store",
                    dest="output_prefix",
                    default="stdout",
                    help="Prefix of output file")
args = parser.parse_args()

out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
            reference_families[family_id])

reference_random_genes.write("%s_reference_random_genes.t" %
Exemplo n.º 27
0
    def handle_sanger_data(self,
                           input_dir,
                           output_prefix,
                           outdir=None,
                           read_subfolders=False,
                           min_mean_qual=0,
                           min_median_qual=0,
                           min_len=50):
        if outdir:
            self.workdir = outdir

        self.init_dirs()

        sanger_filelist = self.make_list_of_path_to_files(
            input_dir,
            expression=self.is_sanger_file,
            recursive=read_subfolders,
            return_absolute_paths=True)
        stat_dict = TwoLvlDict()
        record_dict = OrderedDict()
        trimmed_record_dict = OrderedDict()
        excluded_list = IdList()
        excluded_counter = 0
        low_quality_counter = 0
        too_short_counter = 0

        merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix)
        merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix)
        merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir,
                                                        output_prefix)
        merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir,
                                                        output_prefix)

        for filename in sanger_filelist:
            filename_list = self.split_filename(filename)

            record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir,
                                                              filename_list[1])
            record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir,
                                                              filename_list[1])
            record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % (
                self.workdir, filename_list[1])

            record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % (
                self.workdir, filename_list[1])
            record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % (
                self.workdir, filename_list[1])
            record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % (
                self.workdir, filename_list[1])

            record = SeqIO.read(self.metaopen(filename, "rb"), format="abi")
            record_dict[record.id] = record
            SeqIO.write(record, record_raw_fastq, format="fastq")
            SeqIO.write(record, record_raw_fasta, format="fasta")

            trimmed_record = SeqIO.AbiIO._abi_trim(record)

            stat_dict[record.id] = OrderedDict({
                "raw_len":
                len(record),
                "raw_mean_qual":
                np.mean(record.letter_annotations["phred_quality"]),
                "raw_median_qual":
                np.median(record.letter_annotations["phred_quality"]),
                "trimmed_len":
                len(trimmed_record),
                "trimmed_mean_qual":
                np.mean(trimmed_record.letter_annotations["phred_quality"]),
                "trimmed_median_qual":
                np.median(trimmed_record.letter_annotations["phred_quality"]),
                "retained":
                "-",
            })
            MatplotlibRoutines.draw_bar_plot(
                record.letter_annotations["phred_quality"],
                record_raw_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            if stat_dict[record.id]["trimmed_len"] >= min_len:
                if min_median_qual:
                    if (stat_dict[record.id]["trimmed_median_qual"] >=
                            min_median_qual) and (
                                stat_dict[record.id]["trimmed_mean_qual"] >=
                                min_mean_qual):
                        stat_dict[record.id]["retained"] = "+"
                    else:
                        low_quality_counter += 1
                else:
                    stat_dict[record.id]["retained"] = "+"
            else:
                too_short_counter += 1

            if stat_dict[record.id]["retained"] == "-":
                excluded_list.append(record.id)
                continue

            SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq")
            SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta")

            MatplotlibRoutines.draw_bar_plot(
                trimmed_record.letter_annotations["phred_quality"],
                record_trimmed_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            trimmed_record_dict[record.id] = trimmed_record

        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fasta,
                    format="fasta")

        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fasta,
                    format="fasta")

        excluded_list.write("%s.excluded.ids" % output_prefix)
        stat_dict.write(out_filename="%s.stats" % output_prefix)

        print("Excluded: %i" % excluded_counter)
        print("\tToo short( < %i ): %i" % (min_len, too_short_counter))
        print("\tLow quality( median < %i or mean < %i ): %i" %
              (min_median_qual, min_mean_qual, low_quality_counter))
Exemplo n.º 28
0
    def prepare_annotation_file_from_transcript_and_cds(
            self,
            transcript_file,
            cds_file,
            correspondence_file,
            output_prefix,
            format="fasta",
            correspondence_key_column=0,
            correspondence_value_column=1,
            verbose=False):
        transcript_dict = self.parse_seq_file(transcript_file,
                                              "parse",
                                              format=format)

        cds_dict = self.parse_seq_file(cds_file, "parse", format=format)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      comments_prefix="#",
                                      key_index=correspondence_key_column,
                                      value_index=correspondence_value_column)

        no_corresponding_cds_transcript_list = IdList()
        cds_not_found_transcript_list = IdList()

        annotation_file = "%s.annotation" % output_prefix
        no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix
        cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix

        with open(annotation_file, "w") as annotation_fd:
            for transcript_id in transcript_dict:
                if transcript_id not in correspondence_dict:
                    no_corresponding_cds_transcript_list.append(transcript_id)
                    if verbose:
                        print(
                            "No cds in correspondence file for transcript %s" %
                            transcript_id)
                    continue
                cds_id = correspondence_dict[transcript_id]
                length = len(cds_dict[cds_id].seq)
                start = transcript_dict[transcript_id].seq.upper().find(
                    cds_dict[cds_id].seq.upper())
                if start == -1:
                    cds_not_found_transcript_list.append(transcript_id)
                    if verbose:
                        print("CDS was not found for transcript %s" %
                              transcript_id)
                    continue
                annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id,
                                                         start + 1, length)

                annotation_fd.write(annotation_string)

        no_corresponding_cds_transcript_list.write(
            no_corresponding_cds_transcript_file)
        cds_not_found_transcript_list.write(cds_not_found_transcript_file)
Exemplo n.º 29
0


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input",
                    help="Input sam file. Default: stdin")
parser.add_argument("-o", "--output", action="store", dest="output",
                    help="Output file with reads. Default: stdout")
parser.add_argument("-r", "--read_name_file", action="store", dest="read_name_file", required=True,
                    help="File with full read names or their fragments")
parser.add_argument("-m", "--mode", action="store", dest="mode", default="include",
                    help="Output mode. Allowed: include(default), remove")
parser.add_argument("-c", "--comparison_mode", action="store", dest="comparison_mode",
                    default="exact",
                    help="Read name comparison mode. Allowed: exact(default), partial")

args = parser.parse_args()

input_sam_fd = open(args.input, "r") if args.input else sys.stdin
output_sam_fd = open(args.output, "w") if args.output else sys.stdout

read_name_list = IdList(filename=args.read_name_file)
SamtoolsV1.get_reads_by_name(read_name_list, input_sam_fd, output_sam_fd,
                             mode=args.mode, search_mode=args.comparison_mode)

if args.input:
    input_sam_fd.close()
if args.output:
    output_sam_fd.close()
Exemplo n.º 30
0
parser.add_argument("-f",
                    "--value_file",
                    action="store",
                    dest="value_file",
                    required=True,
                    help="Value with values to seek for")
parser.add_argument("-o",
                    "--output_gff",
                    action="store",
                    dest="output_gff",
                    required=True,
                    help="Output .gff file")
parser.add_argument(
    "-d",
    "--description_fields",
    action="store",
    dest="field_id_list",
    type=lambda s: s.split(","),
    required=True,
    help="Comma-separated list of fields in gff description to check")

args = parser.parse_args()

value_list = IdList(filename=args.value_file)
AnnotationsRoutines.extract_gff_records_by_description_value(
    args.input_gff,
    args.output_gff,
    args.field_id_list,
    value_list,
    retain_comments=False)