Exemplo n.º 1
0
    def count_locations(self,
                        annotation_black_list=[],
                        allow_several_counts_of_record=False,
                        out_filename="location_counts.t",
                        write=True,
                        count_dir="location_counts"):
        os.system("mkdir -p %s" % count_dir)
        regions_dict = self._split_regions()
        region_counts_dict = TwoLvlDict({})
        for region in regions_dict:
            count_locations_dict = {"igc": 0}

            for record in regions_dict[region]:
                if (not record.description["Loc"]) or (
                        "Loc" not in record.description):
                    count_locations_dict["unknown"] += 1
                    continue
                #print(record.description["Loc"])
                if allow_several_counts_of_record:
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        if location not in count_locations_dict:
                            count_locations_dict[location] = 1
                        else:
                            count_locations_dict[location] += 1
                else:
                    full_location = []
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        full_location.append(location)
                    if not full_location:
                        continue
                    full_location.sort()
                    full_location = "/".join(full_location)
                    if full_location not in count_locations_dict:
                        count_locations_dict[full_location] = 1
                    else:
                        count_locations_dict[full_location] += 1

            labels = []
            counts = []
            #colors = []
            for location in count_locations_dict:
                if count_locations_dict[
                        location] == 0 or location in annotation_black_list:
                    continue
                labels.append(location)
                counts.append(count_locations_dict[location])
            region_counts_dict[region] = OrderedDict([
                (label, count) for label, count in zip(labels, counts)
            ])

        if write:
            region_counts_dict.write("%s/%s" % (count_dir, out_filename))
        return region_counts_dict
Exemplo n.º 2
0
    def count_types(self,
                    output_file=None,
                    total_output_file=None,
                    return_mode="chrom"):

        annotated_types = self.get_annotated_types()
        count_dict = TwoLvlDict()
        total_count_dict = OrderedDict()

        for type in annotated_types:
            total_count_dict[type] = OrderedDict()
            total_count_dict[type]["complete"] = 0
            total_count_dict[type]["partial"] = 0

        for chrom in self.records:
            count_dict[chrom] = OrderedDict()
            for type in annotated_types:
                count_dict[chrom][type] = 0

        for chrom in self.records:
            for record in self.records[chrom]:
                count_dict[chrom][record.type] += 1
                if record.partial:
                    total_count_dict[record.type]["partial"] += 1
                else:
                    total_count_dict[record.type]["complete"] += 1

        if output_file:
            count_dict.write(output_file)

        if total_output_file:
            with open(total_output_file, "w") as out_fd:
                out_fd.write(
                    "#rRNA\tComplete%s\tPartial%s\n" %
                    ("(>%.2f of expected length)" %
                     self.partial_threshold if self.partial_threshold else "",
                     "(<%.2f of expected length)" %
                     self.partial_threshold if self.partial_threshold else ""))
                for type in total_count_dict:
                    out_fd.write("%s\t%i\t%i\n" %
                                 (type, total_count_dict[type]["complete"],
                                  total_count_dict[type]["partial"]))

        if return_mode == "chrom":
            return count_dict
        elif return_mode == "total":
            return total_count_dict
        elif return_mode == "both":
            return count_dict, total_count_dict
        else:
            raise ValueError(
                "Unknown return type. Allowed variants: 'chrom', 'total', 'both'"
            )
Exemplo n.º 3
0
    def get_leaf_values(self, write=True):
        leaf_values_dict = TwoLvlDict()
        dN_dict = self._get_tree_dist_dict(self.dNtree)
        dS_dict = self._get_tree_dist_dict(self.dStree)
        W_fict = self._get_tree_dist_dict(self.Wtree)

        leaf_values_dict["dN"] = dN_dict
        leaf_values_dict["dS"] = dS_dict
        leaf_values_dict["W"] = W_fict

        if write:
            leaf_values_dict.write("leaf_values.t")
        return leaf_values_dict
Exemplo n.º 4
0
def results_extraction_listener(queue,
                                output_file_prefix,
                                selected_species_list=None):
    """listens for messages on the queue, writes to file."""

    positive_selection_dict = TwoLvlDict()
    selected_species_positive_selection_dict = TwoLvlDict()
    error_fd = open("errors.err", "w")
    error_fd.write("#sample\terror_code\n")
    while 1:
        result = queue.get()
        if isinstance(result[1], int):
            error_fd.write("%s\t%i\n" % (result[0], result[1]))
            continue
        if result == 'finish':
            positive_selection_dict.write("%s.all" % output_file_prefix,
                                          absent_symbol=".")
            if selected_species_list:
                selected_species_positive_selection_dict.write(
                    "%s.selected_species" % output_file_prefix,
                    absent_symbol=".")
            # print positive_selection_dict.table_form(absent_symbol=".")
            break
        if result[1]:
            positive_selection_dict[result[0]] = result[1]
            if selected_species_list:
                for species in selected_species_list:
                    if species in result[1]:
                        if result[
                                0] not in selected_species_positive_selection_dict:
                            selected_species_positive_selection_dict[
                                result[0]] = {}
                        selected_species_positive_selection_dict[
                            result[0]][species] = result[1][species]
Exemplo n.º 5
0
    def combine_count_files(count_file_list, output_file, sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError("Several files doesn't have corresponding sample name")

        samples = zip(sample_name_list if sample_name_list else count_file_list, count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False,
                                          split_values=False, values_separator=",", key_index=0, value_index=1,
                                          close_after_if_file_object=False, expression=None, comments_prefix="__")

        count_table.write(output_file)
Exemplo n.º 6
0
    def get_general_stats(self):
        stat_dict = TwoLvlDict()

        for report_id in self:
            stat_dict[report_id] = OrderedDict()

            stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs
            stat_dict[report_id]["pairs_without_adapters"] = self[report_id].retained_pairs
            stat_dict[report_id]["pairs_without_adapters_fraction"] = self[report_id].retained_pairs_fraction

        return stat_dict
Exemplo n.º 7
0
    def count_reads_and_bases(self, fastq_file_list, stat_file=None):

        fastq_list = [fastq_file_list] if isinstance(fastq_file_list,
                                                     str) else fastq_file_list

        counts = TwoLvlDict()

        for fastq_file in fastq_list:
            counts[fastq_file] = OrderedDict()
            counts[fastq_file]["Reads"] = 0
            counts[fastq_file]["Bases"] = 0

        for fastq_file in fastq_list:
            with self.metaopen(fastq_file, "r") as fastq_fd:
                for line in fastq_fd:
                    counts[fastq_file]["Bases"] += len(fastq_fd.next())
                    counts[fastq_file]["Reads"] += 1
                    fastq_fd.next()
                    fastq_fd.next()

                # to take into account "\n" at the end of each line
                counts[fastq_file]["Bases"] = counts[fastq_file][
                    "Bases"] - counts[fastq_file]["Reads"]

        counts.write()

        if stat_file:
            counts.write(stat_file)
Exemplo n.º 8
0
    def get_general_stats(self):
        stat_dict = TwoLvlDict()

        for report_id in self:
            stat_dict[report_id] = OrderedDict()
            stat_dict[report_id]["machine_number"] = len(self[report_id].machine_id_list)
            stat_dict[report_id]["machine_ids"] = self[report_id].machine_id_list
            stat_dict[report_id]["flowcell_number"] = len(self[report_id].flowcell_id_list)
            stat_dict[report_id]["flowcell_ids"] = self[report_id].flowcell_id_list
            stat_dict[report_id]["lane_number"] = len(self[report_id].lane_table)
            stat_dict[report_id]["full_lane_ids"] = self[report_id].full_lane_id_list
            stat_dict[report_id]["short_lane_ids"] = self[report_id].short_lane_id_list
            stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs
            stat_dict[report_id]["retained_pairs"] = self[report_id].retained_pairs
            stat_dict[report_id]["retained_pairs_fraction"] = self[report_id].retained_pairs_fraction
            stat_dict[report_id]["retained_forward_only"] = self[report_id].retained_forward_only
            stat_dict[report_id]["retained_reverse_only"] = self[report_id].retained_reverse_only
            stat_dict[report_id]["both_discarded"] = self[report_id].both_discarded
            stat_dict[report_id]["min_retained_pairs_in_tiles_fraction"] = self[report_id].minimum_retained_pairs_in_tiles_fraction

        return stat_dict
Exemplo n.º 9
0
def get_results(samples_list, data_type):
    results = TwoLvlDict()

    for sample in samples_list:
        results[sample] = OrderedDict()
        filename = "%s/all_reads/%s_all_%s_coverage.tab" % (sample, sample,
                                                            data_type)
        data = read_data(filename)
        if not data:
            print sample
            continue
        #print sample
        for gene in data:
            results[sample][gene] = data[gene]

        for proportions, name in zip([[1, 2], [2, 1], [1, 1]],
                                     ["1:2", "2:1", "1:1"]):
            chi_results = calculate_chi_squared(data, proportions)
            #print name
            results[sample][name + " Chi"] = chi_results[0]
            results[sample][name + " p-value"] = chi_results[1]
            #print chi_results
    return results
Exemplo n.º 10
0
if args.labels_list is not None:
    if len(args.labels_list) != len(args.input_file_list):
        raise ValueError(
            "Length of labels list is not equal to number of files with assemblies"
        )

assemblies_dict = OrderedDict()
for i in range(0, len(args.input_file_list)):
    assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i +
                                                                           1)
    tmp_index = "%s.tmp.idx" % assembly_label
    assemblies_dict[assembly_label] = SeqIO.index_db(tmp_index,
                                                     args.input_file_list[i],
                                                     format=args.format)

assembly_N50_dict = TwoLvlDict()
assembly_L50 = TwoLvlDict()
assembly_bins = []
assembly_contig_cumulative_length = OrderedDict()
assembly_contig_number_values = OrderedDict()
assembly_general_stats = TwoLvlDict()
assembly_length_array = OrderedDict()
assembly_lengths = TwoLvlDict()
for assembly in assemblies_dict:
    lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \
        contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly],
                                                                         thresholds_list=args.thresholds,
                                                                         seq_len_file="%s.%s.len" % (args.output_prefix, assembly))
    assembly_N50_dict[assembly] = N50_dict
    assembly_L50[assembly] = L50_dict
    assembly_contig_cumulative_length[
Exemplo n.º 11
0
}
annotation_black_list = [
    "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon",
    "intron", "repeat_region", "telomere", "gene_cassette",
    "five_prime_UTR_intron"
]
with open(args.annotations) as gff_fd:
    for record in GFF.parse(gff_fd):
        annotations_dict[record.id] = record

bad_region_dict = {}
with open(args.masking) as gff_fd:
    for record in GFF.parse(gff_fd):
        bad_region_dict[record.id] = record

statistics_dict = TwoLvlDict(OrderedDict({}))

print("Handling %s" % sample)
statistics_dict[sample] = OrderedDict({})

os.system("mkdir -p %s" % clustering_dir)

mutations = CollectionVCF(
    in_file=args.vcf_file if args.vcf_file else "%s.vcf" % args.sample_name,
    from_file=True)

mutations.get_location(annotations_dict,
                       use_synonym=True,
                       synonym_dict=annotation_synonym_dict)
mutations.set_location_flag(bad_region_dict, check_location, "BR")
mutations.check_by_ref_and_alt(ref_alt_variants["deaminases"],
Exemplo n.º 12
0
import argparse

from Routines import FileRoutines
from CustomCollections.GeneralCollections import TwoLvlDict


parser = argparse.ArgumentParser()

parser.add_argument("-f", "--files", action="store", dest="files", required=True,
                    type=FileRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of files/directories with tables")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="Output file with combined table.")
parser.add_argument("-a", "--absent_symbol", action="store", dest="absent_symbol", default=".",
                    help="Symbol to be treated as absent value")
parser.add_argument("-v", "--split_values", action="store_true", dest="split_values",
                    help="Split values. Default: False")
parser.add_argument("-s", "--value_separator", action="store", dest="value_separator", default=",'",
                    help="Value separator. Default: ','")
parser.add_argument("-g", "--ignore_value_repeats", action="store_true", dest="ignore_value_repeats",
                    help="Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) "
                         "and don't raise exception. If yes value from first entry is stored. Default: False")

args = parser.parse_args()

combined_table = TwoLvlDict(input_file=args.files, absent_symbol=args.absent_symbol,
                            split_values=args.split_values, value_sep=args.value_separator,
                            ignore_value_repeats=args.ignore_value_repeats)
#print combined_table
combined_table.write(args.output, absent_symbol=args.absent_symbol, close_after_if_file_object=False, sort=False)
Exemplo n.º 13
0
parser = argparse.ArgumentParser()

parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","),
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path,
                    help="Comma-separated list of species")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

nonassembled = species_syn_dict.filter_by_line(filter_nonassembled)
species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".")


nonassembled.write("not_assembled_families_in_all_species.t", absent_symbol=".")
complicated_families_dict = nonassembled.filter_by_line(filter_splited_to_several_fam)
complicated_families_dict.write("complicated_families.t", absent_symbol=".")

complicated_families_syn_dict = SynDict()
Exemplo n.º 14
0
from CustomCollections.GeneralCollections import TwoLvlDict
from Parsers.CCF import CollectionCCF


def get_intersection_length(start1, end1, start2, end2):
    if start1 - end2 > 0 or start2 - end1 > 0:
        return 0
    start_shift = start1 - start2
    start_coef_shift = 0 if start_shift < 0 else 1
    end_shift = end1 - end2
    end_coef_shift = 0 if end_shift > 0 else 1

    return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift

overlap_clusters_percent = TwoLvlDict({})
#size = 8
#power = 0.05
print([float(f) / float(100) for f in range(1, 11)])
for size in range(3, 11):
    overlap_clusters_percent[size] = {}
    for power in [float(f) / float(100) for f in range(1, 11)]:
        PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))

        PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))
        PmCDA1_3d_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power))
        PmCDA1_3d_sub_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power))
        #cluster_3d_dict = OrderedDict({})

        cluster_3d_dict = TwoLvlDict({})
Exemplo n.º 15
0
            gene_dict[feature.qualifiers["Name"][0]] = OrderedDict({})
            for sub_feature in feature.sub_features:
                gene_dict[feature.qualifiers["Name"][0]][
                    sub_feature.type] = len(sub_feature)
        if feature.type in ("snoRNA", "ncRNA", "snRNA"):
            gene_dict[feature.qualifiers["Name"][0]] = OrderedDict(
                {"ncRNA": len(feature)})

with open("%s_test.t" % args.prefix, "w") as out_fd:
    for gene in gene_dict:
        for sub_feature in gene_dict[gene]:
            out_fd.write("%s\t%s\t%i\n" %
                         (gene, sub_feature, gene_dict[gene][sub_feature]))

lengths_dict = get_feature_lengths(record_dict)
count_dict = TwoLvlDict({})
for record in lengths_dict:
    count_dict[record] = {}
    for feature_type in lengths_dict[record]:
        count_dict[record][feature_type] = len(
            lengths_dict[record][feature_type])

count_dict.write("%s_counts.t" % args.prefix)
total_lengths = get_total_feature_lengths(lengths_dict,
                                          out_filename="%s_feature_lengths.t" %
                                          args.prefix)

white_list = ["five_prime_UTR", "three_prime_UTR", "CDS", "ncRNA"]
collapsed_dict = feature_lengths_collapse_records(lengths_dict,
                                                  synonym_dict={
                                                      "snoRNA": "ncRNA",
Exemplo n.º 16
0
    return -ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2)


def get_intersection_length(start1, end1, start2, end2):
    if start1 - end2 > 0 or start2 - end1 > 0:
        return 0
    start_shift = start1 - start2
    start_coef_shift = 0 if start_shift < 0 else 1
    end_shift = end1 - end2
    end_coef_shift = 0 if end_shift > 0 else 1

    return (end2 - start2 +
            1) - start_coef_shift * start_shift + end_coef_shift * end_shift


overlap_clusters_percent = TwoLvlDict({})

totaly_genes = 6074
test_fd = open("probability.t", "w")
test_fd.write(
    "#size\tpower\ttotal\tPmCDA1_3d\tPmCDA1_sub_3d\tintersection\tp-value\n")
print([float(f) / float(100) for f in range(1, 11)])
for size in range(3, 11):
    overlap_clusters_percent[size] = {}
    for power in [float(f) / float(100) for f in range(1, 11)]:
        PmCDA1_3d_clusters = CollectionCCF(
            from_file=True,
            input_file=
            "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf"
            % (size, power, size, power))
        PmCDA1_3d_sub_clusters = CollectionCCF(
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d",
                    "--species_dir",
                    action="store",
                    dest="species_dir",
                    default="./",
                    type=check_path,
                    help="Directory with per species statistics")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_stat_dict = TwoLvlDict()

for species in args.species_list:
    with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd:
        statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines())
    species_stat_dict[species] = OrderedDict(statistics)

species_stat_dict.write(out_fd)
if args.output != "stdout":
    out_fd.close()
                    "--gap_symbol",
                    action="store",
                    dest="gap_symbol",
                    default="-",
                    help="Gap symbol. Default - '-'")

parser.add_argument("-m",
                    "--histogram_output",
                    action="store",
                    dest="histogram_output",
                    required=True,
                    help="File to write histogram")

args = parser.parse_args()

unique_position_dict = TwoLvlDict()

FileRoutines.safe_mkdir(args.output_dir)

for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_prefix = "%s/%s.unique_positions" % (args.output_dir,
                                                alignment_name_list[1])

    unique_position_dict[alignment_name_list[
        1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file(
            alignment_file,
            output_prefix,
            format=args.format,
            gap_symbol="-",
            return_mode="relative",
Exemplo n.º 19
0
    }
    annotation_black_list = [
        "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon",
        "intron", "repeat_region", "telomere", "gene_cassette",
        "five_prime_UTR_intron"
    ]
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record

    bad_region_dict = {}
    with open(bad_regions_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            bad_region_dict[record.id] = record

    statistics_dict = TwoLvlDict(OrderedDict({}))
    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)
        statistics_dict[sample_set_name] = OrderedDict({})
        os.chdir(workdir)
        os.system("mkdir -p %s" % sample_set_name)
        os.chdir(sample_set_name)
        os.system("mkdir -p %s" % clustering_dir)

        mutations = CollectionVCF(in_file="../../%s_SNP.vcf" % sample_set_name,
                                  from_file=True)

        mutations.get_location(annotations_dict,
                               use_synonym=True,
                               synonym_dict=annotation_synonym_dict)
        mutations.set_location_flag(bad_region_dict, check_location, "BR")
Exemplo n.º 20
0
                    dest="output",
                    required=True,
                    help="File to write statistics")
parser.add_argument(
    "-l",
    "--log_file",
    action="store",
    dest="log_file",
    default="trimmomatic.log",
    help="Name of files with trimmomatic log. Default - trimmomatic.log")

args = parser.parse_args()

samples = sorted(
    args.samples.split(",") if args.samples else os.listdir(args.samples_dir))
present_samples = []
for sample in samples:
    if os.path.isdir(args.samples_dir + sample):
        present_samples.append(sample)

reports_dict = TwoLvlDict()

for sample in present_samples:
    print("Handling report from %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)
    trimmomatic_log = "%s/trimmomatic.log" % sample_dir
    reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log)

reports_dict.write(args.output)
Exemplo n.º 21
0
                    action="store",
                    dest="species_dir",
                    default="./",
                    type=FileRoutines.check_path,
                    help="Directory with families of species")
"""
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")
"""
args = parser.parse_args()

# run after scripts/expansion/compare_cluster.py

# out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" %
                                                   (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

not_assembled = species_syn_dict.filter_by_line(is_assembled)
species_syn_dict.write("correctly_assembled_families_species.t",
                       absent_symbol=".")

assembled_ids = IdSet(species_syn_dict.sl_keys())
assembled_ids.write("assembled_families.ids")
not_assembled_ids = IdSet(not_assembled.sl_keys())
not_assembled_ids.write("non_assembled_families.ids")
Exemplo n.º 22
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_for_bam_sorting=None,
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_table = TwoLvlDict()
        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            filetypes, forward_files, reverse_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            print "\tAligning reads..."

            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_for_bam_sorting=max_memory_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print "\tIndexing alignment file..."
            os.system("samtools index %s" % alignment_file)

            print "\tCounting reads aligned to features..."
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)

            sample_counts = SynDict()
            sample_counts.read(count_file,
                               header=False,
                               separator="\t",
                               allow_repeats_of_key=False,
                               split_values=False,
                               values_separator=",",
                               key_index=0,
                               value_index=1,
                               close_after_if_file_object=False,
                               expression=None,
                               comments_prefix="__")
            count_table[sample] = sample_counts

        count_table.write(count_table_file)
Exemplo n.º 23
0
    def get_taxa_genomes_summary(self, taxa, email, output_directory, output_prefix,
                                 max_ids_per_query=8000, max_download_attempts=500,
                                 min_scaffold_n50=None, min_contig_n50=None, max_scaffold_l50=None,
                                 max_contig_l50=None, max_contig_count=None, max_scaffold_count=None,
                                 max_chromosome_count=None, min_chromosome_count=None, max_unlocalized_scaffolds=None,
                                 max_unplaced_scaffolds=None, max_total_length=None, min_total_length=None,
                                 max_ungapped_length=None, min_ungapped_length=None,
                                 no_ambiguous_species=True):
        Entrez.email = email
        taxa_list = taxa if isinstance(taxa, Iterable) else [taxa]

        all_files_dir = "%s%s/" % (self.check_path(output_directory), "all")
        nonambiguous_species_all_dir = "%snonambiguous_species_all/" % self.check_path(output_directory)
        ambiguous_species_all_dir = "%s%s/" % (self.check_path(output_directory), "ambiguous_species_all")
        chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "chromosome_lvl")
        non_chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "nonchromosome_lvl")

        filtered_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "passed_integrity_filters")
        filtered_out_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "not_passed_integrity_filters")

        stat_dir = "%s%s/" % (self.check_path(output_directory), "stat")
        taxa_stat_dir = "%s%s/" % (self.check_path(output_directory), "taxa_stat")
        for subdir in (all_files_dir, chromosome_lvl_dir, non_chromosome_lvl_dir, stat_dir,
                       taxa_stat_dir, nonambiguous_species_all_dir, ambiguous_species_all_dir):
            self.save_mkdir(subdir)

        filter_by_integrity = min_scaffold_n50 or min_contig_n50 or max_scaffold_l50 or max_contig_l50 \
                              or max_contig_count or max_scaffold_count or max_chromosome_count \
                              or min_chromosome_count or max_unlocalized_scaffolds \
                              or max_unplaced_scaffolds or max_total_length or min_total_length \
                              or max_ungapped_length or min_ungapped_length

        if filter_by_integrity:
            for subdir in (filtered_by_integrity_dir, filtered_out_by_integrity_dir):
                self.save_mkdir(subdir)

        for taxon in taxa_list:
            search_term = "%s[Orgn]" % taxon

            attempt_counter = 1
            while True:
                try:
                    summary = Entrez.read(Entrez.esearch(db="genome", term=search_term, retmax=10000, retmode="xml"))
                    break
                except URLError:
                    if attempt_counter > max_download_attempts:
                        URLError("Network problems. Maximum attempt number is exceeded")
                    print "URLError. Retrying... Attempt %i" % attempt_counter
                    attempt_counter += 1

            print "Were found %s species" % summary["Count"]
            #print summary

            taxon_stat_file = "%s/%s.stat" % (taxa_stat_dir, taxon.replace(" ", "_"))
            taxon_stat_dict = TwoLvlDict()

            for species_id in summary["IdList"]: #[167] :
                print "Handling species id %s " % species_id

                species_stat_file = "%s/%s.stat" % (stat_dir, species_id)
                species_stat_dict = TwoLvlDict()
                species_stat_dict[species_id] = OrderedDict()

                taxon_stat_dict[species_id] = OrderedDict()

                for stat in "all", "chromosome_lvl", "non_chromosome_lvl":
                    species_stat_dict[species_id][stat] = 0
                    taxon_stat_dict[species_id][stat] = 0
                #species_summary = Entrez.read(Entrez.esummary(db="genome", id=species_id, retmax=10000, retmode="xml"))
                #print species_summary

                # get assemblies linked with genome of species

                attempt_counter = 1
                while True:
                    try:
                        assembly_links = Entrez.read(Entrez.elink(dbfrom="genome", id=species_id, retmode="xml",
                                                                  retmax=10000, linkname="genome_assembly"))
                        break
                    except URLError:
                        if attempt_counter > max_download_attempts:
                            URLError("Network problems. Maximum attempt number is exceeded")
                        print "URLError. Retrying... Attempt %i" % attempt_counter
                        attempt_counter += 1

                assembly_number = len(assembly_links)
                #print links
                #print links[0]["LinkSetDb"][0]["Link"]
                if assembly_links:
                    if "LinkSetDb" in assembly_links[0]:
                        if assembly_links[0]["LinkSetDb"]:
                            if "Link" in assembly_links[0]["LinkSetDb"][0]:
                                assembly_ids = [id_dict["Id"] for id_dict in assembly_links[0]["LinkSetDb"][0]["Link"]]
                            else:
                                continue
                        else:
                            continue
                    else:
                        continue
                else:
                    continue
                number_of_ids = len(assembly_ids)

                print "\tFound %i assemblies" % number_of_ids

                id_group_edges = np.arange(0, number_of_ids+1, max_ids_per_query)

                if id_group_edges[-1] != number_of_ids:
                    id_group_edges = np.append(id_group_edges, number_of_ids)

                number_of_id_groups = len(id_group_edges) - 1

                #print len(assembly_links[0]["LinkSetDb"][0]["Link"])
                #print assembly_ids
                #print len(assembly_ids)
                #assembly_dict = TwoLvlDict()
                #assemblies_with_ambiguous_taxonomies = SynDict()
                #summaries = Entrez.read(Entrez.esummary(db="assembly", id=",".join(assembly_ids), retmode="xml"))

                summary_list = None
                for i in range(0, number_of_id_groups):
                    print "\tDownloading summary about assemblies %i - %i" % (id_group_edges[i]+1, id_group_edges[i+1])
                    #print len(assembly_ids[id_group_edges[i]:id_group_edges[i+1]])
                    summaries = Entrez.read(Entrez.esummary(db="assembly",
                                                            id=",".join(assembly_ids[id_group_edges[i]:id_group_edges[i+1]]),
                                                            retmode="xml"), validate=False)
                    tmp_summary_list = AssemblySummaryList(entrez_summary_biopython=summaries)
                    summary_list = (summary_list + tmp_summary_list) if summary_list else tmp_summary_list

                print "\tDownloaded %i" % len(summary_list)

                if len(summary_list) != number_of_ids:
                    print "\tWARNING:Not all assemblies were downloaded"
                    """
                    print "\tFollowing assemblies were not downloaded(ids):%s" % ",".join(set())
                    """

                if summary_list:
                    species_stat_dict[species_id]["all"] = len(summary_list)
                    taxon_stat_dict[species_id]["all"] = len(summary_list)
                    output_file = "%s%s.genome.summary" % ((output_prefix + ".") if output_prefix else "", species_id)
                                                           #summary_list[0]['SpeciesName'].replace(" ", "_"))

                    all_output_file = "%s/%s" % (all_files_dir, output_file)
                    chromosome_lvl_output_file = "%s/%s" % (chromosome_lvl_dir, output_file)
                    non_chromosome_lvl_output_file = "%s/%s" % (non_chromosome_lvl_dir, output_file)
                    nonambiguous_species_output_file = "%s/%s" % (nonambiguous_species_all_dir, output_file)
                    ambiguous_species_output_file = "%s/%s" % (ambiguous_species_all_dir, output_file)
                    chromosome_lvl_summary_list, non_chromosome_lvl_summary_list = summary_list.filter_non_chrom_level_genomes()
                    filtered_by_integrity_file = "%s/%s" % (filtered_by_integrity_dir, output_file)
                    filtered_out_by_integrity_file = "%s/%s" % (filtered_out_by_integrity_dir, output_file)

                    species_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list)
                    taxon_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list)
                    species_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list)
                    taxon_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list)

                    print("\tChromosome level assemblies %i" % species_stat_dict[species_id]["chromosome_lvl"])
                    print("\tNon chromosome level assemblies %i" % species_stat_dict[species_id]["non_chromosome_lvl"])

                    if chromosome_lvl_summary_list:
                        chromosome_lvl_summary_list.write(chromosome_lvl_output_file)

                    if non_chromosome_lvl_summary_list:
                        non_chromosome_lvl_summary_list.write(non_chromosome_lvl_output_file)

                    nonambiguous_species_summary_list, ambiguous_species_summary_list = summary_list.filter_ambiguous_species()
                    #print(len(nonambiguous_species_summary_list), len(ambiguous_species_summary_list))
                    species_stat_dict[species_id]["nonambiguous_species"] = len(nonambiguous_species_summary_list)
                    species_stat_dict[species_id]["ambiguous_species"] = len(ambiguous_species_summary_list)
                    print "\tAmbiguous species %i" % species_stat_dict[species_id]["ambiguous_species"]
                    if nonambiguous_species_summary_list:
                        nonambiguous_species_summary_list.write(nonambiguous_species_output_file)
                    if ambiguous_species_summary_list:
                        ambiguous_species_summary_list.write(ambiguous_species_output_file)

                    summary_list.write(all_output_file)

                    if filter_by_integrity:
                        filtered_by_integrity, filtered_out_by_integrity = summary_list.filter_by_integrity(min_scaffold_n50=min_scaffold_n50,
                                                                                                            min_contig_n50=min_contig_n50,
                                                                                                            max_scaffold_l50=max_scaffold_l50,
                                                                                                            max_contig_l50=max_contig_l50,
                                                                                                            max_contig_count=max_contig_count,
                                                                                                            max_scaffold_count=max_scaffold_count,
                                                                                                            max_chromosome_count=max_chromosome_count,
                                                                                                            min_chromosome_count=min_chromosome_count,
                                                                                                            max_unlocalized_scaffolds=max_unlocalized_scaffolds,
                                                                                                            max_unplaced_scaffolds=max_unplaced_scaffolds,
                                                                                                            max_total_length=max_total_length,
                                                                                                            min_total_length=min_total_length,
                                                                                                            max_ungapped_length=max_ungapped_length,
                                                                                                            min_ungapped_length=min_ungapped_length,
                                                                                                            no_ambiguous_species=no_ambiguous_species)
                        species_stat_dict[species_id]["filtered_by_integrity"] = len(filtered_by_integrity)
                        species_stat_dict[species_id]["filtered_out_by_integrity"] = len(filtered_out_by_integrity)
                        if filtered_by_integrity:
                            filtered_by_integrity.write(filtered_by_integrity_file)
                        if filtered_out_by_integrity:
                            filtered_out_by_integrity.write(filtered_out_by_integrity_file)
                        print "\tPassed integrity filters %i" % species_stat_dict[species_id]["filtered_by_integrity"]
                species_stat_dict.write(species_stat_file)

                print "\n\n"

            taxon_stat_dict.write(taxon_stat_file)

            """
Exemplo n.º 24
0
                    "--header",
                    action="store_true",
                    dest="header",
                    help="Header is present in input file")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()
out_fd.write(
    "#family\tspecies_with_family\tspecies_with_errors\tspecies_with_correct_fam\terror_ratio\n"
)
with open(args.input, "r") as in_fd:
    if args.header:
        in_fd.readline()
    for line in in_fd:
        species_with_errors = 0
        species_with_fam = 0
        tmp = line.strip().split("\t")
        family_name = tmp[0]
        for fam in tmp[1:]:
            if fam != ".":
                species_with_fam += 1
            if "_" in fam:
    #"PmCDA1_sub1_3d",
    #"PmCDA1_6d",
    "HAP_sub1",
    #"PmCDA1_sub1_6d",
    #"A1_3d",
    #"A1_6d",
    #"A3G_3d",
    #"AID_3d",
    #"AID_6d"
]
power_limits = [f / 100 for f in range(1, 11)]
size_limits = [i for i in range(3, 11)]

os.chdir(workdir)
for sample_set in sample_set_names_list:
    stat_dict = TwoLvlDict(OrderedDict({}))
    print("Handling %s" % sample_set)
    all_clusters = CollectionCCF(from_file=True,
                                 input_file=workdir + all_files_subdir +
                                 sample_set + all_files_suffix)
    if "HAP" not in sample_set:
        all_clusters.check_strandness()
    for min_size in size_limits:
        stat_dict[min_size] = OrderedDict({})
        os.system("mkdir -p %i %i/all " % (min_size, min_size))
        above_size_clusters, below_size_clusters = all_clusters.filter_by_expression(
            "record.size >= %i" % min_size)
        above_size_clusters.write(
            "%i/all/%s_size_%i+%s" %
            (min_size, sample_set, min_size, all_files_suffix))
        stat_dict[min_size][0.00] = len(above_size_clusters)
Exemplo n.º 26
0
parser.add_argument("-c",
                    "--convert_aa_to_single_letter",
                    action="store_true",
                    dest="convert_to_single_letter",
                    help="Convert aminoacids to single letters")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)

gene_alias_dict = SynDict()
if args.gene_alias_file:
    gene_alias_dict.read(args.gene_alias_file, split_values=False)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

summary_dict = TwoLvlDict()
for filename in args.input:
    directory, prefix, extension = split_filename(filename)

    if args.write_dir_path and args.write_ext:
        name = filename
    elif args.write_dir_path:
        name = (directory + prefix) if directory else prefix
    elif args.write_ext:
        name = prefix + extension
    else:
        name = prefix
        if args.suffix_to_remove in name:
            name = name.replace(args.suffix_to_remove, "")
    summary_dict[name] = OrderedDict()
    with open(filename, "r") as file_fd:
Exemplo n.º 27
0
      len(filtered_out_report.records))
if args.ref_species_gene_file:
    reference_genes_dict = {}
    with open(args.ref_species_gene_file, "r") as ref_fd:
        for line in ref_fd:
            gene_family_id, genes = line.strip().split("\t")
            genes = [] if genes == "." else genes.split(",")
            reference_genes_dict[gene_family_id] = [genes[:]]
            if genes:
                reference_genes_dict[gene_family_id].append(choice(genes))
                # print gene_family_id
                #print reference_genes_dict[gene_family_id]

node_header_list = features_list + ["reference_gene"]
delta_index = features_list.index("delta")
statistics_dict = TwoLvlDict({})

for node_id in node_values:
    statistics_dict[node_id] = OrderedDict({
        "lost": 0,
        "new": 0,
        "lost_ref_ann": 0,
        "new_ref_ann": 0
    })

for node_id in node_values:
    fd_list = []
    for directory in node_info_dir, node_ref_dir:
        for mode in "all", "new", "lost":
            fd_list.append(
                open(