Exemplo n.º 1
0
    def combine_count_files(count_file_list,
                            output_file,
                            sample_name_list=None):

        if sample_name_list is not None:
            if len(count_file_list) != len(sample_name_list):
                raise ValueError(
                    "Several files doesn't have corresponding sample name")

        samples = zip(
            sample_name_list if sample_name_list else count_file_list,
            count_file_list)

        count_table = TwoLvlDict()

        for sample, filename in samples:
            count_table[sample] = SynDict(filename=filename,
                                          header=False,
                                          separator="\t",
                                          allow_repeats_of_key=False,
                                          split_values=False,
                                          values_separator=",",
                                          key_index=0,
                                          value_index=1,
                                          close_after_if_file_object=False,
                                          expression=None,
                                          comments_prefix="__")

        count_table.write(output_file)
Exemplo n.º 2
0
    def count_locations(self,
                        annotation_black_list=[],
                        allow_several_counts_of_record=False,
                        out_filename="location_counts.t",
                        write=True,
                        count_dir="location_counts"):
        os.system("mkdir -p %s" % count_dir)
        regions_dict = self._split_regions()
        region_counts_dict = TwoLvlDict({})
        for region in regions_dict:
            count_locations_dict = {"igc": 0}

            for record in regions_dict[region]:
                if (not record.description["Loc"]) or (
                        "Loc" not in record.description):
                    count_locations_dict["unknown"] += 1
                    continue
                #print(record.description["Loc"])
                if allow_several_counts_of_record:
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        if location not in count_locations_dict:
                            count_locations_dict[location] = 1
                        else:
                            count_locations_dict[location] += 1
                else:
                    full_location = []
                    for location in record.description["Loc"]:
                        if location in annotation_black_list:
                            continue
                        full_location.append(location)
                    if not full_location:
                        continue
                    full_location.sort()
                    full_location = "/".join(full_location)
                    if full_location not in count_locations_dict:
                        count_locations_dict[full_location] = 1
                    else:
                        count_locations_dict[full_location] += 1

            labels = []
            counts = []
            #colors = []
            for location in count_locations_dict:
                if count_locations_dict[
                        location] == 0 or location in annotation_black_list:
                    continue
                labels.append(location)
                counts.append(count_locations_dict[location])
            region_counts_dict[region] = OrderedDict([
                (label, count) for label, count in zip(labels, counts)
            ])

        if write:
            region_counts_dict.write("%s/%s" % (count_dir, out_filename))
        return region_counts_dict
Exemplo n.º 3
0
def results_extraction_listener(queue,
                                output_file_prefix,
                                selected_species_list=None):
    """listens for messages on the queue, writes to file."""

    positive_selection_dict = TwoLvlDict()
    selected_species_positive_selection_dict = TwoLvlDict()
    error_fd = open("errors.err", "w")
    error_fd.write("#sample\terror_code\n")
    while 1:
        result = queue.get()
        if isinstance(result[1], int):
            error_fd.write("%s\t%i\n" % (result[0], result[1]))
            continue
        if result == 'finish':
            positive_selection_dict.write("%s.all" % output_file_prefix,
                                          absent_symbol=".")
            if selected_species_list:
                selected_species_positive_selection_dict.write(
                    "%s.selected_species" % output_file_prefix,
                    absent_symbol=".")
            # print positive_selection_dict.table_form(absent_symbol=".")
            break
        if result[1]:
            positive_selection_dict[result[0]] = result[1]
            if selected_species_list:
                for species in selected_species_list:
                    if species in result[1]:
                        if result[
                                0] not in selected_species_positive_selection_dict:
                            selected_species_positive_selection_dict[
                                result[0]] = {}
                        selected_species_positive_selection_dict[
                            result[0]][species] = result[1][species]
Exemplo n.º 4
0
    def get_general_stats(self):
        stat_dict = TwoLvlDict()

        for report_id in self:
            stat_dict[report_id] = OrderedDict()
            stat_dict[report_id]["machine_number"] = len(
                self[report_id].machine_id_list)
            stat_dict[report_id]["machine_ids"] = self[
                report_id].machine_id_list
            stat_dict[report_id]["flowcell_number"] = len(
                self[report_id].flowcell_id_list)
            stat_dict[report_id]["flowcell_ids"] = self[
                report_id].flowcell_id_list
            stat_dict[report_id]["lane_number"] = len(
                self[report_id].lane_table)
            stat_dict[report_id]["full_lane_ids"] = self[
                report_id].full_lane_id_list
            stat_dict[report_id]["short_lane_ids"] = self[
                report_id].short_lane_id_list
            stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs
            stat_dict[report_id]["retained_pairs"] = self[
                report_id].retained_pairs
            stat_dict[report_id]["retained_pairs_fraction"] = self[
                report_id].retained_pairs_fraction
            stat_dict[report_id]["retained_forward_only"] = self[
                report_id].retained_forward_only
            stat_dict[report_id]["retained_reverse_only"] = self[
                report_id].retained_reverse_only
            stat_dict[report_id]["both_discarded"] = self[
                report_id].both_discarded
            stat_dict[report_id][
                "min_retained_pairs_in_tiles_fraction"] = self[
                    report_id].minimum_retained_pairs_in_tiles_fraction

        return stat_dict
Exemplo n.º 5
0
    def write_stats(self, output_prefix):
        Ns_dict = TwoLvlDict()
        gaps_dict = TwoLvlDict()
        for record_id in self.records:
            Ns_dict[self.records[record_id].id] = self.records[record_id].N_counts
            gaps_dict[self.records[record_id].id] = self.records[record_id].gap_counts

        Ns_dict.write(out_filename="%s.N_counts" % output_prefix)
        gaps_dict.write(out_filename="%s.gaps_counts" % output_prefix)
Exemplo n.º 6
0
    def count_types(self, output_file=None, total_output_file=None, return_mode="chrom"):

        annotated_types = self.get_annotated_types()
        count_dict = TwoLvlDict()
        total_count_dict = OrderedDict()

        for type in annotated_types:
            total_count_dict[type] = OrderedDict()
            total_count_dict[type]["complete"] = 0
            total_count_dict[type]["partial"] = 0

        for chrom in self.records:
            count_dict[chrom] = OrderedDict()
            for type in annotated_types:
                count_dict[chrom][type] = 0

        for chrom in self.records:
            for record in self.records[chrom]:
                count_dict[chrom][record.type] += 1
                if record.partial:
                    total_count_dict[record.type]["partial"] += 1
                else:
                    total_count_dict[record.type]["complete"] += 1

        if output_file:
            count_dict.write(output_file)

        if total_output_file:
            with open(total_output_file, "w") as out_fd:
                out_fd.write("#rRNA\tComplete%s\tPartial%s\n" % ("(>%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "",
                                                                 "(<%.2f of expected length)" % self.partial_threshold if self.partial_threshold else ""))
                for type in total_count_dict:
                    out_fd.write("%s\t%i\t%i\n" % (type, total_count_dict[type]["complete"],
                                                   total_count_dict[type]["partial"]))

        if return_mode == "chrom":
            return count_dict
        elif return_mode == "total":
            return total_count_dict
        elif return_mode == "both":
            return count_dict, total_count_dict
        else:
            raise ValueError("Unknown return type. Allowed variants: 'chrom', 'total', 'both'")
Exemplo n.º 7
0
    def count_reads_and_bases(self, fastq_file_list, stat_file=None):

        fastq_list = [fastq_file_list] if isinstance(fastq_file_list, str) else fastq_file_list

        counts = TwoLvlDict()

        for fastq_file in fastq_list:
            counts[fastq_file] = OrderedDict()
            counts[fastq_file]["Reads"] = 0
            counts[fastq_file]["Bases"] = 0

        for fastq_file in fastq_list:
            with self.metaopen(fastq_file, "r") as fastq_fd:
                for line in fastq_fd:
                    counts[fastq_file]["Bases"] += len(fastq_fd.readline())
                    counts[fastq_file]["Reads"] += 1
                    fastq_fd.readline()
                    fastq_fd.readline()

                # to take into account "\n" at the end of each line
                counts[fastq_file]["Bases"] = counts[fastq_file]["Bases"] - counts[fastq_file]["Reads"]

        counts.write()

        if stat_file:
            counts.write(stat_file)

        return counts
Exemplo n.º 8
0
    def get_general_stats(self):
        stat_dict = TwoLvlDict()

        for report_id in self:
            stat_dict[report_id] = OrderedDict()

            stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs
            stat_dict[report_id]["pairs_without_adapters"] = self[
                report_id].retained_pairs
            stat_dict[report_id]["pairs_without_adapters_fraction"] = self[
                report_id].retained_pairs_fraction

        return stat_dict
Exemplo n.º 9
0
def get_results(samples_list, data_type):
    results = TwoLvlDict()

    for sample in samples_list:
        results[sample] = OrderedDict()
        filename = "%s/all_reads/%s_all_%s_coverage.tab" % (sample, sample, data_type)
        data = read_data(filename)
        if not data:
            print sample
            continue
        #print sample
        for gene in data:
            results[sample][gene] = data[gene]

        for proportions, name in zip([[1, 2], [2, 1], [1, 1]], ["1:2", "2:1", "1:1"]):
            chi_results = calculate_chi_squared(data, proportions)
            #print name
            results[sample][name + " Chi"] = chi_results[0]
            results[sample][name + " p-value"] = chi_results[1]
            #print chi_results
    return results
Exemplo n.º 10
0
    def get_general_stats(self):
        stat_dict = TwoLvlDict()

        for report_id in self:
            stat_dict[report_id] = OrderedDict()

            stat_dict[report_id]["Number of distinct kmers"] = self[report_id]["Number of distinct kmers"]
            stat_dict[report_id]["Number of distinct kmers"] = self[report_id]["Number of distinct kmers"]
            stat_dict[report_id]["Fraction of distinct kmers with errors"] = self[report_id]["Fraction of distinct kmers with errors"]

            stat_dict[report_id]["Total number of kmers"] = self[report_id]["Total number of kmers"]
            stat_dict[report_id]["Total number of kmers with errors"] = self[report_id]["Total number of kmers with errors"]
            stat_dict[report_id]["Fraction of kmers with errors"] = self[report_id]["Fraction of kmers with errors"]

            stat_dict[report_id]["Width of first peak"] = self[report_id]["Width of first peak"]
            stat_dict[report_id]["Mean kmer multiplicity in first peak"] = self[report_id]["Mean kmer multiplicity in first peak"]
            stat_dict[report_id]["Kmer multiplicity at first maximum "] = self[report_id]["Kmer multiplicity at first maximum"]
            stat_dict[report_id]["Standard deviation of kmer multiplicity in first peak"] = self[report_id]["Standard deviation of kmer multiplicity in first peak"]
            stat_dict[report_id]["Variance coefficient of kmer multiplicity in first peak"] = self[report_id]["Variance coefficient of kmer multiplicity in first peak"]
            if "Estimated genome size, bp" in self[report_id]:
                stat_dict[report_id]["Estimated genome size,bp"] = self[report_id]["Estimated genome size, bp"]

        return stat_dict
Exemplo n.º 11
0
from Parsers.CCF import CollectionCCF


def get_intersection_length(start1, end1, start2, end2):
    if start1 - end2 > 0 or start2 - end1 > 0:
        return 0
    start_shift = start1 - start2
    start_coef_shift = 0 if start_shift < 0 else 1
    end_shift = end1 - end2
    end_coef_shift = 0 if end_shift > 0 else 1

    return (end2 - start2 +
            1) - start_coef_shift * start_shift + end_coef_shift * end_shift


overlap_clusters_percent = TwoLvlDict({})
#size = 8
#power = 0.05
print([float(f) / float(100) for f in range(1, 11)])
for size in range(3, 11):
    overlap_clusters_percent[size] = {}
    for power in [float(f) / float(100) for f in range(1, 11)]:
        PmCDA1_3d_clusters = CollectionCCF(
            from_file=True,
            input_file=
            "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf"
            % (size, power, size, power))

        PmCDA1_3d_sub_clusters = CollectionCCF(
            from_file=True,
            input_file=
Exemplo n.º 12
0
            "Length of labels list is not equal to number of files with assemblies"
        )

assemblies_dict = OrderedDict()
for i in range(0, len(args.input_file_list)):
    assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i +
                                                                           1)
    tmp_index = "%s.tmp.idx" % assembly_label
    assemblies_dict[assembly_label] = SequenceRoutines.parse_seq_file(
        args.input_file_list[i],
        args.parsing_mode,
        format=args.format,
        index_file=tmp_index)
    #SeqIO.index_db(tmp_index, args.input_file_list[i],format=args.format)

assembly_N50_dict = TwoLvlDict()
assembly_L50 = TwoLvlDict()
assembly_bins = []
assembly_contig_cumulative_length = OrderedDict()
assembly_contig_number_values = OrderedDict()
assembly_general_stats = TwoLvlDict()
assembly_length_array = OrderedDict()
assembly_lengths = TwoLvlDict()
for assembly in assemblies_dict:
    lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \
        contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly],
                                                                         thresholds_list=args.thresholds,
                                                                         seq_len_file="%s.%s.len" % (args.output_prefix, assembly))
    assembly_N50_dict[assembly] = N50_dict
    assembly_L50[assembly] = L50_dict
    assembly_contig_cumulative_length[
Exemplo n.º 13
0
species_list = sorted(args.species_set)
if args.white_list_file and args.black_list_file:
    raise ValueError("Black list and white list cant be set simultaneously")

black_list = IdList()
white_list = IdList()
if args.black_list_file:
    black_list.read(args.black_list_file)
if args.white_list_file:
    white_list.read(args.white_list_file)
out_fd = open(args.cafe_file, "w")
filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w")
out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
species_filtered_fd_list = OrderedDict()
fam_count_dict = TwoLvlDict()
species_family_dict = TwoLvlDict()
for species in args.species_set:
    species_family_dict[species] = SynDict()
    species_family_dict[species].read(
        "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix),
        split_values=True,
        values_separator=",",
        separator="\t")
    #print species_family_dict[species]
    fam_count_dict[species] = species_family_dict[species].count_synonyms()
    #print fam_count_dict[species]
    species_filtered_fd_list[species] = open(
        "%s%s.fam" % (args.filtered_family_dir, species), "w")

for family in fam_count_dict.sl_keys():
Exemplo n.º 14
0
                           "snoRNA": "ncRNA",
                           "snRNA": "ncRNA"
                           }
annotation_black_list = ["gene", "region", "ARS", "long_terminal_repeat",
                         "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette",
                         "five_prime_UTR_intron"]
with open(args.annotations) as gff_fd:
    for record in GFF.parse(gff_fd):
        annotations_dict[record.id] = record

bad_region_dict = {}
with open(args.masking) as gff_fd:
    for record in GFF.parse(gff_fd):
        bad_region_dict[record.id] = record

statistics_dict = TwoLvlDict(OrderedDict({}))

print("Handling %s" % sample)
statistics_dict[sample] = OrderedDict({})

os.system("mkdir -p %s" % clustering_dir)

mutations = CollectionVCF(in_file=args.vcf_file if args.vcf_file else "%s.vcf" % args.sample_name,
                          from_file=True)

mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict)
mutations.set_location_flag(bad_region_dict, check_location, "BR")
mutations.check_by_ref_and_alt(ref_alt_variants["deaminases"], "DA", description="Deaminase-like variant")

raw_mutations_counts = len(mutations)
print("Totaly %i mutations" % raw_mutations_counts)
Exemplo n.º 15
0
    def handle_sanger_data(self,
                           input_dir,
                           output_prefix,
                           outdir=None,
                           read_subfolders=False,
                           min_mean_qual=0,
                           min_median_qual=0,
                           min_len=50):
        if outdir:
            self.workdir = outdir

        self.init_dirs()

        sanger_filelist = self.make_list_of_path_to_files(
            input_dir,
            expression=self.is_sanger_file,
            recursive=read_subfolders,
            return_absolute_paths=True)
        stat_dict = TwoLvlDict()
        record_dict = OrderedDict()
        trimmed_record_dict = OrderedDict()
        excluded_list = IdList()
        excluded_counter = 0
        low_quality_counter = 0
        too_short_counter = 0

        merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix)
        merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix)
        merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir,
                                                        output_prefix)
        merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir,
                                                        output_prefix)

        for filename in sanger_filelist:
            filename_list = self.split_filename(filename)

            record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir,
                                                              filename_list[1])
            record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir,
                                                              filename_list[1])
            record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % (
                self.workdir, filename_list[1])

            record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % (
                self.workdir, filename_list[1])
            record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % (
                self.workdir, filename_list[1])
            record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % (
                self.workdir, filename_list[1])

            record = SeqIO.read(self.metaopen(filename, "rb"), format="abi")
            record_dict[record.id] = record
            SeqIO.write(record, record_raw_fastq, format="fastq")
            SeqIO.write(record, record_raw_fasta, format="fasta")

            trimmed_record = SeqIO.AbiIO._abi_trim(record)

            stat_dict[record.id] = OrderedDict({
                "raw_len":
                len(record),
                "raw_mean_qual":
                np.mean(record.letter_annotations["phred_quality"]),
                "raw_median_qual":
                np.median(record.letter_annotations["phred_quality"]),
                "trimmed_len":
                len(trimmed_record),
                "trimmed_mean_qual":
                np.mean(trimmed_record.letter_annotations["phred_quality"]),
                "trimmed_median_qual":
                np.median(trimmed_record.letter_annotations["phred_quality"]),
                "retained":
                "-",
            })
            MatplotlibRoutines.draw_bar_plot(
                record.letter_annotations["phred_quality"],
                record_raw_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            if stat_dict[record.id]["trimmed_len"] >= min_len:
                if min_median_qual:
                    if (stat_dict[record.id]["trimmed_median_qual"] >=
                            min_median_qual) and (
                                stat_dict[record.id]["trimmed_mean_qual"] >=
                                min_mean_qual):
                        stat_dict[record.id]["retained"] = "+"
                    else:
                        low_quality_counter += 1
                else:
                    stat_dict[record.id]["retained"] = "+"
            else:
                too_short_counter += 1

            if stat_dict[record.id]["retained"] == "-":
                excluded_list.append(record.id)
                continue

            SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq")
            SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta")

            MatplotlibRoutines.draw_bar_plot(
                trimmed_record.letter_annotations["phred_quality"],
                record_trimmed_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            trimmed_record_dict[record.id] = trimmed_record

        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fasta,
                    format="fasta")

        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fasta,
                    format="fasta")

        excluded_list.write("%s.excluded.ids" % output_prefix)
        stat_dict.write(out_filename="%s.stats" % output_prefix)

        print("Excluded: %i" % excluded_counter)
        print("\tToo short( < %i ): %i" % (min_len, too_short_counter))
        print("\tLow quality( median < %i or mean < %i ): %i" %
              (min_median_qual, min_mean_qual, low_quality_counter))
Exemplo n.º 16
0
                    dest="output",
                    required=True,
                    help="File to write statistics")
parser.add_argument(
    "-l",
    "--log_file",
    action="store",
    dest="log_file",
    default="trimmomatic.log",
    help="Name of files with trimmomatic log. Default - trimmomatic.log")

args = parser.parse_args()

samples = sorted(
    args.samples.split(",") if args.samples else os.listdir(args.samples_dir))
present_samples = []
for sample in samples:
    if os.path.isdir(args.samples_dir + sample):
        present_samples.append(sample)

reports_dict = TwoLvlDict()

for sample in present_samples:
    print("Handling report from %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)
    trimmomatic_log = "%s/trimmomatic.log" % sample_dir
    reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log)

reports_dict.write(args.output)
Exemplo n.º 17
0
    def filter(self,
               samples_directory,
               output_directory,
               adapter_fragment_file,
               trimmomatic_adapter_file,
               general_stat_file,
               samples_to_handle=None,
               threads=4,
               trimmomatic_dir="",
               coockiecutter_dir="",
               facut_dir="",
               mismatch_number=2,
               pe_reads_score=30,
               se_read_score=10,
               min_adapter_len=1,
               sliding_window_size=None,
               average_quality_threshold=15,
               base_quality="phred33",
               read_name_type="illumina",
               leading_base_quality_threshold=None,
               trailing_base_quality_threshold=None,
               crop_length=None,
               head_crop_length=None,
               min_len=50,
               remove_intermediate_files=False,
               skip_coockiecutter=False,
               retain_single_end_reads=True,
               input_is_se=False):

        Cookiecutter.path = coockiecutter_dir
        Trimmomatic.jar_path = trimmomatic_dir
        Trimmomatic.threads = threads
        FaCut.path = facut_dir

        self.safe_mkdir(output_directory)
        """
        merged_raw_dir = "%s/merged/" % output_directory
        filtered_dir = "%s/filtered/" % output_directory
        coockie_filtered_dir = "%s/coockiecutter/" % filtered_dir
        coockie_trimmomatic_filtered_dir = "%s/coockiecutter_trimmomatic/" % filtered_dir
        coockie_trimmomatic_quality_filtered_dir = "%s/coockiecutter_trimmomatic_quality/" % filtered_dir
        final_filtered_dir = "%s/final/" % filtered_dir
        filtering_stat_dir = "%s/filtered_stat/" % output_directory
        """
        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        merged_raw_dir, filtered_dir, coockie_filtered_dir, \
        coockie_trimmomatic_filtered_dir, coockie_trimmomatic_quality_filtered_dir, \
        final_filtered_dir, filtering_stat_dir = self.prepare_filtering_directories(output_directory, sample_list)

        filtering_statistics = TwoLvlDict()

        for sample in sample_list:
            print("Handling sample %s" % sample)
            filtering_statistics[sample] = OrderedDict()
            merged_raw_sample_dir = "%s/%s/" % (merged_raw_dir, sample)
            #merged_forward_reads = "%s/%s_1.fq" % (merged_raw_sample_dir, sample)
            #merged_reverse_reads = "%s/%s_2.fq" % (merged_raw_sample_dir, sample)

            coockie_filtered_sample_dir = "%s/%s/" % (coockie_filtered_dir,
                                                      sample)
            coockie_stats = "%s/%s.coockiecutter.stats" % (
                coockie_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (
                coockie_trimmomatic_filtered_dir, sample)

            coockie_trimmomatic_quality_filtered_sample_dir = "%s/%s/" % (
                coockie_trimmomatic_quality_filtered_dir, sample)
            final_filtered_sample_dir = "%s/%s/" % (final_filtered_dir, sample)
            filtering_stat_sample_dir = "%s/%s" % (filtering_stat_dir, sample)

            #"""
            print("\tMerging fastqs if necessary...")
            merged_forward_reads, merged_reverse_reads, merged_se_reads = self.combine_fastq_files(
                samples_directory,
                sample,
                merged_raw_sample_dir,
                use_links_if_merge_not_necessary=True,
                input_is_se=input_is_se)
            if not skip_coockiecutter:
                print("\tFiltering by Cookiecutter")
                #"""
                Cookiecutter.rm_reads(
                    adapter_fragment_file,
                    merged_forward_reads
                    if merged_forward_reads else merged_se_reads,
                    coockie_stats,
                    right_reads=merged_reverse_reads,
                    out_dir=coockie_filtered_sample_dir,
                    use_dust_filter=False,
                    dust_cutoff=None,
                    dust_window_size=None,
                    use_N_filter=False,
                    read_length_cutoff=None,
                    polyGC_length_cutoff=None)

                #"""
                print("\tParsing Cookiecutter report...")
                coockiecutter_report = CoockiecutterReport(
                    coockie_stats, input_is_se=input_is_se)

                filtering_statistics[sample][
                    "raw_pairs"] = coockiecutter_report.input_pairs
                filtering_statistics[sample][
                    "pairs_after_coockiecutter"] = coockiecutter_report.retained_pairs
                filtering_statistics[sample][
                    "pairs_after_coockiecutter,%"] = float(
                        "%.2f" %
                        (float(coockiecutter_report.retained_pairs) /
                         float(coockiecutter_report.input_pairs) * 100))

                os.system("cp %s %s" %
                          (coockie_stats, filtering_stat_sample_dir))

                coockie_filtered_paired_forward_reads = "%s/%s_1.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
                coockie_filtered_paired_reverse_reads = "%s/%s_2.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
                coockie_filtered_paired_se_reads = ""

                coockie_filtered_se_reads = "%s/%s.se.ok.fastq" % (
                    coockie_filtered_sample_dir, sample)
            # se reads produced by Coockiecutter are ignored now!!

            #coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (coockie_trimmomatic_filtered_dir, sample)
            trimmomatic_output_prefix = "%s/%s" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            trimmomatic_log = "%s.trimmomatic.log" % trimmomatic_output_prefix
            #"""
            if (merged_forward_reads is None) and (merged_reverse_reads is
                                                   None):
                print("Filtering by Trimmomatic...")

                Trimmomatic.filter(
                    merged_se_reads
                    if skip_coockiecutter else coockie_filtered_se_reads,
                    trimmomatic_output_prefix,
                    output_extension="fq",
                    right_reads=None,
                    adapters_file=trimmomatic_adapter_file,
                    mismatch_number=mismatch_number,
                    pe_reads_score=pe_reads_score,
                    se_read_score=se_read_score,
                    min_adapter_len=min_adapter_len,
                    sliding_window_size=sliding_window_size,
                    average_quality_threshold=average_quality_threshold,
                    leading_base_quality_threshold=
                    leading_base_quality_threshold,
                    trailing_base_quality_threshold=
                    trailing_base_quality_threshold,
                    crop_length=crop_length,
                    head_crop_length=head_crop_length,
                    min_length=min_len,
                    logfile=trimmomatic_log,
                    base_quality=base_quality)

            else:
                print("\tFiltering by Trimmomatic...")
                Trimmomatic.filter(
                    merged_forward_reads if skip_coockiecutter else
                    coockie_filtered_paired_forward_reads,
                    trimmomatic_output_prefix,
                    output_extension="fq",
                    right_reads=merged_reverse_reads if skip_coockiecutter else
                    coockie_filtered_paired_reverse_reads,
                    adapters_file=trimmomatic_adapter_file,
                    mismatch_number=mismatch_number,
                    pe_reads_score=pe_reads_score,
                    se_read_score=se_read_score,
                    min_adapter_len=min_adapter_len,
                    sliding_window_size=sliding_window_size,
                    average_quality_threshold=average_quality_threshold,
                    leading_base_quality_threshold=
                    leading_base_quality_threshold,
                    trailing_base_quality_threshold=
                    trailing_base_quality_threshold,
                    crop_length=crop_length,
                    head_crop_length=head_crop_length,
                    min_length=min_len,
                    logfile=trimmomatic_log,
                    base_quality=base_quality)
            #"""
            trimmomatic_report = TrimmomaticReport(trimmomatic_log,
                                                   input_is_se=input_is_se)
            if skip_coockiecutter:
                filtering_statistics[sample][
                    "raw_pairs"] = trimmomatic_report.stats["input"]

            filtering_statistics[sample][
                "pairs_after_trimmomatic"] = trimmomatic_report.stats[
                    "surviving"] if input_is_se else trimmomatic_report.stats[
                        "both_surviving"]
            filtering_statistics[sample][
                "pairs_after_trimmomatic,%"] = trimmomatic_report.stats[
                    "surviving,%"] if input_is_se else trimmomatic_report.stats[
                        "both_surviving,%"]

            if retain_single_end_reads and not input_is_se:
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic"] = trimmomatic_report.stats[
                        "forward_only_surviving"]
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[
                        "forward_only_surviving"]

                filtering_statistics[sample][
                    "reverse_se_after_trimmomatic"] = trimmomatic_report.stats[
                        "reverse_only_surviving,%"]
                filtering_statistics[sample][
                    "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[
                        "forward_only_surviving,%"]

            os.system("cp %s %s" %
                      (trimmomatic_log, filtering_stat_sample_dir))

            coockie_trimmomatic_filtered_paired_forward_reads = "%s/%s_1.pe.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            coockie_trimmomatic_filtered_paired_reverse_reads = "%s/%s_2.pe.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_unpaired_forward_reads = "%s/%s_1.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)
            coockie_trimmomatic_filtered_unpaired_reverse_reads = "%s/%s_2.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            coockie_trimmomatic_filtered_se_reads = "%s/%s.se.fq" % (
                coockie_trimmomatic_filtered_sample_dir, sample)

            final_forward_reads = "%s/%s.final_1.fastq" % (
                final_filtered_sample_dir, sample)
            final_reverse_reads = "%s/%s.final_2.fastq" % (
                final_filtered_sample_dir, sample)

            final_forward_se_reads = "%s/%s.final_1.se.fastq" % (
                final_filtered_sample_dir, sample)
            final_reverse_se_reads = "%s/%s.final_2.se.fastq" % (
                final_filtered_sample_dir, sample)

            final_se_reads = "%s/%s.final.se.fastq" % (
                final_filtered_sample_dir, sample)

            if sliding_window_size is None:
                facut_pe_output_prefix = "%s/%s.pe" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_forward_se_output_prefix = "%s/%s.forward.se" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_reverse_se_output_prefix = "%s/%s.reverse.se" % (
                    coockie_trimmomatic_quality_filtered_sample_dir, sample)
                facut_pe_stat_file = "%s.facut.stat" % facut_pe_output_prefix

                facut_forward_se_stat_file = "%s.facut.stat" % facut_forward_se_output_prefix
                facut_reverse_se_stat_file = "%s.facut.stat" % facut_reverse_se_output_prefix
                #"""
                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_pe_output_prefix,
                    coockie_trimmomatic_filtered_paired_forward_reads,
                    reverse_reads=
                    coockie_trimmomatic_filtered_paired_reverse_reads,
                    quality_type=base_quality,
                    stat_file=facut_pe_stat_file,
                    name_type=read_name_type)

                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_forward_se_output_prefix,
                    coockie_trimmomatic_filtered_unpaired_forward_reads,
                    quality_type=base_quality,
                    stat_file=facut_forward_se_stat_file,
                    name_type=read_name_type)
                FaCut.filter_by_mean_quality(
                    average_quality_threshold,
                    facut_reverse_se_output_prefix,
                    coockie_trimmomatic_filtered_unpaired_reverse_reads,
                    quality_type=base_quality,
                    stat_file=facut_reverse_se_stat_file,
                    name_type=read_name_type)
                #"""
                #if input_is_se:

                #else:
                facut_report = FaCutReport(facut_pe_stat_file)

                filtering_statistics[sample][
                    "pairs_after_facut"] = facut_report.retained_pairs
                filtering_statistics[sample]["pairs_after_facut,%"] = float(
                    "%.2f" % (float(facut_report.retained_pairs) /
                              float(facut_report.input_pairs) * 100))
                filtering_statistics[sample][
                    "retained_pairs_in_worst_tile,%"] = facut_report.minimum_retained_pairs_in_tiles_fraction * 100

                filtering_statistics[sample][
                    "pairs_survived_after_filtration,%"] = float(
                        "%.2f" %
                        (float(facut_report.retained_pairs) /
                         filtering_statistics[sample]["raw_pairs"] * 100))

                facut_filtered_forward_reads = "%s_1.pe.fq" % facut_pe_output_prefix
                facut_filtered_reverse_reads = "%s_2.pe.fq" % facut_pe_output_prefix

                facut_filtered_forward_se_reads = "%s.se.fq" % facut_forward_se_output_prefix
                facut_filtered_reverse_se_reads = "%s.se.fq" % facut_reverse_se_output_prefix

                os.system("cp %s %s" %
                          (facut_pe_stat_file, filtering_stat_sample_dir))
                if retain_single_end_reads:
                    os.system("cp %s %s" % (facut_forward_se_stat_file,
                                            filtering_stat_sample_dir))
                    os.system("cp %s %s" % (facut_reverse_se_stat_file,
                                            filtering_stat_sample_dir))

                os.system("ln %s %s" %
                          (facut_filtered_forward_reads, final_forward_reads))
                os.system("ln %s %s" %
                          (facut_filtered_reverse_reads, final_reverse_reads))
                if retain_single_end_reads and not input_is_se:
                    os.system("cat %s %s > %s" %
                              (facut_filtered_forward_se_reads,
                               facut_filtered_reverse_se_reads,
                               final_forward_se_reads))

                    #os.system("ln %s %s" % (facut_filtered_forward_se_reads, final_forward_se_reads))
                    #os.system("ln %s %s" % (facut_filtered_reverse_se_reads, final_reverse_se_reads))

                if input_is_se:
                    pass
                    #os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads))

            else:
                os.system("ln %s %s" %
                          (coockie_trimmomatic_filtered_paired_forward_reads,
                           final_forward_reads))
                os.system("ln %s %s" %
                          (coockie_trimmomatic_filtered_paired_reverse_reads,
                           final_reverse_reads))
                if retain_single_end_reads and not input_is_se:
                    os.system(
                        "cat %s %s > %s" %
                        (coockie_trimmomatic_filtered_unpaired_forward_reads,
                         coockie_trimmomatic_filtered_unpaired_reverse_reads,
                         final_forward_se_reads))
                    """
                    os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, final_forward_se_reads))
                    os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_reverse_reads, final_reverse_se_reads))

                    """
                if input_is_se:
                    os.system("ln %s %s" %
                              (coockie_trimmomatic_filtered_se_reads,
                               final_se_reads))
                filtering_statistics[sample][
                    "pairs_survived_after_filtration,%"] = float(
                        "%.2f" %
                        (float(trimmomatic_report.stats[
                            "surviving" if input_is_se else "both_surviving"])
                         / filtering_statistics[sample]["raw_pairs"] * 100))

            print(filtering_statistics.table_form())

            if remove_intermediate_files:
                shutil.rmtree(merged_raw_sample_dir)
                shutil.rmtree(coockie_filtered_sample_dir)
                shutil.rmtree(coockie_trimmomatic_filtered_sample_dir)
                shutil.rmtree(coockie_trimmomatic_quality_filtered_sample_dir)

        if remove_intermediate_files:
            shutil.rmtree(coockie_filtered_dir)
            shutil.rmtree(coockie_trimmomatic_filtered_dir)
            shutil.rmtree(coockie_trimmomatic_quality_filtered_dir)
            shutil.rmtree(merged_raw_dir)

        filtering_statistics.write(general_stat_file, sort=False)
Exemplo n.º 18
0
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./",
                    type=FileRoutines.check_path,
                    help="Directory with families of species")
"""
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")
"""
args = parser.parse_args()

# run after scripts/expansion/compare_cluster.py

# out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

not_assembled = species_syn_dict.filter_by_line(is_assembled)
species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".")

assembled_ids = IdSet(species_syn_dict.sl_keys())
assembled_ids.write("assembled_families.ids")
not_assembled_ids = IdSet(not_assembled.sl_keys())
not_assembled_ids.write("non_assembled_families.ids")

"""
Exemplo n.º 19
0
    def draw_variant_window_densities(self, count_df, scaffold_length_dict, window_size, window_step, output_prefix,
                                      masking_dict=None,
                                      gap_fraction_threshold=0.4,
                                      record_style=None, ext_list=("svg", "png"),
                                      label_fontsize=13, left_offset=0.2, figure_width=12,
                                      figure_height_scale_factor=0.5, scaffold_synonym_dict=None,
                                      id_replacement_mode="partial", suptitle=None, density_multiplicator=1000,
                                      scaffold_black_list=[], sort_scaffolds=False, scaffold_ordered_list=None,
                                      scaffold_white_list=[], add_sample_name_to_labels=False,
                                      dist_between_scaffolds_scaling_factor=1,
                                      gap_color="grey",
                                      masked_color="grey", no_snp_color="white",
                                      colormap=None,
                                      colors=("#333a97", "#3d3795","#5d3393", "#813193", "#9d2d7f", "#b82861",
                                                        "#d33845", "#ea2e2e", "#f5ae27"),
                                      thresholds=(0.0, 0.1, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0, 2.5),
                                      colormap_tuple_list=((0.0, "#333a97"), (0.1, "#3d3795"), (0.5, "#5d3393"),
                                                           (0.75, "#813193"), (1.0, "#9d2d7f"), (1.25, "#b82861"),
                                                           (1.5, "#d33845"), (2.0, "#ea2e2e"), (2.5, "#f5ae27"))):
        """ cont_dict = {sample: {scaffold: }}"""

        if dist_between_scaffolds_scaling_factor < 1:
            raise ValueError("Scaling factor for distance between scaffolds have to be >=1.0")

        final_scaffold_list = self.get_filtered_scaffold_list(count_df.index.get_level_values('CHROM').unique().to_list(),
                                                              scaffold_black_list=scaffold_black_list,
                                                              sort_scaffolds=sort_scaffolds,
                                                              scaffold_ordered_list=scaffold_ordered_list,
                                                              scaffold_white_list=scaffold_white_list)
        scaffold_number = len(final_scaffold_list)
        max_scaffold_length = max([scaffold_length_dict[scaf] for scaf in final_scaffold_list])
        #max_scaffold_length = max(scaffold_length_dict.values())
        window_number, sample_number = np.shape(count_df)
        figure = plt.figure(figsize=(figure_width,
                                     int(figure_height_scale_factor * scaffold_number * sample_number)))
        subplot = plt.subplot(1, 1, 1)

        subplot.get_yaxis().set_visible(False)
        #subplot.get_xaxis().set_visible(False)
        #axes.xaxis.set_major_formatter(x_formatter)

        #subplot.spines['bottom'].set_color('none')
        subplot.spines['right'].set_color('none')
        subplot.spines['left'].set_color('none')
        subplot.spines['top'].set_color('none')

        scaffold_height = 10

        dist_between_scaffolds = 5
        start_x = 0
        start_y = - dist_between_scaffolds

        label_line_y_shift = int(scaffold_height/2)
        label_line_y_jump = int(scaffold_height/2)

        #normalize_color_func = LinearSegmentedColormap.from_list("Densities_custom", colormap_tuple_list)
        #plt.register_cmap(cmap=colormap)
        #colormap = cm.get_cmap(name="plasma", lut=None)
        #normalize_colors = colors.BoundaryNorm(boundaries_for_colormap, len(boundaries_for_colormap) - 1) * int(256/(len(boundaries_for_colormap) - 1))
        #normalize_colors = colors.Normalize(vmin=boundaries_for_colormap[0], vmax=boundaries_for_colormap[-1])

        masked_windows_count_dict = TwoLvlDict()
        no_snps_windows_count_dict = TwoLvlDict()

        for sample in count_df:
            masked_windows_count_dict[sample] = OrderedDict()
            no_snps_windows_count_dict[sample] = OrderedDict()

        if colormap:
            cmap = plt.get_cmap(colormap, len(thresholds))

        masked_regions_fd = open("%s.masked_regions" % output_prefix, "w")
        masked_regions_fd.write("#scaffold\twindow\tmasked_position\tmasked_position,fraction\n")
        for scaffold in final_scaffold_list:

            sample_index = 0
            for sample in count_df:
                masked_windows_count_dict[sample][scaffold] = 0
                no_snps_windows_count_dict[sample][scaffold] = 0
                #if scaffold in scaffold_black_list:
                #    continue
                #print gap_coords_list, gap_len_list

                start_y += scaffold_height + dist_between_scaffolds * (dist_between_scaffolds_scaling_factor if sample_index == 0 else 1)
                label_y_start = label_line_y_shift + start_y
                gap_y_jump = label_y_start + label_line_y_jump
                prev_x = 0

                #figure.text(0, start_y, scaffold, rotation=0, fontweight="bold", transform=subplot.transAxes, fontsize=9,
                #             horizontalalignment='center',
                #             verticalalignment='center')

                if scaffold_synonym_dict:
                    if id_replacement_mode == "exact":
                        if scaffold in scaffold_synonym_dict:
                            scaffold_label = scaffold_synonym_dict[scaffold]
                        else:
                            scaffold_label = scaffold
                            print("WARNING!!! Synonym for %s was not found" % scaffold)
                    elif id_replacement_mode == "partial":

                        partial_syn_list = []
                        for partial_syn in scaffold_synonym_dict:
                            if partial_syn in scaffold:
                                partial_syn_list.append(partial_syn)

                        if len(partial_syn_list) > 1:
                            print("WARNING!!! More than one possible replacement for %s was found: %s. No replacement then." % (scaffold, ",".join(partial_syn_list)))
                            scaffold_label = scaffold
                        elif not partial_syn_list:
                            scaffold_label = scaffold
                            print("WARNING!!! Synonym for %s was not found" % scaffold)
                        else:
                            scaffold_label = scaffold_synonym_dict[partial_syn_list[0]]
                    else:
                        raise ValueError("Unknown id replacement mode")

                else:
                    scaffold_label = scaffold

                subplot.annotate(("%s (%s)" % (scaffold, sample))if add_sample_name_to_labels else scaffold_label,
                                 xy=(0, label_y_start), xycoords='data', fontsize=16,
                                 xytext=(-15, 1.5 * label_line_y_shift), textcoords='offset points',
                                 ha='right', va='top')
                if scaffold in count_df[sample]:
                    for window_index in count_df.loc[scaffold].index:

                        window_start = window_index * window_step
                        window_end = window_start + window_size - 1  # TODO: check end coordinate
                        if masking_dict:
                            if scaffold in masking_dict:
                                unmasked_length = window_size - masking_dict[scaffold][window_index]
                                if unmasked_length > 0:
                                    variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(unmasked_length)
                                else:
                                    variant_density = None
                        else:
                            variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(window_size)

                        if variant_density is None:
                            window_color = masked_color
                        else:
                            if colormap:
                                if variant_density <= thresholds[0]:
                                    window_color = no_snp_color
                                else:
                                    for threshold_index in range(0, len(thresholds) - 1):
                                        if thresholds[threshold_index] < variant_density <= thresholds[threshold_index+1]:
                                            window_color = cmap(threshold_index)
                                            break
                                    else:
                                        window_color = cmap(threshold_index+1)

                            else:
                                if variant_density <= colormap_tuple_list[0][0]:
                                    window_color = no_snp_color
                                else:
                                    for lower_boundary, color in colormap_tuple_list:
                                        if variant_density <= lower_boundary:
                                            break
                                        if variant_density > lower_boundary:
                                            prev_color = color
                                    else:
                                        prev_color = color
                                    window_color = prev_color

                        if masking_dict:
                            if scaffold in masking_dict:
                                if float(masking_dict[scaffold][window_index]) / float(window_size) > gap_fraction_threshold:
                                    window_color = masked_color
                        #print scaffold
                        #print i, variant_density, window_color

                        if window_color == masked_color:
                            masked_windows_count_dict[sample][scaffold] += 1
                            masked_regions_fd.write("%s\t%i\t%i\t%f\n" % (scaffold, window_index, masking_dict[scaffold][window_index], float(masking_dict[scaffold][window_index]) / float(window_size)))
                        elif window_color == no_snp_color:
                            no_snps_windows_count_dict[sample][scaffold] += 1

                        window = Rectangle((window_start, start_y), window_size, scaffold_height, fill=True,
                                           edgecolor=None, facecolor=window_color, linewidth=0.0000000000001)
                        #print prev_x
                        #print gap_coords[0] - prev_x

                        subplot.add_patch(window)

                # draw_chromosome

                fragment = Rectangle((0, start_y), scaffold_length_dict[scaffold], scaffold_height, fill=False,
                                     edgecolor="black", facecolor=None, linewidth=0.5)
                subplot.add_patch(fragment)
                sample_index += 1

        legend_y_position = int(start_y/2)
        legend_x_position = int(max_scaffold_length * 1.05)
        legend_element_side = scaffold_height

        square_y_pos = legend_y_position - legend_element_side

        for color, legend_label in zip((masked_color, no_snp_color), ("masked", "no SNPs")):
            square_y_pos += legend_element_side
            fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                 edgecolor="black", facecolor=color, linewidth=0.5)

            subplot.add_patch(fragment)
            subplot.annotate(legend_label,
                             xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                             xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)
        if colormap:
            for i in range(0, len(thresholds)):
                square_y_pos += legend_element_side
                #print (colormap_tuple_list[i][1])
                fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                     edgecolor="black", facecolor=cmap(i), linewidth=0.5)

                subplot.add_patch(fragment)
                if i == (len(thresholds) - 1):
                    legend_element_label = "> %.2f" % thresholds[i]
                else:
                    legend_element_label = "%.2f - %.2f" % (thresholds[i], thresholds[i + 1])

                subplot.annotate(legend_element_label,
                                 xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                                 xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)
        else:
            for i in range(0, len(colormap_tuple_list)):
                square_y_pos += legend_element_side
                #print (colormap_tuple_list[i][1])
                fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True,
                                     edgecolor="black", facecolor=colormap_tuple_list[i][1], linewidth=0.5)

                subplot.add_patch(fragment)
                if i == (len(colormap_tuple_list) - 1):
                    legend_element_label = "> %.2f" % colormap_tuple_list[i][0]
                else:
                    legend_element_label = "%.2f - %.2f" % (colormap_tuple_list[i][0], colormap_tuple_list[i + 1][0])

                subplot.annotate(legend_element_label,
                                 xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13,
                                 xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),)

        plt.xlim(xmin=0, xmax=int(1.2 * max_scaffold_length))
        plt.ylim(ymin=0, ymax=start_y + 2 * scaffold_height)
        #plt.colorbar(subplot)
        #plt.tight_layout()

        plt.subplots_adjust(left=left_offset, right=0.95)#bottom=0.1, right=0.8, top=0.9)
        if suptitle:
            plt.suptitle(suptitle)
        for extension in ext_list:
            plt.savefig("%s.%s" % (output_prefix, extension))
        plt.close()

        no_snps_windows_count_dict.write("%s.no_snps.windows.count" % output_prefix)
        masked_windows_count_dict.write("%s.masked.windows.count" % output_prefix)
        masked_regions_fd.close()
Exemplo n.º 20
0
           -ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2)
    """
    return -ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2)


def get_intersection_length(start1, end1, start2, end2):
    if start1 - end2 > 0 or start2 - end1 > 0:
        return 0
    start_shift = start1 - start2
    start_coef_shift = 0 if start_shift < 0 else 1
    end_shift = end1 - end2
    end_coef_shift = 0 if end_shift > 0 else 1

    return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift

overlap_clusters_percent = TwoLvlDict({})

totaly_genes = 6074
test_fd = open("probability.t", "w")
test_fd.write("#size\tpower\ttotal\tPmCDA1_3d\tPmCDA1_sub_3d\tintersection\tp-value\n")
print([float(f) / float(100) for f in range(1, 11)])
for size in range(3, 11):
    overlap_clusters_percent[size] = {}
    for power in [float(f) / float(100) for f in range(1, 11)]:
        PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))
        PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power))

        #cluster_3d_dict = OrderedDict({})

        cluster_3d_set = set([])
        cluster_3d_sub_set = set([])
Exemplo n.º 21
0
                    dest="species_dir",
                    default="./",
                    type=check_path,
                    help="Comma-separated list of species")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()

for species in args.species_list:
    species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" %
                                                   (args.species_dir, species))

species_syn_dict.write("families_all_species.t", absent_symbol=".")

nonassembled = species_syn_dict.filter_by_line(filter_nonassembled)
species_syn_dict.write("correctly_assembled_families_species.t",
                       absent_symbol=".")

nonassembled.write("not_assembled_families_in_all_species.t",
                   absent_symbol=".")
complicated_families_dict = nonassembled.filter_by_line(
    filter_splited_to_several_fam)
Exemplo n.º 22
0
    return True

parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="File with families assembly information")
parser.add_argument("-e", "--header", action="store_true", dest="header",
                    help="Header is present in input file")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_syn_dict = TwoLvlDict()
out_fd.write("#family\tspecies_with_family\tspecies_with_errors\tspecies_with_correct_fam\terror_ratio\n")
with open(args.input, "r") as in_fd:
    if args.header:
        in_fd.readline()
    for line in in_fd:
        species_with_errors = 0
        species_with_fam = 0
        tmp = line.strip().split("\t")
        family_name = tmp[0]
        for fam in tmp[1:]:
            if fam != ".":
                species_with_fam += 1
            if "_" in fam:
                species_with_errors += 1
        species_with_correct_fam = species_with_fam - species_with_errors
Exemplo n.º 23
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file_prefix,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_per_thread_for_bam_sorting="4G",
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_pe_table = TwoLvlDict()
        count_se_table = TwoLvlDict()
        count_all_table = TwoLvlDict()
        count_pe_table_file = "%s/%s.pe.tab" % (output_directory,
                                                count_table_file_prefix)
        count_se_table_file = "%%s/%s.se.tab" % (output_directory,
                                                 count_table_file_prefix)
        count_all_table_file = "%s/%s.all.tab" % (output_directory,
                                                  count_table_file_prefix)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            alignment_sample_se_dir = "%s/se/" % alignment_sample_dir
            filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            if se_files:
                self.safe_mkdir(alignment_sample_se_dir)

            print("\tAligning paired reads...")
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)
            #"""
            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_per_thread_for_bam_sorting=
                max_memory_per_thread_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print("\tIndexing alignment file for paired reads...")
            os.system("samtools index %s" % alignment_file)

            print("\tCounting paired reads aligned to features...")

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)
            #"""
            sample_counts = SynDict(filename=count_file,
                                    header=False,
                                    separator="\t",
                                    allow_repeats_of_key=False,
                                    split_values=False,
                                    values_separator=",",
                                    key_index=0,
                                    value_index=1,
                                    close_after_if_file_object=False,
                                    expression=int,
                                    comments_prefix="__")
            count_pe_table[sample] = sample_counts

            if se_files:
                print("\tAligning single reads...")
                count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir,
                                                       sample)
                #"""
                STAR.align(
                    genome_dir,
                    se_files,
                    reverse_read_list=None,
                    annotation_gtf=annotation_gtf,
                    feature_from_gtf_to_use_as_exon=
                    feature_from_gtf_to_use_as_exon,
                    exon_tag_to_use_as_transcript_id=
                    exon_tag_to_use_as_transcript_id,
                    exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                    length_of_sequences_flanking_junction=
                    length_of_sequences_flanking_junction,
                    junction_tab_file_list=junction_tab_file_list,
                    three_prime_trim=three_prime_trim,
                    five_prime_trim=five_prime_trim,
                    adapter_seq_for_three_prime_clip=
                    adapter_seq_for_three_prime_clip,
                    max_mismatch_percent_for_adapter_trimming=
                    max_mismatch_percent_for_adapter_trimming,
                    three_prime_trim_after_adapter_clip=
                    three_prime_trim_after_adapter_clip,
                    output_type=output_type,
                    sort_bam=sort_bam,
                    max_memory_per_thread_for_bam_sorting=
                    max_memory_per_thread_for_bam_sorting,
                    include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                    output_unmapped_reads=output_unmapped_reads,
                    output_dir=alignment_sample_se_dir,
                    two_pass_mode=two_pass_mode,
                    max_intron_length=max_intron_length)

                alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir

                print("\tIndexing alignment file for single reads...")
                os.system("samtools index %s" % alignment_se_file)

                print("\tCounting single reads aligned to features...")

                HTSeq.count(
                    alignment_se_file,
                    gff_for_htseq,
                    count_se_file,
                    samtype="bam",
                    order="pos",
                    stranded_rnaseq=stranded_rnaseq,
                    min_alignment_quality=min_alignment_quality,
                    feature_type=feature_type_for_htseq,
                    feature_id_attribute=feature_id_attribute_for_htseq,
                    mode=htseq_mode,
                    suppress_progres_report=False)
                #"""

                sample_se_counts = SynDict(filename=count_se_file,
                                           header=False,
                                           separator="\t",
                                           allow_repeats_of_key=False,
                                           split_values=False,
                                           values_separator=",",
                                           key_index=0,
                                           value_index=1,
                                           close_after_if_file_object=False,
                                           expression=int,
                                           comments_prefix="__")

                count_se_table[sample] = sample_se_counts
            else:
                count_se_table[sample] = SynDict()
            count_all_table[sample] = SynDict()
            if se_files:
                for gene_id in set(sample_counts.keys()) | set(
                        sample_se_counts.keys()):
                    if (gene_id in sample_counts) and (gene_id
                                                       in sample_se_counts):
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id] + sample_se_counts[gene_id]
                    elif gene_id in sample_counts:
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id]
                    elif gene_id in sample_se_counts:
                        count_all_table[sample][gene_id] = sample_se_counts[
                            gene_id]
            else:
                count_all_table[sample] = count_pe_table[sample]

        count_pe_table.write(count_pe_table_file)
        count_se_table.write(count_se_table_file)
        count_all_table.write(count_all_table_file)
Exemplo n.º 24
0
                    "--gap_symbol",
                    action="store",
                    dest="gap_symbol",
                    default="-",
                    help="Gap symbol. Default - '-'")

parser.add_argument("-m",
                    "--histogram_output",
                    action="store",
                    dest="histogram_output",
                    required=True,
                    help="File to write histogram")

args = parser.parse_args()

unique_position_dict = TwoLvlDict()

FileRoutines.safe_mkdir(args.output_dir)

for alignment_file in args.input:
    alignment_name_list = FileRoutines.split_filename(alignment_file)
    output_prefix = "%s/%s.unique_positions" % (args.output_dir,
                                                alignment_name_list[1])

    unique_position_dict[alignment_name_list[
        1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file(
            alignment_file,
            output_prefix,
            format=args.format,
            gap_symbol="-",
            return_mode="relative",
Exemplo n.º 25
0
from collections import OrderedDict
from RouToolPa.Collections.General import TwoLvlDict
from RouToolPa.Routines.File import check_path



parser = argparse.ArgumentParser()

parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","),
                    required=True,
                    help="Comma-separated list of species")
parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path,
                    help="Directory with per species statistics")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

species_stat_dict = TwoLvlDict()

for species in args.species_list:
    with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd:
        statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines())
    species_stat_dict[species] = OrderedDict(statistics)

species_stat_dict.write(out_fd)
if args.output != "stdout":
    out_fd.close()
Exemplo n.º 26
0
                    help="write extensions of vcf files in output file. Default: false")
parser.add_argument("-r", "--remove_nucleotide_substitutions", action="store_true", dest="rem_nuc_sub",
                    help="Remove nucleotide substitutions from output(preserve only AA substitutions)")
parser.add_argument("-c", "--convert_aa_to_single_letter", action="store_true", dest="convert_to_single_letter",
                    help="Convert aminoacids to single letters")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)

gene_alias_dict = SynDict()
if args.gene_alias_file:
    gene_alias_dict.read(args.gene_alias_file, split_values=False)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

summary_dict = TwoLvlDict()
for filename in args.input:
    directory, prefix, extension = split_filename(filename)

    if args.write_dir_path and args.write_ext:
        name = filename
    elif args.write_dir_path:
        name = (directory + prefix) if directory else prefix
    elif args.write_ext:
        name = prefix + extension
    else:
        name = prefix
        if args.suffix_to_remove in name:
            name = name.replace(args.suffix_to_remove, "")
    summary_dict[name] = OrderedDict()
    with open(filename, "r") as file_fd:
Exemplo n.º 27
0
      len(filtered_out_report.records))
if args.ref_species_gene_file:
    reference_genes_dict = {}
    with open(args.ref_species_gene_file, "r") as ref_fd:
        for line in ref_fd:
            gene_family_id, genes = line.strip().split("\t")
            genes = [] if genes == "." else genes.split(",")
            reference_genes_dict[gene_family_id] = [genes[:]]
            if genes:
                reference_genes_dict[gene_family_id].append(choice(genes))
                # print gene_family_id
                #print reference_genes_dict[gene_family_id]

node_header_list = features_list + ["reference_gene"]
delta_index = features_list.index("delta")
statistics_dict = TwoLvlDict({})

for node_id in node_values:
    statistics_dict[node_id] = OrderedDict({
        "lost": 0,
        "new": 0,
        "lost_ref_ann": 0,
        "new_ref_ann": 0
    })

for node_id in node_values:
    fd_list = []
    for directory in node_info_dir, node_ref_dir:
        for mode in "all", "new", "lost":
            fd_list.append(
                open(
Exemplo n.º 28
0
    def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None,
                                        black_scaffold_list=(), white_scaffold_list=()):

        busco_table_dict = OrderedDict()
        gene_id_dict = OrderedDict()
        counts_dict = OrderedDict()

        output_path_list = self.split_filename(output_prefix)

        pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".")
        pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".")
        self.safe_mkdir(pairwise_overlaps_dir)
        self.safe_mkdir(pairwise_overlap_counts_dir)

        lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)]

        for busco_table, label in zip(busco_file_list, lllabels_list):
            busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list,
                                                 white_list=white_scaffold_list)

            gene_id_dict[label] = OrderedDict()
            counts_dict[label] = OrderedDict()

            gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses()

        # TODO: draw piecharts


        # TODO: count overlaps

        pairwise_overlap_dict = OrderedDict()
        count_pairwise_overlap_dict = OrderedDict()
        for label1 in lllabels_list:
            for label2 in lllabels_list:
                if label1 == label2:
                    continue
                overlap_id = "%s_vs_%s" % (label1, label2)
                pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                count_pairwise_overlap_dict[overlap_id] = TwoLvlDict()
                for status1 in self.status_list:
                    pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict()
                    for status2 in self.status_list:
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2])
                        count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)])
                        pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2)))

                count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id))

        if 2 <= len(busco_file_list) <= 3:
            fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6))
            plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes")
            #print(subplot_list)
            for status, index in zip(self.status_list, range(0, 4)):

                plt.sca(subplot_list[index // 2][index % 2])
                plt.title(status)
                MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status],
                                                          gene_id_dict[lllabels_list[1]][status],
                                                          set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None,
                                                          set_labels=lllabels_list, set_colors=["red", "yellow", "green"],
                                                          output_prefix=None, extensions=("png",), title=None)

            plt.savefig("%s.venn.png" % output_prefix)

            plt.close()
Exemplo n.º 29
0
                    dest="split_values",
                    help="Split values. Default: False")
parser.add_argument("-s",
                    "--value_separator",
                    action="store",
                    dest="value_separator",
                    default=",'",
                    help="Value separator. Default: ','")
parser.add_argument(
    "-g",
    "--ignore_value_repeats",
    action="store_true",
    dest="ignore_value_repeats",
    help=
    "Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) "
    "and don't raise exception. If yes value from first entry is stored. Default: False"
)

args = parser.parse_args()

combined_table = TwoLvlDict(input_file=args.files,
                            absent_symbol=args.absent_symbol,
                            split_values=args.split_values,
                            value_sep=args.value_separator,
                            ignore_value_repeats=args.ignore_value_repeats)
#print combined_table
combined_table.write(args.output,
                     absent_symbol=args.absent_symbol,
                     close_after_if_file_object=False,
                     sort=False)
Exemplo n.º 30
0
                               "snoRNA": "ncRNA",
                               "snRNA": "ncRNA"
                               }
    annotation_black_list = ["gene", "region", "ARS", "long_terminal_repeat",
                             "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette",
                             "five_prime_UTR_intron"]
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record

    bad_region_dict = {}
    with open(bad_regions_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            bad_region_dict[record.id] = record

    statistics_dict = TwoLvlDict(OrderedDict({}))
    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)
        statistics_dict[sample_set_name] = OrderedDict({})
        os.chdir(workdir)
        os.system("mkdir -p %s" % sample_set_name)
        os.chdir(sample_set_name)
        os.system("mkdir -p %s" % clustering_dir)
        #os.system("pwd")
        mutations = CollectionVCF(vcf_file="../SNP_annotated_raw_vcf/%s_SNP.vcf" % sample_set_name,
                                  from_file=True)

        mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict)
        mutations.check_location(bad_regions)
        mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA")