示例#1
0
def variants_in_UTR(collection, feature_type="5'_UTR"):

    UTR_record_list = []
    for variant in collection:
        if feature_type in variant.info_dict["Ftype"]:
            UTR_record_list.append(variant)
    return CollectionVCF(from_file=False,
                         record_list=UTR_record_list,
                         metadata=collection.metadata,
                         header=collection.header)
示例#2
0
文件: CCF.py 项目: melakbet/MAVR
 def extract_vcf(self):
     vcf = CollectionVCF(metadata=self.metadata.vcf_metadata,
                         record_list=[],
                         header=self.metadata.vcf_header,
                         samples=self.metadata.samples,
                         from_file=False)
     for record in self:
         """
         print(record)
         print(type(record))
         print(record.records)
         print(type(record.records))
         """
         vcf = vcf + record.records
     return vcf
示例#3
0
def variants_start_end(collection,
                       left,
                       record_dict,
                       min_five_utr_len=10,
                       skip_nonintergenic_variants=False):

    pre_UTR_record_list = []
    for record_id in record_dict:
        for feature in record_dict[record_id].features:
            if feature.type != "gene":
                continue

            for sub_feature in feature.sub_features:
                if sub_feature.type == "five_prime_UTR" and len(
                        sub_feature) >= min_five_utr_len:
                    break
            else:
                continue
            for sub_feature in feature.sub_features:
                strand = sub_feature.strand
                if sub_feature.type == "five_prime_UTR":
                    five_UTR_start = sub_feature.location.start + 1 if strand == +1 else sub_feature.location.end
                    pre_UTR_start = five_UTR_start - left if strand == +1 else five_UTR_start + 1
                    pre_UTR_end = five_UTR_start - 1 if strand == +1 else five_UTR_start + left

                    for variant in collection:
                        if record_id != variant.chrom:
                            continue
                        if skip_nonintergenic_variants and variant.info_dict[
                                "Ftype"] != ["igc"]:
                            continue
                        if pre_UTR_start <= variant.pos <= pre_UTR_end:
                            pre_UTR_record_list.append(variant)
                            pre_UTR_record_list[-1].info_dict["Fstrand"] = [
                                "P"
                            ] if strand == +1 else ["M"]
                            relative_position = (variant.pos -
                                                 five_UTR_start) * strand
                            if relative_position > 0:
                                print(pre_UTR_start, pre_UTR_end,
                                      five_UTR_start)
                                print(variant)
                                print(sub_feature)

    return CollectionVCF(from_file=False,
                         record_list=pre_UTR_record_list,
                         metadata=collection.metadata,
                         header=collection.header)
示例#4
0
                    help="End histogramm file prefix",
                    default="end_histogramm")

args = parser.parse_args()

if args.both is not None:
    args.left = args.both
    args.right = args.both

if (not args.right) and (not args.left):
    raise ValueError("Both left and right regions were not set")

with open(args.gff, "r") as in_fd:
    record_dict = dict([(record.id, record) for record in GFF.parse(in_fd)])

variants = CollectionVCF(from_file=True, vcf_file=args.vcf)
gene_variants_positions = []
all_variant_start_positions = []
all_variant_end_positions = []
print(args.left)
print(args.right)
for record_id in record_dict:
    for feature in record_dict[record_id].features:
        if feature.type != "gene":
            continue
        #print(feature.sub_features)
        for sub_feature in feature.sub_features:
            if sub_feature.type != "CDS":
                continue
            chrom = record_id
            strand = sub_feature.strand
示例#5
0
    annotations_dict = {}
    annotation_synonym_dict = {
        "three_prime_UTR": "3'_UTR",
        "five_prime_UTR": "5'_UTR",
        "snoRNA": "ncRNA",
        "snRNA": "ncRNA"
    }
    annotation_black_list = [
        "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron",
        "repeat_region"
    ]
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record

    bad_region_dict = {}
    with open(bad_regions_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            bad_region_dict[record.id] = record

    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)
        os.chdir(workdir)
        mutations = CollectionVCF(vcf_file=sample_set_name + ".vcf",
                                  from_file=True)
        mutations.find_location(annotations_dict,
                                use_synonym=True,
                                synonym_dict=annotation_synonym_dict,
                                feature_type_black_list=annotation_black_list)
        mutations.write("%s_annotated.vcf" % sample_set_name)
示例#6
0
import matplotlib.pyplot as plt
from matplotlib import rcParams

if __name__ == "__main__":
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/all/pre_UTR_strandness/"
    letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
    os.chdir(workdir)
    sample_set_names_list = [
        "PmCDA1_3d", "PmCDA1_sub1_3d", "PmCDA1_6d", "PmCDA1_sub1_6d"
    ]
    rcParams.update({'font.size': 7})
    plt.figure(1, dpi=300, figsize=(6, 6))
    index = 1
    for sample, letter in zip(sample_set_names_list, letter_list_part1):
        collection = CollectionVCF(
            from_file=True,
            vcf_file=sample + "_pre_UTR_variants_only_intergenic_l_300.vcf")
        sample_data = collection.count_strandness(
            sample + "_pre_UTR_variants_only_intergenic_l_300_strandness")
        plt.subplot(2, 2, index)
        n_groups = 4
        points = np.arange(n_groups)
        bar_width = 0.35

        C_values = sample_data["all"][0]
        G_values = sample_data["all"][1]

        rects1 = plt.bar(points, C_values, bar_width, color='b', label='C->T')

        rects2 = plt.bar(points + bar_width,
                         G_values,
示例#7
0
from matplotlib import rcParams

if __name__ == "__main__":
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/all/all/"
    letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
    os.chdir(workdir)
    sample_set_names_list = ["PmCDA1_3d",
                             "PmCDA1_sub1_3d",
                             "PmCDA1_6d",
                             "PmCDA1_sub1_6d"
                             ]
    rcParams.update({'font.size': 7})
    plt.figure(1, dpi=300, figsize=(6, 6))
    index = 1
    for sample, letter in zip(sample_set_names_list, letter_list_part1):
        collection = CollectionVCF(from_file=True, vcf_file=sample + "_good.vcf")
        sample_data = collection.count_strandness(sample + "_good_strandness")
        plt.subplot(2, 2, index)
        n_groups = 4
        points = np.arange(n_groups)
        bar_width = 0.35

        C_values = sample_data["all"][0]
        G_values = sample_data["all"][1]

        rects1 = plt.bar(points, C_values, bar_width,
                                     color='b',
                                     label='C->T')

        rects2 = plt.bar(points + bar_width, G_values, bar_width,
                                     color='g',
示例#8
0
    os.chdir(workdir)

    skip_genes_without_five_utr = False
    left = 300
    right = 300
    bin_width = 5
    bins = np.linspace(-left, right, ((left + right) / bin_width) + 1)
    normed = True
    max_start = 0
    max_end = 0
    for sample_set in sample_set_names_list:
        vcf_file = "%s_good.vcf" % sample_set
        #start_hist_prefix = "%s_start_hist_r_%i_l_%i" % (sample_set, right, left)
        #end_hist_prefix = "%s_end_hist_r_%i_l_%i" % (sample_set, right, left)
        #gene_variants = "%s_gene_variants_r_%i_l_%i.t" % (sample_set, right, left)
        variants = CollectionVCF(from_file=True, vcf_file=vcf_file)
        start_dict[sample_set], end_dict[sample_set], position_dict[sample_set] = \
            variants.variants_start_end(left, right, record_dict, skip_genes_without_five_utr=skip_genes_without_five_utr)
        length_dict[sample_set] = len(variants)
        #print(start_dict[sample_set])
        start_hist_dict[sample_set] = list(
            np.histogram(start_dict[sample_set], bins=bins))
        end_hist_dict[sample_set] = list(
            np.histogram(end_dict[sample_set], bins=bins))
        print(start_hist_dict[sample_set][0])
        if normed:
            start_hist_dict[sample_set][0] = start_hist_dict[sample_set][
                0].astype(np.float32, copy=False)
            end_hist_dict[sample_set][0] = end_hist_dict[sample_set][0].astype(
                np.float32, copy=False)
            start_hist_dict[sample_set][
示例#9
0
    bad_region_dict = {}
    with open(bad_regions_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            bad_region_dict[record.id] = record

    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)

        os.chdir(workdir)
        os.system("mkdir -p %s" % sample_set_name)
        os.chdir(sample_set_name)
        os.system("mkdir -p %s %s" % (clustering_dir, rainfall_dir))
        #os.system("pwd")
        mutations = CollectionVCF(
            vcf_file="../SNP_annotated_raw_vcf/%s_SNP.vcf" % sample_set_name,
            from_file=True)
        """
        mutations.rainfall_plot("%s_mutations" % (sample_set_name), ref_genome=reference, draw_gaps=True,
                                masked_regions=bad_region_dict)
        """

        mutations.get_location(annotations_dict,
                               use_synonym=True,
                               synonym_dict=annotation_synonym_dict)
        mutations.check_location(bad_regions)
        mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA")

        annotation_black_list = [
            "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon",
            "intron", "repeat_region", "telomere", "gene_cassette",
示例#10
0
    gff_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/annotations/merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3"
    annotations_dict = {}
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record

    for sample in samples_list:
        print("Handling %s" % sample)

        os.chdir(workdir)
        os.chdir(sample)
        if alignment_dir not in os.listdir("."):
            continue
        os.chdir(alignment_dir)
        os.system("mkdir -p %s" % clustering_dir)
        mutations = CollectionVCF(vcf_file=sample + suffix, from_file=True)
        mutations.get_location(annotations_dict)
        mutations.check_by_ref_and_alt(ref_alt_variants["desaminases"], "DA")
        """
        for record in mutations:
            print (record)
            print(record.flags)
        """
        #for record in mutations:
        #    print(record.description)
        mutations.location_pie(annotation_black_list=[
            "gene", "region", "ARS", "long_terminal_repeat"
        ],
                               figsize=(40, 40),
                               pie_filename="variant_location_pie.svg",
                               counts_filename="variant_location_counts.t")
示例#11
0
文件: CCF.py 项目: melakbet/MAVR
    def adjust(self,
               border_limit=None,
               min_size_to_adjust=2,
               remove_border_subclusters=False,
               remove_size_limit=1):
        # adjusts cluster borders, returns list of new cluster records
        # skip adjustment for clusters with 3 or less mutations
        if (self.size < min_size_to_adjust) or (self.subclusters is None):
            #return -1
            return [self]
        limit = border_limit if border_limit else len(self.subclusters)
        for i in range(0, limit):
            if self.subclusters[i] == self.subclusters[0]:
                left_subcluster_end = i
            else:
                break
        # exit if cluster doesnt have subclusters
        if left_subcluster_end == len(self.subclusters) - 1:
            #return 1
            return [self]

        for i in range(-1, -limit - 1, -1):
            if self.subclusters[i] == self.subclusters[-1]:
                right_subcluster_start = i
            else:
                break

        if remove_border_subclusters:
            start = left_subcluster_end + 1 if left_subcluster_end < remove_size_limit else 0
            end = right_subcluster_start if right_subcluster_start >= -remove_size_limit else len(
                self.subclusters)

            new_left_cluster, new_right_cluster = None, None

            if start > 0:
                new_left_cluster = RecordCCF(
                    collection_vcf=CollectionVCF(
                        record_list=self.records.records[:start],
                        from_file=False),
                    subclusters=self.subclusters[:start],
                    from_records=True)

            if end < len(self.subclusters):
                new_right_cluster = RecordCCF(
                    collection_vcf=CollectionVCF(
                        record_list=self.records.records[end:],
                        from_file=False),
                    subclusters=self.subclusters[end:],
                    from_records=True)
            """
            self.__init__(collection_vcf=CollectionVCF(record_list=self.records.records[start:end], from_file=False),
                          subclusters=self.subclusters[start:end], from_records=True)
            """
            new_middle_cluster = RecordCCF(
                collection_vcf=CollectionVCF(
                    record_list=self.records.records[start:end],
                    from_file=False),
                subclusters=self.subclusters[start:end],
                from_records=True)
            """
            if new_left_cluster or new_right_cluster:
                print("original")
                print(self)
                print("adjusted")
                print(new_left_cluster)
                print(new_middle_cluster)
                print(new_right_cluster)
            """
            cluster_list = [new_left_cluster] if new_left_cluster else []
            cluster_list += [new_middle_cluster]
            cluster_list += [new_right_cluster] if new_right_cluster else []
            return cluster_list
示例#12
0
 pre_UTR_bins = left / bin_width
 CDS_bins = np.linspace(0, right, right / bin_width + 1)
 UTR_bins = 10
 normed = True
 max_start = 0
 max_end = 0
 skip_nonintergenic_variants = True
 for sample_set in sample_set_names_list:
     print("Handling %s" % sample_set)
     vcf_file = "%s_good.vcf" % sample_set
     #start_hist_prefix = "%s_start_hist_r_%i_l_%i" % (sample_set, right, left)
     #end_hist_prefix = "%s_end_hist_r_%i_l_%i" % (sample_set, right, left)
     #gene_variants = "%s_gene_variants_r_%i_l_%i.t" % (sample_set, right, left)
     #variants, minus_variants = CollectionVCF(from_file=True, vcf_file=vcf_file).filter_by_expression("record.ref == 'C'") # C -> T variants
     #variants, minus_variants = CollectionVCF(from_file=True, vcf_file=vcf_file).filter_by_expression("(record.ref == 'C' and record.info_dict['Fstrand'][0] == 'P') or (record.ref == 'G' and record.info_dict['Fstrand'][0] == 'M')") # nontranscribed thread
     variants = CollectionVCF(from_file=True, vcf_file=vcf_file)
     pre_UTR_positions[sample_set], UTR_positions[sample_set], CDS_positions[sample_set] = \
         variants_start_end(variants, left, right, record_dict, min_five_utr_len=10,
                            skip_nonintergenic_variants=skip_nonintergenic_variants)
     length_dict[sample_set] = len(variants)
     #print(start_dict[sample_set])
     pre_UTR_hist_dict[sample_set] = list(
         np.histogram(pre_UTR_positions[sample_set], bins=pre_UTR_bins))
     UTR_hist_dict[sample_set] = list(
         np.histogram(UTR_positions[sample_set], bins=UTR_bins))
     CDS_hist_dict[sample_set] = list(
         np.histogram(CDS_positions[sample_set], bins=CDS_bins))
     print("UTR")
     print(UTR_positions[sample_set])
     print("blablabla")
     print(pre_UTR_hist_dict[sample_set][0])
示例#13
0
__author__ = 'mahajrod'
import os
from Parsers.VCF import CollectionVCF

if __name__ == "__main__":
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/all"

    os.chdir(workdir)
    samples_list = sorted(os.listdir("."))

    suffix = "_GATK_best_merged.vcf"

    for sample in samples_list:
        print("Handling %s" % sample)

        os.chdir(workdir)
        os.chdir(sample)
        if "alignment_LAN210_v0.9m" not in os.listdir("."):
            continue
        os.chdir("alignment_LAN210_v0.9m")

        mutations = CollectionVCF(vcf_file=sample + suffix, from_file=True)
        print("Totaly %s mutations" % len(mutations))

        mutations.test_thresholds(save_clustering=True,
                                  testing_dir="testing_theshold_inconsistent")
示例#14
0
combined_vcf_suffix = "_adjusted_cluster_mutations.vcf"
combined_3_vcf_suffix = "_adjusted_3+_cluster_mutations.vcf"

samples_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/all/"
samples_subdir = "alignment_LAN210_v0.10m/"
sample_suffix = "_GATK_best_SNP.vcf"

workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/"
combined_subdir = "clustering/"

for sample_set in sample_set_names_list:
    os.chdir(workdir)
    os.chdir(sample_set)
    os.system("mkdir -p per_sample_vcf")
    sample_set_mutations = CollectionVCF(from_file=True,
                                         vcf_file=combined_subdir +
                                         sample_set + combined_vcf_suffix)
    #sample_set_3_mutations = CollectionVCF(from_file=True, vcf_file=combined_subdir + sample_set + combined_3_vcf_suffix)
    per_sample_mutations = {}
    samples_list = sample_set_mutations.samples
    print("Handling %s" % sample_set)
    for sample in sample_set_mutations.samples:
        print("Handling %s" % sample)
        sample_mutations = CollectionVCF(from_file=True,
                                         vcf_file=samples_dir + sample + "/" +
                                         samples_subdir + sample +
                                         sample_suffix)
        per_sample_mutations[sample] = CollectionVCF(
            metadata=sample_mutations.metadata,
            record_list=None,
            header=sample_mutations.header,
示例#15
0
        "three_prime_UTR": "3'_UTR",
        "five_prime_UTR": "5'_UTR",
        "snoRNA": "ncRNA",
        "snRNA": "ncRNA"
    }
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record

    bad_region_dict = {}
    with open(bad_regions_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            bad_region_dict[record.id] = record
    region = "chrX"
    for sample_set in sample_set_names_list:
        collection = CollectionVCF(from_file=True,
                                   vcf_file="%s_good.vcf" % sample_set)
        plt.figure(1, figsize=(15, 8))

        ax0 = plt.subplot(212)
        ax0.set_yscale('log', basey=2)
        ax0.set_xlim(1, 745688)
        #ax0.set_ylim(ymin=0)
        region_reference_dict, maximum = rainfall_plot(
            collection,
            region,
            base_colors=[],
            facecolor="#D6D6D6",
            ref_genome=reference,
            draw_gaps=True,
            masked_regions=bad_region_dict)
        for reference in region_reference_dict[region]:
示例#16
0
            continue
        os.chdir("alignment_LAN210_v0.9m")

        mutations = CollectionVCF(vcf_file=sample + suffix,
                                  from_file=True)
        print("Totaly %s mutations" % len(mutations))

        mutations.test_thresholds(extracting_method='distance', threshold=(50, 5000, 100),
                                  testing_dir="testing_threshold")
    """
    sample_set_names_list = [
        "PmCDA1_3d", "HAP", "PmCDA1_sub1_3d", "PmCDA1_6d", "HAP_sub1",
        "PmCDA1_sub1_6d", "A1_3d", "A1_6d", "A3G_3d", "AID_3d", "AID_6d"
    ]
    suffix = "_SNP.vcf"
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/test_thresholds/"
    sample_dir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/SNP_annotated_raw_vcf/"

    for sample_set in sample_set_names_list:
        print("Handling %s" % sample_set)
        os.chdir(workdir)
        os.system("mkdir -p %s" % sample_set)
        os.chdir(sample_set)

        mutations = CollectionVCF(vcf_file=sample_dir + sample_set + suffix,
                                  from_file=True)
        print("Totaly %s mutations" % len(mutations))

        mutations.test_thresholds(extracting_method='distance',
                                  threshold=(50, 5000, 100),
                                  testing_dir="testing_threshold")
示例#17
0
def homogeneity_plot(sample_set_names_list, plot_file_prefix):
    rcParams.update({'font.size': 6})
    letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
    feature_type_list = ["all", "pre_5'_UTR", "5'_UTR", "CDS", "3'_UTR"]

    figure = plt.figure(1,
                        dpi=600,
                        figsize=(5 * 1.47, 2 * len(sample_set_names_list)))
    index = 1
    for sample, letter in zip(sample_set_names_list, letter_list_part1):
        findex = 1
        for feature_type in feature_type_list:
            vcf_file = "%s_good.vcf" % sample if feature_type == "all" \
                else "%s_pre_UTR_variants_only_intergenic_l_300.vcf" % sample \
                if feature_type == "pre_5'_UTR" \
                else "%s_%s_variants.vcf" % (sample, feature_type)
            collection = CollectionVCF(from_file=True, vcf_file=vcf_file)
            sample_data = collection.count_strandness("%s_%s_variants" %
                                                      (sample, feature_type))
            ax = plt.subplot(len(sample_set_names_list),
                             len(feature_type_list), index)
            n_groups = 4
            points = np.arange(n_groups)
            bar_width = 0.35

            C_values = sample_data["all"][0]
            G_values = sample_data["all"][1]

            rects1 = plt.bar(points,
                             C_values,
                             bar_width,
                             color='b',
                             label='C->T')

            rects2 = plt.bar(points + bar_width,
                             G_values,
                             bar_width,
                             color='g',
                             label='G->A')
            table = [C_values[1:-1], G_values[1:-1]]
            g, p_value, dof, expctd = chi2_contingency(table)
            phi = phi_coefficient_correlation(table)
            if index > len(feature_type_list) * (len(sample_set_names_list) -
                                                 1):
                plt.xlabel('Strand')

            if findex == 1:
                plt.ylabel('N of SNV')
                plt.text(-0.50,
                         0.5,
                         sample[:-3],
                         rotation=90,
                         fontweight="bold",
                         transform=ax.transAxes,
                         fontsize=10,
                         horizontalalignment='center',
                         verticalalignment='center')
            #plt.title("%s%i. %s (%i SNV)\np=%.3f, phi=%.3f" % (letter, findex, sample, len(collection), p_value, phi),
            #          fontweight='bold')
            #plt.title("%s%i. %s (%s)\np=%.3f, phi=%.3f" % (letter, findex, sample, feature_type, p_value, phi),
            #          fontweight='bold', fontsize=6)
            title = "%s%i" % (letter, findex)
            title_text = r"$p=%.2f, \varphi=%.2f$" % (p_value, phi) if p_value >= 0.01 \
                else r"$p=%.1e, \varphi=%.2f$" % (p_value, phi)
            plt.text(
                0.23,
                1.1,
                title_text,
                rotation=0,
                transform=ax.transAxes,
                fontsize=8,
                #horizontalalignment='center',
                verticalalignment='center')
            plt.title(title, fontweight='bold', fontsize=11, loc="left")
            plt.xticks(points + bar_width, ('None', '+', '-', 'Both'))
            if findex == len(feature_type_list):
                plt.legend(prop={'size': 8})
            if index <= len(feature_type_list):
                plt.text(0.5,
                         1.25,
                         feature_type,
                         rotation=0,
                         fontweight="bold",
                         transform=ax.transAxes,
                         fontsize=10,
                         horizontalalignment='center',
                         verticalalignment='center')
            #plt.suptitle("Strandness histograms", fontweight="bold", fontsize=20)
            findex += 1
            index += 1
    #plt.tight_layout()
    plt.subplots_adjust(hspace=0.5,
                        wspace=0.25,
                        top=0.88,
                        left=0.09,
                        right=0.99)
    for extension in [".pdf", ".svg", ".eps", ".png"]:
        plt.savefig("%s%s" % (plot_file_prefix, extension))

    plt.close()
示例#18
0
文件: CCF.py 项目: melakbet/MAVR
    def read(self, input_file):
        # TODO: write read from ccf file
        with open(input_file, "r") as in_fd:

            stripped_line = in_fd.readline().strip()
            if stripped_line == "#VCF_METADATA START":
                vcf_metadata = MetadataVCF()
                stripped_line = in_fd.readline().strip()
                while (stripped_line != "#VCF_METADATA END"):
                    vcf_metadata.add_metadata(stripped_line)
                    stripped_line = in_fd.readline().strip()
            stripped_line = in_fd.readline().strip()
            if stripped_line == "#VCF_HEADER":
                header_line = in_fd.readline().strip()
                vcf_header = HeaderVCF(header_line[1:].split("\t"))
                #print("a\na\na\na\na\n")
                #print(vcf_header)
                self.metadata = MetadataCCF(vcf_header[9:],
                                            vcf_metadata=vcf_metadata,
                                            vcf_header=vcf_header)
            stripped_line = in_fd.readline().strip()
            if stripped_line == "#CCF_HEADER":
                header_line = in_fd.readline().strip()
                self.header = HeaderCCF(header_line[1:].split("\t"))
            flag = 0
            self.records = []
            while True:
                data_line = in_fd.readline()

                if data_line == "" or data_line == "\n":
                    break
                stripped_line = data_line.strip()
                if data_line[0] == "\t":
                    #stripped_line = stripped_line[1:]
                    #print(collection_vcf)
                    collection_vcf.records.append(
                        collection_vcf.add_record(
                            stripped_line,
                            external_metadata=self.metadata.vcf_metadata))
                    flag = 1
                    #print("aaaa")
                    continue

                if flag != 0:
                    self.records.append(
                        RecordCCF(id=cluster_id,
                                  chrom=chrom,
                                  size=size,
                                  start=start,
                                  end=end,
                                  description=description,
                                  flags=flags,
                                  collection_vcf=collection_vcf,
                                  bad_vcf_records=bad_records,
                                  from_records=False,
                                  subclusters=subclusters))
                    #collection_vcf = None

                if stripped_line[0] == ">":
                    flag = 0
                    cluster_id, chrom, start, end, description_and_flags = stripped_line[
                        1:].split("\t")
                    start = int(start)
                    end = int(end)
                    description_and_flags = description_and_flags.split(";")
                    description = OrderedDict({})
                    flags = set([])
                    subclusters = None
                    for descr_entry in description_and_flags:
                        descr_entry_splited = descr_entry.split("=")
                        if len(descr_entry_splited) == 1:
                            flags.add(descr_entry_splited[0])
                            continue
                        if descr_entry_splited[0] == "Size":
                            size = int(descr_entry_splited[1])
                        elif descr_entry_splited[0] == "Bad_records":
                            bad_records = int(descr_entry_splited[1])
                        elif descr_entry_splited[
                                0] == "Mean" or descr_entry_splited[
                                    0] == "Median" or descr_entry_splited[
                                        0] == "Power" or descr_entry_splited[
                                            0] == "Homogeneity":
                            description[descr_entry_splited[0]] = float(
                                descr_entry_splited[1])
                        elif descr_entry_splited[0] == "Loc":
                            description[descr_entry_splited[
                                0]] = descr_entry_splited[1].split(",")
                        elif descr_entry_splited[0] == "Subclusters":
                            subclusters = [
                                int(x)
                                for x in descr_entry_splited[1].split(",")
                            ]
                        else:
                            description[descr_entry_splited[
                                0]] = descr_entry_splited[1].split(",")
                            if len(description[descr_entry_splited[0]]) == 1:
                                description[
                                    descr_entry_splited[0]] = description[
                                        descr_entry_splited[0]][0]
                    collection_vcf = CollectionVCF(metadata=None,
                                                   record_list=None,
                                                   header=None,
                                                   vcf_file=None,
                                                   samples=None,
                                                   from_file=False,
                                                   external_metadata=None)
                    continue
            self.records.append(
                RecordCCF(id=cluster_id,
                          chrom=chrom,
                          size=size,
                          start=start,
                          end=end,
                          description=description,
                          flags=flags,
                          collection_vcf=collection_vcf,
                          bad_vcf_records=bad_records,
                          from_records=False,
                          subclusters=subclusters))
示例#19
0
    annotations_dict = {}
    annotation_synonym_dict = {"three_prime_UTR": "3'_UTR",
                               "five_prime_UTR": "5'_UTR",
                               "snoRNA": "ncRNA",
                               "snRNA": "ncRNA"
                               }
    with open(gff_file) as gff_fd:
        for record in GFF.parse(gff_fd):
            annotations_dict[record.id] = record


    rcParams.update({'font.size': 7})
    plt.figure(1, dpi=300, figsize=(8, 6))
    index = 1
    for sample, letter in zip(sample_set_names_list, letter_list_part1):
        collection = CollectionVCF(from_file=True, vcf_file=sample + "_good.vcf")

        plt.subplot(3, 2, index)
        collection.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict)
        location_pie(collection, annotation_colors=[],
                     ref_genome=None, explode=True, annotation_black_list=annotation_black_list,
                     allow_several_counts_of_record=False,
                     counts_filename="location_counts.t",
                     counts_dir="location_counts",
                     legend_font=6,
                     combine_mixed=True
                     )
        plt.title("%s. %s" % (letter, sample), fontweight='bold')

        index += 1
    for format_ext in ["svg", "eps", "pdf", "png"]:
示例#20
0
    vcf_best_merged_homo = "%s_GATK_best_merged_homo.vcf" % sample_name

    UnifiedGenotyper.variant_call("%s_trimmed_sorted_rm_pcr_chrom_with_header.bam" % sample_name,
                                  reference,
                                  stand_emit_conf=40,
                                  stand_call_conf=100,
                                  GATK_dir=gatk_dir,
                                  num_of_threads=5,
                                  output_mode="EMIT_VARIANTS_ONLY",
                                  discovery_mode="BOTH",
                                  output_file=vcf_all)

    SelectVariants.get_indel(gatk_dir, reference, vcf_all, vcf_indel)
    SelectVariants.get_SNP(gatk_dir, reference, vcf_all, vcf_SNP)

    VariantFiltration.filter_bad_SNP(gatk_dir, reference, vcf_SNP, vcf_filtered_SNP)
    VariantFiltration.filter_bad_indel(gatk_dir, reference, vcf_indel, vcf_filtered_indel)
    SelectVariants.remove_filtered(gatk_dir, reference, vcf_filtered_SNP, vcf_best_SNP)
    SelectVariants.remove_filtered(gatk_dir, reference, vcf_filtered_indel, vcf_best_indel)

    CombineVariants.combine_from_same_source(gatk_dir, reference, [vcf_best_SNP, vcf_best_indel], vcf_best_merged)

    best_merged = CollectionVCF(vcf_file=vcf_best_merged)
    best_merged_homo, best_merged_hetero = best_merged.split_by_zygoty()
    best_merged_homo.write(vcf_best_merged_homo)
    best_merged_hetero.write(vcf_best_merged_hetero)




示例#21
0
    annotations_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/annotations/merged_annotations_Nagalakshmi_tranf_to_LAN210_v0.10m.gff3"
    sequence_file = "/home/mahajrod/genetics/desaminases/data/LAN210_v0.10m/LAN210_v0.10m.fasta"
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf"
    sequence_dict = SeqIO.to_dict(SeqIO.parse(sequence_file, "fasta"))
    #print(sequence_dict)

    with open(annotations_file, "r") as in_fd:
        annotation_dict = dict([(record.id, record)
                                for record in GFF.parse(in_fd)])

    mutation_strand_dict = {"C": "P", "G": "M"}

    #values_names = {"Len": 0, "P": 1, "P_dens": 2, "M": 3, "M_dens": 4, "Exp": 5}
    values_names = {"Len": 0, "P": 1, "M": 2, "Exp": 3}
    for sample_set_name in sample_set_names_list:
        print("Handling %s" % sample_set_name)
        os.chdir(workdir)

        os.chdir(sample_set_name)
        mutations = CollectionVCF(
            vcf_file="./clustering/%s_adjusted_cluster_mutations.vcf" %
            sample_set_name,
            from_file=True)
        mutations_large_clusters = CollectionVCF(
            vcf_file="./clustering/%s_adjusted_3+_cluster_mutations.vcf" %
            sample_set_name,
            from_file=True)
        prepare_data(mutations, "all_adjusted_cluster_mutations",
                     expression_data_dir)
        prepare_data(mutations_large_clusters, "adjusted_3+_cluster_mutations",
                     expression_data_dir)
示例#22
0
    workdir = "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/all/pre_UTR_strandness/"
    letter_list_part1 = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
    os.chdir(workdir)
    sample_set_names_list = [
        "PmCDA1_3d", "PmCDA1_sub1_3d", "PmCDA1_6d", "PmCDA1_sub1_6d"
    ]
    rcParams.update({'font.size': 7})

    feature_type_list = ["5'_UTR", "CDS", "3'_UTR"]

    for feature_type in feature_type_list:
        plt.figure(1, dpi=300, figsize=(6, 6))
        index = 1
        for sample, letter in zip(sample_set_names_list, letter_list_part1):
            collection = CollectionVCF(from_file=True,
                                       vcf_file="%s_%s_variants.vcf" %
                                       (sample, feature_type))
            sample_data = collection.count_strandness("%s_%s_variants" %
                                                      (sample, feature_type))
            plt.subplot(2, 2, index)
            n_groups = 4
            points = np.arange(n_groups)
            bar_width = 0.35

            C_values = sample_data["all"][0]
            G_values = sample_data["all"][1]

            rects1 = plt.bar(points,
                             C_values,
                             bar_width,
                             color='b',