def create_illumina_genotyping_references(illumina_read_dir='../Genotyping/'): from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() id_to_gene = {1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA'} repeats = {'GP1BA': range(1, 5), 'CSTB': range(1, 16), 'MAOA': range(1, 6)} repeats_patterns = { 'GP1BA': [ 'AGCCCGACCACCCCAGAGCCCACCTCAGAGCCCGCCCCC', 'AGCCCGACCACCCCGGAGCCCACCTCAGAGCCCGCCCCC', 'AGCCCGACCACCCCGGAGCCCACCCCAATCCCGACCATCGCCA' ], 'CSTB': ['CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGCGGGGCGGGG', 'CGGCGGGCGGGG'], 'MAOA': [ 'ACCGGCACCGGCACCAGTACCCGCACCAGT', 'ACCGGCACCGGCACCGAGCGCAAGGCGGAG', 'ACCGGCACCGGCACCAGTACCCGCACCAGT' ] } for vntr_id in id_to_gene.keys(): # if vntr_id != 1221: # continue for repeat in repeats[id_to_gene[vntr_id]]: outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.pacfa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 3000, repeats_patterns[id_to_gene[vntr_id]])
def create_illumina_copy_number_variation_references( illumina_read_dir='../Illumina_copy_number/'): from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() id_to_gene = { 119: 'DRD4', 1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA', 1219: 'IL1RN' } repeats = { 'DRD4': range(1, 12), 'GP1BA': range(1, 6), 'CSTB': range(1, 16), 'MAOA': range(1, 6), 'IL1RN': range(1, 10) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: outfile = illumina_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 149)
def check_trio_consistency(father_file, mother_file, child_file): father_genotypes = get_genotypes(father_file) mother_genotypes = get_genotypes(mother_file) child_genotypes = get_genotypes(child_file) from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() vntr_ids = set(father_genotypes.keys() + mother_genotypes.keys() + child_genotypes.keys()) print('Total vntrs: %s' % len(vntr_ids)) inconsistents = [] consistents = [] for vid in vntr_ids: if not is_consistent(vid, father_genotypes, mother_genotypes, child_genotypes): inconsistents.append( len(reference_vntrs[vid].get_repeat_segments())) # print (len(reference_vntrs[vid].pattern)) if 83 <= len(reference_vntrs[vid].pattern) < 85: print reference_vntrs[vid].left_flanking_region[-50:] # print('%s: %s %s %s' % (vid, father_genotypes[vid], mother_genotypes[vid], child_genotypes[vid])) else: consistents.append( len((reference_vntrs[vid].get_repeat_segments()))) print('Total inconsistencies: %s' % len(inconsistents)) print(consistents) print(inconsistents)
def get_pacbio_comparison_result(): reference_vntrs = load_unique_vntrs_data() id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'} genes = glob.glob('../Pacbio_copy_number/*') for gene_dir in genes: print(gene_dir) files = glob.glob(gene_dir + '/*30x.fastq.sam') gene_name = gene_dir.split('/')[-1] mapped_reads = {} for file_name in files: copies = int(file_name.split('_')[-2]) make_bam_and_index(file_name) base_name = file_name[:-4] original_bam = base_name + '.bam' bwasw_alignment(base_name) blasr_alignment(base_name) bwasw_alignment_file = base_name[:-3] + '_bwasw_aln.bam' blasr_alignment_file = base_name[:-3] + '_blasr_aln.bam' vntr_id = None for vid, gname in id_to_gene.items(): if gname == gene_name: vntr_id = vid ref_length = copies * len(reference_vntrs[vntr_id].pattern) + 2000 true_ids = get_pacbio_true_read_ids(original_bam, reference_vntrs[vntr_id], ref_length) blasr_ids = get_id_of_reads_mapped_to_vntr_in_bamfile( blasr_alignment_file, reference_vntrs[vntr_id]) bwasw_ids = get_id_of_reads_mapped_to_vntr_in_bamfile( bwasw_alignment_file, reference_vntrs[vntr_id]) blasr_tp = [ read_id for read_id in blasr_ids if read_id in true_ids ] bwasw_tp = [ read_id for read_id in bwasw_ids if read_id in true_ids ] vntr_finder = VNTRFinder(reference_vntrs[vntr_id]) our_filtering = get_out_pacbio_filtered_counts( base_name, vntr_finder) our_selection = our_filtering mapped_reads[copies] = [ len(true_ids), our_filtering, our_selection, len(bwasw_tp), len(blasr_tp) ] with open(gene_dir + '/result.txt', 'w') as out: for copies in sorted(mapped_reads.iterkeys()): original, our_filtering, our_selection, bwasw, blasr = mapped_reads[ copies] out.write('%s %s %s %s %s %s\n' % (copies, original, our_filtering, our_selection, bwasw, blasr))
def get_illumina_comparison_result(): reference_vntrs = load_unique_vntrs_data() id_to_gene = { 119: 'DRD4', 1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA', 1219: 'IL1RN' } genes = glob.glob('../Illumina_copy_number/*') for gene_dir in genes: print(gene_dir) files = glob.glob(gene_dir + '/*30x.sam') gene_name = gene_dir.split('/')[-1] print(len(files)) mapped_reads = {} for file_name in files: copies = file_name.split('_')[-2] make_bam_and_index(file_name) base_name = file_name[:-4] bowtie_bam = bowtie_alignment(base_name + '.fq') bwa_bam = bwamem_alignment(base_name + '.fq') original_bam = file_name[:-4] + '.bam' vntr_id = None for vid, gname in id_to_gene.items(): if gname == gene_name: vntr_id = vid vntr_finder = VNTRFinder(reference_vntrs[vntr_id]) original = count_reads(original_bam) our_selection = get_our_selected_reads_count( base_name + '.fq', vntr_finder) our_filtering = get_our_filtered_reads_count( base_name + '.fq', vntr_finder) bwa = count_reads_mapped_to_vntr_in_bamfile( bwa_bam, reference_vntrs[vntr_id]) bowtie = count_reads_mapped_to_vntr_in_bamfile( bowtie_bam, reference_vntrs[vntr_id]) mapped_reads[int(copies)] = [ original, our_filtering, our_selection, bwa, bowtie ] with open(gene_dir + '/result.txt', 'w') as out: for copies in sorted(mapped_reads.iterkeys()): original, our_filtering, our_selection, bwa, bowtie = mapped_reads[ copies] out.write('%s %s %s %s %s %s\n' % (copies, original, our_filtering, our_selection, bwa, bowtie))
def create_pacbio_copy_number_variation_references( pacbio_read_dir='../pacbio_recruitment/set1/'): from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'} repeats = { 'CSTB': range(1, 69), 'HIC1': range(2, 36), 'INS': range(10, 171) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0: continue if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0: continue outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 1000)
def create_pacbio_coverage_data_for_3_genes_and_10_cn( pacbio_read_dir='../pacbio_coverage_experiment/'): from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() id_to_gene = {1221: 'CSTB', 1216: 'HIC1', 1215: 'INS'} repeats = { 'CSTB': range(2, 42), 'HIC1': range(2, 22), 'INS': range(10, 110) } for vntr_id in id_to_gene.keys(): for repeat in repeats[id_to_gene[vntr_id]]: if id_to_gene[vntr_id] == 'INS' and repeat % 5 != 0: continue if id_to_gene[vntr_id] == 'CSTB' and repeat % 2 != 0: continue if id_to_gene[vntr_id] != 'INS': continue outfile = pacbio_read_dir + id_to_gene[vntr_id] + '/' + str( repeat) + '.fa' create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 3000)
def create_pacbio_ru_length_data_for_all_vntrs( pacbio_read_dir='../pacbio_ru_data_for_all_vntrs/'): from reference_vntr import load_unique_vntrs_data reference_vntrs = load_unique_vntrs_data() with open('vntr_complex.txt') as infile: lines = infile.readlines() complex_vntrs = [int(r.strip().split()[0]) for r in lines] + [0] repeat_units = {} for vntr_id in range(len(reference_vntrs)): if vntr_id in complex_vntrs: continue ru = len(reference_vntrs[vntr_id].pattern) if ru not in repeat_units.keys(): repeat_units[ru] = [] if len(repeat_units[ru]) >= 4: continue repeat_units[ru].append(vntr_id) import os for ru in repeat_units.keys(): if len(repeat_units[ru]) < 2: continue for vntr_id in repeat_units[ru]: original_repeats = len( reference_vntrs[vntr_id].get_repeat_segments()) start = max(3, original_repeats - 10) for repeat in range(start, start + 21): if repeat % 5 != 0: continue outfile = pacbio_read_dir + str(ru) + '/vntr_id_' + str( vntr_id) + '_' + str(repeat) + '.fa' if not os.path.exists(os.path.dirname(outfile)): os.makedirs(os.path.dirname(outfile)) create_reference_region_with_specific_repeats( reference_vntrs[vntr_id], repeat, outfile, 1000)
def find_info_by_mapping(sim_dir='simulation_data/', dir_index=0): reference_vntrs = load_unique_vntrs_data() id_to_gene = {119: 'DRD4', 1220: 'GP1BA', 1221: 'CSTB', 1214: 'MAOA', 1219: 'IL1RN'} gene_to_length = {'DRD4': 528, 'GP1BA': 39, 'CSTB': 168, 'MAOA': 30} clean_up_tmp() dirs = glob.glob(sim_dir+'/*') simulation_dir = dirs[dir_index] files = glob.glob(simulation_dir + '/*') for fasta_file in files: if fasta_file.endswith('WGS_30x.fasta'): gene_name = simulation_dir.split('/')[-1].split('_')[0] vntr_id = None for vid, gname in id_to_gene.items(): if gname == gene_name: vntr_id = vid ref_vntr = reference_vntrs[vntr_id] true_reads_file = fasta_file[:-6] + '_true_reads.txt' simulated_sam_file = fasta_file[:-6] + '.sam' if not os.path.exists(true_reads_file): region = [ref_vntr.start_point, ref_vntr.start_point + gene_to_length[gene_name]] true_reads = get_id_of_reads_mapped_to_vntr_in_samfile(simulated_sam_file, ref_vntr, region=region) with open(true_reads_file, 'w') as out: for true_read in true_reads: out.write('%s\n' % true_read) else: with open(true_reads_file) as input: lines = input.readlines() true_reads = [line.strip() for line in lines if line.strip() != ''] true_reads_hmm_scores = fasta_file[:-6] + '_t_reads_hmm_score.txt' false_reads_hmm_scores = fasta_file[:-6] + '_f_reads_hmm_score.txt' if not os.path.exists(true_reads_hmm_scores): write_hmm_scores(simulated_sam_file, true_reads_hmm_scores, false_reads_hmm_scores, ref_vntr, true_reads) for i, parameter in enumerate([10]): positive_file = fasta_file[:-6] + '_bwa_%s_positive_supplementary_reads.txt' % abs(parameter) false_negative_file = fasta_file[:-6] + '_bwa_%s_fn_supplementary_reads.txt' % abs(parameter) if os.path.exists(positive_file) and os.path.exists(false_negative_file): continue bwa_alignment_file = '/tmp/_gene%s_' % dir_index + 'bwa_alignment_%s.sam' % i bwa_alignment(fasta_file, bwa_alignment_file, parameter) positive_reads, fn_reads = get_positive_and_fn_reads_from_samfile(bwa_alignment_file, ref_vntr, true_reads) save_reads_stat(positive_file, positive_reads) save_reads_stat(false_negative_file, fn_reads) clean_up_tmp() for i, parameter in enumerate([-0.6, -2]): if i == 0: continue positive_file = fasta_file[:-6] + '_bowtie_%s_positive_supplementary_reads.txt' % abs(parameter) false_negative_file = fasta_file[:-6] + '_bowtie_%s_fn_supplementary_reads.txt' % abs(parameter) if os.path.exists(positive_file) and os.path.exists(false_negative_file): continue bowtie_alignment_file = '/tmp/_gene%s_' % dir_index + 'bowtie_alignment_%s.sam' % i bowtie_alignment(fasta_file, bowtie_alignment_file, parameter) positive_reads, fn_reads = get_positive_and_fn_reads_from_samfile(bowtie_alignment_file, ref_vntr, true_reads) save_reads_stat(positive_file, positive_reads) save_reads_stat(false_negative_file, fn_reads) if gene_name == 'MAOA': os.system('cp %s /pedigree2/projects/VeNTeR/bowtie_alignment_%s.sam' % (bowtie_alignment_file, i)) clean_up_tmp()
def genotype(args, genotype_parser): if args.alignment_file is None and args.fasta is None: print_error( genotype_parser, 'ERROR: No input specified. Please specify alignment file or fasta file' ) if args.nanopore: settings.MAX_ERROR_RATE = 0.3 elif args.pacbio: settings.MAX_ERROR_RATE = 0.3 else: settings.MAX_ERROR_RATE = 0.05 if args.threads < 1: print_error(genotype_parser, 'ERROR: threads cannot be less than 1') settings.CORES = args.threads input_file = args.alignment_file if args.alignment_file else args.fasta input_is_alignment_file = input_file.endswith( 'bam') or input_file.endswith('sam') working_directory = args.working_directory + '/' if args.working_directory else os.path.dirname( input_file) + '/' log_file = working_directory + 'log_%s.log' % os.path.basename(input_file) log_format = '%(asctime)s %(levelname)s:%(message)s' logging.basicConfig(format=log_format, filename=log_file, level=logging.DEBUG, filemode='w') reference_vntrs = load_unique_vntrs_data() # reference_vntrs = identify_homologous_vntrs(reference_vntrs, 'chr15') # illumina_targets = [1214, 1220, 1221, 1222, 1223, 1224, 377, 378, 809] illumina_targets = [532789, 188871, 301645, 600000] target_vntrs = [] for i in range(len(reference_vntrs)): if not reference_vntrs[i].is_non_overlapping( ) or reference_vntrs[i].has_homologous_vntr(): continue target_vntrs.append(i) if args.vntr_id is not None: target_vntrs = [int(vid) for vid in args.vntr_id.split(',')] else: target_vntrs = illumina_targets genome_analyzier = GenomeAnalyzer(reference_vntrs, target_vntrs, working_directory) if args.pacbio: if input_is_alignment_file: genome_analyzier.find_repeat_counts_from_pacbio_alignment_file( input_file) else: genome_analyzier.find_repeat_counts_from_pacbio_reads( input_file, args.naive) else: if args.frameshift: if valid_vntr_for_frameshift(target_vntrs): genome_analyzier.find_frameshift_from_alignment_file( input_file) else: genotype_parser.error( '--frameshift is only available for these IDs: %s' % settings.FRAMESHIFT_VNTRS) elif input_is_alignment_file: genome_analyzier.find_repeat_counts_from_alignment_file(input_file) else: genome_analyzier.find_repeat_counts_from_short_reads(input_file)