def __init__(self,genome_file, keep_in_memory=True, keep_until_done=False, prefix=''): self.open_genome_file=open_input_file(genome_file) self.reader=FastaReader(self.open_genome_file) self.keep_in_memory=keep_in_memory self.keep_until_done=keep_until_done self.prefix=prefix self.all_chr={}
def parse_stat_file(stat_file): open_file = utils_logging.open_input_file(stat_file, pipe=False) line_number = 0 total = 0 duplicates = 0 mapped = 0 properly_paired = 0 for line in open_file: line_number += 1 if line_number == 1: total = int(line.split()[0]) elif line_number == 2: duplicates = int(line.split()[0]) elif line_number == 3: mapped = int(line.split()[0]) elif line_number == 4: dummy = int(line.split()[0]) elif line_number == 5: dummy = int(line.split()[0]) elif line_number == 6: dummy = int(line.split()[0]) elif line_number == 7: properly_paired = int(line.split()[0]) elif line_number == 8: dummy = int(line.split()[0]) elif line_number == 9: dummy = int(line.split()[0]) elif line_number == 10: dummy = int(line.split()[0]) elif line_number == 11: dummy = int(line.split()[0]) return (total, duplicates, mapped, properly_paired) open_file.close()
def bin_coordinates_through_genome(input_file, output_file, genome_file, bin_size): open_file=utils_logging.open_input_file(input_file) open_output=utils_logging.open_output_file(output_file) all_coordinates_per_chr={} genome_loader=GenomeLoader(genome_file) previous_bin=0 all_chr=[] for line in open_file: sp_line=line.split() all_coordinates=all_coordinates_per_chr.get(sp_line[0]) if all_coordinates is None: all_chr.append(sp_line[0]) all_coordinates=[] all_coordinates_per_chr[sp_line[0]]=all_coordinates all_coordinates.append(int(sp_line[1])) all_chr.sort() for chr in all_chr: header, sequence =genome_loader.get_chr(chr) chr=header.strip() chr_len=len(sequence) all_coordinates=all_coordinates_per_chr.get(chr) all_bins=bin_value_from_array(all_coordinates, bin_size, chr_len) for bin,value in enumerate(all_bins): open_output.write('%s\t%s\t%s\t%s\n'%(chr, bin*bin_size, (bin*bin_size)+previous_bin, value)) previous_bin+=len(all_bins)*bin_size open_output.close()
def sex_specific_markers_female(vcf_file, mother, father, offsprings_file): sample_to_sex,sex_to_sample, ordered_sample = load_sex_info(offsprings_file) file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) sex_to_sample.get("M").remove(father) sex_to_sample.get("F").remove(mother) for vcf_record in reader: gt_mother = vcf_record.get_genotype(mother) sp_mother = vcf_record.get_sample_depth(mother) gt_father = vcf_record.get_genotype(father) sp_father = vcf_record.get_sample_depth(father) sample_female=[] sample_male=[] if (sp_mother is None or int(sp_mother)<4) and sp_father is not None and int(sp_father)>10: valid=True for sample in sex_to_sample.get('F'): gt_of_fem = vcf_record.get_genotype(sample) sp_of_fem = vcf_record.get_sample_depth(sample) sample_female.append("%s:%s"%(gt_of_fem,sp_of_fem)) if sp_of_fem is not None and int(sp_of_fem)>2: valid=False break nb_male_offspring=0 for sample in sex_to_sample.get('M'): gt_of_mal = vcf_record.get_genotype(sample) sp_of_mal = vcf_record.get_sample_depth(sample) sample_male.append("%s:%s"%(gt_of_mal,sp_of_mal)) if sp_of_mal is not None and int(sp_of_mal)>5: nb_male_offspring+=1 if nb_male_offspring < len(sex_to_sample.get('M'))-4: valid=False if valid : print vcf_record.get_reference(), vcf_record.get_position(), "%s:%s"%(gt_father,sp_father), ' '.join(sample_male), "\t\t", "%s:%s"%(gt_mother,sp_mother), ' '.j
def get_normalize_coverage(coverage_file, nb_sample_required=0): #contig coverage coverage_mrk_dup nb_sample file_handle = utils_logging.open_input_file(coverage_file, pipe=False) all_samples_to_coverage={} all_samples=[] all_markers=[] for line in file_handle: sp_line = line.strip().split("\t") if line.startswith("#"): for i in range(4, len(sp_line), 2): sample = sp_line[i] all_samples.append(sample) all_samples_to_coverage[sample]=[] elif int(sp_line[3])>=nb_sample_required: i=0 all_markers.append(sp_line[0]) j=0 for i in range(4, len(sp_line), 2): data=sp_line[i] all_samples_to_coverage[all_samples[j]].append(int(data)) j+=1 all_samples_to_norm_coverage={} for sample in all_samples: coverage_info = all_samples_to_coverage.get(sample) total=sum(coverage_info) normalized_coverage=[] for coverage in coverage_info: normalized_coverage.append(float(coverage)/total*1000000) all_samples_to_norm_coverage[sample]=normalized_coverage return all_markers, all_samples, all_samples_to_norm_coverage
def allele_presence_abscence(vcf_file,parent_name1,parent_name2): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) sample_names = reader.get_sample_names() #all_samples_in_file=reader.get_sample_names() vcf_record_in_one_contig={} curr_reference=None qual_threshold=10 cov_threshold=0 children_samples=[] for sample in sample_names: if sample != parent_name1 and sample != parent_name2: children_samples.append(sample) for vcf_records in reader: #First check that the parent are callable vcf_record_in_one_contig[vcf_records.get_position()]=vcf_records curr_reference = vcf_records.get_reference() genotype_p1 = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold, minimum_depth=cov_threshold,sample_list=[parent_name1]) genotype_p2 = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold, minimum_depth=cov_threshold,sample_list=[parent_name2]) #if the parent are callable if len(genotype_p1)==1 and len(genotype_p2)==1: g1=genotype_p1.keys()[0] g2=genotype_p2.keys()[0] all_alleles=set() all_alleles.update(set(g1.split('/'))) all_alleles.update(set(g2.split('/'))) if g1!=g2 and len(all_alleles)==2 and ( len(set(g1.split('/')))>1 or len(set(g2.split('/')))>1 ): allele1=set(all_alleles.pop()) allele2=set(all_alleles.pop()) allele1_seg_pattern=[] allele2_seg_pattern=[] for sample in children_samples: remaining_genotype = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold, minimum_depth=cov_threshold,sample_list=[sample]) if len(remaining_genotype)>0: haplotypes = set(remaining_genotype.keys()[0].split('/')) if len(allele1.intersection(haplotypes)): allele1_seg_pattern.append('1') else: allele1_seg_pattern.append('0') if len(allele2.intersection(haplotypes)): allele2_seg_pattern.append('1') else: allele2_seg_pattern.append('0') else: allele1_seg_pattern.append('-') allele2_seg_pattern.append('-') if len(set(g1.split('/')))>1: print "%s\t%s\t10\t%s"%(curr_reference, vcf_records.get_position(),''.join(allele1_seg_pattern)) elif len(set(g2.split('/')))>1: print "%s\t%s\t01\t%s"%(curr_reference, vcf_records.get_position(),''.join(allele2_seg_pattern))
def __init__(self, genome_file, keep_in_memory=True, keep_until_done=False, prefix=''): self.open_genome_file = open_input_file(genome_file) self.reader = FastaReader(self.open_genome_file) self.keep_in_memory = keep_in_memory self.keep_until_done = keep_until_done self.prefix = prefix self.all_chr = {}
def read_exon_capture_file(exon_capture_file, extension=0): open_exon_capture=utils_logging.open_input_file(exon_capture_file, pipe=False) all_segments_per_chr={} for line in open_exon_capture: if line.startswith('#') or line.startswith('track'): continue sp_line=line.strip().split() chr=sp_line[0] start=int(sp_line[1])-extension end=int(sp_line[2])+extension all_segments=all_segments_per_chr.get(chr) if all_segments is None: all_segments=[] all_segments_per_chr[chr]=all_segments all_segments.append((start,end)) open_exon_capture.close() return all_segments_per_chr
def snps_to_allele(vcf_file, bam_file): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) sample_names = reader.get_sample_names() #all_samples_in_file=reader.get_sample_names() vcf_record_in_one_contig={} curr_reference=None for vcf_records in reader: #First check that the parent are callable if curr_reference!=vcf_records.get_reference(): if curr_reference: process_alleles(vcf_record_in_one_contig, sample_names, curr_reference) vcf_record_in_one_contig={} vcf_record_in_one_contig[vcf_records.get_position()]=vcf_records curr_reference = vcf_records.get_reference() if curr_reference: process_alleles(vcf_record_in_one_contig, sample_names, curr_reference)
def bin_coordinates(input_file, output_file, bin_size): open_file=utils_logging.open_input_file(input_file) open_output=utils_logging.open_output_file(output_file) all_coordinates_per_chr={} for line in open_file: sp_line=line.split() all_coordinates=all_coordinates_per_chr.get(sp_line[0]) if all_coordinates is None: all_coordinates=[] all_coordinates_per_chr[sp_line[0]]=all_coordinates all_coordinates.append(int(sp_line[1])) for chr in all_coordinates_per_chr.keys(): all_coordinates=all_coordinates_per_chr.get(chr) all_bins=bin_value_from_array(all_coordinates, bin_size) for bin,value in enumerate(all_bins): open_output.write('%s\t%s\t%s\n'%(chr,bin*bin_size,value)) open_output.close()
def read_exon_capture_file(exon_capture_file, extension=0): open_exon_capture = utils_logging.open_input_file(exon_capture_file, pipe=False) all_segments_per_chr = {} for line in open_exon_capture: if line.startswith('#') or line.startswith('track'): continue sp_line = line.strip().split() chr = sp_line[0] start = int(sp_line[1]) - extension end = int(sp_line[2]) + extension all_segments = all_segments_per_chr.get(chr) if all_segments is None: all_segments = [] all_segments_per_chr[chr] = all_segments all_segments.append((start, end)) open_exon_capture.close() return all_segments_per_chr
def snps_to_allele(vcf_file, bam_file): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) sample_names = reader.get_sample_names() #all_samples_in_file=reader.get_sample_names() vcf_record_in_one_contig = {} curr_reference = None for vcf_records in reader: #First check that the parent are callable if curr_reference != vcf_records.get_reference(): if curr_reference: process_alleles(vcf_record_in_one_contig, sample_names, curr_reference) vcf_record_in_one_contig = {} vcf_record_in_one_contig[vcf_records.get_position()] = vcf_records curr_reference = vcf_records.get_reference() if curr_reference: process_alleles(vcf_record_in_one_contig, sample_names, curr_reference)
def parse_stat_file(stat_file): open_file = utils_logging.open_input_file(stat_file,pipe=False) line_number=0 total=0 duplicates=0 mapped=0 properly_paired=0 for line in open_file: line_number+=1 if line_number==1: total = int(line.split()[0]) elif line_number==2: duplicates = int(line.split()[0]) elif line_number==3: mapped = int(line.split()[0]) elif line_number==4: dummy = int(line.split()[0]) elif line_number==5: dummy = int(line.split()[0]) elif line_number==6: dummy = int(line.split()[0]) elif line_number==7: properly_paired = int(line.split()[0]) elif line_number==8: dummy = int(line.split()[0]) elif line_number==9: dummy = int(line.split()[0]) elif line_number==10: dummy = int(line.split()[0]) elif line_number==11: dummy = int(line.split()[0]) return (total,duplicates,mapped,properly_paired) open_file.close()
def vcf_to_lepmap(vcf_file, output_file, sex_file, mother_name, father_name, family_name, genotype_quality_threshold=20, max_prop_missing=.5): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) all_samples_in_file = reader.get_sample_names() if sex_file: sample2sex, sex2sample = read_pop_file(sex_file) all_samples_in_file = sample2sex.keys() if mother_name in all_samples_in_file: all_samples_in_file.pop(mother_name) if father_name in all_samples_in_file: all_samples_in_file.pop(father_name) max_missing = int(len(all_samples_in_file) * max_prop_missing) valid_genotypes = ['0/1 0/0', '0/1 0/1', '0/1 1/1', '0/0 0/1', '0/0 1/1', '1/1 0/0', '1/1 0/1'] all_lines = {} for sample in all_samples_in_file: if sample2sex.get(sample) == 'M': sex="1" else: sex="2" all_lines[sample] = [family_name, sample, father_name, mother_name, sex, "0"] all_lines[father_name] = [family_name, father_name, "0", "0", "1", "0"] all_lines[mother_name] = [family_name, mother_name, "0", "0", "2", "0"] all_samples_in_file.append(father_name) all_samples_in_file.append(mother_name) nb_lines = 0 nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 nb_missing_per_sample = Counter() count_invalid_parent_geno = 0 for vcf_records in reader: nb_lines += 1 if nb_lines % 10000 == 0: sys.stdout.write('.') ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_records.is_indel(): count_indel += 1 continue nb_missing = 0 all_chars = [] all_codes = set() parent_geno = [] for sample in all_samples_in_file: gt = vcf_records.get_genotype(sample) gq = vcf_records.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: value1, value2 = re.split('[/|]', gt) code = '%s %s'%(int(value1)+1, int(value2)+1) else: nb_missing += 1 nb_missing_per_sample[sample] += 1 code = '0 0' all_chars.append(code) all_codes.add(code) if sample in [father_name, mother_name] and gq>genotype_quality_threshold: parent_geno.append(gt) #if len(parent_geno)!=2 or not ' '.join(parent_geno) in valid_genotypes: # count_invalid_parent_geno += 1 # continue if len(all_codes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 for i, sample in enumerate(all_samples_in_file): all_lines[sample].append(all_chars[i]) else: count_too_many_missing += 1 if count_invalid_parent_geno: logging.warning("%s snps remove because they missing or non informative parental genotypes" % (count_invalid_parent_geno)) if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s snps output in Lepmap format" % (nb_sequence)) for sample in nb_missing_per_sample: logging.info("%s markers missing in %s" % (nb_missing_per_sample.get(sample), sample)) with open(output_file, 'w') as open_output: for sample in all_lines: open_output.write('%s\n'%('\t'.join(all_lines.get(sample))))
def vcf_2_genepop(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) if phased: reader = vcfIO.PhasedVcfReader(file_handle) else: reader = vcfIO.VcfReader(file_handle) all_samples_in_file = reader.get_sample_names() if pop_file: sample2pop, pop2samples = read_pop_file(pop_file) all_samples = sample2pop.keys() else: pop = 'dummy_pop' all_samples = all_samples_in_file pop2samples = {pop: all_samples_in_file} sample2pop = {} for sample in all_samples: sample2pop[sample] = pop all_lines = {} headers = [] for sample in all_samples: all_lines[sample] = [] nb_sample = len(all_samples) max_missing = int(len(all_samples) * max_prop_missing) nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 for vcf_record in reader: ref_base = vcf_record.get_reference_base() alt_bases = vcf_record.get_alt_bases() if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_record.is_indel(): count_indel += 1 continue nb_missing = 0 all_alleles = [] all_genotypes = set() for sample in all_samples: gt = vcf_record.get_genotype(sample) gq = vcf_record.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: allele1, allele2 = gt.split('/') alleles = "%02d%02d" % (int(allele1) + 1, int(allele2) + 1) all_genotypes.add(gt) else: nb_missing += 1 alleles = '0000' all_alleles.append(alleles) if len(all_genotypes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 variant_name = '%s:%s' % (vcf_record.get_reference(), vcf_record.get_position()) headers.append(variant_name) for i, sample in enumerate(all_samples): all_lines[sample].append(all_alleles[i]) else: count_too_many_missing += 1 title_line = "Generated by %s from %s on %s" % (getpass.getuser(), vcf_file, time.ctime()) if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence)) with open(output_file, 'w') as open_file: open_file.write(title_line + '\n') open_file.write('%s\n' % ('\n'.join(headers))) for pop in pop2samples: open_file.write("Pop\n") samples = pop2samples.get(pop) for sample in samples: open_file.write("%s %s, %s\n" % (sample, pop, '\t'.join(all_lines.get(sample)) ))
def vcf_to_simple_genotype(vcf_file, mandatory_list_sample, list_samples, min_nb_high_qual_sample=1, print_all_genotype=False): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) if not list_samples: list_samples = reader.get_sample_names() discarded_parents = 0 nb_markers = 0 print '#chr\tpos\tal1\tal2\t%s' % ('\t'.join(list_samples)) for vcf_records in reader: #First check that the parent are callable nb_markers += 1 keep1 = test_mandatory_samples( vcf_records, genotype_quality_threshold=20, minimum_depth=6, mandatory_list_sample=mandatory_list_sample) keep2 = test_all_samples( vcf_records, genotype_quality_threshold=20, minimum_depth=6, list_samples=list_samples, min_nb_high_qual_sample=min_nb_high_qual_sample) if keep1 and keep2: ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() if len(alt_bases) > 1: continue else: alt_base = alt_bases[0] if not print_all_genotype: genotypes_all = vcf_records.get_valid_genotype_per_sample( genotype_quality_threshold=20, minimum_depth=6) samples2genotype = generate_empty_hash_with_sample( list_samples) for genotype in genotypes_all: sample_list = genotypes_all.get(genotype) genotype_str = genotype.replace('0', ref_base).replace( '1', alt_base) #genotype_str = genotype for sample in sample_list: samples2genotype[sample] = genotype_str out = [] out.append(vcf_records.get_reference()) out.append(str(vcf_records.get_position())) out.append(ref_base) out.append(alt_base) for sample in list_samples: out.append(samples2genotype.get(sample)) print '\t'.join(out) else: all_genotypes = vcf_records.get_all_genotype(list_samples) all_genotype_quals = vcf_records.get_all_genotype_quality( list_samples) all_sample_depth = vcf_records.get_all_sample_depth( list_samples) out = [] out.append(vcf_records.get_reference()) out.append(str(vcf_records.get_position())) out.append(ref_base) out.append(alt_base) for i in range(len(all_genotypes)): if all_genotypes[i]: genotype_str = all_genotypes[i].replace( '0', ref_base).replace('1', alt_base) else: genotype_str = '.' if not all_genotype_quals[i]: all_genotype_quals[i] = 0 if not all_sample_depth[i]: all_sample_depth[i] = 0 out.append('%s:%s:%s' % (genotype_str, all_genotype_quals[i], all_sample_depth[i])) print '\t'.join(out) else: discarded_parents += 1 sys.stderr.write("%s markers %s filtered out\n" % (nb_markers, discarded_parents)) return True
def vcf_2_vcf(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) if phased: reader = vcfIO.PhasedVcfReader(file_handle) else: reader = vcfIO.VcfReader(file_handle) all_samples = reader.get_sample_names() all_lines = {} headers = [] for sample in all_samples: all_lines[sample] = [] nb_sample = len(all_samples) max_missing = int(len(all_samples) * max_prop_missing) nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 open_file = open(output_file, 'w') open_file.write(reader.get_header_lines()) for vcf_record in reader: ref_base = vcf_record.get_reference_base() alt_bases = vcf_record.get_alt_bases() if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_record.is_indel(): count_indel += 1 continue nb_missing = 0 all_alleles = [] all_genotypes = set() for sample in all_samples: gt = vcf_record.get_genotype(sample) gq = vcf_record.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: allele1, allele2 = gt.split('/') alleles = "%02d%02d" % (int(allele1) + 1, int(allele2) + 1) all_genotypes.update(gt.split('/')) else: nb_missing += 1 alleles = '0000' all_alleles.append(alleles) if len(all_genotypes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 open_file.write(str(vcf_record) + '\n') else: count_too_many_missing += 1 open_file.close() if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence))
def vcf_to_phylip_or_nexus(vcf_file, output_file, pop_file=None, genotype_quality_threshold=20, max_prop_missing=.5, phased=False, type_output=OUTPUT_TYPE_PHYLIP): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) if phased: # Not useful for phylip format reader = vcfIO.VcfReader(file_handle) else: reader = vcfIO.VcfReader(file_handle) all_samples_in_file = reader.get_sample_names() if pop_file: sample2pop, pop2sample = read_pop_file(pop_file) all_samples_in_file = sample2pop.keys() max_missing = int(len(all_samples_in_file) * max_prop_missing) all_lines = {} for sample in all_samples_in_file: all_lines[sample] = [] nb_sample = len(all_samples_in_file) nb_lines = 0 nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 nb_missing_per_sample = Counter() for vcf_records in reader: nb_lines += 1 if nb_lines % 10000 == 0: sys.stdout.write('.') ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() array = [ref_base] array.extend(alt_bases) if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_records.is_indel(): count_indel += 1 continue nb_missing = 0 set_missing = set() all_chars = [] all_codes = set() for sample in all_samples_in_file: gt = vcf_records.get_genotype(sample) gq = vcf_records.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: value1, value2 = re.split('[/|]', gt) code = atgc2iupac(list(set([array[int(value1)], array[int(value2)]]))) all_codes.add(code) else: nb_missing += 1 set_missing.add(sample) code = '?' all_chars.append(code) if len(all_codes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 for s in set_missing: nb_missing_per_sample[s] += 1 for i, sample in enumerate(all_samples_in_file): all_lines[sample].append(all_chars[i]) else: count_too_many_missing += 1 if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s snps output in %s format" % (nb_sequence, OUTPUT_TYPE_PHYLIP)) for sample in nb_missing_per_sample: logging.info("%s markers missing in %s" % (nb_missing_per_sample.get(sample), sample)) with open(output_file, 'w') as open_output: if type_output == OUTPUT_TYPE_PHYLIP: # Output the phylip formated file open_output.write('%s %s\n' % (nb_sample, nb_sequence)) for sample in all_samples_in_file: open_output.write("%s %s\n" % (sample, ''.join(all_lines.get(sample)))) elif type_output == OUTPUT_TYPE_NEXUS: open_output.write(format_nexus(all_lines))
def vcf_to_lepmap(vcf_file, output_file, sex_file, mother_name, father_name, family_name, genotype_quality_threshold=20, max_prop_missing=.5): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) all_samples_in_file = reader.get_sample_names() if sex_file: sample2sex, sex2sample = read_pop_file(sex_file) all_samples_in_file = sample2sex.keys() if mother_name in all_samples_in_file: all_samples_in_file.pop(mother_name) if father_name in all_samples_in_file: all_samples_in_file.pop(father_name) max_missing = int(len(all_samples_in_file) * max_prop_missing) valid_genotypes = [ '0/1 0/0', '0/1 0/1', '0/1 1/1', '0/0 0/1', '0/0 1/1', '1/1 0/0', '1/1 0/1' ] all_lines = {} for sample in all_samples_in_file: if sample2sex.get(sample) == 'M': sex = "1" else: sex = "2" all_lines[sample] = [ family_name, sample, father_name, mother_name, sex, "0" ] all_lines[father_name] = [family_name, father_name, "0", "0", "1", "0"] all_lines[mother_name] = [family_name, mother_name, "0", "0", "2", "0"] all_samples_in_file.append(father_name) all_samples_in_file.append(mother_name) nb_lines = 0 nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 nb_missing_per_sample = Counter() count_invalid_parent_geno = 0 for vcf_records in reader: nb_lines += 1 if nb_lines % 10000 == 0: sys.stdout.write('.') ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_records.is_indel(): count_indel += 1 continue nb_missing = 0 all_chars = [] all_codes = set() parent_geno = [] for sample in all_samples_in_file: gt = vcf_records.get_genotype(sample) gq = vcf_records.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: value1, value2 = re.split('[/|]', gt) code = '%s %s' % (int(value1) + 1, int(value2) + 1) else: nb_missing += 1 nb_missing_per_sample[sample] += 1 code = '0 0' all_chars.append(code) all_codes.add(code) if sample in [father_name, mother_name ] and gq > genotype_quality_threshold: parent_geno.append(gt) #if len(parent_geno)!=2 or not ' '.join(parent_geno) in valid_genotypes: # count_invalid_parent_geno += 1 # continue if len(all_codes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 for i, sample in enumerate(all_samples_in_file): all_lines[sample].append(all_chars[i]) else: count_too_many_missing += 1 if count_invalid_parent_geno: logging.warning( "%s snps remove because they missing or non informative parental genotypes" % (count_invalid_parent_geno)) if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s snps output in Lepmap format" % (nb_sequence)) for sample in nb_missing_per_sample: logging.info("%s markers missing in %s" % (nb_missing_per_sample.get(sample), sample)) with open(output_file, 'w') as open_output: for sample in all_lines: open_output.write('%s\n' % ('\t'.join(all_lines.get(sample))))
def vcf_to_structure(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) if phased: reader = vcfIO.PhasedVcfReader(file_handle) else: reader = vcfIO.VcfReader(file_handle) all_samples_in_file = reader.get_sample_names() if pop_file: sample2pop, pop2sample = read_pop_file(pop_file) pops = pop2sample.keys() all_samples = sample2pop.keys() else: pop = 'dummy_pop' pops = [pop] all_samples = all_samples_in_file pop2sample = {pop: [all_samples_in_file]} sample2pop = {} for sample in all_samples: sample2pop[sample] = pop sample_errors = set(all_samples).difference(set(all_samples_in_file)) if len(sample_errors) > 0: logging.critical('%s samples (%s) from the population file not found in the vcf file' % ( len(sample_errors), ', '.join(sample_errors))) return -2 all_lines = {} headers = [] for sample in all_samples: all_lines[sample + '1'] = [] all_lines[sample + '2'] = [] nb_sample = len(all_samples) max_missing = int(len(all_samples) * max_prop_missing) nb_sequence = 0 count_more_than_one_allele = 0 count_indel = 0 count_too_many_missing = 0 count_non_polymorphic = 0 for vcf_records in reader: ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() if len(alt_bases) > 1: count_more_than_one_allele += 1 continue if vcf_records.is_indel(): count_indel += 1 continue nb_missing = 0 all_alleles1 = [] all_alleles2 = [] all_genotypes = set() for sample in all_samples: gt = vcf_records.get_genotype(sample) gq = vcf_records.get_genotype_quality(sample) if gt and gq > genotype_quality_threshold: allele1, allele2 = gt.split('/') all_genotypes.add(gt) else: nb_missing += 1 allele1, allele2 = ('-9', '-9') all_alleles1.append(allele1) all_alleles2.append(allele2) if len(all_genotypes) == 1: count_non_polymorphic += 1 continue if nb_missing <= max_missing: nb_sequence += 1 variant_name = '%s:%s' % (vcf_records.get_reference(), vcf_records.get_position()) headers.append(variant_name) for i, sample in enumerate(all_samples): all_lines[sample + '1'].append(all_alleles1[i]) all_lines[sample + '2'].append(all_alleles2[i]) else: count_too_many_missing += 1 if count_more_than_one_allele: logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele)) if count_indel: logging.warning("%s indels removed" % (count_indel)) if count_non_polymorphic: logging.warning( "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic)) if count_too_many_missing: logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing)) logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence)) with open(output_file, 'w') as open_file: open_file.write('\t%s\n' % ('\t'.join(headers))) for sample in all_samples: open_file.write("%s\t%s\t%s\n" % ( sample, pops.index(sample2pop.get(sample)) + 1, '\t'.join(all_lines.get(sample + '1')) )) open_file.write("%s\t%s\t%s\n" % ( sample, pops.index(sample2pop.get(sample)) + 1, '\t'.join(all_lines.get(sample + '2')) ))
def vcf_to_simple_genotype(vcf_file, mandatory_list_sample, list_samples, min_nb_high_qual_sample=1, print_all_genotype=False): file_handle = utils_logging.open_input_file(vcf_file, pipe=False) reader = vcfIO.VcfReader(file_handle) if not list_samples: list_samples=reader.get_sample_names() discarded_parents=0 nb_markers=0 print '#chr\tpos\tal1\tal2\t%s'%('\t'.join(list_samples)) for vcf_records in reader: #First check that the parent are callable nb_markers+=1 keep1=test_mandatory_samples(vcf_records, genotype_quality_threshold=20, minimum_depth=6, mandatory_list_sample=mandatory_list_sample) keep2 = test_all_samples(vcf_records, genotype_quality_threshold=20, minimum_depth=6, list_samples=list_samples, min_nb_high_qual_sample=min_nb_high_qual_sample) if keep1 and keep2: ref_base = vcf_records.get_reference_base() alt_bases = vcf_records.get_alt_bases() if len(alt_bases)>1: continue else: alt_base=alt_bases[0] if not print_all_genotype: genotypes_all = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=20, minimum_depth=6) samples2genotype = generate_empty_hash_with_sample(list_samples) for genotype in genotypes_all: sample_list = genotypes_all.get(genotype) genotype_str = genotype.replace('0', ref_base).replace('1', alt_base) #genotype_str = genotype for sample in sample_list: samples2genotype[sample] = genotype_str out=[] out.append(vcf_records.get_reference()) out.append(str(vcf_records.get_position())) out.append(ref_base) out.append(alt_base) for sample in list_samples: out.append(samples2genotype.get(sample)) print '\t'.join(out) else: all_genotypes=vcf_records.get_all_genotype(list_samples) all_genotype_quals=vcf_records.get_all_genotype_quality(list_samples) all_sample_depth=vcf_records.get_all_sample_depth(list_samples) out=[] out.append(vcf_records.get_reference()) out.append(str(vcf_records.get_position())) out.append(ref_base) out.append(alt_base) for i in range(len(all_genotypes)): if all_genotypes[i]: genotype_str = all_genotypes[i].replace('0', ref_base).replace('1', alt_base) else: genotype_str='.' if not all_genotype_quals[i]: all_genotype_quals[i]=0 if not all_sample_depth[i]: all_sample_depth[i]=0 out.append('%s:%s:%s'%(genotype_str,all_genotype_quals[i],all_sample_depth[i])) print '\t'.join(out) else: discarded_parents+=1 sys.stderr.write("%s markers %s filtered out\n"%(nb_markers,discarded_parents)) return True