示例#1
0
 def __init__(self,genome_file, keep_in_memory=True, keep_until_done=False, prefix=''):
     self.open_genome_file=open_input_file(genome_file)
     self.reader=FastaReader(self.open_genome_file)
     self.keep_in_memory=keep_in_memory
     self.keep_until_done=keep_until_done
     self.prefix=prefix
     self.all_chr={}
示例#2
0
def parse_stat_file(stat_file):
    open_file = utils_logging.open_input_file(stat_file, pipe=False)
    line_number = 0
    total = 0
    duplicates = 0
    mapped = 0
    properly_paired = 0

    for line in open_file:
        line_number += 1
        if line_number == 1:
            total = int(line.split()[0])
        elif line_number == 2:
            duplicates = int(line.split()[0])
        elif line_number == 3:
            mapped = int(line.split()[0])
        elif line_number == 4:
            dummy = int(line.split()[0])
        elif line_number == 5:
            dummy = int(line.split()[0])
        elif line_number == 6:
            dummy = int(line.split()[0])
        elif line_number == 7:
            properly_paired = int(line.split()[0])
        elif line_number == 8:
            dummy = int(line.split()[0])
        elif line_number == 9:
            dummy = int(line.split()[0])
        elif line_number == 10:
            dummy = int(line.split()[0])
        elif line_number == 11:
            dummy = int(line.split()[0])
    return (total, duplicates, mapped, properly_paired)

    open_file.close()
示例#3
0
def bin_coordinates_through_genome(input_file, output_file, genome_file, bin_size):
    open_file=utils_logging.open_input_file(input_file)
    open_output=utils_logging.open_output_file(output_file)
    all_coordinates_per_chr={}
    genome_loader=GenomeLoader(genome_file)
    previous_bin=0
    all_chr=[]
    for line in open_file:
        sp_line=line.split()
        all_coordinates=all_coordinates_per_chr.get(sp_line[0])
        if all_coordinates is None:
            all_chr.append(sp_line[0])
            all_coordinates=[]
            all_coordinates_per_chr[sp_line[0]]=all_coordinates
        all_coordinates.append(int(sp_line[1]))
    all_chr.sort()
    for chr in all_chr:
        header, sequence =genome_loader.get_chr(chr)
        chr=header.strip()
        chr_len=len(sequence)
        
        all_coordinates=all_coordinates_per_chr.get(chr)
        all_bins=bin_value_from_array(all_coordinates, bin_size, chr_len)
        for bin,value in enumerate(all_bins):
            open_output.write('%s\t%s\t%s\t%s\n'%(chr, bin*bin_size, (bin*bin_size)+previous_bin, value))
        previous_bin+=len(all_bins)*bin_size
    open_output.close()    
示例#4
0
def sex_specific_markers_female(vcf_file, mother, father, offsprings_file):
    sample_to_sex,sex_to_sample, ordered_sample = load_sex_info(offsprings_file)
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader  = vcfIO.VcfReader(file_handle)
    sex_to_sample.get("M").remove(father)
    sex_to_sample.get("F").remove(mother)

    for vcf_record in reader:
        gt_mother = vcf_record.get_genotype(mother)
        sp_mother = vcf_record.get_sample_depth(mother)
        gt_father = vcf_record.get_genotype(father)
        sp_father = vcf_record.get_sample_depth(father)
        sample_female=[]
        sample_male=[]

        if (sp_mother is None or int(sp_mother)<4) and sp_father is not None and int(sp_father)>10:
            valid=True
            for sample in sex_to_sample.get('F'):
                gt_of_fem = vcf_record.get_genotype(sample)
                sp_of_fem = vcf_record.get_sample_depth(sample)
                sample_female.append("%s:%s"%(gt_of_fem,sp_of_fem))
                if sp_of_fem is not None and int(sp_of_fem)>2:
                    valid=False
                    break
            nb_male_offspring=0
            for sample in sex_to_sample.get('M'):
                gt_of_mal = vcf_record.get_genotype(sample)
                sp_of_mal = vcf_record.get_sample_depth(sample)
                sample_male.append("%s:%s"%(gt_of_mal,sp_of_mal))
                if sp_of_mal is not None and int(sp_of_mal)>5:
                    nb_male_offspring+=1
            if nb_male_offspring < len(sex_to_sample.get('M'))-4:
                valid=False
            if valid :
                print vcf_record.get_reference(), vcf_record.get_position(), "%s:%s"%(gt_father,sp_father), '  '.join(sample_male), "\t\t", "%s:%s"%(gt_mother,sp_mother), '  '.j
def get_normalize_coverage(coverage_file, nb_sample_required=0):
    #contig    coverage        coverage_mrk_dup        nb_sample

    file_handle = utils_logging.open_input_file(coverage_file, pipe=False)
    all_samples_to_coverage={}
    all_samples=[]
    all_markers=[]
    for line in file_handle:
        sp_line = line.strip().split("\t")
        if line.startswith("#"):
            for i in range(4, len(sp_line), 2):
                sample = sp_line[i]
                all_samples.append(sample)
                all_samples_to_coverage[sample]=[]
        elif int(sp_line[3])>=nb_sample_required:
            i=0
            all_markers.append(sp_line[0])
            j=0
            for i in range(4, len(sp_line), 2):
                data=sp_line[i]
                all_samples_to_coverage[all_samples[j]].append(int(data))
                j+=1
    all_samples_to_norm_coverage={}
    for sample in all_samples:
        coverage_info = all_samples_to_coverage.get(sample)
        total=sum(coverage_info)
        normalized_coverage=[]
        for coverage in coverage_info:
            normalized_coverage.append(float(coverage)/total*1000000)
        all_samples_to_norm_coverage[sample]=normalized_coverage
    
    return all_markers, all_samples, all_samples_to_norm_coverage
示例#6
0
def allele_presence_abscence(vcf_file,parent_name1,parent_name2):
    
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader  = vcfIO.VcfReader(file_handle)
    sample_names = reader.get_sample_names()
    #all_samples_in_file=reader.get_sample_names()
    vcf_record_in_one_contig={}
    curr_reference=None
    qual_threshold=10
    cov_threshold=0
    
    children_samples=[]
    for sample in sample_names:
        if sample != parent_name1 and sample != parent_name2:
            children_samples.append(sample) 
    for vcf_records in reader:
        #First check that the parent are callable
        vcf_record_in_one_contig[vcf_records.get_position()]=vcf_records
        curr_reference = vcf_records.get_reference()
        genotype_p1 = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold, minimum_depth=cov_threshold,sample_list=[parent_name1])
        genotype_p2 = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold, minimum_depth=cov_threshold,sample_list=[parent_name2])
        
        #if the parent are callable
        
        if len(genotype_p1)==1 and len(genotype_p2)==1:
            g1=genotype_p1.keys()[0]
            g2=genotype_p2.keys()[0]
            all_alleles=set()
            all_alleles.update(set(g1.split('/')))
            all_alleles.update(set(g2.split('/')))
            if g1!=g2 and len(all_alleles)==2 and ( len(set(g1.split('/')))>1 or len(set(g2.split('/')))>1 ):
                allele1=set(all_alleles.pop())
                allele2=set(all_alleles.pop())
                allele1_seg_pattern=[]
                allele2_seg_pattern=[]
                
                
                for sample in children_samples:
                    remaining_genotype = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=qual_threshold,
                                                                                   minimum_depth=cov_threshold,sample_list=[sample])
                    if len(remaining_genotype)>0:
                        haplotypes = set(remaining_genotype.keys()[0].split('/'))
                        if len(allele1.intersection(haplotypes)):
                            allele1_seg_pattern.append('1')
                        else:
                            allele1_seg_pattern.append('0')
                        if len(allele2.intersection(haplotypes)):
                            allele2_seg_pattern.append('1')
                        else:
                            allele2_seg_pattern.append('0')
                    else:
                        allele1_seg_pattern.append('-')
                        allele2_seg_pattern.append('-')
                if len(set(g1.split('/')))>1:
                    print "%s\t%s\t10\t%s"%(curr_reference, vcf_records.get_position(),''.join(allele1_seg_pattern))
                elif len(set(g2.split('/')))>1:
                    print "%s\t%s\t01\t%s"%(curr_reference, vcf_records.get_position(),''.join(allele2_seg_pattern))
示例#7
0
 def __init__(self,
              genome_file,
              keep_in_memory=True,
              keep_until_done=False,
              prefix=''):
     self.open_genome_file = open_input_file(genome_file)
     self.reader = FastaReader(self.open_genome_file)
     self.keep_in_memory = keep_in_memory
     self.keep_until_done = keep_until_done
     self.prefix = prefix
     self.all_chr = {}
def read_exon_capture_file(exon_capture_file, extension=0):
    open_exon_capture=utils_logging.open_input_file(exon_capture_file, pipe=False)
    all_segments_per_chr={}
    for line in open_exon_capture:
        if line.startswith('#') or line.startswith('track'):
            continue
        sp_line=line.strip().split()
        chr=sp_line[0]
        start=int(sp_line[1])-extension
        end=int(sp_line[2])+extension
        all_segments=all_segments_per_chr.get(chr)
        if all_segments is None:
            all_segments=[]
            all_segments_per_chr[chr]=all_segments
        all_segments.append((start,end))
    open_exon_capture.close()
    return all_segments_per_chr
示例#9
0
def snps_to_allele(vcf_file, bam_file):
    
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader  = vcfIO.VcfReader(file_handle)
    sample_names = reader.get_sample_names()
    #all_samples_in_file=reader.get_sample_names()
    vcf_record_in_one_contig={}
    curr_reference=None
    for vcf_records in reader:
        #First check that the parent are callable
        if curr_reference!=vcf_records.get_reference():
            if curr_reference:
                process_alleles(vcf_record_in_one_contig, sample_names, curr_reference)
            vcf_record_in_one_contig={}
        vcf_record_in_one_contig[vcf_records.get_position()]=vcf_records
        curr_reference = vcf_records.get_reference()
    if curr_reference:
        process_alleles(vcf_record_in_one_contig, sample_names, curr_reference)
示例#10
0
def bin_coordinates(input_file, output_file, bin_size):
    open_file=utils_logging.open_input_file(input_file)
    open_output=utils_logging.open_output_file(output_file)
    all_coordinates_per_chr={}
    for line in open_file:
        sp_line=line.split()
        all_coordinates=all_coordinates_per_chr.get(sp_line[0])
        if all_coordinates is None:
            all_coordinates=[]
            all_coordinates_per_chr[sp_line[0]]=all_coordinates
        all_coordinates.append(int(sp_line[1]))
    
    for chr in all_coordinates_per_chr.keys():
        all_coordinates=all_coordinates_per_chr.get(chr)
        all_bins=bin_value_from_array(all_coordinates, bin_size)
        for bin,value in enumerate(all_bins):
            open_output.write('%s\t%s\t%s\n'%(chr,bin*bin_size,value))
    open_output.close()
示例#11
0
def read_exon_capture_file(exon_capture_file, extension=0):
    open_exon_capture = utils_logging.open_input_file(exon_capture_file,
                                                      pipe=False)
    all_segments_per_chr = {}
    for line in open_exon_capture:
        if line.startswith('#') or line.startswith('track'):
            continue
        sp_line = line.strip().split()
        chr = sp_line[0]
        start = int(sp_line[1]) - extension
        end = int(sp_line[2]) + extension
        all_segments = all_segments_per_chr.get(chr)
        if all_segments is None:
            all_segments = []
            all_segments_per_chr[chr] = all_segments
        all_segments.append((start, end))
    open_exon_capture.close()
    return all_segments_per_chr
示例#12
0
def snps_to_allele(vcf_file, bam_file):

    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader = vcfIO.VcfReader(file_handle)
    sample_names = reader.get_sample_names()
    #all_samples_in_file=reader.get_sample_names()
    vcf_record_in_one_contig = {}
    curr_reference = None
    for vcf_records in reader:
        #First check that the parent are callable
        if curr_reference != vcf_records.get_reference():
            if curr_reference:
                process_alleles(vcf_record_in_one_contig, sample_names,
                                curr_reference)
            vcf_record_in_one_contig = {}
        vcf_record_in_one_contig[vcf_records.get_position()] = vcf_records
        curr_reference = vcf_records.get_reference()
    if curr_reference:
        process_alleles(vcf_record_in_one_contig, sample_names, curr_reference)
示例#13
0
def parse_stat_file(stat_file):
    open_file = utils_logging.open_input_file(stat_file,pipe=False)
    line_number=0
    total=0
    duplicates=0
    mapped=0
    properly_paired=0

    for line in open_file:
        line_number+=1
        if line_number==1:
            total = int(line.split()[0])
        elif line_number==2:
            duplicates = int(line.split()[0])
        elif line_number==3:
            mapped = int(line.split()[0])
        elif line_number==4:
            dummy = int(line.split()[0])
        elif line_number==5:
            dummy = int(line.split()[0])
        elif line_number==6:
            dummy = int(line.split()[0])
        elif line_number==7:
            properly_paired = int(line.split()[0])
        elif line_number==8:
            dummy = int(line.split()[0])
        elif line_number==9:
            dummy = int(line.split()[0])
        elif line_number==10:
            dummy = int(line.split()[0])
        elif line_number==11:
            dummy = int(line.split()[0])
    return (total,duplicates,mapped,properly_paired)


    open_file.close()
示例#14
0
def vcf_to_lepmap(vcf_file, output_file, sex_file, mother_name, father_name, family_name,
                  genotype_quality_threshold=20, max_prop_missing=.5):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader = vcfIO.VcfReader(file_handle)
    all_samples_in_file = reader.get_sample_names()
    if sex_file:
        sample2sex, sex2sample = read_pop_file(sex_file)
        all_samples_in_file = sample2sex.keys()
    if mother_name in all_samples_in_file: all_samples_in_file.pop(mother_name)
    if father_name in all_samples_in_file: all_samples_in_file.pop(father_name)

    max_missing = int(len(all_samples_in_file) * max_prop_missing)
    valid_genotypes = ['0/1 0/0', 
                      '0/1 0/1',
                      '0/1 1/1',
                      '0/0 0/1',
                      '0/0 1/1',
                      '1/1 0/0',
                      '1/1 0/1']
    all_lines = {}
    for sample in all_samples_in_file:
        if sample2sex.get(sample) == 'M': sex="1"
        else: sex="2"
        all_lines[sample] = [family_name, sample, father_name, mother_name, sex, "0"]
    all_lines[father_name] = [family_name, father_name, "0", "0", "1", "0"]
    all_lines[mother_name] = [family_name, mother_name, "0", "0", "2", "0"]
    all_samples_in_file.append(father_name)
    all_samples_in_file.append(mother_name)
    nb_lines = 0
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    nb_missing_per_sample = Counter()
    count_invalid_parent_geno = 0
    for vcf_records in reader:
        nb_lines += 1
        if nb_lines % 10000 == 0:
            sys.stdout.write('.')
        ref_base = vcf_records.get_reference_base()
        alt_bases = vcf_records.get_alt_bases()
        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_records.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        all_chars = []
        all_codes = set()
        parent_geno = []
        for sample in all_samples_in_file:
            gt = vcf_records.get_genotype(sample)
            gq = vcf_records.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                value1, value2 = re.split('[/|]', gt)
                code = '%s %s'%(int(value1)+1, int(value2)+1)
            else:
                nb_missing += 1
                nb_missing_per_sample[sample] += 1
                code = '0 0'
            all_chars.append(code)
            all_codes.add(code)
            if sample in [father_name, mother_name] and gq>genotype_quality_threshold:
                parent_geno.append(gt)
        #if len(parent_geno)!=2 or not ' '.join(parent_geno) in valid_genotypes:
        #    count_invalid_parent_geno += 1
        #    continue
        if len(all_codes) == 1:
            count_non_polymorphic += 1
            continue
        if nb_missing <= max_missing:
            nb_sequence += 1
            for i, sample in enumerate(all_samples_in_file):
                all_lines[sample].append(all_chars[i])
        else:
            count_too_many_missing += 1
    if count_invalid_parent_geno:
        logging.warning("%s snps remove because they missing or non informative parental genotypes" % (count_invalid_parent_geno))
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing))
    logging.info("%s snps output in Lepmap format" % (nb_sequence))
    for sample in nb_missing_per_sample:
        logging.info("%s markers missing in %s" % (nb_missing_per_sample.get(sample), sample))
    with open(output_file, 'w') as open_output:
        for sample in all_lines:
            open_output.write('%s\n'%('\t'.join(all_lines.get(sample))))
示例#15
0
def vcf_2_genepop(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    if phased:
        reader = vcfIO.PhasedVcfReader(file_handle)
    else:
        reader = vcfIO.VcfReader(file_handle)
    all_samples_in_file = reader.get_sample_names()

    if pop_file:
        sample2pop, pop2samples = read_pop_file(pop_file)
        all_samples = sample2pop.keys()
    else:
        pop = 'dummy_pop'
        all_samples = all_samples_in_file
        pop2samples = {pop: all_samples_in_file}
        sample2pop = {}
        for sample in all_samples: sample2pop[sample] = pop

    all_lines = {}
    headers = []
    for sample in all_samples:
        all_lines[sample] = []
    nb_sample = len(all_samples)
    max_missing = int(len(all_samples) * max_prop_missing)
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    for vcf_record in reader:
        ref_base = vcf_record.get_reference_base()
        alt_bases = vcf_record.get_alt_bases()

        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_record.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        all_alleles = []
        all_genotypes = set()
        for sample in all_samples:
            gt = vcf_record.get_genotype(sample)
            gq = vcf_record.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                allele1, allele2 = gt.split('/')
                alleles = "%02d%02d" % (int(allele1) + 1, int(allele2) + 1)
                all_genotypes.add(gt)
            else:
                nb_missing += 1
                alleles = '0000'
            all_alleles.append(alleles)

        if len(all_genotypes) == 1:
            count_non_polymorphic += 1
            continue

        if nb_missing <= max_missing:
            nb_sequence += 1
            variant_name = '%s:%s' % (vcf_record.get_reference(), vcf_record.get_position())
            headers.append(variant_name)
            for i, sample in enumerate(all_samples):
                all_lines[sample].append(all_alleles[i])
        else:
            count_too_many_missing += 1
    title_line = "Generated by %s from %s on %s" % (getpass.getuser(), vcf_file, time.ctime())
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing))
    logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence))
    with open(output_file, 'w') as open_file:
        open_file.write(title_line + '\n')
        open_file.write('%s\n' % ('\n'.join(headers)))
        for pop in pop2samples:
            open_file.write("Pop\n")
            samples = pop2samples.get(pop)
            for sample in samples:
                open_file.write("%s %s, %s\n" % (sample, pop, '\t'.join(all_lines.get(sample)) ))
示例#16
0
def vcf_to_simple_genotype(vcf_file,
                           mandatory_list_sample,
                           list_samples,
                           min_nb_high_qual_sample=1,
                           print_all_genotype=False):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader = vcfIO.VcfReader(file_handle)
    if not list_samples:
        list_samples = reader.get_sample_names()
    discarded_parents = 0
    nb_markers = 0
    print '#chr\tpos\tal1\tal2\t%s' % ('\t'.join(list_samples))
    for vcf_records in reader:
        #First check that the parent are callable
        nb_markers += 1

        keep1 = test_mandatory_samples(
            vcf_records,
            genotype_quality_threshold=20,
            minimum_depth=6,
            mandatory_list_sample=mandatory_list_sample)

        keep2 = test_all_samples(
            vcf_records,
            genotype_quality_threshold=20,
            minimum_depth=6,
            list_samples=list_samples,
            min_nb_high_qual_sample=min_nb_high_qual_sample)

        if keep1 and keep2:
            ref_base = vcf_records.get_reference_base()
            alt_bases = vcf_records.get_alt_bases()
            if len(alt_bases) > 1:
                continue
            else:
                alt_base = alt_bases[0]
            if not print_all_genotype:
                genotypes_all = vcf_records.get_valid_genotype_per_sample(
                    genotype_quality_threshold=20, minimum_depth=6)
                samples2genotype = generate_empty_hash_with_sample(
                    list_samples)

                for genotype in genotypes_all:
                    sample_list = genotypes_all.get(genotype)
                    genotype_str = genotype.replace('0', ref_base).replace(
                        '1', alt_base)
                    #genotype_str = genotype

                    for sample in sample_list:
                        samples2genotype[sample] = genotype_str
                out = []
                out.append(vcf_records.get_reference())
                out.append(str(vcf_records.get_position()))
                out.append(ref_base)
                out.append(alt_base)
                for sample in list_samples:
                    out.append(samples2genotype.get(sample))
                print '\t'.join(out)
            else:
                all_genotypes = vcf_records.get_all_genotype(list_samples)
                all_genotype_quals = vcf_records.get_all_genotype_quality(
                    list_samples)
                all_sample_depth = vcf_records.get_all_sample_depth(
                    list_samples)

                out = []
                out.append(vcf_records.get_reference())
                out.append(str(vcf_records.get_position()))
                out.append(ref_base)
                out.append(alt_base)
                for i in range(len(all_genotypes)):
                    if all_genotypes[i]:
                        genotype_str = all_genotypes[i].replace(
                            '0', ref_base).replace('1', alt_base)
                    else:
                        genotype_str = '.'
                    if not all_genotype_quals[i]: all_genotype_quals[i] = 0
                    if not all_sample_depth[i]: all_sample_depth[i] = 0
                    out.append('%s:%s:%s' %
                               (genotype_str, all_genotype_quals[i],
                                all_sample_depth[i]))
                print '\t'.join(out)
        else:
            discarded_parents += 1
    sys.stderr.write("%s markers %s filtered out\n" %
                     (nb_markers, discarded_parents))
    return True
示例#17
0
def vcf_2_vcf(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    if phased:
        reader = vcfIO.PhasedVcfReader(file_handle)
    else:
        reader = vcfIO.VcfReader(file_handle)
    all_samples = reader.get_sample_names()

    all_lines = {}
    headers = []
    for sample in all_samples:
        all_lines[sample] = []
    nb_sample = len(all_samples)
    max_missing = int(len(all_samples) * max_prop_missing)
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    open_file = open(output_file, 'w')

    open_file.write(reader.get_header_lines())

    for vcf_record in reader:
        ref_base = vcf_record.get_reference_base()
        alt_bases = vcf_record.get_alt_bases()

        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_record.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        all_alleles = []
        all_genotypes = set()
        for sample in all_samples:
            gt = vcf_record.get_genotype(sample)
            gq = vcf_record.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                allele1, allele2 = gt.split('/')
                alleles = "%02d%02d" % (int(allele1) + 1, int(allele2) + 1)
                all_genotypes.update(gt.split('/'))
            else:
                nb_missing += 1
                alleles = '0000'
            all_alleles.append(alleles)

        if len(all_genotypes) == 1:
            count_non_polymorphic += 1
            continue

        if nb_missing <= max_missing:
            nb_sequence += 1
            open_file.write(str(vcf_record) + '\n')
        else:
            count_too_many_missing += 1

    open_file.close()
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing))
    logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence))
示例#18
0
def vcf_to_phylip_or_nexus(vcf_file, output_file, pop_file=None, genotype_quality_threshold=20, max_prop_missing=.5,
                           phased=False, type_output=OUTPUT_TYPE_PHYLIP):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    if phased:
        # Not useful for phylip format
        reader = vcfIO.VcfReader(file_handle)
    else:
        reader = vcfIO.VcfReader(file_handle)
    all_samples_in_file = reader.get_sample_names()
    if pop_file:
        sample2pop, pop2sample = read_pop_file(pop_file)
        all_samples_in_file = sample2pop.keys()

    max_missing = int(len(all_samples_in_file) * max_prop_missing)

    all_lines = {}
    for sample in all_samples_in_file:
        all_lines[sample] = []
    nb_sample = len(all_samples_in_file)
    nb_lines = 0
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    nb_missing_per_sample = Counter()
    for vcf_records in reader:
        nb_lines += 1
        if nb_lines % 10000 == 0:
            sys.stdout.write('.')
        ref_base = vcf_records.get_reference_base()
        alt_bases = vcf_records.get_alt_bases()
        array = [ref_base]
        array.extend(alt_bases)
        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_records.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        set_missing = set()
        all_chars = []
        all_codes = set()
        for sample in all_samples_in_file:
            gt = vcf_records.get_genotype(sample)
            gq = vcf_records.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                value1, value2 = re.split('[/|]', gt)
                code = atgc2iupac(list(set([array[int(value1)], array[int(value2)]])))
                all_codes.add(code)
            else:
                nb_missing += 1
                set_missing.add(sample)
                code = '?'
            all_chars.append(code)
        if len(all_codes) == 1:
            count_non_polymorphic += 1
            continue
        if nb_missing <= max_missing:
            nb_sequence += 1

            for s in set_missing: nb_missing_per_sample[s] += 1
            for i, sample in enumerate(all_samples_in_file):
                all_lines[sample].append(all_chars[i])
        else:
            count_too_many_missing += 1
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing))
    logging.info("%s snps output in %s format" % (nb_sequence, OUTPUT_TYPE_PHYLIP))
    for sample in nb_missing_per_sample:
        logging.info("%s markers missing in %s" % (nb_missing_per_sample.get(sample), sample))
    with open(output_file, 'w') as open_output:
        if type_output == OUTPUT_TYPE_PHYLIP:
            # Output the phylip formated file
            open_output.write('%s %s\n' % (nb_sample, nb_sequence))
            for sample in all_samples_in_file:
                open_output.write("%s %s\n" % (sample, ''.join(all_lines.get(sample))))
        elif type_output == OUTPUT_TYPE_NEXUS:
            open_output.write(format_nexus(all_lines))
示例#19
0
def vcf_to_lepmap(vcf_file,
                  output_file,
                  sex_file,
                  mother_name,
                  father_name,
                  family_name,
                  genotype_quality_threshold=20,
                  max_prop_missing=.5):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader = vcfIO.VcfReader(file_handle)
    all_samples_in_file = reader.get_sample_names()
    if sex_file:
        sample2sex, sex2sample = read_pop_file(sex_file)
        all_samples_in_file = sample2sex.keys()
    if mother_name in all_samples_in_file: all_samples_in_file.pop(mother_name)
    if father_name in all_samples_in_file: all_samples_in_file.pop(father_name)

    max_missing = int(len(all_samples_in_file) * max_prop_missing)
    valid_genotypes = [
        '0/1 0/0', '0/1 0/1', '0/1 1/1', '0/0 0/1', '0/0 1/1', '1/1 0/0',
        '1/1 0/1'
    ]
    all_lines = {}
    for sample in all_samples_in_file:
        if sample2sex.get(sample) == 'M': sex = "1"
        else: sex = "2"
        all_lines[sample] = [
            family_name, sample, father_name, mother_name, sex, "0"
        ]
    all_lines[father_name] = [family_name, father_name, "0", "0", "1", "0"]
    all_lines[mother_name] = [family_name, mother_name, "0", "0", "2", "0"]
    all_samples_in_file.append(father_name)
    all_samples_in_file.append(mother_name)
    nb_lines = 0
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    nb_missing_per_sample = Counter()
    count_invalid_parent_geno = 0
    for vcf_records in reader:
        nb_lines += 1
        if nb_lines % 10000 == 0:
            sys.stdout.write('.')
        ref_base = vcf_records.get_reference_base()
        alt_bases = vcf_records.get_alt_bases()
        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_records.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        all_chars = []
        all_codes = set()
        parent_geno = []
        for sample in all_samples_in_file:
            gt = vcf_records.get_genotype(sample)
            gq = vcf_records.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                value1, value2 = re.split('[/|]', gt)
                code = '%s %s' % (int(value1) + 1, int(value2) + 1)
            else:
                nb_missing += 1
                nb_missing_per_sample[sample] += 1
                code = '0 0'
            all_chars.append(code)
            all_codes.add(code)
            if sample in [father_name, mother_name
                          ] and gq > genotype_quality_threshold:
                parent_geno.append(gt)
        #if len(parent_geno)!=2 or not ' '.join(parent_geno) in valid_genotypes:
        #    count_invalid_parent_geno += 1
        #    continue
        if len(all_codes) == 1:
            count_non_polymorphic += 1
            continue
        if nb_missing <= max_missing:
            nb_sequence += 1
            for i, sample in enumerate(all_samples_in_file):
                all_lines[sample].append(all_chars[i])
        else:
            count_too_many_missing += 1
    if count_invalid_parent_geno:
        logging.warning(
            "%s snps remove because they missing or non informative parental genotypes"
            % (count_invalid_parent_geno))
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" %
                        (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations"
            % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" %
                        (count_too_many_missing, max_missing))
    logging.info("%s snps output in Lepmap format" % (nb_sequence))
    for sample in nb_missing_per_sample:
        logging.info("%s markers missing in %s" %
                     (nb_missing_per_sample.get(sample), sample))
    with open(output_file, 'w') as open_output:
        for sample in all_lines:
            open_output.write('%s\n' % ('\t'.join(all_lines.get(sample))))
示例#20
0
def vcf_to_structure(vcf_file, output_file, pop_file, genotype_quality_threshold=20, max_prop_missing=.5, phased=False):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    if phased:
        reader = vcfIO.PhasedVcfReader(file_handle)
    else:
        reader = vcfIO.VcfReader(file_handle)
    all_samples_in_file = reader.get_sample_names()

    if pop_file:
        sample2pop, pop2sample = read_pop_file(pop_file)
        pops = pop2sample.keys()
        all_samples = sample2pop.keys()
    else:
        pop = 'dummy_pop'
        pops = [pop]
        all_samples = all_samples_in_file
        pop2sample = {pop: [all_samples_in_file]}
        sample2pop = {}
        for sample in all_samples: sample2pop[sample] = pop

    sample_errors = set(all_samples).difference(set(all_samples_in_file))
    if len(sample_errors) > 0:
        logging.critical('%s samples (%s) from the population file not found in the vcf file' % (
        len(sample_errors), ', '.join(sample_errors)))
        return -2

    all_lines = {}
    headers = []
    for sample in all_samples:
        all_lines[sample + '1'] = []
        all_lines[sample + '2'] = []
    nb_sample = len(all_samples)
    max_missing = int(len(all_samples) * max_prop_missing)
    nb_sequence = 0
    count_more_than_one_allele = 0
    count_indel = 0
    count_too_many_missing = 0
    count_non_polymorphic = 0
    for vcf_records in reader:
        ref_base = vcf_records.get_reference_base()
        alt_bases = vcf_records.get_alt_bases()

        if len(alt_bases) > 1:
            count_more_than_one_allele += 1
            continue
        if vcf_records.is_indel():
            count_indel += 1
            continue

        nb_missing = 0
        all_alleles1 = []
        all_alleles2 = []
        all_genotypes = set()
        for sample in all_samples:
            gt = vcf_records.get_genotype(sample)
            gq = vcf_records.get_genotype_quality(sample)
            if gt and gq > genotype_quality_threshold:
                allele1, allele2 = gt.split('/')
                all_genotypes.add(gt)
            else:
                nb_missing += 1
                allele1, allele2 = ('-9', '-9')
            all_alleles1.append(allele1)
            all_alleles2.append(allele2)

        if len(all_genotypes) == 1:
            count_non_polymorphic += 1
            continue

        if nb_missing <= max_missing:
            nb_sequence += 1
            variant_name = '%s:%s' % (vcf_records.get_reference(), vcf_records.get_position())
            headers.append(variant_name)
            for i, sample in enumerate(all_samples):
                all_lines[sample + '1'].append(all_alleles1[i])
                all_lines[sample + '2'].append(all_alleles2[i])
        else:
            count_too_many_missing += 1
    if count_more_than_one_allele:
        logging.warning("%s snps remove because they had more than 2 alleles" % (count_more_than_one_allele))
    if count_indel:
        logging.warning("%s indels removed" % (count_indel))
    if count_non_polymorphic:
        logging.warning(
            "%s snps removed because no polymorphism was found between populations" % (count_non_polymorphic))
    if count_too_many_missing:
        logging.warning("%s snps removed because >%s missing samples" % (count_too_many_missing, max_missing))
    logging.info("%s samples and %s SNPs output" % (nb_sample, nb_sequence))
    with open(output_file, 'w') as open_file:
        open_file.write('\t%s\n' % ('\t'.join(headers)))
        for sample in all_samples:
            open_file.write("%s\t%s\t%s\n" % (
            sample, pops.index(sample2pop.get(sample)) + 1, '\t'.join(all_lines.get(sample + '1')) ))
            open_file.write("%s\t%s\t%s\n" % (
            sample, pops.index(sample2pop.get(sample)) + 1, '\t'.join(all_lines.get(sample + '2')) ))
示例#21
0
def vcf_to_simple_genotype(vcf_file, mandatory_list_sample, list_samples, min_nb_high_qual_sample=1, print_all_genotype=False):
    file_handle = utils_logging.open_input_file(vcf_file, pipe=False)
    reader  = vcfIO.VcfReader(file_handle)
    if not list_samples:
        list_samples=reader.get_sample_names()
    discarded_parents=0
    nb_markers=0
    print '#chr\tpos\tal1\tal2\t%s'%('\t'.join(list_samples))
    for vcf_records in reader:
        #First check that the parent are callable
        nb_markers+=1
        
        keep1=test_mandatory_samples(vcf_records, genotype_quality_threshold=20, minimum_depth=6, mandatory_list_sample=mandatory_list_sample)
        
        keep2 = test_all_samples(vcf_records, genotype_quality_threshold=20, minimum_depth=6, list_samples=list_samples,
                                 min_nb_high_qual_sample=min_nb_high_qual_sample)
        
        if keep1 and keep2:
            ref_base = vcf_records.get_reference_base()
            alt_bases = vcf_records.get_alt_bases()
            if len(alt_bases)>1:
                continue
            else:
                alt_base=alt_bases[0]
            if not print_all_genotype:
                genotypes_all = vcf_records.get_valid_genotype_per_sample(genotype_quality_threshold=20, minimum_depth=6)
                samples2genotype = generate_empty_hash_with_sample(list_samples)
                
                for genotype in genotypes_all:
                    sample_list = genotypes_all.get(genotype)
                    genotype_str = genotype.replace('0', ref_base).replace('1', alt_base)
                    #genotype_str = genotype
                    
                    for sample in sample_list:
                        samples2genotype[sample] = genotype_str
                out=[]
                out.append(vcf_records.get_reference())
                out.append(str(vcf_records.get_position()))
                out.append(ref_base)
                out.append(alt_base)
                for sample in list_samples:
                    out.append(samples2genotype.get(sample))
                print '\t'.join(out)
            else:
                all_genotypes=vcf_records.get_all_genotype(list_samples)
                all_genotype_quals=vcf_records.get_all_genotype_quality(list_samples)
                all_sample_depth=vcf_records.get_all_sample_depth(list_samples)
                
                out=[]
                out.append(vcf_records.get_reference())
                out.append(str(vcf_records.get_position()))
                out.append(ref_base)
                out.append(alt_base)
                for i in range(len(all_genotypes)):
                    if all_genotypes[i]:
                        genotype_str = all_genotypes[i].replace('0', ref_base).replace('1', alt_base)
                    else:
                        genotype_str='.'
                    if not all_genotype_quals[i]: all_genotype_quals[i]=0
                    if not all_sample_depth[i]: all_sample_depth[i]=0
                    out.append('%s:%s:%s'%(genotype_str,all_genotype_quals[i],all_sample_depth[i]))
                print '\t'.join(out)
        else:
            discarded_parents+=1
    sys.stderr.write("%s markers %s filtered out\n"%(nb_markers,discarded_parents))
    return True