def main(options):
    hap = gdc.open2(options["hap"])
    don = gdc.open2(options["donor"])
    rec = gdc.open2(options["recipient"])
    snp, ind, geno = [
        open(options["out"] + x, "w") for x in [".snp", ".ind", ".geno"]
    ]

    total_individuals = write_ind_file(rec, don, ind)

    removed = {"multiallelic": 0}
    for line in hap:
        bits = line[:-1].split()
        chr, pos = bits[0:2]
        gts = bits[2:]
        alleles = list(set(gts))
        if len(alleles) > 2:
            removed["multiallelic"] += 1
            next
        # Setting first allele to 0 - should check this is ok.
        if len(gts) != 2 * total_individuals:
            raise Exception("N.genotypes!=N.individuals T %s %s" % (chr, pos))

        snp.write("\t".join(
            [chr + ":" + pos, chr, "0.0", pos, alleles[0], alleles[1]]) + "\n")
        for i in range(total_individuals):
            this_gt = sum([g == alleles[0] for g in gts[(i * 2):(i * 2 + 2)]])
            geno.write(str(this_gt))
        geno.write("\n")

    [x.close() for x in [hap, don, rec, snp, ind, geno]]
示例#2
0
def main(options):
    hap=gdc.open2(options["hap"])
    don=gdc.open2(options["donor"])
    rec=gdc.open2(options["recipient"])
    snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]]

    total_individuals=write_ind_file(rec, don, ind)

    removed={"multiallelic":0}
    for line in hap:
        bits=line[:-1].split()
        chr,pos=bits[0:2]
        gts=bits[2:]
        alleles=list(set(gts))
        if len(alleles)>2:
            removed["multiallelic"]+=1
            next
        # Setting first allele to 0 - should check this is ok.
        if len(gts)!=2*total_individuals:
            raise Exception("N.genotypes!=N.individuals T %s %s"%(chr, pos))

        snp.write("\t".join([chr+":"+pos, chr, "0.0", pos, alleles[0], alleles[1]])+"\n")
        for i in range(total_individuals):
            this_gt=sum([g==alleles[0] for g in gts[(i*2):(i*2+2)]])
            geno.write(str(this_gt))
        geno.write("\n")

    [x.close() for x in [hap, don, rec, snp, ind, geno]]
示例#3
0
文件: ms2psmc.py 项目: shaferab/gdc
def read_ms(ms_file, options):
    """ 
    Read a ms file and return positions and haplotypes
    """
    
    ms=gdc.open2(ms_file)
    if options["length"]:
        length=options["length"]           

    nhap=length=None
    line=ms.next()
    if options["macs"]:
        nhap, length = [int(x) for x in line.split()[1:3]]
    else:        #For example, srcm with -SC abs
        length=options["length"]

    for line in ms:
        if line.startswith("segsites:"):
            npos=int(line.split()[1])
        elif line.startswith("positions:"):
            pos=np.array([int(length*float(p)) for p in line.split()[1:]])
            if len(pos) != npos:
                raise Exception("Number of positions does not match segsites")
            break

    haps=np.genfromtxt(ms, dtype=int, delimiter=1)   #Assume the rest of the file is the haplotypes
    haps=np.transpose(haps)

    if haps.shape[0] != npos:
        raise Exception("Number of positions doesn't match")
    if nhap and haps.shape[1] != nhap:
        raise Exception("Number of haplotypes doesn't match")

    return length, pos, haps
示例#4
0
def load_from_arp(arp):
    """
    Parse an arp file into an internal format, returning site data, which is a 
    list of lists, with the sites on each chromosome, and gt data which is a 
    dict with infomation on the samples and genotypes. 
    I'm really just guessing what the structure of the file is.
    """
    arp_file=gdc.open2(arp)

    N_chr=-1                                
    N_sites=-1
    site_data=[]
    gt_data={}
    ascertained=False                   
    for line in arp_file: 
        if line.startswith("#Number of independent chromosomes"):
            N_chr=int(line.split()[-1])
        if line.startswith("#Total number of polymorphic sites") and not ascertained:
            N_sites=int(line.split()[-1])
        if line.startswith("#ASCERTAINED DATA"):
            ascertained=True
            N_sites=-1
        if line.startswith("#Number of polym. sites meeting ascertainment criterion:"):
            N_sites=int(line.split()[-1])
        if "polymorphic positions on chromosome" in line and not ascertained:
            npos=int(line.split()[1])
            chrom=int(line.split()[-1])
            line=next(arp_file)
            site_data.append(None)
            site_data[chrom-1]=[int(x.replace(",", "")) for x in line[1:].split()]
        if line.startswith("#Ascertained polymorphic positions on chromosome") and ascertained:
            chrom=int(line.split()[-1])
            line=next(arp_file)
            npos=len(line.split())
            site_data.append(None)
            site_data[chrom-1]=[int(x.replace(",", "")) for x in line[1:].split()]
        if "SampleName=" in line:
            sname=line.split("\"")[-2]
            line=arp_file.next()
            ssize=int(line.split("=")[-1])
            gt=np.zeros((N_sites, ssize), dtype='int')
            arp_file.next()               # "SampleData= {" line
            for i in range(ssize):
                line=arp_file.next()
                gt_string=line.split()[2]
                if len(gt_string)!=N_sites:
                    raise Exception("Wrong number of sites")
                gt[:,i]=[int(int(x)>0) for x in gt_string]   # converting {1,2,3}->1
            gt_data[sname]={"size":ssize, "gt":gt}

    if not N_chr==len(site_data):
        raise Exception( "Number of chromosomes does not match site data" )
    if not N_sites==sum([len(x) for x in site_data]):
        raise Exception( "Total number of sites does not match" )

    return site_data, gt_data
示例#5
0
def main(options):
    """
    Convert vcf to eigenstrat format (ind, snp and geno files)
    """
    vcf=gdc.open2(options["vcf"])
    snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]]
    removed={"multiallelic":0, "indel":0}
    count=0
    
    for line in vcf:
        if line[:2]=="##":				  # Comment line
            next
        elif line[:6]=="#CHROM":			  # Header line
            inds=line.split()[9:]
            if options["ref"]:
                ind.write(options["ref"]+"\tU\tREF\n")
            for indi in inds:
                if not options["indAsPop"]:
                    ind.write(indi+"\tU\tPOP\n")
                else:
                    ind.write(indi+"\tU\t"+indi+"\n")
                   
        else:							  # data
            bits=line.split()
            if "," in bits[4]:
                removed["indel"]+=1
                continue
            if len(bits[3])!=1 or len(bits[4])!=1:
                removed["multiallelic"]+=1
                continue
            else:
                if bits[2]==".":
                    bits[2]=bits[0]+":"+bits[1]
                snp.write("    ".join([bits[2], bits[0], "0.0", bits[1], bits[3], bits[4]])+"\n")
                geno_string=""
                if options["ref"]:
                    geno_string="2"
                for gt in bits[9:]:
                    geno_string+=decode_gt_string(gt)
                geno.write(geno_string+"\n")
                count+=1

    [f.close for f in [ind, snp, geno]]

    print "Done. Wrote "+str(count) + " sites"
    print "Excluded " + str(sum(removed.values())) + " sites"
    for key in removed:
        print "Excluded " + str(removed[key]) + " " + key
    return
示例#6
0
def main(options):
    """
    Convert vcf to eigenstrat format (ind, snp and geno files)
    """
    vcf=gdc.open2(options["vcf"])
    snp, ind, geno = [open(options["out"]+x, "w") for x in [".snp", ".ind", ".geno"]]
    removed={"multiallelic":0, "indel":0}
    count=0
    
    for line in vcf:
        if line[:2]=="##":				  # Comment line
            next
        elif line[:6]=="#CHROM":			  # Header line
            inds=line.split()[9:]
            if options["ref"]:
                ind.write(options["ref"]+"\tU\tREF\n")
            for indi in inds:
                if not options["indAsPop"]:
                    ind.write(indi+"\tU\tPOP\n")
                else:
                    ind.write(indi+"\tU\t"+indi+"\n")
                   
        else:							  # data
            bits=line.split()
            if "," in bits[4]:
                removed["indel"]+=1
                continue
            if len(bits[3])!=1 or len(bits[4])!=1:
                removed["multiallelic"]+=1
                continue
            else:
                if bits[2]==".":
                    bits[2]=bits[0]+":"+bits[1]
                snp.write("    ".join([bits[2], bits[0], "0.0", bits[1], bits[3], bits[4]])+"\n")
                geno_string=""
                if options["ref"]:
                    geno_string="2"
                for gt in bits[9:]:
                    geno_string+=decode_gt_string(gt)
                geno.write(geno_string+"\n")
                count+=1

    [f.close for f in [ind, snp, geno]]

    print "Done. Wrote "+str(count) + " sites"
    print "Excluded " + str(sum(removed.values())) + " sites"
    for key in removed:
        print "Excluded " + str(removed[key]) + " " + key
    return
示例#7
0
def read_ms(ms_file, options):
    """ 
    Read a ms file and return positions and haplotypes
    """

    ms = gdc.open2(ms_file)
    if options["length"]:
        length = options["length"]

    nhap = length = None
    line = ms.next()
    if options["macs"]:
        nhap, length = [int(x) for x in line.split()[1:3]]
    else:  #For example, srcm with -SC abs
        length = options["length"]

    for line in ms:
        if line.startswith("segsites:"):
            npos = int(line.split()[1])
        elif line.startswith("positions:"):
            pos = np.array([int(length * float(p)) for p in line.split()[1:]])
            if len(pos) != npos:
                raise Exception("Number of positions does not match segsites")
            break

    haps = np.genfromtxt(
        ms, dtype=int,
        delimiter=1)  #Assume the rest of the file is the haplotypes
    haps = np.transpose(haps)

    if haps.shape[0] != npos:
        raise Exception("Number of positions doesn't match")
    if nhap and haps.shape[1] != nhap:
        raise Exception("Number of haplotypes doesn't match")

    return length, pos, haps
示例#8
0
def read_macs(macs_file):
    """ 
    Read a macs file and return positions and haplotypes
    """

    macs=gdc.open2(macs_file)
    line=macs.next()
    nhap, length = [int(x) for x in line.split()[1:3]]
    for line in macs:
        if line.startswith("segsites:"):
            npos=int(line.split()[1])
        elif line.startswith("positions:"):
            pos=np.array([int(length*float(p)) for p in line.split()[1:]])
            if len(pos) != npos:
                raise Exception("Number of positions does not match segsites")
            break

    haps=np.genfromtxt(macs, dtype=int, delimiter=1)   #Assume the rest of the file is the haplotypes
    haps=np.transpose(haps)

    if haps.shape != (npos, nhap):
        raise Exception("Genotype matrix shape doesn't match")
    
    return length, pos, haps
def main(argv):
    try:
        opts, args = getopt.getopt(argv, "o:v:d:",
                                   ["output=", "vcf=", "diploid="])
    except getopt.GetoptError:
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-o", "--output"):
            output_filename = arg
        elif opt in ("-v", "--vcf"):
            vcf_filename = arg
        elif opt in ("-d", "--diploid"):  # samples diploid? (True/False)
            if (arg in ['1', 1, 'T', 't', 'True', 'true']):
                d_flag = True
            else:
                d_flag = False

    n_head_lines = 0
    for vcfline in gdc.open2(vcf_filename):
        n_head_lines += 1
        line = vcfline.decode()
        if line[1] != '#':
            break
    nlines = sum(1 for line in gdc.open2(vcf_filename)) - n_head_lines

    vcf_file = gdc.open2(vcf_filename)
    vcfline = next(vcf_file)
    pound = vcfline.decode()[1]
    while pound == '#':
        vcfline = next(vcf_file)
        pound = vcfline.decode()[1]
    line = vcfline.decode()

    output_head = ['ID', 'chr', 'pos', 'Alt', 'N', 'Fq']

    with open(output_filename, 'w') as mycsv:
        datawriter = csv.writer(mycsv)
        datawriter.writerow(output_head)
        #, quoting = csv.QUOTE_NONE
        for i in range(0, nlines):
            vcfline = next(vcf_file)
            line = vcfline.decode()
            line = line.split()
            chr, pos = line[0], line[1]
            chrpos = line[0] + '_' + line[1]

            alt_count = 0
            total = 0
            for j in range(9, len(line)):
                if line[j][0] == '.':
                    continue
                else:
                    alt_count, total = tally(alt_count, line[j], d_flag, total)

            if total == 0:
                continue
            else:
                alt_fq = alt_count / total
                row = [chrpos, chr, pos, alt_count, total, alt_fq]
                datawriter.writerow(row)

    mycsv.close()
示例#10
0
def load_from_arp(arp):
    """
    Parse an arp file into an internal format, returning site data, which is a 
    list of lists, with the sites on each chromosome, and gt data which is a 
    dict with infomation on the samples and genotypes. 
    I'm really just guessing what the structure of the file is.
    """
    arp_file = gdc.open2(arp)

    N_chr = -1
    N_sites = -1
    site_data = []
    gt_data = {}
    ascertained = False
    for line in arp_file:
        if line.startswith("#Number of independent chromosomes"):
            N_chr = int(line.split()[-1])
        if line.startswith(
                "#Total number of polymorphic sites") and not ascertained:
            N_sites = int(line.split()[-1])
        if line.startswith("#ASCERTAINED DATA"):
            ascertained = True
            N_sites = -1
        if line.startswith(
                "#Number of polym. sites meeting ascertainment criterion:"):
            N_sites = int(line.split()[-1])
        if "polymorphic positions on chromosome" in line and not ascertained:
            npos = int(line.split()[1])
            chrom = int(line.split()[-1])
            line = next(arp_file)
            site_data.append(None)
            site_data[chrom -
                      1] = [int(x.replace(",", "")) for x in line[1:].split()]
        if line.startswith("#Ascertained polymorphic positions on chromosome"
                           ) and ascertained:
            chrom = int(line.split()[-1])
            line = next(arp_file)
            npos = len(line.split())
            site_data.append(None)
            site_data[chrom -
                      1] = [int(x.replace(",", "")) for x in line[1:].split()]
        if "SampleName=" in line:
            sname = line.split("\"")[-2]
            line = arp_file.next()
            ssize = int(line.split("=")[-1])
            gt = np.zeros((N_sites, ssize), dtype='int')
            arp_file.next()  # "SampleData= {" line
            for i in range(ssize):
                line = arp_file.next()
                gt_string = line.split()[2]
                if len(gt_string) != N_sites:
                    raise Exception("Wrong number of sites")
                gt[:, i] = [int(int(x) > 0)
                            for x in gt_string]  # converting {1,2,3}->1
            gt_data[sname] = {"size": ssize, "gt": gt}

    if not N_chr == len(site_data):
        raise Exception("Number of chromosomes does not match site data")
    if not N_sites == sum([len(x) for x in site_data]):
        raise Exception("Total number of sites does not match")

    return site_data, gt_data
示例#11
0
def output_hetfa(options):
    """
    output a single hetfa 
    """
    ref_fa = Fasta(options["ref"])
    mask = None
    if options["mask"]:
        mask = Fasta(options["mask"])

    out = None
    if options["out"]:
        out = gzip.open(options["out"] + ".hetfa.fa.gz", "w")
    else:
        out = sys.stdout

    out.write(">" + options["chrom"] + "\n")

    vcf = gdc.open2(options["vcf"])
    sample_idx = None
    last_pos = 0
    for line in vcf:
        if line.startswith("##"):
            continue
        elif line.startswith("#"):
            bits = line.split()
            sample_idx = bits.index(options["sample"])
        else:  #data line
            bits = line.split()
            gt = bits[sample_idx]
            pos = int(bits[1])
            if pos == last_pos:
                continue
            ref = bits[3]
            alt = bits[4]

            masked = check_mask(mask, options, pos)
            ref_seq = get_ref_seq(options, ref_fa, mask, last_pos, pos)

            if len(ref) == 1 and len(alt) == 1 and gt[0] in [
                    "0", "1"
            ] and gt[2] in ["0", "1"
                            ] and not masked:  #This is a biallelic site
                #This is the sequence from the last position to the base before the current position (note that pos is 1-based)
                if options["refcheck"] and ref_fa[options["chrom"]][
                        pos - 1].seq != ref:
                    raise Exception("Reference mismatch at pos " + str(pos))

                genotype = int(gt[0]) + int(gt[2])
                if genotype == 0:
                    out.write(ref_seq + ref)
                elif genotype == 1:
                    hetfa_code = HETFA_MAP[tuple(sorted([ref, alt]))]
                    out.write(ref_seq + hetfa_code)
                elif genotype == 2:
                    out.write(ref_seq + alt)
                else:
                    raise Exception(
                        "Untrapped bad genotype in haplotype 0 at pos" +
                        str(pos))

            else:  #This is either unphased or missing or multiallelic
                out.write(ref_seq + "N")

            last_pos = pos

    #Fill in the reference at the end and terminate with newline.
    tail_seq = ref_fa[options["chrom"]][last_pos:].seq
    out.write(tail_seq + "\n")
示例#12
0
def output_fastas(options):
    """
    output two .fa files, one for each chromosome. 
    """
    ref_fa = Fasta(options["ref"])

    out0 = gzip.open(options["out"] + ".0.fa.gz", "w")
    out1 = gzip.open(options["out"] + ".1.fa.gz", "w")
    out0.write(">" + options["chrom"] + "\n")
    out1.write(">" + options["chrom"] + "\n")

    mask = None
    if options["mask"]:
        mask = Fasta(options["mask"])

    vcf = gdc.open2(options["vcf"])
    sample_idx = None
    last_pos = 0
    for line in vcf:
        if line.startswith("##"):
            continue
        elif line.startswith("#"):
            bits = line.split()
            sample_idx = bits.index(options["sample"])
        else:  #data line
            bits = line.split()
            gt = bits[sample_idx]
            pos = int(bits[1])
            if pos == last_pos:
                continue
            ref = bits[3]
            alt = bits[4]

            masked = check_mask(mask, options, pos)
            ref_seq = get_ref_seq(options, ref_fa, mask, last_pos, pos)

            if len(ref) == 1 and len(alt) == 1 and gt in [
                    "0|0", "1|0", "0|1", "1|1"
            ] and not masked:  #This is a phased biallelic site
                #This is the sequence from the last position to the base before the current position (note that pos is 1-based)
                if options["refcheck"] and ref_fa[options["chrom"]][
                        pos - 1].seq != ref:
                    raise Exception("Reference mismatch at pos " + str(pos))

                if gt[0] == "0":
                    out0.write(ref_seq + ref)
                elif gt[0] == "1":
                    out0.write(ref_seq + alt)
                else:
                    raise Exception(
                        "Untrapped bad genotype in haplotype 0 at pos" +
                        str(pos))

                if gt[2] == "0":
                    out1.write(ref_seq + ref)
                elif gt[2] == "1":
                    out1.write(ref_seq + alt)
                else:
                    raise Exception(
                        "Untrapped bad genotype in haplotype 1 at pos" +
                        str(pos))

            else:  #This is either unphased or missing or multiallelic
                out0.write(ref_seq + "N")
                out1.write(ref_seq + "N")

            last_pos = pos

    #Fill in the reference at the end and terminate with newline.
    tail_seq = ref_fa[options["chrom"]][last_pos:].seq
    out0.write(tail_seq + "\n")
    out1.write(tail_seq + "\n")
    out0.close()
    out1.close()
示例#13
0
def main(options):
    """
    Iterate over the vcf and output one fasta file for each chromosome. 
    """

    ref_fa = Fasta(options["ref"])

    out0 = gzip.open(options["out"] + ".0.fa.gz", "w")
    out1 = gzip.open(options["out"] + ".1.fa.gz", "w")
    out0.write(">" + options["chrom"] + "\n")
    out1.write(">" + options["chrom"] + "\n")

    vcf = gdc.open2(options["vcf"])
    sample_idx = None
    last_pos = 0
    for line in vcf:
        if line.startswith("##"):
            continue
        elif line.startswith("#"):
            bits = line.split()
            sample_idx = bits.index(options["sample"])
        else:  # data line
            bits = line.split()
            gt = bits[sample_idx]
            pos = int(bits[1])
            if pos == last_pos:
                continue
            ref = bits[3]
            alt = bits[4]

            if (
                len(ref) == 1 and len(alt) == 1 and gt in ["0|0", "1|0", "0|1", "1|1"]
            ):  # This is a phased biallelic site
                # This is the sequence from the last position to the base before the current position (note that pos is 1-based)
                ref_seq = ref_fa[options["chrom"]][last_pos : (pos - 1)].seq
                if options["refcheck"] and ref_fa[options["chrom"]][pos - 1].seq != ref:
                    raise Exception("Reference mismatcah at pos " + str(pos))

                if gt[0] == "0":
                    out0.write(ref_seq + ref)
                elif gt[0] == "1":
                    out0.write(ref_seq + alt)
                else:
                    raise Exception("Untrapped bad genotype in haplotype 0 at pos" + str(pos))

                if gt[2] == "0":
                    out1.write(ref_seq + ref)
                elif gt[2] == "1":
                    out1.write(ref_seq + alt)
                else:
                    raise Exception("Untrapped bad genotype in haplotype 1 at pos" + str(pos))

            else:  # This is either unphased or missing or multiallelic
                out0.write("N" * (pos - last_pos))
                out1.write("N" * (pos - last_pos))

            last_pos = pos

    # Fill in the reference at the end and terminate with newline.
    tail_seq = ref_fa[options["chrom"]][last_pos:].seq
    out0.write(tail_seq + "\n")
    out1.write(tail_seq + "\n")