fileoutput = open(chrPrev+'_'+args.output, 'w')

  for line in datafile:
    words = line.split()
    chr = str(words[0].split('_')[1])
    pos = words[1]

    # split chromosomes into separate files
    if chr != chrPrev:
      fileoutput.close()
      fileoutput = open(chr+'_'+args.output, 'w')
      chrPrev = chr
      siteNumber = 1

    # select samples
    genotypes = calls.selectSamples(sampCol, words)

    # count Ns
    valueN = calls.countPerPosition(genotypes, 'N')

    if valueN <= args.missing:
      genotypesMerged = ''.join(str(e) for e in genotypes)
      genotypesMergedP = genotypesMerged.replace('N', '?')
    else:
      continue

    # count the number of called sites
    if not calls.is_polymorphic(genotypes):
      siteNumber += 1
    else:
      fileoutput.write("%s\t%s\t%s\t%s\n" % (chr, pos, siteNumber, genotypesMergedP))
    sampCol = calls.indexSamples(sampleNames, header_words)

    # count number of sample
    nSample = len(sampleNames)
    ############################## perform counting ####################

    print('Calculating ...')
    nonMissDP = []

    for line in datafile:
        words = line.split()
        Chr = words[0]
        pos = int(words[1])

        # select samples
        sample_num = calls.selectSamples(sampCol, words)

        # sum up
        for i in sample_num:
            if i.isdigit():
                nonMissDP.append(float(i))
            elif i == "NA":
                continue
            else:
                warnings.warn("%s is not numeric at the line %s" %
                              (i, counter + 1))

        # make output
        depth = round(sum(nonMissDP) / float(len(nonMissDP)), 2)
        outputFile.write("%s\t%s\t%s\n" % (Chr, pos, depth))
        nonMissDP = []
Exemplo n.º 3
0
counter = 0

# read the header
ances = open(args.ancestral, 'r')
ances_words = ances.readline()

output = open(args.output, 'w')

print('Opening the file...')
with open(args.input) as datafile:
    header_words = datafile.readline().split()

    sampCol = calls.indexSamples(sampleNames, header_words)

    print('Creating the output file...')
    samples_head = calls.selectSamples(sampCol, header_words)
    samples_headP = '\t'.join(str(e) for e in samples_head)
    output.write('%s\t%s\t%s\t%s\t%s\n' % ('CHROM', 'POS', 'ANC',
                           'DER', samples_headP))

    # read the second line of the ancestral file
    ances_words = ances.readline().split()
    if '_' in ances_words[0]:
        ances_ch = int(ances_words[0].split('_')[1])
    else:
        ances_ch = int(ances_words[0].split('chr')[1])
    ances_pos = int(ances_words[1])
    ances_gt = splitAncestral(ances_words[2])

    for line in datafile:
        words = line.split()
Exemplo n.º 4
0
    posE = ''
    for line in datafile:
        words = line.split()
        Chr = words[0]
        pos = int(words[1])

        # to store the values of a previous line
        if not ChrPrevious:
            ChrPrevious = Chr
        if not posS:
            posS = pos
        if not posE:
            posE = pos

        # select samples
        sample_charaters = calls.selectSamples(sampCol, words)

        # check if one- or two-character code
        if any(["/" in gt for gt in sample_charaters]):
            sample_charaters = calls.twoToOne(sample_charaters)

        # if window size is reached output the results
        if Chr > ChrPrevious:  # if end of a chromosome
            try:
                HeterWindow = round(meanWindow(Hwindow, Twindow), 4)
            except Exception:
                HeterWindow = "NA"
            calls.processWindow(ChrPrevious, posS, posE, HeterWindow,
                                outputFile)
            windPosEnd = windSize
            Hwindow = []
Exemplo n.º 5
0
        vars()[popName + "Index"] = calls.indexSamples(
            vars()[popName + "samples"], header_words)

    for line in datafile:
        words = line.split()
        GT = words[2:]
        GTpair = [i for i in list(set(GT))
                  if i != 'N']  # get the set of alleles, skip missing alleles
        popNum = 0

        #if ("N" not in GT) and (len(GTpair) == 2) : # skip missing data or non-biallelic
        if (len(GTpair) == 2):  # skip non-biallelic
            for popName in pops:
                popNum += 1  # to make correct output. See below
                # select genotypes
                sGT = calls.selectSamples(vars()[popName + "Index"], words)

                counts = Counter(sGT)  # count alleles
                bicounts = [counts[GTpair[0]],
                            counts[GTpair[1]]]  # extract counts
                bicountsP = ','.join(str(w) for w in bicounts)

                # make output
                if popNum == len(pops):
                    fileoutput.write("%s\n" % bicountsP)
                else:
                    fileoutput.write("%s\t" % bicountsP)

        # track progress
        counter += 1
        if counter % 1000000 == 0:
Exemplo n.º 6
0
############################# program #############################

counter = 0

print('Opening the file...')

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    # index samples
    sampCol = calls.indexSamples(sampleNames, header_words)

    # make output header
    print('Creating the output file...')
    fileoutput = open(args.output, 'w')
    sampHeader = calls.selectSamples([0, 1] + sampCol, header_words)
    sampHeaderP = '\t'.join(str(el) for el in sampHeader)
    fileoutput.write(sampHeaderP + '\n')

    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]

        # select samples
        genotypes = calls.selectSamples(sampCol, words)

        # make output
        chr_posP = '\t'.join(str(el) for el in chr_pos)
        genotypesP = '\t'.join(str(el) for el in genotypes)
        fileoutput.write('%s\t%s\n' % (chr_posP, genotypesP))
Exemplo n.º 7
0
    fileoutput = open(chrPrev + '_' + args.output, 'w')

    for line in datafile:
        words = line.split()
        chr = str(words[0].split('_')[1])
        pos = words[1]

        # split chromosomes into separate files
        if chr != chrPrev:
            fileoutput.close()
            fileoutput = open(chr + '_' + args.output, 'w')
            chrPrev = chr
            siteNumber = 1

        # select samples
        genotypes = calls.selectSamples(sampCol, words)

        # count Ns
        valueN = calls.countPerPosition(genotypes, 'N')

        if valueN <= args.missing:
            genotypesMerged = ''.join(str(e) for e in genotypes)
            genotypesMergedP = genotypesMerged.replace('N', '?')
        else:
            continue

        # count the number of called sites
        if not calls.is_polymorphic(genotypes):
            siteNumber += 1
        else:
            fileoutput.write("%s\t%s\t%s\t%s\n" %
    return all(x == items[0] for x in items[1:])

counter = 0
output = open(args.output, 'w')
output.write("#CHR\tPOS\tCommon_alleles\tRare_alleles\n")

print('Opening the file...')

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    # index samples
    sIndex = calls.indexSamples(sNames, header_words)

    # create lists for output
    sNames = calls.selectSamples(sIndex, header_words)
    
    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]
        chr_posP = '\t'.join(str(e) for e in chr_pos)

        # select samples
        sGT = calls.selectSamples(sIndex, words)

        # define two or one character code
        if all(len(i) == 1 for i in  sGT):
            alleles = calls.OneToTwo(sGT)
        elif all("/" in i for i in  sGT):
            alleles = sGT
  # index samples
  sampCol = calls.indexSamples(sampleNames, header_words)
    
  for line in datafile:
    # track progress
    counter += 1
    if counter % 1000000 == 0:
      print str(counter), "lines processed"

    words = line.split()
    chr_pos = words[0:2]
    ch = int(words[0].split('_')[1])
    pos = int(words[1])

     # select samples
    alleles = calls.selectSamples(sampCol, words)

    # count Ns
    valueN = calls.countPerPosition(alleles, 'N')

    if valueN <= args.missing:
      Allalleles = [i for i in alleles if i != 'N']
    else:
      continue

    # find overlap with the ancestor
    while ch > ref_ch or (ch == ref_ch and pos > ref_pos):
      words2 = ref.readline().split()
      if words2 == []:
        ancest = 'N'
        break
counter = 0

print('Opening the file...\n')

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    ChrPos = header_words[0:2]
    ChrPosP = '\t'.join(str(e) for e in ChrPos)

    # index samples
    sampCol = calls.indexSamples(sampleNames, header_words)

    # create lists for output
    sampColnames = calls.selectSamples(sampCol, header_words)

    # create merged column names
    idsheader = []
    for i in range(len(sampColnames)):
        if i % 2 == 0:
            name1 = sampColnames[i].split("_")[0]
            name2 = sampColnames[i + 1].split("_")[0]
            if name1 == name2:
                idsheader.append(name1)
            else:
                raise KeyError(
                    "Sample is not paired. Sample name %s doesn't equal to sample name %s"
                    % (name1, name2))
            header = '\t'.join(str(e) for e in idsheader)
############################# program #############################

counter = 0

print('Opening the file...')

with open(args.input) as datafile:
    header_line = datafile.readline()
    header_words = header_line.split()

    # index samples
    sampCol = calls.indexSamples(sampleNames, header_words)

    # create lists for output
    sampColnames = calls.selectSamples(sampCol, header_words)
    sampNs = [0 for i in sampColnames]

    print('Counting Ns ...')

    Ns = []

    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]

        # select samples
        sample_charaters = calls.selectSamples(sampCol, words)

        # count Ns per position
        contNsOnly = calls.countPerPosition(sample_charaters, 'N')
sampleNames = calls.checkSampleNames(args.stats, args.input)

############################# program #############################

output = open(args.output, 'w')

counter = 0

print('Opening the file...')

with open(args.input) as datafile:
    header_line = datafile.readline()
    header_words = header_line.split()

    sampCol = calls.indexSamples(sampleNames, header_words)
    sampColnames = calls.selectSamples(sampCol, header_words)
    sampColnamesP = '\t'.join(str(e) for e in sampColnames)

    output.write("%s\t%s\t%s\n" %
                 (header_words[0], header_words[1], sampColnamesP))

    print('Adding noise ...')

    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]
        sample_scores = calls.selectSamples(sampCol, words)

        sample_scoresNoise = []

        for s in sample_scores:
fieldsIndex = calls.indexSamples(fieldsNames, annotOptions)

sift_words = siftFile.readline().split()
sift_chr = int(sift_words[0].split('_')[1])
sift_pos = int(sift_words[1])

with open(args.tab) as datafile:
  header_words = datafile.readline().split()

  # index samples
  sampCol = calls.indexSamples(sampleNames, header_words)

  # make output header
  print('Creating the output file...')
  output = open(args.output, 'w')
  ouput_header = header_words[0:2] + calls.selectSamples(sampCol, header_words)
  ouput_headerP = '\t'.join(str(el) for el in ouput_header)
  output.write('%s\n' % ouput_headerP)

############################### perform counting ####################

  for line in datafile:
    words = line.split()
    ch = int(words[0].split('_')[1])
    pos = int(words[1])

    # select samples
    tab_charaters = calls.selectSamples(sampCol, words)

    # find overlap in genomic position
    while (ch > sift_chr) or (ch == sift_chr and pos > sift_pos):
with open(args.callsFile) as callsFile:
    header_words = callsFile.readline().split()

    # index samples according to the header
    for groupName in groups:
        vars()[groupName + "Index"] = calls.indexSamples(
            vars()[groupName + "samples"], header_words)

    for line in callsFile:
        words = line.split()
        CHR = words[0]
        POS = int(words[1])
        # prepare the genotypes
        for popName in groups:
            sGT = calls.selectSamples(vars()[popName + "Index"],
                                      words)  # select genotypes per group
            sGTnoN = [i for i in sGT if i != 'N']  # remove missing data
            if sGTnoN != []:  # if not all GT are missing
                sGTnoNset = list(set(sGTnoN))  # find set of alleles
                random.shuffle(
                    sGTnoNset)  # shuffle list to deal with a tie case.
                mostFreqGT = max(
                    sGTnoNset, key=sGTnoN.count
                )  # find the most frequent allele. A tie is solved randomly.
            else:
                mostFreqGT = 'NA'

            if CHR != CHRprev:
                try:
                    fastaSeqP = ''.join(
                        str(w) for w in vars()[popName + "fastaSeq"])
    ANCindex = calls.indexSamples(['ANC'], header_words)
    DERindex = calls.indexSamples(['DER'], header_words)

    FamilyIndex = {}
    phasedLines = {}
    for family in familySamples:
        FamilyIndex[family] = calls.indexSamples(familySamples[family],
                                                 header_words)
        phasedLines[family] = []

    for line in datafile:
        words = line.split()
        CHR = words[0]
        POS = words[1]
        chr_pos = CHR + '_' + POS
        ANC = calls.selectSamples(ANCindex, words)[0]
        DER = calls.selectSamples(DERindex, words)[0]
        allSeq = calls.selectSamples(sampleIndex, words)

        if calls.is_biallelic(allSeq):
            for family in FamilyIndex:
                famSeq = calls.selectSamples(FamilyIndex[family], words)
                GTsplit = []
                splitfamSeq = []
                for GT in famSeq:
                    GT = GT.replace('/', ' ')
                    GT = GT.replace('|', ' ')
                    GT = GT.replace('.', '9')
                    GTsplit.append(GT)
                splitSeq = ' '.join(str(e) for e in GTsplit)
                splitfamSeq.append(splitSeq)
parser.add_argument('-s', '--samples', help = 'column names of the samples to process (optional)', type=str, required=False)
args = parser.parse_args()

# check if samples names are given and if all sample names are present in a header
sampleNames = calls.checkSampleNames(args.samples, args.input)

############################# program #############################

CHRprev = 'NA'

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    # index a sample
    sampCol = calls.indexSamples(sampleNames, header_words)
    header_samples = calls.selectSamples(sampCol, header_words)
    header_samplesP = '\t'.join(str(e) for e in header_samples)

    for line in datafile:
        words = line.split()
        CHR = words[0]
        Pos = words[1]

        samples = calls.selectSamples(sampCol, words)
        samplesP = '\t'.join(str(e) for e in samples)

        # find chromosome border
        if CHRprev == CHR:
            output.write('%s\t%s\t%s\n' % (CHR, Pos, samplesP))
        else:
            CHRprev = CHR