示例#1
0
############################# program #############################

counter = 0

# read the header
ances = open(args.ancestral, 'r')
ances_words = ances.readline()

output = open(args.output, 'w')

print('Opening the file...')
with open(args.input) as datafile:
    header_words = datafile.readline().split()

    sampCol = calls.indexSamples(sampleNames, header_words)

    print('Creating the output file...')
    samples_head = calls.selectSamples(sampCol, header_words)
    samples_headP = '\t'.join(str(e) for e in samples_head)
    output.write('%s\t%s\t%s\t%s\t%s\n' % ('CHROM', 'POS', 'ANC',
                           'DER', samples_headP))

    # read the second line of the ancestral file
    ances_words = ances.readline().split()
    if '_' in ances_words[0]:
        ances_ch = int(ances_words[0].split('_')[1])
    else:
        ances_ch = int(ances_words[0].split('chr')[1])
    ances_pos = int(ances_words[1])
    ances_gt = splitAncestral(ances_words[2])
sampleNames = calls.checkSampleNames(args.samples, args.input)

############################# program #############################

counter = 0
siteNumber = 1
chrPrev = str(1)

print('Opening the file...')

with open(args.input) as datafile:
  header_line = datafile.readline()
  header_words = header_line.split()

  # index samples
  sampCol = calls.indexSamples(sampleNames, header_words)

  print('Creating the output file...')
  fileoutput = open(chrPrev+'_'+args.output, 'w')

  for line in datafile:
    words = line.split()
    chr = str(words[0].split('_')[1])
    pos = words[1]

    # split chromosomes into separate files
    if chr != chrPrev:
      fileoutput.close()
      fileoutput = open(chr+'_'+args.output, 'w')
      chrPrev = chr
      siteNumber = 1
示例#3
0
counter = 0

print('Opening the file...')

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    # make output header
    print('Creating the output file...')
    fileoutput = open(args.output, 'w')
    popsP = '\t'.join(str(w) for w in pops)
    fileoutput.write("%s\n" % popsP)

    for popName in pops:
        # index samples
        vars()[popName + "Index"] = calls.indexSamples(
            vars()[popName + "samples"], header_words)

    for line in datafile:
        words = line.split()
        GT = words[2:]
        GTpair = [i for i in list(set(GT))
                  if i != 'N']  # get the set of alleles, skip missing alleles
        popNum = 0

        #if ("N" not in GT) and (len(GTpair) == 2) : # skip missing data or non-biallelic
        if (len(GTpair) == 2):  # skip non-biallelic
            for popName in pops:
                popNum += 1  # to make correct output. See below
                # select genotypes
                sGT = calls.selectSamples(vars()[popName + "Index"], words)
############################# program #############################

def all_same(items):
    return all(x == items[0] for x in items[1:])

counter = 0
output = open(args.output, 'w')
output.write("#CHR\tPOS\tCommon_alleles\tRare_alleles\n")

print('Opening the file...')

with open(args.input) as datafile:
    header_words = datafile.readline().split()

    # index samples
    sIndex = calls.indexSamples(sNames, header_words)

    # create lists for output
    sNames = calls.selectSamples(sIndex, header_words)
    
    for line in datafile:
        words = line.split()
        chr_pos = words[0:2]
        chr_posP = '\t'.join(str(e) for e in chr_pos)

        # select samples
        sGT = calls.selectSamples(sIndex, words)

        # define two or one character code
        if all(len(i) == 1 for i in  sGT):
            alleles = calls.OneToTwo(sGT)
parser.add_argument('-s', '--samples', help = 'column names of the samples to process (optional)', type=str, required=False)
args = parser.parse_args()

# check if samples names are given and if all sample names are present in a header
sampleNames = calls.checkSampleNames(args.samples, args.tab)

############################# program #############################

print('Opening the file...')

counter = 0

siftFile = open(args.annotation, 'r')
annotOptions = siftFile.readline().split()
fieldsNames = args.fields.split(',')
fieldsIndex = calls.indexSamples(fieldsNames, annotOptions)

sift_words = siftFile.readline().split()
sift_chr = int(sift_words[0].split('_')[1])
sift_pos = int(sift_words[1])

with open(args.tab) as datafile:
  header_words = datafile.readline().split()

  # index samples
  sampCol = calls.indexSamples(sampleNames, header_words)

  # make output header
  print('Creating the output file...')
  output = open(args.output, 'w')
  ouput_header = header_words[0:2] + calls.selectSamples(sampCol, header_words)
outputPhy.write(' %s %s\n' % (NumberSamp, NumberPos))

# process one sample per time to reduce RAM usage
for sample in sampleNames:

    # write sample name into file
    outputFasta.write(">%s\n" % sample)
    outputPhy.write("%s  " % sample)

    fastaLim = 0  # counter to split sequence in multi-line fasta

    with open(args.input) as datafile:
        header_words = datafile.readline().split()

        # index a sample
        sampCol = calls.indexSamples([sample], header_words)

        for line in datafile:
            words = line.split()

            genotype = calls.selectSamples(sampCol, words)

            # output only single nucleotide genotypes, insertions are replaced with N.
            if len(genotype) == 1:
                outputFasta.write(genotype[0])
                outputPhy.write(genotype[0])
            else:
                outputFasta.write('N')
                outputPhy.write('N')

            # to split sequence in multi-line fasta
for i in familyNames.strip("\"").split(";"):
    famName = i.split("[")[0]
    famSample = re.split("\[|\]|", i)[1]
    Fsamples.append(famSample.split(","))
    familySamples[famName] = calls.checkSampleNames(famSample, args.input)
samples = calls.flattenList(Fsamples)
calls.checkSampleNames('ANC', args.input)
calls.checkSampleNames('DER', args.input)

############################# program #############################

outputSNPs = open(args.output + '.snps', 'w')

with open(args.input) as datafile:
    header_words = datafile.readline().split()
    sampleIndex = calls.indexSamples(samples, header_words)
    ANCindex = calls.indexSamples(['ANC'], header_words)
    DERindex = calls.indexSamples(['DER'], header_words)

    FamilyIndex = {}
    phasedLines = {}
    for family in familySamples:
        FamilyIndex[family] = calls.indexSamples(familySamples[family],
                                                 header_words)
        phasedLines[family] = []

    for line in datafile:
        words = line.split()
        CHR = words[0]
        POS = words[1]
        chr_pos = CHR + '_' + POS