Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract the SFS from a binch of merged SLiM output files")

    parser.add_argument(
        "-i",
        "--input",
        required=True,
        dest="input",
        type=str,
        help="The name of the input file (or the input directory)")
    parser.add_argument(
        "-o",
        "--output",
        required=True,
        dest="output",
        type=str,
        help="The name of the output file you want to write to")
    parser.add_argument(
        "-d",
        "--dir",
        required=False,
        dest="dir",
        action='store_true',
        help=
        "Use this flag if you want to combine a number of files in the same directory",
        default=False)

    args = parser.parse_args()

    if args.dir:
        files = glob.glob(args.input + '/*sfs')
    else:
        files = [args.input]
    sites = float(len(files)) * 1e6

    # Get a list of dictionaries containing the SFSs
    dicts = [getDict(i) for i in files]

    nonsynSFS, synSFS = mergeDicts(dicts)

    nonsynSFS[0] += (sites * 0.75) - sum(nonsynSFS[1:])
    synSFS[0] += (sites * 0.25) - sum(synSFS[1:])
    ds = float(synSFS[-1]) / sum(synSFS)
    dn = float(nonsynSFS[-1]) / sum(nonsynSFS)
    print 'nonsyn pi:', pi(nonsynSFS)
    print 'syn pi:', pi(synSFS)
    print 'pi / pi_0:', pi(synSFS) / 0.01
    print 'dN', dn
    print 'ds', ds
    print 'adv. Cont', float(sum(mergeTheSFS([
        i['m2'] for i in dicts
    ])[1:-1])) / (sum(mergeTheSFS([i['m1'] for i in dicts])[1:-1]) +
                  sum(mergeTheSFS([i['m2'] for i in dicts])[1:-1]))

    print 'dN/dS:', dn / ds
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        description="Extract the SFS from a binch of merged SLiM output files")

    parser.add_argument(
        "-i",
        "--input",
        required=True,
        dest="input",
        type=str,
        help="The name of the directory containing the SLiM output")

    parser.add_argument(
        "-o",
        "--output",
        required=True,
        dest="output",
        type=str,
        help="The name of the output file you want to write to")

    args = parser.parse_args()

    #	output = []
    for m in range(1, 9):
        print 'm' + str(m)
        full_sfs = []
        for i in glob.glob(args.input + '/R*'):
            num = i.split('/')[-1].split('.')[2]

            process = subprocess.Popen(['zgrep', 'm' + str(m), i],
                                       stdout=subprocess.PIPE).communicate()[0]
            fixations, sfs = getSFSfromSLiM(process)
            if fixations == None: continue
            polymorphs = sum(sfs)
            sfs[-1] += fixations
            sfs[0] = 0
            print SFS.pi(sfs) / 140000.
            print ':'.join(map(str, sfs))
            continue
            if len(full_sfs) == 0:
                full_sfs = sfs
            else:
                full_sfs = SFS.merge_SFS(full_sfs, sfs)


#		output.append(['m'+str(m),full_sfs])
    return

    txt = open(args.output, 'w')
    for i in output:
        print i
        txt.write(i[0] + '\n')
        txt.write(' '.join(map(str, i[1])) + '\n')

    txt.close()
def main():
    parser = argparse.ArgumentParser(
        description=
        "Combine all the sfs files coming out of the sfs_from_slim_update_bootstrap.py script"
    )

    parser.add_argument(
        "-i",
        "--input",
        required=True,
        dest="input",
        type=str,
        help="The name of the file that contains the sfs files")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        dest="output",
                        type=str,
                        help="The name of the output file")
    args = parser.parse_args()

    sfs_dict = {}

    for i in gzip.open(args.input):
        z = i.split('[')
        region = z[1].strip("'").replace("'", '')
        sfs_temp = map(
            int, z[2].replace(']', '').replace(',', '').strip().split(' '))
        try:
            sfs_dict[region].append(sfs_temp)
        except KeyError:
            sfs_dict[region] = [sfs_temp]

    data = []

    for i in sfs_dict.keys():
        sfs = sfs_dict[i][0]
        for j in sfs_dict[i][1:]:
            sfs = SFS_tools.merge_SFS(sfs, j)
        stream = i.split('.')[0]
        dist = map(int, i.replace(',', '').split('.')[1].split('-'))
        if stream == 'u':
            mult = -1
        else:
            mult = 1

        mid = mult * sum(dist) / 2

        data.append([mid, SFS_tools.pi(sfs), SFS_tools.tajima(sfs)])

    pd.DataFrame(data, columns=['dist', 'pi', 'TD']).to_csv(args.output)
def main():
    parser = argparse.ArgumentParser(
        description=
        "Give a directory and I'll analyse the patterns of diversity around a simulated exon"
    )

    parser.add_argument(
        "-i",
        "--input",
        required=True,
        dest="input",
        type=str,
        help="The name of the file that contains the SLiM output")
    args = parser.parse_args()
    count = 0
    r_bins = [float(i) / 10
              for i in range(1, 500, 1)] + [1. * i for i in range(50, 500)]

    approx = combinedSel(np.array(r_bins))
    pylab.plot(r_bins, approx * 0.01, 'r')

    for i in glob.glob(args.input + '*.out.gz'):

        if count == 0:
            data = processSLiM(i)
        else:
            temp = processSLiM(i)
            data = pd.concat([data, temp])
        count += 1
        if count == 50:
            break
    top = max(range(len(r_bins)))
    analysis = []
    for r in range(len(r_bins)):
        bin = r_bins[r]

        if r == top:
            next = 1e8
        else:
            next = r_bins[r + 1]

        in_range = list(data[(data.r_dist_true >= bin)
                             & (data.r_dist_true < next)].sfs)
        if len(in_range) == 0: continue
        analysis.append([r_bins[r], SFS.pi(mergeManySFS(in_range))])
    true_r = pd.DataFrame(analysis, columns=['dist', 'pi'])
    pylab.plot(true_r.dist, true_r.pi, 'b')
    pylab.show()
Exemplo n.º 5
0
def get_summary(sfs_dict, label):
    out = []
    for key in sfs_dict.keys():
        mid = key
        if sum(sfs_dict[key][1:-1]) == 0:
            out.append([
                label, mid, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan',
                sum(sfs_dict[key])
            ])
        elif sum(sfs_dict[key][1:-1]) > 0:
            out.append([
                label,
                mid,
                SFS.pi(sfs_dict[key]),
                SFS.xsi(sfs_dict[key]),
                SFS.pi2(sfs_dict[key]),
                SFS.fwh(sfs_dict[key]),
                SFS.theta_W(sfs_dict[key]),
                SFS.tajima(sfs_dict[key]),
                #				SFS.KZl(sfs_dict[key]),
                sum(sfs_dict[key])
            ])
    return out
def main():
    parser = argparse.ArgumentParser(
        description="Takes the summary file of recombination distances and ")

    parser.add_argument("-i",
                        "--input",
                        required=True,
                        dest="input",
                        type=str,
                        help="The name of the sorted, bed file of segments")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        dest="output",
                        type=str,
                        help="The name of the output file")
    parser.add_argument(
        "-l",
        "--label",
        required=False,
        dest="label",
        type=str,
        help=
        "Add a label to this data, e;g; which chromosome do they come from? The default will be 'Autosomes'",
        default='Autosomes')
    parser.add_argument(
        "--ncpg",
        required=False,
        dest="ncpg",
        action='store_true',
        help="Add this flag if you want to analyse the non-CpG sites",
        default=False)
    parser.add_argument("--cne",
                        required=False,
                        dest="cne",
                        action='store_true',
                        help="Add this flag if you are analysing CNEs",
                        default=False)
    parser.add_argument("--Cox",
                        required=False,
                        dest="Cox",
                        action='store_true',
                        help="Add this flag if you want to use the Cox map",
                        default=False)
    parser.add_argument(
        "--GC",
        required=False,
        dest="GC",
        action='store_true',
        help=
        "Add this flag if you want to include GeneConversion according to the Paigen et al estimates",
        default=False)

    args = parser.parse_args()

    data = pd.read_csv(args.input, compression='gzip', header=None, sep='\t')
    data['scale'] = [-1 if x.split('.')[0] == 'u' else 1 for x in data[0]]

    # The following gene conversion parameters come from Paigen et al 2008 PLoS Genetics

    nc_gc_ratio = 0.105  # The relative rate of NCO gene conversion compared to CO Gene Conversion 0.105 for Paigen
    tract_length = 144  # average tract length

    if args.Cox:
        data['r_co'] = data[7] * 426200 * 4
        # RecPos is the index of the dataframe where recombination rates are held
        recPos = 'r_co'
        data['r_gc'] = 426200 * 4 * (nc_gc_ratio * data[7] /
                                     data[3]) * tract_length * (1 - np.exp(
                                         (-1. * data[3]) / tract_length))
        data['joint'] = data['r_gc'] + data['r_co']
        if args.GC:
            recPos = 'joint'
    else:
        data['r_co'] = data[2]
        recPos = 'r_co'
        data['r_gc'] = (nc_gc_ratio * data[2] /
                        data[3]) * tract_length * (1 - np.exp(
                            (-1. * data[3]) / tract_length))
        data['joint'] = data['r_gc'] + data['r_co']
        if args.GC:
            recPos = 'joint'

    data['dist'] = data[recPos] * data['scale']

    if args.cne:
        roundBy = 2
        bins = np.logspace(0, 2.477122, 100) - 1
#		bins = range(0, 200, roundBy) ## For CNEs
    else:
        roundBy = 30
        bins = np.logspace(0, 3.477122, 100) - 1


#		bins = range(0, 3000, roundBy) ## For Exons
    output_lines = []
    output_lines_2 = []

    for i in range(len(bins)):

        if i < 99:

            chunk = data.loc[(data[recPos] >= bins[i])
                             & (data[recPos] < bins[i + 1])]
        else:
            chunk = data.loc[(data[recPos] >= bins[i])]

        if len(chunk) == 0: continue

        SFS, div = summariseChunk(chunk, ncpg=args.ncpg)

        up_chunk = chunk[chunk['dist'] < 0]
        if len(up_chunk) == 0: continue
        SFS_up, div_up = summariseChunk(up_chunk, ncpg=args.ncpg)

        down_chunk = chunk[chunk['dist'] > 0]
        if len(down_chunk) == 0: continue
        SFS_down, div_down = summariseChunk(down_chunk, ncpg=args.ncpg)

        if sum(SFS) == 0: continue
        else:
            outline = [
                bins[i],
                args.label,
                SFS_tools.pi(SFS),
                pgm.jukes_cantor(float(div[0]) / sum(SFS)),
                pgm.jukes_cantor(float(div[1]) / sum(SFS)),
                SFS_tools.tajima(SFS),
                sum(SFS),
            ]
        if sum(SFS_up) == 0: continue
        else:
            outline_up = [
                -1 * bins[i],
                args.label,
                SFS_tools.pi(SFS_up),
                pgm.jukes_cantor(float(div_up[0]) / sum(SFS_up)),
                pgm.jukes_cantor(float(div_up[1]) / sum(SFS_up)),
                SFS_tools.tajima(SFS_up),
                sum(SFS_up),
            ]
        if sum(SFS_down) == 0: continue
        else:
            outline_down = [
                bins[i],
                args.label,
                SFS_tools.pi(SFS_down),
                pgm.jukes_cantor(float(div_down[0]) / sum(SFS_down)),
                pgm.jukes_cantor(float(div_down[1]) / sum(SFS_down)),
                SFS_tools.tajima(SFS_down),
                sum(SFS_down),
            ]

        output_lines.append(outline)
        output_lines_2.append(outline_up)
        output_lines_2.append(outline_down)

    output1 = pd.DataFrame(output_lines,
                           columns=[
                               'mid', 'label', 'pi', 'fam_div_jc',
                               'rat_div_jc', 'tajima', 'sites'
                           ])
    output1.to_csv(args.output)

    output2 = pd.DataFrame(output_lines_2,
                           columns=[
                               'mid', 'label', 'pi', 'fam_div_jc',
                               'rat_div_jc', 'tajima', 'sites'
                           ])
    output2.to_csv('split_' + args.output)
Exemplo n.º 7
0
for i in glob.glob('*/*sfs'):
    x = open(i).readlines()
    name = i.split('/')[0]

    if name[0] == 'u':
        mult = -1
    elif name[0] == 'd':
        mult = 1
    interval = name.split('.')[1]
    dist = mult * (int(interval.split('-')[0]) +
                   int(interval.split('-')[1])) / 2
    x = open(i).readlines()
    sfs = map(float, x[0].strip().split(','))
    if sfs == [0.0]:
        continue
    outline = [
        dist,
        sum(sfs),
        SFS.pi(sfs),
        SFS.xsi(sfs),
        SFS.pi2(sfs),
        SFS.theta_W(sfs),
        SFS.fwh(sfs),
        SFS.tajima(sfs), sfs[-1] / sum(sfs)
    ]
    out_lines.append(outline)
#	output.write(','.join(map(str,outline))+'\n')

data = pd.DataFrame(out_lines, columns=headings)
data.sort_values('dist').to_csv(sys.argv[1])
def main():
    parser = argparse.ArgumentParser(
        description="Takes the summary file of recombination distances and ")

    parser.add_argument("-i",
                        "--input",
                        required=True,
                        dest="input",
                        type=str,
                        help="The name of the sorted, bed file of segments")
    parser.add_argument("-o",
                        "--output",
                        required=True,
                        dest="output",
                        type=str,
                        help="The name of the output file")
    parser.add_argument(
        "-l",
        "--label",
        required=False,
        dest="label",
        type=str,
        help=
        "Add a label to this data, e;g; which chromosome do they come from? The default will be 'Autosomes'",
        default='Autosomes')
    parser.add_argument(
        "--ncpg",
        required=False,
        dest="ncpg",
        action='store_true',
        help="Add this flag if you want to analyse the non-CpG sites",
        default=False)
    parser.add_argument(
        "--dir",
        required=False,
        dest="dir",
        action='store_true',
        help="Add this flag if you are analysing a directory of files",
        default=False)

    args = parser.parse_args()
    if not args.dir:
        data = pd.read_csv(args.input, header=None, sep='\t')
    else:
        data = pd.concat([
            pd.read_csv(i, header=None, sep='\t')
            for i in glob.glob(args.input + '*')
        ])

    ranges = range(0, 100, 1)  ## For CNEs
    #	ranges = range(0, 3000, 30) ## For Exons
    output_lines = []

    for i in range(len(ranges)):
        if i < 99:

            chunk = data.loc[(data[2] >= ranges[i])
                             & (data[2] < ranges[i + 1])]
        else:
            chunk = data.loc[(data[2] >= ranges[i])]
        if len(chunk) == 0: continue
        SFS, div = summariseChunk(chunk, ncpg=args.ncpg)

        if sum(SFS) == 0: continue
        outline = [
            ranges[i],
            args.label,
            SFS_tools.pi(SFS),
            pgm.jukes_cantor(float(div[0]) / sum(SFS)),
            pgm.jukes_cantor(float(div[1]) / sum(SFS)),
            SFS_tools.tajima(SFS),
            sum(SFS),
        ]
        output_lines.append(outline)

    output = pd.DataFrame(output_lines,
                          columns=[
                              'distance', 'label', 'pi', 'fam_div_jc',
                              'rat_div_jc', 'tajima', 'sites'
                          ])
    output.to_csv(args.output)
def summariseChunk(chunk):
	recDist = chunk[1].mean()
	sfs = []
	for i in list(chunk.columns)[2:]:
		sfs.append( chunk[i].sum() )
	return [recDist, SFS_tools.pi(sfs)]