Пример #1
0
def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"):
    ph = ['p' + h for h in get_header(p_bed)]
    rh = get_header(region_bed)
    if isinstance(p_col_name, (int, long)):
        p_col_name = ph[p_col_name][1:]

    a = dict(p_bed=p_bed, region_bed=region_bed)
    a['p_bed'] = fix_header(a['p_bed'])

    yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"]
    for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a,
            header=rh + ph), itemgetter('chrom','start','end')):
        plist = list(plist)
        plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))]
        tscores = [float(row['pt']) for row in plist if 'pt' in row]

        if max_p:
            if any(float(row['p' + p_col_name]) > max_p for row in plist):
                continue

        ngt05  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05)
        ngt1  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1)
        tpos = sum(1 for ts in tscores if ts > 0)
        tneg = sum(1 for ts in tscores if ts < 0)
        tsum = sum(ts for ts in tscores)
        frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)]
        yield frow
Пример #2
0
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False):
    """
    the converted columns are appended to the end of the row.
    then after the sort, these are removed.
    this removes problems with floating point reprs.
    """
    the_first_line = get_header(afile)
    row_len = len(the_first_line)
    n_extra = len(sort_convertors)

    # maintain order of the sort cols, but use the appended columns for the
    # numeric ones.
    actual_sort_cols = []
    n_extra = 0

    # since we append floats to the end *and* want to maintain the
    # requested sort order, we create the `actual_sort_cols`
    for c in sort_cols:
        if not c in sort_convertors:
            actual_sort_cols.append(c)
        else:
            idx = row_len + n_extra
            actual_sort_cols.append(idx)
            n_extra += 1

    # if it was stdin, then we read one line to get the header length.
    lines = reader(afile, header=header) if afile != "-" \
            else chain([the_first_line], reader(afile, header))
    # groupby the correct columns
    for keyed, group in groupby(lines,
                                lambda toks: [toks[i] for i in group_cols]):

        # then generate the rows with the converted columns appended.
        def gen_converted_group():
            for toks in group:
                # add the converted columns onto the end.
                yield toks + [
                    fn(toks[col_idx])
                    for col_idx, fn in sort_convertors.items()
                ]

        # then iterator over the sorted cols.
        for toks in sorted(gen_converted_group(),
                           key=itemgetter(*actual_sort_cols)):
            # strip the extra columns.
            yield toks[:row_len]
Пример #3
0
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False):
    """
    the converted columns are appended to the end of the row.
    then after the sort, these are removed.
    this removes problems with floating point reprs.
    """
    the_first_line = get_header(afile)
    row_len = len(the_first_line)
    n_extra = len(sort_convertors)

    # maintain order of the sort cols, but use the appended columns for the
    # numeric ones.
    actual_sort_cols = []
    n_extra = 0

    # since we append floats to the end *and* want to maintain the
    # requested sort order, we create the `actual_sort_cols`
    for c in sort_cols:
        if not c in sort_convertors:
            actual_sort_cols.append(c)
        else:
            idx = row_len + n_extra
            actual_sort_cols.append(idx)
            n_extra += 1

    # if it was stdin, then we read one line to get the header length.
    lines = reader(afile, header=header) if afile != "-" \
            else chain([the_first_line], reader(afile, header))
    # groupby the correct columns
    for keyed, group in groupby(lines, lambda toks:
                                        [toks[i] for i in group_cols]):

        # then generate the rows with the converted columns appended.
        def gen_converted_group():
            for toks in group:
                # add the converted columns onto the end.
                yield toks + [fn(toks[col_idx]) for col_idx, fn in sort_convertors.items()]

        # then iterator over the sorted cols.
        for toks in sorted(gen_converted_group(), key=itemgetter(*actual_sort_cols)):
            # strip the extra columns.
            yield toks[:row_len]
Пример #4
0
def filter(p_bed, region_bed, max_p=None, region_p=None, p_col_name="P.Value",
                    coef_col_name="logFC"):
    ph = ['p' + h for h in get_header(p_bed)]
    rh = get_header(region_bed)
    if isinstance(p_col_name, (int, long)):
        p_col_name = ph[p_col_name][1:]

    a = dict(p_bed=p_bed, region_bed=region_bed)
    a['p_bed'] = fix_header(a['p_bed'])

    j = 0
    for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a,
            header=rh + ph), itemgetter('chrom','start','end')):
        plist = list(plist)

        if region_p:
            r = plist[0] # first cols are all the same
            region_p_key = 'slk_sidak_p' if 'slk_sidak_p' in r \
                                         else 'z_sidak_p' if 'z_sidak_p' in r \
                                         else None
            if region_p_key is None: raise Exception
            if float(r[region_p_key]) > region_p:
                continue

        plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))]
        tscores = [float(row['pt']) for row in plist if 'pt' in row]

        if max_p:
            if any(float(row['p' + p_col_name]) > max_p for row in plist):
                continue

        ngt05  = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05)

        # logic to try to find t and coef headers and skip if not found
        extra_header = []
        extra = []
        if tscores:
            tpos = sum(1 for ts in tscores if ts > 0)
            tneg = sum(1 for ts in tscores if ts < 0)
            tpn = "%i/%i" % (tpos, tneg)

            tsum = str(sum(ts for ts in tscores))
            extra_header += ["t.pos/t.neg", "t.sum"]
            extra += [tpn, tsum]
        else:
            tsum = tpn = "NA"
        if 'p' + coef_col_name not in plist[0] and 'pcoefficient' in plist[0]:
            coef_col_name = 'coefficient'
        if 'p' + coef_col_name in plist[0]:
            coef = (sum(float(row['p' + coef_col_name]) for row in plist) /
                                    len(plist))

            # since we probably had the data logit transformed, here we
            # do the inverse and subtract 0.5 since ilogit(0) == 0.5
            icoef = (sum(ilogit(float(row['p' + coef_col_name])) for row in plist) /
                                    len(plist)) - 0.5
            extra_header += ["avg.diff", "ilogit.diff"]
            extra += ["%.3f" % coef, "%.3f" % icoef]
        else:
            coef = icoef = float('nan')

        frow = [plist[0][h] for h in rh] + extra
        if j == 0:
            yield rh + extra_header
            j = 1
        yield frow