示例#1
0
def analyze(inp_fname, out_fname, plt_fname=None, k=3, thresh_init=1e-2, thresh_conv=1e-4, scaling="identity"):
    """Given cleaned data (output from `clean` function), run k-means
    clustering analysis, outputting textual and optionally plot output

    :param inp_fname: Input filename of raw data
    :param out_fname: Output filename for k-means data
    :param plt_fname: (optional) Output filename for k-means plot
    :param k: (optional) Number of k-means
    :param thresh_init: (optional) Threshold for initiation
    :param thresh_conv: (optional) Threshold for convergence
    :param scaling: (optional) Identity, zscore, or min-max scaling
    :returns: None
    """
    scaling_funcs = {
        "identity": identity,
        "zscore": zscore,
        "minmax": minmax,
    }

    dct = load(inp_fname)
    keys = nest(dict.keys, filt(bool), tuple)(dct)
    rpkm = nest(part(getitems, dct), pipe(scaling_funcs[scaling]))(keys)

    sse = 1
    sse_last = 0
    while relerr(sse, sse_last) > thresh_init:
        sse_last = sse
        sse, labels, means = kstep(random.sample(rpkm, k), rpkm)

    sse = 1
    sse_last = 0
    while relerr(sse, sse_last) > thresh_conv:
        sse_last = sse
        sse, labels, _ = kstep(means, rpkm)
        means = mstep(labels, rpkm)

    if plt_fname is not None:
        plot_kmeans(plt_fname, means, labels, rpkm, dct[None], scaling)

    organs = pipe(lambda x: x.index(max(x)))(transpose(means))
    index = sorted(range(len(organs)), key=organs.__getitem__)

    temp = collections.defaultdict(list)
    for i in index:
        temp[organs[i]].append(dct[None][i])

    op = nest(sorted, "|".join, repr)

    with open(out_fname, "w") as f:
        f.write(f"ID,NAME,GROUP,ASSIGNMENT,URL\n")
        for (id_, name), group in sorted(zip(keys, labels), key=nest(reversed, tuple)):
            f.write(
                f"{id_},{name},{group},{op(temp[group])},{TEMPLATE_NCBI_GENE.format(id_)}\n")
示例#2
0
def mstep(labels, values):
    """Generate new k-means given labeled RPKM values

    :param labels: K-mean assignments
    :param values: tuples of RPKM values
    :returns: New k-means
    """
    dct = collections.defaultdict(list)
    for i, x in zip(labels, values):
        dct[i].append(x)

    return pipe(nest(transpose, pipe(mean)))(dct.values())
示例#3
0
def sorted_fields(data):
    """Sort the fieldnames by mean RPKM

    :param data: Pruned data, tuple of dicts
    :returns: tuple of sorted fieldnames
    """
    full = nest(get(0), dict.keys, tuple)(data)
    keys = nest(set, part(sub, set(VALID_KEYS)), sorted, tuple)(full)
    vals = pipe(lambda x: pipe(get(x, iskey=False))(keys))(data)
    trans = nest(zip, tuple)(*vals)
    ind = nest(len, range)(trans)
    op = get(pipe(mean)(trans), iskey=False)
    srt = nest(part(sorted, key=op, reverse=True), tuple)(ind)
    sorted_fields = VALID_KEYS + pipe(get(keys, iskey=False))(srt)
    return sorted_fields
示例#4
0
def minmax(itr):
    if len(itr) <= 1:
        return
    lo, hi = min(itr), max(itr)
    rng = (hi - lo)
    op = nest(part(sub, lo), part(truediv, rng))
    return pipe(op)(itr)
示例#5
0
def convert(dct):
    """Convert a dict of str values to numeric where possible

    :param dct: dict with strs from cleaned data
    :returns: dict with numeric data
    """
    return dict(zip(dct.keys(), pipe(safenum)(dct.values())))
示例#6
0
def clean(inp_fname, out_fname):
    """Given raw data (output from `retrieve` function), remove the unnecessary
    information and sort the fieldnames by mean RPKM

    :param inp_fname: Input filename of raw data
    :param out_fname: Output filename for cleaned data
    :returns: tuple of dicts
    """
    with open(inp_fname) as f:
        return nest(csv.DictReader, pipe(prune),
                    lambda x: writecsv(x, out_fname, sorted_fields(x)))(f)
示例#7
0
def load(fname):
    """Load from cleaned data

    :param fname: Filename of cleaned data
    :returns: dict containing reorganized data
    """
    with open(fname) as f:
        return nest(
            csv.DictReader,
            pipe(convert),
            part(todatadict)
        )(f)
示例#8
0
def todatadict(data):
    """Reorganize tuple of dicts into single dict where keys are defined by ID
    and NAME, values are numerical data corresponding to RPKM values

    :param data: tuple of dicts from loaded cleaned data
    :returns: dict containing reorganized data
    """
    full = nest(get(0), dict.keys, tuple)(data)
    k = nest(filt(nest(VALID_KEYS.__contains__, lambda x: not x)), tuple)(full)
    dct = {
        (x["ID"], x["NAME"]): pipe(get(x, iskey=False))(k)
        for x in data
    }

    # remember the keys associated with the RPKM values
    dct[None] = k
    return dct
示例#9
0
def retrieve(inp_fname, out_fname, include=bool):
    """Given an search export from NCBI Gene, retrieve RNA-Seq data on those
    GeneIDs, and then write out the data to an output file; the `include`
    argument is a function that will filter out particular rows of the input
    file

    :param inp_fname: Input filename of NCBI Gene search export
    :param out_fname: Output filename for writing raw data
    :param include: Function to filter out input rows
    :returns: tuple of dicts
    """
    with open(inp_fname) as f:
        ids = nest(part(csv.DictReader, delimiter="\t"), filt(include),
                   apply(nest(get("GeneID"), int)), sorted, tuple)(f)

    urls = pipe(TEMPLATE_NCBI_GENE.format)(ids)
    n = (1 + len(urls) // CHUNK_SIZE)

    print(f"A total of {n} chunks will be retrieved.")
    return procfull(out_fname)(urls)
示例#10
0
def zscore(itr):
    mu = mean(itr)
    s = sd(itr)
    op = nest(part(sub, mu), part(truediv, s))
    return pipe(op)(itr)
示例#11
0
def var(itr):
    if len(itr) <= 1:
        return float("nan")
    mu = mean(itr)
    op = nest(part(sub, mu), part(pow, 2))
    return sum(pipe(op)(itr)) / (len(itr) - 1)
示例#12
0
def harmonic_mean(itr):
    return inv(mean(pipe(inv)(itr)))