Exemplo n.º 1
0
def test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, idx, log=False):

    pval = sp.zeros((gene_counts.shape[0], 1), dtype='float')
    pval.fill(sp.nan)

    for i in xrange(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        if sp.isnan(disp_adj[i]):
            continue

        response = gene_counts[i, :].astype('int')

        if sp.sum(response[:response.shape[0] / 2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2:
            pval[i] = 1
            continue

        modNB0 = sm.GLM(response, dmatrix0, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf))
        modNB1 = sm.GLM(response, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp_adj[i]), offset=sp.log(sf))
        result0 = modNB0.fit()
        result1 = modNB1.fit()
        pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance, dmatrix1.shape[1] - dmatrix0.shape[1])

    if log:
        log_progress(idx.shape[0], idx.shape[0])
        print ''

    return (pval, idx)
Exemplo n.º 2
0
def adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, idx, log=False):

    disp_adj = sp.empty((counts.shape[0], 1))
    disp_adj.fill(sp.nan)
    disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')
    error_cnt = 0

    for i in range(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        if not sp.isnan(disp_raw[i]):

            ### init dispersion and response
            disp = 0.1
            resp = counts[i, :].astype('int')

            ### run for max 10 iterations
            for j in range(10):
                modNB = sm.GLM(resp, dmatrix1, family=sm.families.NegativeBinomial(alpha=disp), offset=sp.log(sf))
                result = modNB.fit()

                dispBef = disp
                yhat = result.mu
                sign = -1.0
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    try:
                        res = minimize_scalar(adj_loglikelihood_shrink_scalar_onedisper, args=(dmatrix1, resp, yhat, disp_fitted[i], varPrior, sign), method='Bounded', bounds=(0, 10.0), tol=1e-5)
                    except TypeError:
                        disp_adj[i] = disp 
                        disp_adj_conv[i] = False
                        error_cnt += 1
                        break
                disp = res.x

                if abs(sp.log(disp) - sp.log(dispBef)) < 1e-4:
                    disp_adj[i] = disp
                    disp_adj_conv[i] = True
                    break
            else:
                disp_adj[i] = disp
                disp_adj_conv[i] = False
    if log:
        log_progress(idx.shape[0], idx.shape[0])
        print ''

    if error_cnt > 0:
        print 'Warning: %i events did not fit due to a TypeError' % error_cnt

    return (disp_adj, disp_adj_conv, idx)
Exemplo n.º 3
0
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG):

    if CFG['verbose']:
        print 'Start to estimate adjusted dispersions.'

    varLogDispSamp = polygamma(1, (dmatrix1.shape[0] - dmatrix1.shape[1] ) / 2) ## number of samples - number of coefficients
    varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp)

    if CFG['parallel'] > 1:
        disp_adj = sp.empty((counts.shape[0], 1))
        disp_adj.fill(sp.nan)
        disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize)]

        try:
            result = [pool.apply_async(adjust_dispersion_chunk, args=(counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx,)) for cidx in idx_chunks]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, counts.shape[0])
                        res_cnt += 1
                    disp_adj[j] = tmp[0][i]
                    disp_adj_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(counts.shape[0], counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:        
        (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                           disp=disp_adj,
                           matrix=dmatrix1,
                           figtitle='Adjusted Dispersion Estimate',
                           filename=os.path.join(CFG['plot_dir'], 'dispersion_adjusted.pdf'),
                           CFG=CFG)

    return (disp_adj, disp_adj_conv)
Exemplo n.º 4
0
def estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, idx, log=False):

    disp_raw = sp.empty((idx.shape[0], 1), dtype='float')
    disp_raw.fill(sp.nan)
    disp_raw_conv = sp.zeros((idx.shape[0], 1), dtype='bool')

    for i in range(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        disp = 0.1
        resp = gene_counts[i, :].astype('int')

        if sum(resp / sf) < CFG['min_count'] or sp.mean(resp == 0) > 0.6:
            continue

        for j in range(10):
            modNB = sm.GLM(resp,
                           matrix,
                           family=sm.families.NegativeBinomial(alpha=disp),
                           offset=sp.log(sf))
            result = modNB.fit()

            last_disp = disp
            yhat = result.mu
            sign = -1.0
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                res = minimize_scalar(likelihood.adj_loglikelihood_scalar,
                                      args=(matrix, resp, yhat, sign),
                                      method='Bounded',
                                      bounds=(0, 10.0),
                                      tol=1e-5)
            disp = res.x

            if abs(sp.log(disp) - sp.log(last_disp)) < 1e-4:
                disp_raw[i] = disp
                disp_raw_conv[i] = True
                break
        else:
            disp_raw[i] = disp
            disp_raw_conv[i] = False
    if log:
        log_progress(idx.shape[0], idx.shape[0])

    return (disp_raw, disp_raw_conv, idx)
Exemplo n.º 5
0
def estimate_dispersion(gene_counts, matrix, sf, CFG):
    
    if CFG['verbose']:
        print 'Estimating raw dispersions'

    if CFG['parallel'] > 1:
        disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float')
        disp_raw.fill(sp.nan)
        disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)]

        try:
            result = [pool.apply_async(estimate_dispersion_chunk, args=(gene_counts[idx, :], matrix, sf, CFG, idx,)) for idx in idx_chunks]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    disp_raw[j] = tmp[0][i]
                    disp_raw_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:        
        (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=gene_counts,
                                disp=disp_raw,
                                matrix=matrix,
                                figtitle='Raw Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'),
                                CFG=CFG)

    return (disp_raw, disp_raw_conv)
Exemplo n.º 6
0
def test_count_chunk(gene_counts,
                     disp_adj,
                     sf,
                     dmatrix0,
                     dmatrix1,
                     CFG,
                     idx,
                     log=False):

    pval = sp.zeros((gene_counts.shape[0], 1), dtype='float')
    pval.fill(sp.nan)

    for i in xrange(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        if sp.isnan(disp_adj[i]):
            continue

        response = gene_counts[i, :].astype('int')

        if sp.sum(
                response[:response.shape[0] /
                         2] == 0) >= CFG['max_0_frac'] * response.shape[0] / 2:
            pval[i] = 1
            continue

        modNB0 = sm.GLM(response,
                        dmatrix0,
                        family=sm.families.NegativeBinomial(alpha=disp_adj[i]),
                        offset=sp.log(sf))
        modNB1 = sm.GLM(response,
                        dmatrix1,
                        family=sm.families.NegativeBinomial(alpha=disp_adj[i]),
                        offset=sp.log(sf))
        result0 = modNB0.fit()
        result1 = modNB1.fit()
        pval[i] = 1 - chi2.cdf(result0.deviance - result1.deviance,
                               dmatrix1.shape[1] - dmatrix0.shape[1])

    if log:
        log_progress(idx.shape[0], idx.shape[0])
        print ''

    return (pval, idx)
Exemplo n.º 7
0
def test_count(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG):

    if CFG['verbose']:
        print 'Start the statistical test.'

    if CFG['parallel'] > 1:
        pval = sp.zeros((gene_counts.shape[0], 1), dtype='float')
        pval.fill(sp.nan)

        pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)]

        try:
            result = [pool.apply_async(test_count_chunk, args=(gene_counts[cidx, :], disp_adj[cidx], sf, dmatrix0, dmatrix1, CFG, cidx,)) for cidx in idx_chunks]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[1]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    pval[j] = tmp[0][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:        
        (pval, _) = test_count_chunk(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose'])

    if CFG['verbose']:
        print ''

    return pval
Exemplo n.º 8
0
def estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, idx, log=False):

    disp_raw = sp.empty((idx.shape[0], 1), dtype='float')
    disp_raw.fill(sp.nan)
    disp_raw_conv = sp.zeros((idx.shape[0], 1), dtype='bool')

    for i in range(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        disp = 0.1
        resp = gene_counts[i, :].astype('int')

        if sum(resp / sf) < CFG['min_count'] or sp.mean(resp == 0) > 0.6:
            continue

        for j in range(10):
            modNB  = sm.GLM(resp, matrix, family=sm.families.NegativeBinomial(alpha=disp), offset=sp.log(sf))
            result = modNB.fit()

            last_disp = disp
            yhat = result.mu
            sign = -1.0
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                res = minimize_scalar(likelihood.adj_loglikelihood_scalar, args=(matrix, resp, yhat, sign), method='Bounded', bounds=(0, 10.0), tol=1e-5)
            disp = res.x

            if abs(sp.log(disp) - sp.log(last_disp)) < 1e-4:
                disp_raw[i] = disp
                disp_raw_conv[i] = True
                break
        else:
            disp_raw[i] = disp
            disp_raw_conv[i] = False
    if log:
        log_progress(idx.shape[0], idx.shape[0])

    return (disp_raw, disp_raw_conv, idx)
Exemplo n.º 9
0
def test_count(gene_counts, disp_adj, sf, dmatrix0, dmatrix1, CFG):

    if CFG['verbose']:
        print 'Start the statistical test.'

    if CFG['parallel'] > 1:
        pval = sp.zeros((gene_counts.shape[0], 1), dtype='float')
        pval.fill(sp.nan)

        pool = mp.Pool(processes=CFG['parallel'],
                       initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [
            sp.arange(x, min(x + binsize, gene_counts.shape[0]))
            for x in range(0, gene_counts.shape[0], binsize)
        ]

        try:
            result = [
                pool.apply_async(test_count_chunk,
                                 args=(
                                     gene_counts[cidx, :],
                                     disp_adj[cidx],
                                     sf,
                                     dmatrix0,
                                     dmatrix1,
                                     CFG,
                                     cidx,
                                 )) for cidx in idx_chunks
            ]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[1]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    pval[j] = tmp[0][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:
        (pval, _) = test_count_chunk(gene_counts,
                                     disp_adj,
                                     sf,
                                     dmatrix0,
                                     dmatrix1,
                                     CFG,
                                     sp.arange(gene_counts.shape[0]),
                                     log=CFG['verbose'])

    if CFG['verbose']:
        print ''

    return pval
Exemplo n.º 10
0
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG):

    if CFG['verbose']:
        print 'Start to estimate adjusted dispersions.'

    varLogDispSamp = polygamma(
        1, (dmatrix1.shape[0] - dmatrix1.shape[1]) /
        2)  ## number of samples - number of coefficients
    varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp)

    if CFG['parallel'] > 1:
        disp_adj = sp.empty((counts.shape[0], 1))
        disp_adj.fill(sp.nan)
        disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'],
                       initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [
            sp.arange(x, min(x + binsize, counts.shape[0]))
            for x in range(0, counts.shape[0], binsize)
        ]

        try:
            result = [
                pool.apply_async(adjust_dispersion_chunk,
                                 args=(
                                     counts[cidx, :],
                                     dmatrix1,
                                     disp_raw[cidx],
                                     disp_fitted[cidx],
                                     varPrior,
                                     sf,
                                     CFG,
                                     cidx,
                                 )) for cidx in idx_chunks
            ]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, counts.shape[0])
                        res_cnt += 1
                    disp_adj[j] = tmp[0][i]
                    disp_adj_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(counts.shape[0], counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:
        (disp_adj, disp_adj_conv,
         _) = adjust_dispersion_chunk(counts,
                                      dmatrix1,
                                      disp_raw,
                                      disp_fitted,
                                      varPrior,
                                      sf,
                                      CFG,
                                      sp.arange(counts.shape[0]),
                                      log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_adj,
                                matrix=dmatrix1,
                                figtitle='Adjusted Dispersion Estimate',
                                filename=os.path.join(
                                    CFG['plot_dir'],
                                    'dispersion_adjusted.pdf'),
                                CFG=CFG)

    return (disp_adj, disp_adj_conv)
Exemplo n.º 11
0
def adjust_dispersion_chunk(counts,
                            dmatrix1,
                            disp_raw,
                            disp_fitted,
                            varPrior,
                            sf,
                            CFG,
                            idx,
                            log=False):

    disp_adj = sp.empty((counts.shape[0], 1))
    disp_adj.fill(sp.nan)
    disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')
    error_cnt = 0

    for i in range(idx.shape[0]):

        if log:
            log_progress(i, idx.shape[0])

        if not sp.isnan(disp_raw[i]):

            ### init dispersion and response
            disp = 0.1
            resp = counts[i, :].astype('int')

            ### run for max 10 iterations
            for j in range(10):
                modNB = sm.GLM(resp,
                               dmatrix1,
                               family=sm.families.NegativeBinomial(alpha=disp),
                               offset=sp.log(sf))
                result = modNB.fit()

                dispBef = disp
                yhat = result.mu
                sign = -1.0
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    try:
                        res = minimize_scalar(
                            adj_loglikelihood_shrink_scalar_onedisper,
                            args=(dmatrix1, resp, yhat, disp_fitted[i],
                                  varPrior, sign),
                            method='Bounded',
                            bounds=(0, 10.0),
                            tol=1e-5)
                    except TypeError:
                        disp_adj[i] = disp
                        disp_adj_conv[i] = False
                        error_cnt += 1
                        break
                disp = res.x

                if abs(sp.log(disp) - sp.log(dispBef)) < 1e-4:
                    disp_adj[i] = disp
                    disp_adj_conv[i] = True
                    break
            else:
                disp_adj[i] = disp
                disp_adj_conv[i] = False
    if log:
        log_progress(idx.shape[0], idx.shape[0])
        print ''

    if error_cnt > 0:
        print 'Warning: %i events did not fit due to a TypeError' % error_cnt

    return (disp_adj, disp_adj_conv, idx)
Exemplo n.º 12
0
def estimate_dispersion(gene_counts, matrix, sf, CFG):

    if CFG['verbose']:
        print 'Estimating raw dispersions'

    if CFG['parallel'] > 1:
        disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float')
        disp_raw.fill(sp.nan)
        disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'],
                       initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [
            sp.arange(x, min(x + binsize, gene_counts.shape[0]))
            for x in range(0, gene_counts.shape[0], binsize)
        ]

        try:
            result = [
                pool.apply_async(estimate_dispersion_chunk,
                                 args=(
                                     gene_counts[idx, :],
                                     matrix,
                                     sf,
                                     CFG,
                                     idx,
                                 )) for idx in idx_chunks
            ]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    disp_raw[j] = tmp[0][i]
                    disp_raw_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:
        (disp_raw, disp_raw_conv,
         _) = estimate_dispersion_chunk(gene_counts,
                                        matrix,
                                        sf,
                                        CFG,
                                        sp.arange(gene_counts.shape[0]),
                                        log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=gene_counts,
                                disp=disp_raw,
                                matrix=matrix,
                                figtitle='Raw Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'],
                                                      'dispersion_raw.pdf'),
                                CFG=CFG)

    return (disp_raw, disp_raw_conv)
Exemplo n.º 13
0
def get_gene_expression(CFG, fn_out=None, strain_subset=None):

    if CFG['verbose']:
        sys.stdout.write('Quantifying gene expression ...\n')

    ### load gene information
    if CFG['is_matlab']:
        genes = scio.loadmat(CFG['fname_genes'],
                             struct_as_record=False)['genes'][0, :]
        numgenes = len(genes)
    else:
        genes = cPickle.load(open(CFG['fname_genes'], 'r'))[0]
        numgenes = genes.shape[0]

    ### open hdf5 file containing graph count information
    IN = h5py.File(CFG['fname_count_in'], 'r')
    strains = IN['strains'][:].astype('str')
    if strain_subset is None:
        strain_idx = sp.arange(strains.shape[0])
    else:
        strain_idx = sp.where(sp.in1d(strains, strain_subset))[0]
    gene_counts = sp.zeros((numgenes, strain_idx.shape[0]), dtype='float')
    gene_names = sp.array([x.name for x in genes], dtype='str')

    if CFG['is_matlab']:
        seg_lens = IN['seg_len'][:, 0]
        gene_ids_segs = IN['gene_ids_segs'][0, :].astype('int') - 1
    else:
        seg_lens = IN['seg_len'][:]
        gene_ids_segs = IN['gene_ids_segs'][:].astype('int')

    ### no longer assume that the gene_ids_segs are sorted by gene ID
    s_idx = sp.argsort(gene_ids_segs[:, 0], kind='mergesort')
    _, u_idx = sp.unique(gene_ids_segs[s_idx, 0], return_index=True)
    s_idx = s_idx[u_idx]

    ### iterate over genes
    for gidx, iidx in enumerate(s_idx):

        if CFG['verbose']:
            log_progress(gidx, numgenes, 100)

        ### get idx of non alternative segments
        if CFG['is_matlab']:
            non_alt_idx = get_non_alt_seg_ids_matlab(genes[gidx])
            seg_idx = sp.arange(iidx,
                                iidx + genes[gidx].segmentgraph[0, 2].shape[0])
            if len(seg_idx) == 0:
                continue
        else:
            non_alt_idx = genes[gidx].get_non_alt_seg_ids()
            seg_idx = sp.arange(
                iidx, iidx + genes[gidx].segmentgraph.seg_edges.shape[0])

        gene_idx = gene_ids_segs[seg_idx]
        if len(gene_idx.shape) > 0:
            gene_idx = gene_idx[0]

        if CFG['is_matlab']:
            assert (IN['gene_names'][gene_idx] == genes[gidx].name)
        else:
            assert (IN['gene_names'][:][gene_idx] == genes[gidx].name)
        assert (genes[gidx].name == gene_names[gidx])

        if CFG['non_alt_norm']:
            seg_idx = seg_idx[non_alt_idx]

        ### compute gene expression as the read count over all non alternative segments
        if CFG['is_matlab']:
            #gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx], IN['seg_len'][seg_idx, 0]) / sp.sum(IN['seg_len'][seg_idx, 0])
            gene_counts[gidx, :] = sp.dot(
                IN['segments'][:, seg_idx][strain_idx],
                seg_lens[seg_idx]) / CFG['read_length']
            #seg_offset += genes[gidx].segmentgraph[0, 2].shape[0]
        else:
            #gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :].T, IN['seg_len'][:][seg_idx]) / sp.sum(IN['seg_len'][:][seg_idx])
            if seg_idx.shape[0] > 1:
                gene_counts[gidx, :] = sp.dot(
                    IN['segments'][seg_idx, :][:, strain_idx].T,
                    seg_lens[seg_idx, 0]) / CFG['read_length']
            else:
                gene_counts[gidx, :] = IN['segments'][
                    seg_idx, :][strain_idx] * seg_lens[seg_idx,
                                                       0] / CFG['read_length']
            #seg_offset += genes[gidx].segmentgraph.seg_edges.shape[0]

    IN.close()

    if CFG['verbose']:
        sys.stdout.write('\n... done.\n')

    ### write results to hdf5
    if fn_out is not None:
        OUT = h5py.File(fn_out, 'w')
        OUT.create_dataset(name='strains', data=strains[strain_idx])
        OUT.create_dataset(name='genes', data=gene_names)
        OUT.create_dataset(name='raw_count',
                           data=gene_counts,
                           compression="gzip")
        OUT.close()

    return (gene_counts, strains, gene_names)
Exemplo n.º 14
0
def get_gene_expression(CFG, fn_out=None, strain_subset=None):

    if CFG['verbose']:
        sys.stdout.write('Quantifying gene expression ...\n')
    
    ### load gene information
    if CFG['is_matlab']:
        genes = scio.loadmat(CFG['fname_genes'], struct_as_record=False)['genes'][0, :]
        numgenes = len(genes)
    else:
        genes = cPickle.load(open(CFG['fname_genes'], 'r'))[0]
        numgenes = genes.shape[0]

    ### open hdf5 file containing graph count information
    IN = h5py.File(CFG['fname_count_in'], 'r')
    strains = IN['strains'][:].astype('str')
    if strain_subset is None:
        strain_idx = sp.arange(strains.shape[0])
    else:
        strain_idx = sp.where(sp.in1d(strains, strain_subset))[0]
    gene_counts = sp.zeros((numgenes, strain_idx.shape[0]), dtype='float')
    gene_names = sp.array([x.name for x in genes], dtype='str')

    if CFG['is_matlab']:
        seg_lens = IN['seg_len'][:, 0]
        gene_ids_segs = IN['gene_ids_segs'][0, :].astype('int') - 1
    else:
        seg_lens = IN['seg_len'][:]
        gene_ids_segs = IN['gene_ids_segs'][:].astype('int')

    ### no longer assume that the gene_ids_segs are sorted by gene ID
    s_idx = sp.argsort(gene_ids_segs[:, 0], kind='mergesort')
    _, u_idx = sp.unique(gene_ids_segs[s_idx, 0], return_index=True)
    s_idx = s_idx[u_idx]

    ### iterate over genes
    for gidx, iidx in enumerate(s_idx):

        if CFG['verbose']:  
            log_progress(gidx, numgenes, 100)
            
        ### get idx of non alternative segments
        if CFG['is_matlab']:
            non_alt_idx = get_non_alt_seg_ids_matlab(genes[gidx])
            seg_idx = sp.arange(iidx, iidx + genes[gidx].segmentgraph[0, 2].shape[0])
            if len(seg_idx) == 0:
                continue
        else:
            non_alt_idx = genes[gidx].get_non_alt_seg_ids()
            seg_idx = sp.arange(iidx, iidx + genes[gidx].segmentgraph.seg_edges.shape[0])

        gene_idx = gene_ids_segs[seg_idx]
        if len(gene_idx.shape) > 0:
            gene_idx = gene_idx[0]

        if CFG['is_matlab']:
            assert(IN['gene_names'][gene_idx] == genes[gidx].name)
        else:
            assert(IN['gene_names'][:][gene_idx] == genes[gidx].name)
        assert(genes[gidx].name == gene_names[gidx])

        if CFG['non_alt_norm']:
            seg_idx = seg_idx[non_alt_idx]

        ### compute gene expression as the read count over all non alternative segments
        if CFG['is_matlab']:
            #gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx], IN['seg_len'][seg_idx, 0]) / sp.sum(IN['seg_len'][seg_idx, 0])
            gene_counts[gidx, :] = sp.dot(IN['segments'][:, seg_idx][strain_idx], seg_lens[seg_idx]) / CFG['read_length']
            #seg_offset += genes[gidx].segmentgraph[0, 2].shape[0]
        else:
            #gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :].T, IN['seg_len'][:][seg_idx]) / sp.sum(IN['seg_len'][:][seg_idx])
            if seg_idx.shape[0] > 1:
                gene_counts[gidx, :] = sp.dot(IN['segments'][seg_idx, :][:, strain_idx].T, seg_lens[seg_idx, 0]) / CFG['read_length']
            else:
                gene_counts[gidx, :] = IN['segments'][seg_idx, :][strain_idx] * seg_lens[seg_idx, 0] / CFG['read_length']
            #seg_offset += genes[gidx].segmentgraph.seg_edges.shape[0]

    IN.close()

    if CFG['verbose']:
        sys.stdout.write('\n... done.\n')

    ### write results to hdf5
    if fn_out is not None:
        OUT = h5py.File(fn_out, 'w')
        OUT.create_dataset(name='strains', data=strains[strain_idx])
        OUT.create_dataset(name='genes', data=gene_names)
        OUT.create_dataset(name='raw_count', data=gene_counts, compression="gzip")
        OUT.close()

    return (gene_counts, strains, gene_names)