示例#1
0
def order_pieces_prob(face, pieces, vtxsum, vtxocc):
    from utils import hist
    mul = operator.mul
    hists = {v:hist(v-vtxsum[v], 5-vtxocc[v]) for v in face}
    opieces = []
    for piece, rots in pieces.items():
        for rot in rots:
            rpiece = rot_piece(piece, rot)
            score = reduce(mul, (hists[v][p] for v,p in zip(face, rpiece)), 1.0)
            assert 0 <= score <= 1.0
            opieces.append((score, piece, rpiece))
    opieces.sort(reverse=True)
    return opieces
示例#2
0
def order_pieces_prob(face, pieces, vtxsum, vtxocc):
    from utils import hist
    mul = operator.mul
    hists = {v: hist(v - vtxsum[v], 5 - vtxocc[v]) for v in face}
    opieces = []
    for piece, rots in pieces.items():
        for rot in rots:
            rpiece = rot_piece(piece, rot)
            score = reduce(mul, (hists[v][p] for v, p in zip(face, rpiece)),
                           1.0)
            assert 0 <= score <= 1.0
            opieces.append((score, piece, rpiece))
    opieces.sort(reverse=True)
    return opieces
示例#3
0
def stats():
    """ return some statistical data to be viewed in a table """

    year_list = df.columns[2:-1]
    # list of info to be viewed in a table
    table = []

    table.append([
        "TOTAL NUMBER OF MIGRANTS", "{:,.0f}".format(df["sum"].sum()),
        "Over a Period of 73 Years From 1945 to 2018"
    ])
    table.append([
        "AVERAGE NUMBER OF MIGRANTS", "{:,.0f}".format(df["sum"].sum() / 73),
        "-"
    ])
    table.append([
        "MINIMUM NUMBER OF MIGRANTS FROM A COUNTRY",
        "{:,.0f}".format(df["sum"].min()), "'Chad' from 1945 to 2018"
    ])
    table.append([
        "MAXIMUM NUMBER OF MIGRANTS FROM A COUNTRY",
        "{:,.0f}".format(df["sum"].max()), "'UK & Ireland' from 1945 to 2018"
    ])
    table.append([
        "MINIMUM NUMBER OF MIGRANTS IN A YEAR",
        "{:,.0f}".format(df[year_list].sum(axis=0).min()),
        "Almost Two Years from Oct 1945 to Jun 1947"
    ])
    table.append([
        "MAXIMUM NUMBER OF MIGRANTS IN A YEAR",
        "{:,.0f}".format(df[year_list].sum(axis=0).max()), "from 2012 to 2013"
    ])
    table.append([
        "MAXIMUM NUMBER OF MIGRANTS FROM A SINGLE COUNTRY AND IN A YEAR",
        "{:,.0f}".format(df[year_list].max().max()),
        "UK & Ireland from 1968 to 1969 "
    ])

    return render_template("stats.html", table=table, image=hist(df))
示例#4
0
grid = np.vstack([X, Y]).reshape(2, -1)

# Trained model
axScatter.contour(X,
                  Y,
                  M(grid).reshape(X.shape[0], Y.shape[0]),
                  colors='blue',
                  linestyles='--')

# Conditional plots
axHistx1.plot(xx, M1(xx.reshape(1, 120)).flatten(), color='b', linestyle='-')
axHistx2.plot(xx, M2(xx.reshape(1, 120)).flatten(), color='b', linestyle='-')
axHistx3.plot(xx, M3(xx.reshape(1, 120)).flatten(), color='b', linestyle='-')

# Conditional histograms ( i.e. empirical conditional)
axHistx1.hist(ut.hist(data, y_slice[0]),
              bins='auto',
              density=True,
              color='lightgray')
axHistx2.hist(ut.hist(data, y_slice[1]),
              bins='auto',
              density=True,
              color='lightgray')
axHistx3.hist(ut.hist(data, y_slice[2]),
              bins='auto',
              density=True,
              color='lightgray')

plt.savefig('gaussian-mix.png', bbox_inches='tight')
print('# Image saved at ./gaussian-mix.png')
示例#5
0
def doc_word_embed_content_noise(content_path,
                                 noise_path,
                                 whiten_path=None,
                                 content_lines=None,
                                 noise_lines=None,
                                 opt=None):
    no_add_set = set()
    doc_word_embed_f = doc_word_embed_sen
    content_words_ar, content_word_embeds = doc_word_embed_f(
        content_path, no_add_set, content_lines=content_lines)
    words_set = set(content_words_ar)
    noise_words_ar, noise_word_embeds = doc_word_embed_f(
        noise_path, set(content_words_ar), content_lines=noise_lines)
    content_words_ar.extend(noise_words_ar)
    words_ar = content_words_ar
    word_embeds = torch.cat((content_word_embeds, noise_word_embeds), dim=0)

    whitening = opt.whiten if opt is not None else True  #True #April, temporary normalize by inlier covariance!
    if whitening and whiten_path is not None:
        #use an article of data in the inliers topic to whiten data.
        whiten_ar, whiten_word_embeds = doc_word_embed_f(
            whiten_path, set()
        )  #, content_lines=content_lines)#,content_lines=content_lines) ######april!!

        whiten_cov = utils.cov(whiten_word_embeds)
        fast_whiten = False  #True
        if not fast_whiten:
            U, D, V_t = linalg.svd(whiten_cov)
            #D_avg = D.mean() #D[len(D)//2]
            #print('D_avg! {}'.format(D_avg))

            cov_inv = torch.from_numpy(
                np.matmul(linalg.pinv(np.diag(np.sqrt(D))),
                          U.transpose())).to(utils.device)
            #cov_inv = torch.from_numpy(np.matmul(U, np.matmul(linalg.pinv(np.diag(np.sqrt(D))), V_t))).to(utils.device)

            word_embeds0 = word_embeds
            #change multiplication order!
            word_embeds = torch.mm(cov_inv, word_embeds.t()).t()
            if False:

                after_cov = utils.cov(word_embeds)
                U1, D1, V_t1 = linalg.svd(after_cov)
                pdb.set_trace()

                content_whitened = torch.mm(cov_inv,
                                            content_word_embeds.t()).t()
                after_cov2 = utils.cov(content_whitened)
                _, D1, _ = linalg.svd(after_cov2)
                print('after whitening D {}'.format(D1[:7]))
        else:
            #### faster whitening
            sv = decom.TruncatedSVD(30)
            sv.fit(whiten_cov.cpu().numpy())
            top_evals, top_evecs = sv.singular_values_, sv.components_
            top_evals = torch.from_numpy(1 / np.sqrt(top_evals)).to(
                utils.device)
            top_evecs = torch.from_numpy(top_evecs).to(utils.device)
            #pdb.set_trace()

            X = word_embeds
            projected = torch.mm(top_evecs.t() / (top_evecs**2).sum(-1),
                                 torch.mm(top_evecs, X.t())).t()
            #eval_ones = torch.eye(len(top_evals), device=top_evals.device)
            ##projected = torch.mm(torch.mm(top_evecs.t(), eval_ones), torch.mm(top_evecs, X.t())).t()

            #(d x k) * (k x d) * (d x n), project onto and squeeze the components along top evecs
            ##word_embeds = torch.mm((top_evecs/top_evals.unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-torch.mm(top_evecs.t(), torch.mm(top_evecs, X.t()) ).t())
            #pdb.set_trace()
            ##word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected )
            #word_embeds = torch.mm((top_evecs/(top_evals*(top_evecs**2).sum(-1)).unsqueeze(-1)).t(), torch.mm(top_evecs, X.t())).t() + (X-projected )
            word_embeds = torch.mm(torch.mm(top_evecs.t(), top_evals.diag()),
                                   torch.mm(top_evecs,
                                            X.t())).t() + (X - projected)

    noise_idx = torch.LongTensor(
        list(range(len(content_word_embeds),
                   len(word_embeds)))).to(utils.device)
    if False:
        #normalie per direction
        word_embeds_norm = ((word_embeds - word_embeds.mean(0))**2).sum(
            dim=1, keepdim=True).sqrt()
    debug_top_dir = False
    if debug_top_dir:
        w1 = (content_word_embeds - word_embeds.mean(0)
              )  #/word_embeds_norm[:len(content_word_embeds)]

        w2 = (noise_word_embeds - word_embeds.mean(0)
              )  #/word_embeds_norm[len(content_word_embeds):]
        mean_diff = ((w1.mean(0) - w2.mean(0))**2).sum().sqrt()
        w1_norm = (w1**2).sum(-1).sqrt().mean()
        w2_norm = (w2**2).sum(-1).sqrt().mean()
        X = (word_embeds - word_embeds.mean(0))  #/word_embeds_norm
        cov = torch.mm(X.t(), X) / word_embeds.size(0)
        U, D, V_t = linalg.svd(cov.cpu().numpy())
        U1 = torch.from_numpy(U[1]).to(utils.device)
        mean1_dir = w1.mean(0)
        mean1_proj = (mean1_dir * U1).sum()
        mean2_dir = w2.mean(0)
        mean2_proj = (mean2_dir * U1).sum()
        diff_proj = ((mean1_dir - mean2_dir) * U1).sum()

        #plot histogram of these projections
        proj1 = (w1 * U1).sum(-1)
        proj2 = (w2 * U1).sum(-1)
        utils.hist(proj1, 'inliers')
        utils.hist(proj2, 'outliers')
        pdb.set_trace()
    #word_embeds=(word_embeds - word_embeds.mean(0))/word_embeds_norm
    return words_ar, word_embeds, noise_idx
示例#6
0
# Data 2D histogram
axScatter.hist2d(data[:,0], data[:,1], normed=True, bins=[100, 50], cmap='binary')

# Analytic pdf
axScatter.contour(X,Y,pdf.reshape(X.shape[0], Y.shape[0]), colors='red', linestyles='--')

# Trained model
axScatter.contour(X, Y, M(grid).reshape(X.shape[0], Y.shape[0]), colors='blue', linestyles='--')


# Conditional plots
# Analytic
axHistx1.plot(xx, cpdf1, color='r', linestyle='--')
axHistx2.plot(xx, cpdf2, color='r', linestyle='--')
axHistx3.plot(xx, cpdf3, color='r', linestyle='--')

# Models
axHistx1.plot(xx, cM1(xx.reshape(1,n)).flatten(), color='b', linestyle='--')
axHistx2.plot(xx, cM2(xx.reshape(1,n)).flatten(), color='b', linestyle='--')
axHistx3.plot(xx, cM3(xx.reshape(1,n)).flatten(), color='b', linestyle='--')

# Conditional histograms ( i.e. empirical conditional)
axHistx1.hist(ut.hist(data, x[0], axes='y'), bins='auto', density=True, color='lightgray')
axHistx2.hist(ut.hist(data, x[1], axes='y'), bins='auto', density=True, color='lightgray')
axHistx3.hist(ut.hist(data, x[2], axes='y'), bins='auto', density=True, color='lightgray')



plt.savefig('student-t.png', bbox_inches='tight')
print('# Image saved at ./student-t.png')
示例#7
0
    def lnlike(self, p):

        lnlike = 0.0
        # need to generalize the following!!
        if self.func is not None:
            lnlike += np.sum(np.log(self.func(self.data[:,0], self.data[:,1], \
                                              10.0**p[0], 10.0**p[1])))
        else:
            try:
                if self.interpType == 'linear1d':
                    # For Linear Interpolation
                    # ------------------------
                    pdf = self.dataComp.rotate2full(np.array([self.interp[jj](self.sampTrans.range2unit(10.0**p))
                                                                for jj in range(len(self.interp))]).flatten())

                elif self.interpType == 'gp1d':
                    # For 1D GP
                    # ----------
                    if not self.interpErrors:
                        pdf = self.dataComp.rotate2full(np.array([self.interp[jj].predict(self.dataComp.pca_weights[jj,:],
                                                                                          self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0]
                                                                  for jj in range(len(self.interp))]))
                    elif self.interpErrors:
                        if not self.interpHyperErrors:
                            pdf = self.dataComp.rotate2full(np.array([self.interp[jj].sample_conditional(self.dataComp.pca_weights[jj,:],
                                                                                                         self.sampTrans.range2unit(np.atleast_2d(10.0**p)))
                                                                      for jj in range(len(self.interp))]).flatten())
                        elif self.interpHyperErrors:
                            # drawing new kernel hyperparameters from posterior
                            [self.interp[jj].set_parameter_vector(random.choice(
                                self.gp_kernel_posterior[jj] / np.log10(np.e)))
                             for jj in range(len(self.interp))]

                            # sampling conditional, as before
                            pdf = self.dataComp.rotate2full(np.array([self.interp[jj].sample_conditional(self.dataComp.pca_weights[jj,:],
                                                                                                         self.sampTrans.range2unit(np.atleast_2d(10.0**p)))
                                                                      for jj in range(len(self.interp))]).flatten())

                elif self.interpType == 'gp2d':
                    # For 2D GP
                    # ----------
                    xrot = np.zeros((self.dataComp.user_dim,self.dataComp.user_dim2))
                    for ii in range(self.dataComp.user_dim):
                        for jj in range(self.dataComp.user_dim2):
                            if not self.interpErrors:
                                xrot[ii,jj] = self.interp[ii][jj].predict(self.dataComp.unitCore[ii,jj,:],
                                                                          self.sampTrans.range2unit([10.0**p]))[0][0]
                            elif self.interpErrors:
                                if not self.interpHyperErrors:
                                    xrot[ii,jj] = self.interp[ii][jj].sample_conditional(self.dataComp.unitCore[ii,jj,:],
                                                                                         self.sampTrans.range2unit([10.0**p]))
                                elif self.interpHyperErrors:
                                    # drawing new kernel hyperparameters from posterior
                                    self.interp[ii][jj].set_parameter_vector(random.choice(
                                        self.gp_kernel_posterior[ii][jj] / np.log10(np.e)))

                                    # sampling conditional, as before
                                    xrot[ii,jj] = self.interp[ii][jj].sample_conditional(self.dataComp.unitCore[ii,jj,:],
                                                                                         self.sampTrans.range2unit([10.0**p]))

                    pdf = self.dataComp.rotate2full(xrot).flatten(order='F')

                # did you train on the distribution ('linear') or
                # 'log10' of the distribution.
                if self.interpScale == 'linear':
                    pdf = pdf
                elif self.interpScale == 'log10':
                    pdf = 10.0**pdf

                # construct normalized PDF
                pdf = utils.hist(self.x, pdf)
                # query PDF at data locations
                if self.catalogType == 'median':
                    pdf_val = pdf.pdf(self.data)
                elif self.catalogType == 'samples':
                    try:
                        pdf_val = np.mean(pdf.pdf(self.data),axis=0)
                    except ValueError:
                        # array indexing: [sample, source, parameter]
                        pdf_val = np.mean([pdf.pdf(self.data[kk])
                                           for kk in range(self.data.shape[0])],
                                          axis=0)

                # incoporate expected rate information
                if self.rate_interp is not None:
                    # rate = self.rate_interp.predict(self.rate_data,
                    #                                 self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0]
                    if not self.interpErrors:
                        rate = self.rate_interp.predict(self.rate_data,
                                                        self.sampTrans.range2unit(np.atleast_2d(10.0**p)))[0][0]
                    elif self.interpErrors:
                        if not self.interpHyperErrors:
                            rate = self.rate_interp.sample_conditional(self.rate_data,
                                                                       self.sampTrans.range2unit(np.atleast_2d(10.0**p)))
                        elif self.interpHyperErrors:
                            # drawing new kernel hyperparameters from posterior
                            self.rate_interp.set_parameter_vector(random.choice(
                                self.rate_gp_kernel_posterior/ np.log10(np.e)))

                            rate = self.rate_interp.sample_conditional(self.rate_data,
                                                                       self.sampTrans.range2unit(np.atleast_2d(10.0**p)))

                    rate = 10.0**(self.rate_mean + self.rate_std * rate)
                    pdf_val *= rate

                lnlike += np.sum(np.log(pdf_val))

                if self.rate_interp is not None:
                    if self.poisson_marg:
                        lnlike -= (1.0 + self.data.shape[0]) * np.log(rate)
                        if rate < 1e-5: lnlike = -np.inf
                    elif not self.poisson_marg:
                        lnlike -= rate

                if np.isnan(lnlike):
                    lnlike = -np.inf
            except np.linalg.LinAlgError:
                lnlike = -np.inf

        return lnlike