Exemplo n.º 1
0
    def get_features(self, business, naics, ADD_SYNONYMS=False):
        """
        :param business: business dictionary from challenge set
        :param naics: list of naics dictionaries to check against
        :param ADD_SYNONYMS: boolean whether to add synonyms to titles and descriptions
        :return: dictionary of the 8 similarity combinations to their score
        """
        business_desc = business['description']
        google_type = self.google_types.get(business['unique_id'])
        business_name = business['name']
        if google_type:
            business_name += ' ' + google_type

        if ADD_SYNONYMS:
            business_desc = util.add_synonyms_to_text(business_desc)
            business_name = util.add_synonyms_to_text(business_name)
        else:
            business_desc = util.clean_paragraph(business_desc)
            business_name = util.clean_paragraph(business_name)

        codes_to_features = {}
        for naic in naics:
            naic_desc = naic['description']
            naic_title = naic['title']
            if ADD_SYNONYMS:
                naic_title = util.add_synonyms_to_text(naic_title)
                naic_desc = util.add_synonyms_to_text(naic_desc)
            else:
                naic_title = util.clean_paragraph(naic_title)
                naic_desc = util.clean_paragraph(naic_desc)

            d_d_sim = util.cosine_sim(business_desc, naic_desc)
            t_t_sim = util.cosine_sim(business_name, naic_title)
            d_t_sim = util.cosine_sim(business_desc, naic_title)
            t_d_sim = util.cosine_sim(business_name, naic_desc)

            t_t_w2vsim = util.word2vec_sim(business_name, naic_title, self.model)
            d_d_w2vsim = util.word2vec_sim(business_desc, naic_desc, self.model)
            d_t_w2vsim = util.word2vec_sim(business_desc, naic_title, self.model)
            t_d_w2vsim = util.word2vec_sim(business_name, naic_desc, self.model)

            t_t_w2vsim = util.removeNans(t_t_w2vsim)
            d_d_w2vsim = util.removeNans(d_d_w2vsim)
            d_t_w2vsim = util.removeNans(d_t_w2vsim)
            t_d_w2vsim = util.removeNans(t_d_w2vsim)

            features = {
                'd_d_sim': d_d_sim,
                't_t_sim': t_t_sim,
                'd_t_sim': d_t_sim,
                't_d_sim': t_d_sim,
                't_t_w2vsim': t_t_w2vsim,
                'd_d_w2vsim': d_d_w2vsim,
                'd_t_w2vsim': d_t_w2vsim,
                't_d_w2vsim': t_d_w2vsim
            }
            codes_to_features[naic['code']] = features
        return codes_to_features
def get_all_business_types():
    businesses = loader.get_challengeset()
    idtoloc = loader.get_idtoloc()
    business_types_dict = loader.get_business_types()
    print "Done {} of {}".format(len(business_types_dict), len(businesses))
    for business in businesses:
        unique_id = business['unique_id']
        if unique_id not in business_types_dict.keys():
            print business['name']
            closest_place, best_sim = None, 0
            lat, lon = idtoloc[unique_id]
            for place in get_places(lat, lon):
                sim = cosine_sim(place['name'], business['name'])
                if sim > best_sim:
                    closest_place = place
                    best_sim = sim
            if closest_place:
                types = filter(lambda x: not x in ['point_of_interest', 'establishment', 'sublocality', 'route',
                                                   'real', 'political', 'of', 'or', 'local', 'locality', 'intersection',
                                                   '1'], closest_place['types'])
                types = " ".join(types).replace("_", " ")
            else:
                types = None
            print types
            business_types_dict[unique_id] = types
            loader.dump_business_dict(business_types_dict)
Exemplo n.º 3
0
     
     d2 = (W.T@err)*nonlinearity.deriv(z) # gradient of the currents
     if two_layers:
         W2.grad = -([email protected])/len(idx_tst)
         b2.grad = -d2.mean(1, keepdim=True)
         
         d1 = (W2@d2)*nonlinearity.deriv(z1)
         W1.grad = -(d1@inputs[idx_tst,:])/len(idx_tst)
         b1.gad = -d1.mean(1, keepdim=True)
     else:
         W1.grad = -(d2@inputs[idx_tst,:])/len(idx_tst)
         b1.gad = -d2.mean(1, keepdim=True)
     
     conds = inp_condition[idx_tst]
     cond_grad = np.array([d2[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)])
     gradz_sim.append(util.cosine_sim(cond_grad-cond_grad.mean(0),cond_grad-cond_grad.mean(0)))
     
     cond_grad = np.array([(W.T@err)[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)])
     gradlin_sim.append(util.cosine_sim(cond_grad-cond_grad.mean(0),cond_grad-cond_grad.mean(0)))
     # cond_grad = np.array([((d2[:,conds==i]@z[:,conds==i].T)/np.sum(conds==i)).mean(1).detach().numpy() \
     #                       for i in np.unique(conds)])
     # gradw_sim.append(util.cosine_sim(cond_grad,cond_grad))
 
 # do learning
 for j, btch in enumerate(dl):
     optimizer.zero_grad()
     
     inps, outs = btch
     if two_layers:
         z1 = nonlinearity(torch.matmul(W1,inps.T) + b1)
         z = nonlinearity(torch.matmul(W2,z1) + b2)
Exemplo n.º 4
0
    if epoch in [0, nepoch - 1]:
        errb = (targets[idx_tst, :].T - nn.Sigmoid()(pred))  # bernoulli
        errg = (targets[idx_tst, :].T - pred)  # gaussian

        err = ppp * errb + (1 -
                            ppp) * errg  # convex sum, in case you want that

        d2 = (W.T @ err) * nonlinearity.deriv(z)  # gradient of the currents

        conds = abstract_conds[idx_tst]
        cond_grad = np.array([
            d2[:, conds == i].mean(1).detach().numpy()
            for i in np.unique(conds)
        ])
        gradz_sim.append(
            util.cosine_sim(cond_grad - cond_grad.mean(0),
                            cond_grad - cond_grad.mean(0)))

        # cond_grad = np.array([(W.T@err)[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)])
        cond_grad = np.array([
            (d2[:, conds == i]
             @ inputs[idx_tst, :][conds == i, :]).detach().numpy().T
            for i in np.unique(conds)
        ])
        gradlin_sim.append(
            util.cosine_sim(cond_grad - cond_grad.mean(0),
                            cond_grad - cond_grad.mean(0)))
        # cond_grad = np.array([((d2[:,conds==i]@z[:,conds==i].T)/np.sum(conds==i)).mean(1).detach().numpy() \
        #                       for i in np.unique(conds)])
        # gradw_sim.append(util.cosine_sim(cond_grad,cond_grad))

    # do learning
Exemplo n.º 5
0
    def get_features(self, business, naics, ADD_SYNONYMS=False):
        """
        :param business: business dictionary from challenge set
        :param naics: list of naics dictionaries to check against
        :param ADD_SYNONYMS: boolean whether to add synonyms to titles and descriptions
        :return: dictionary of the 8 similarity combinations to their score
        """
        business_desc = business['description']
        google_type = self.google_types.get(business['unique_id'])
        business_name = business['name']
        if google_type:
            business_name += ' ' + google_type

        if ADD_SYNONYMS:
            business_desc = util.add_synonyms_to_text(business_desc)
            business_name = util.add_synonyms_to_text(business_name)
        else:
            business_desc = util.clean_paragraph(business_desc)
            business_name = util.clean_paragraph(business_name)

        codes_to_features = {}
        for naic in naics:
            naic_desc = naic['description']
            naic_title = naic['title']
            if ADD_SYNONYMS:
                naic_title = util.add_synonyms_to_text(naic_title)
                naic_desc = util.add_synonyms_to_text(naic_desc)
            else:
                naic_title = util.clean_paragraph(naic_title)
                naic_desc = util.clean_paragraph(naic_desc)

            d_d_sim = util.cosine_sim(business_desc, naic_desc)
            t_t_sim = util.cosine_sim(business_name, naic_title)
            d_t_sim = util.cosine_sim(business_desc, naic_title)
            t_d_sim = util.cosine_sim(business_name, naic_desc)

            t_t_w2vsim = util.word2vec_sim(business_name, naic_title,
                                           self.model)
            d_d_w2vsim = util.word2vec_sim(business_desc, naic_desc,
                                           self.model)
            d_t_w2vsim = util.word2vec_sim(business_desc, naic_title,
                                           self.model)
            t_d_w2vsim = util.word2vec_sim(business_name, naic_desc,
                                           self.model)

            t_t_w2vsim = util.removeNans(t_t_w2vsim)
            d_d_w2vsim = util.removeNans(d_d_w2vsim)
            d_t_w2vsim = util.removeNans(d_t_w2vsim)
            t_d_w2vsim = util.removeNans(t_d_w2vsim)

            features = {
                'd_d_sim': d_d_sim,
                't_t_sim': t_t_sim,
                'd_t_sim': d_t_sim,
                't_d_sim': t_d_sim,
                't_t_w2vsim': t_t_w2vsim,
                'd_d_w2vsim': d_d_w2vsim,
                'd_t_w2vsim': d_t_w2vsim,
                't_d_w2vsim': t_d_w2vsim
            }
            codes_to_features[naic['code']] = features
        return codes_to_features