示例#1
0
    def run(self, data_reader, odir, n_return, penalty_mode):

        log_handle, log_system = logger(odir)
        saliences_pen = None
        for hour, labels, unicodes, saliences, X in data_reader:

            time_int = hour_str2datetime_interval(hour)
            n_points_total = X.shape[0]

            
            ### DEDUP ###        
            I = self.unique_indices(unicodes, saliences)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I,:]
            labels = [labels[idx] for idx in I] 
            n_points = X.shape[0]

            print "System time: {} -- {}".format(time_int.start, time_int.stop)
            print "Received {} unique sentences from {} total".format(
                n_points, n_points_total)

            if self.use_temp_ is True and self.n_updates_ > 0:
                saliences_pen = self.penalize_salience(
                    saliences, X, penalty_mode)
                ranks = saliences_pen
            else:
                ranks = saliences

            sorted_idxs = sorted(range(n_points),
                                 key=lambda x: ranks[x], 
                                 reverse=True)
            update_idxs = sorted_idxs[0:n_return]

            for e in update_idxs:
                if saliences_pen is not None:
                    print saliences[e], saliences_pen[e], unicodes[e].encode(u'utf-8')
                else:
                    print saliences[e], unicodes[e].encode(u'utf-8')
            print


            self.add_updates(
                update_idxs, time_int, labels,
                unicodes, saliences, saliences_pen, X)

            self.write_iterative_summary(
                odir, time_int.stop.strftime(u'%Y-%m-%d-%H'))

            log_system(
                sorted_idxs, time_int, labels, 
                unicodes, saliences, saliences_pen)
             
        self.write_updates(odir)
        log_handle.close()
示例#2
0
    def run(self, data_reader, odir, n_return, penalty_mode):

        log_handle, log_system = logger(odir)
        saliences_pen = None
        for hour, labels, unicodes, saliences, X in data_reader:

            time_int = hour_str2datetime_interval(hour)
            n_points_total = X.shape[0]

            ### DEDUP ###
            I = self.unique_indices(unicodes, saliences)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I, :]
            labels = [labels[idx] for idx in I]
            n_points = X.shape[0]

            print "System time: {} -- {}".format(time_int.start, time_int.stop)
            print "Received {} unique sentences from {} total".format(
                n_points, n_points_total)

            if self.use_temp_ is True and self.n_updates_ > 0:
                saliences_pen = self.penalize_salience(saliences, X,
                                                       penalty_mode)
                ranks = saliences_pen
            else:
                ranks = saliences

            sorted_idxs = sorted(range(n_points),
                                 key=lambda x: ranks[x],
                                 reverse=True)
            update_idxs = sorted_idxs[0:n_return]

            for e in update_idxs:
                if saliences_pen is not None:
                    print saliences[e], saliences_pen[e], unicodes[e].encode(
                        u'utf-8')
                else:
                    print saliences[e], unicodes[e].encode(u'utf-8')
            print

            self.add_updates(update_idxs, time_int, labels, unicodes,
                             saliences, saliences_pen, X)

            self.write_iterative_summary(
                odir, time_int.stop.strftime(u'%Y-%m-%d-%H'))

            log_system(sorted_idxs, time_int, labels, unicodes, saliences,
                       saliences_pen)

        self.write_updates(odir)
        log_handle.close()
示例#3
0
    def run(self,
            data_reader,
            odir,
            penalty_mode,
            scale=1.0,
            repulsion=1.0,
            update_cutoff=1.0,
            update_sim_threshold=.75):

        stdsclr = StandardScaler()
        log_handle, log_system = logger(odir)
        saliences_pen = None

        for hour, labels, unicodes, saliences, X, in data_reader:
            time_int = hour_str2datetime_interval(hour)
            n_points_total = X.shape[0]

            #            I = self.simple_filter(unicodes)
            #            unicodes = unicodes[I]
            #            saliences = saliences[I]
            #            X = X[I,:]
            #            labels = [labels[idx] for idx in I]

            ### REMOVE INPUTS THAT MATCH PREVIOUS UPATES ###
            I = self.non_update_matching_indices(
                X, threshold=update_sim_threshold)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I, :]
            labels = [labels[idx] for idx in I]

            ### DEDUP AND COUNT DUPLICATES ###
            I, counts = self.unique_indices(unicodes,
                                            saliences,
                                            return_counts=True)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I, :]
            labels = [labels[idx] for idx in I]
            n_points = X.shape[0]

            print "System time: {} -- {}".format(time_int.start, time_int.stop)
            print "Received {} unique sentences from {} total".format(
                n_points, n_points_total)
            if n_points <= 1:
                continue

            if self.use_temp_ is True and self.n_updates_ > 0:
                saliences_pen = self.penalize_salience(saliences,
                                                       X,
                                                       penalty_mode,
                                                       scale=repulsion)
                ranks = saliences_pen
            else:
                ranks = saliences

            ### Init Preferences and Similarities ###
            P = self.compute_preferences(ranks, n_points, counts, scale)
            A = self.compute_affinities(X, P, counts)

            af = AffinityPropagation(preference=P,
                                     affinity='precomputed',
                                     max_iter=1000,
                                     damping=.7,
                                     verbose=True).fit(A)

            exemplars = af.cluster_centers_indices_
            assignments = exemplars[af.labels_]

            if saliences_pen is None:
                ranks = stdsclr.fit_transform(saliences[:, np.newaxis])
            else:
                ranks = stdsclr.fit_transform(saliences_pen[:, np.newaxis])

            update_idxs = [e for e in exemplars
                           if ranks[e] > update_cutoff \
                           and np.where(assignments == e)[0].shape[0] > 1]

            sorted_idxs = []
            for e in exemplars:
                sorted_idxs.append(e)
                for m in np.where(assignments == e)[0]:
                    if e != m:
                        sorted_idxs.append(e)

            self.add_updates(update_idxs, time_int, labels, unicodes,
                             saliences, saliences_pen, X)

            self.write_iterative_summary(
                odir, time_int.stop.strftime(u'%Y-%m-%d-%H'))

            log_system(sorted_idxs, time_int, labels, unicodes, saliences,
                       saliences_pen)

            for e in update_idxs:
                if saliences_pen is not None:
                    print saliences[e], saliences_pen[e],
                    print unicodes[e].encode(u'utf-8')
                else:
                    print saliences[e], unicodes[e].encode(u'utf-8')
            print

        self.write_updates(odir)
示例#4
0
    def run(self, data_reader, odir, penalty_mode, scale=1.0,
            repulsion=1.0, update_cutoff=1.0, update_sim_threshold=.75):
        
        stdsclr = StandardScaler() 
        log_handle, log_system = logger(odir)
        saliences_pen = None

        for hour, labels, unicodes, saliences, X, in data_reader:
            time_int = hour_str2datetime_interval(hour)
            n_points_total = X.shape[0]


#            I = self.simple_filter(unicodes)
#            unicodes = unicodes[I]
#            saliences = saliences[I]
#            X = X[I,:]
#            labels = [labels[idx] for idx in I] 

            ### REMOVE INPUTS THAT MATCH PREVIOUS UPATES ###
            I = self.non_update_matching_indices(
                X, threshold=update_sim_threshold)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I,:]
            labels = [labels[idx] for idx in I] 

            ### DEDUP AND COUNT DUPLICATES ###
            I, counts = self.unique_indices(
                unicodes, saliences, return_counts=True)
            unicodes = unicodes[I]
            saliences = saliences[I]
            X = X[I,:]
            labels = [labels[idx] for idx in I] 
            n_points = X.shape[0]

            print "System time: {} -- {}".format(
                time_int.start, time_int.stop)
            print "Received {} unique sentences from {} total".format(
                n_points, n_points_total)
            if n_points <= 1:
                continue

            if self.use_temp_ is True and self.n_updates_ > 0:
                saliences_pen = self.penalize_salience(
                    saliences, X, penalty_mode, scale=repulsion)
                ranks = saliences_pen
            else:
                ranks = saliences

            ### Init Preferences and Similarities ###
            P = self.compute_preferences(ranks, n_points, counts, scale)
            A = self.compute_affinities(X, P, counts)

            af = AffinityPropagation(
                preference=P, affinity='precomputed', max_iter=1000,
                damping=.7, verbose=True).fit(A)

            exemplars = af.cluster_centers_indices_
            assignments = exemplars[af.labels_]

            if saliences_pen is None:
                ranks = stdsclr.fit_transform(saliences[:,np.newaxis])
            else:
                ranks = stdsclr.fit_transform(saliences_pen[:,np.newaxis])

            update_idxs = [e for e in exemplars 
                           if ranks[e] > update_cutoff \
                           and np.where(assignments == e)[0].shape[0] > 1]


            sorted_idxs = []
            for e in exemplars:
                sorted_idxs.append(e)
                for m in np.where(assignments == e)[0]:
                    if e != m:
                        sorted_idxs.append(e)

            self.add_updates(
                update_idxs, time_int, labels,
                unicodes, saliences, saliences_pen, X)

            self.write_iterative_summary(
                odir, time_int.stop.strftime(u'%Y-%m-%d-%H'))

            log_system(
                sorted_idxs, time_int, labels, 
                unicodes, saliences, saliences_pen)

            for e in update_idxs:
                if saliences_pen is not None:
                    print saliences[e], saliences_pen[e],
                    print unicodes[e].encode(u'utf-8')
                else:
                    print saliences[e], unicodes[e].encode(u'utf-8')
            print

        self.write_updates(odir)