Exemplo n.º 1
0
    def make_fa_summaries(self,
                          eval_data,
                          prefix,
                          feature_set,
                          apsal_sal=.4,
                          apsal_sim=.7,
                          **kwargs):

        apsal = APSalienceSummarizer()
        jobs = []
        for event, corpus in eval_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        print "Generating AP+Salience Summary"
        print "\tSal Threshold: {}".format(apsal_sal)
        print "\tSim Threshold: {}".format(apsal_sim)

        jobs = []
        for event, corpus in eval_data:
            jobs.append((APSalienceFilteredSummary(),
                         (event, prefix, feature_set, apsal_sal, apsal_sim)))

        self.do_work(filter_worker, jobs, **kwargs)
Exemplo n.º 2
0
def filter_worker(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    import time
    time.sleep(1)
    while not job_queue.empty():
        try:
            summarizer, sum_args = job_queue.get(block=False)

            if isinstance(summarizer, APFilteredSummary):
                event, sim_cutoff = sum_args
                ap = APSummarizer()
                ap_tsv_path = ap.get_tsv_path(event)
                apf_tsv_path = summarizer.get_tsv_path(event, sim_cutoff)
                if os.path.exists(ap_tsv_path) and \
                    not os.path.exists(apf_tsv_path):
                    summarizer.make(event, sim_threshold=sim_cutoff)

            elif isinstance(summarizer, HACFilteredSummary):
                event, prefix, feature_set, dist_cutoff, sim_cutoff = sum_args
                hac = HACSummarizer()
                hac_tsv_path = hac.get_tsv_path(event, dist_cutoff)
                hacf_tsv_path = summarizer.get_tsv_path(
                    event, dist_cutoff, sim_cutoff)
                if os.path.exists(hac_tsv_path) and \
                    not os.path.exists(hacf_tsv_path):

                    try:
                        summarizer.make(event,
                                        prefix,
                                        feature_set,
                                        dist_cutoff=dist_cutoff,
                                        sim_threshold=sim_cutoff)
                    except Exception, e:
                        print e
                        print hac_tsv_path

            elif isinstance(summarizer, APSalienceFilteredSummary) or \
                isinstance(summarizer, APSalTRankSalThreshFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                aps = APSalienceSummarizer()
                aps_tsv_path = aps.get_tsv_path(event, prefix, feature_set)
                apsf_tsv_path = summarizer.get_tsv_path(
                    event, prefix, feature_set, sal_cutoff, sim_cutoff)
                if os.path.exists(aps_tsv_path) and \
                    not os.path.exists(apsf_tsv_path):
                    summarizer.make(event,
                                    prefix,
                                    feature_set,
                                    min_cluster_size=2,
                                    center_threshold=sal_cutoff,
                                    sim_threshold=sim_cutoff)
            elif isinstance(summarizer, RankedSalienceFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                rsfs_tsv_path = summarizer.get_tsv_path(
                    event, sal_cutoff, sim_cutoff)
                if not os.path.exists(rsfs_tsv_path):
                    summarizer.make(event, prefix, feature_set, sal_cutoff,
                                    sim_cutoff)
Exemplo n.º 3
0
    def tune_fa(self,
                dev_data,
                prefix,
                feature_set,
                sal_min=-2.0,
                sal_max=2.0,
                sal_step=.1,
                sem_sim_min=.2,
                sem_sim_max=.7,
                sem_sim_step=.05,
                **kwargs):

        apsal = APSalienceSummarizer()
        sal_cutoffs = np.arange(sal_min, sal_max + sal_step, sal_step)
        sem_sim_cutoffs = np.arange(sem_sim_min, sem_sim_max + sem_sim_step,
                                    sem_sim_step)

        print "Tuning on dev data."

        ### Run clustering ###
        print "Generating AP+Salience Cluster\n\t(no params)"

        jobs = []
        for event, corpus in dev_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        ### Run filtering ###
        print
        print "Generating AP+Salience Summary"
        print "\tSal Threshold ({}, {}), step={}".format(
            sal_min, sal_max, sal_step)
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(sal_cutoffs.shape[0] *
                                       sem_sim_cutoffs.shape[0])

        jobs = []
        for event, corpus in dev_data:

            for sem_sim_cutoff in sem_sim_cutoffs:
                for sal_cutoff in sal_cutoffs:
                    jobs.append((APSalienceFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  sem_sim_cutoff)))
        self.do_work(filter_worker, jobs, **kwargs)
Exemplo n.º 4
0
    def make_summaries(self,
                       eval_data,
                       prefix,
                       feature_set,
                       hac_dist=1.35,
                       hac_sim=.7,
                       ap_sim=.7,
                       apsal_sal=.4,
                       apsal_sim=.7,
                       apsal_tr_sal=.6,
                       apsal_tr_sim=.6,
                       sal_rank_sal=1.8,
                       sal_rank_sim=.4,
                       **kwargs):

        ap = APSummarizer()
        apsal = APSalienceSummarizer()
        hac = HACSummarizer()

        print "Running with optimal params on dev data."

        ### Run clustering ###
        print "Generating AP Cluster\n\t(no params)"
        print "Generating AP+Salience Cluster\n\t(no params)"
        print "Generating HAC Cluster\n\tdist-thresh: {}".format(hac_dist)

        jobs = []
        for event, corpus in eval_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            if not os.path.exists(ap.dir_):
                os.makedirs(ap.dir_)
            if not os.path.exists(hac.dir_):
                os.makedirs(hac.dir_)

            jobs.append((event, corpus, prefix, feature_set, hac, hac_dist))
            jobs.append((event, corpus, prefix, feature_set, ap))
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        ### Run filtering ###
        print
        print "Generating AP Summary"
        print "\tSim Threshold: {}".format(ap_sim)
        print "Generating AP+Salience Summary"
        print "\tSal Threshold: {}".format(apsal_sal)
        print "\tSim Threshold: {}".format(apsal_sim)
        print "Generating HAC Summary"
        print "\tDist Threshold: {}".format(hac_dist)
        print "\tSim Threshold: {}".format(hac_sim)
        print "Generating AP+Salience Time Ranked"
        print "\tSal Threshold: {}".format(apsal_tr_sal)
        print "\tSim Threshold: {}".format(apsal_tr_sim)
        print "Generating Salience Ranked Summary"
        print "\tSal Threshold: {}".format(sal_rank_sal)
        print "\tSim Threshold: {}".format(sal_rank_sim)

        jobs = []
        for event, corpus in eval_data:
            jobs.append((HACFilteredSummary(), (event, prefix, feature_set,
                                                hac_dist, hac_sim)))

            jobs.append((APFilteredSummary(), (event, ap_sim)))
            jobs.append((APSalienceFilteredSummary(),
                         (event, prefix, feature_set, apsal_sal, apsal_sim)))
            jobs.append(
                (APSalTRankSalThreshFilteredSummary(),
                 (event, prefix, feature_set, apsal_tr_sal, apsal_tr_sim)))
            jobs.append(
                (RankedSalienceFilteredSummary(),
                 (event, prefix, feature_set, sal_rank_sal, sal_rank_sim)))

        self.do_work(filter_worker, jobs, **kwargs)
Exemplo n.º 5
0
    def tune(self,
             dev_data,
             prefix,
             feature_set,
             hac_dist_min=.9,
             hac_dist_max=5.05,
             hac_dist_step=.05,
             sal_min=-2.0,
             sal_max=2.0,
             sal_step=.1,
             sem_sim_min=.2,
             sem_sim_max=.7,
             sem_sim_step=.05,
             rank_sim_min=.2,
             rank_sim_max=.4,
             rank_sim_step=.05,
             **kwargs):

        ap = APSummarizer()
        apsal = APSalienceSummarizer()
        hac = HACSummarizer()

        hac_dist_cutoffs = np.arange(hac_dist_min,
                                     hac_dist_max + hac_dist_step,
                                     hac_dist_step)
        sal_cutoffs = np.arange(sal_min, sal_max + sal_step, sal_step)
        sem_sim_cutoffs = np.arange(sem_sim_min, sem_sim_max + sem_sim_step,
                                    sem_sim_step)
        rank_sim_cutoffs = np.arange(rank_sim_min,
                                     rank_sim_max + rank_sim_step,
                                     rank_sim_step)

        print "Tuning on dev data."

        ### Run clustering ###
        print "Generating AP Cluster\n\t(no params)"
        print "Generating AP+Salience Cluster\n\t(no params)"
        print "Generating HAC Cluster"
        print "\tDist Threshold ({}, {}), step={} {} jobs/event".format(
            hac_dist_min, hac_dist_max, hac_dist_step,
            hac_dist_cutoffs.shape[0])

        jobs = []
        for event, corpus in dev_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            if not os.path.exists(ap.dir_):
                os.makedirs(ap.dir_)
            if not os.path.exists(hac.dir_):
                os.makedirs(hac.dir_)

            for cutoff in hac_dist_cutoffs:
                jobs.append((event, corpus, prefix, feature_set, hac, cutoff))
            jobs.append((event, corpus, prefix, feature_set, ap))
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        ### Run filtering ###
        print
        print "Generating AP Summary"
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(sem_sim_cutoffs.shape[0])
        print "Generating AP+Salience Summary"
        print "\tSal Threshold ({}, {}), step={}".format(
            sal_min, sal_max, sal_step)
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(sal_cutoffs.shape[0] *
                                       sem_sim_cutoffs.shape[0])
        print "Generating HAC Summary"
        print "\tDist Threshold ({}, {}), step={}".format(
            hac_dist_min, hac_dist_max, hac_dist_step)
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(hac_dist_cutoffs.shape[0] *
                                       sem_sim_cutoffs.shape[0])

        rsfs = RankedSalienceFilteredSummary()
        if not os.path.exists(rsfs.dir_):
            os.makedirs(rsfs.dir_)
        jobs = []
        for event, corpus in dev_data:

            for sem_sim_cutoff in sem_sim_cutoffs:

                for dist_cutoff in hac_dist_cutoffs:
                    jobs.append(
                        (HACFilteredSummary(), (event, prefix, feature_set,
                                                dist_cutoff, sem_sim_cutoff)))

                jobs.append((APFilteredSummary(), (event, sem_sim_cutoff)))
                for sal_cutoff in sal_cutoffs:
                    jobs.append((APSalienceFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  sem_sim_cutoff)))
                    jobs.append((APSalTRankSalThreshFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  sem_sim_cutoff)))
            for rank_sim_cutoff in rank_sim_cutoffs:
                for sal_cutoff in sal_cutoffs:
                    jobs.append((RankedSalienceFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  rank_sim_cutoff)))
        self.do_work(filter_worker, jobs, **kwargs)
Exemplo n.º 6
0
    def make(self,
             event,
             prefix,
             feature_set,
             min_cluster_size=2,
             sim_threshold=.2264,
             center_threshold=1.0):
        tsv_path = self.get_tsv_path(event, prefix, feature_set,
                                     center_threshold, sim_threshold)
        lvecs = SentenceLatentVectorsResource()
        spa = SaliencePredictionAggregator()
        apsal = APSalienceSummarizer()
        cluster_df = apsal.get_dataframe(event, prefix, feature_set)
        #for _, row in cluster_df.iterrows():
        #    print row['hour'], datetime.utcfromtimestamp(row['timestamp'])
        updates = []
        Xcache = None
        timestamps = sorted(list(cluster_df['timestamp'].unique()))
        for timestamp in timestamps:
            hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            lvec_df = lvecs.get_dataframe(event, hour)
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)
            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy()
            clusters.sort(['stream id', 'sentence id'], inplace=True)

            salience = []
            for _, row in clusters.iterrows():
                sal_pred = sal_df.loc[
                    (sal_df['stream id'] == row['stream id']) & \
                    (sal_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64).mean()
                salience.append(sal_pred)
            clusters['salience'] = salience
            #clusters.sort(['salie'], inplace=True)

            sal_mean = np.mean(salience)
            sal_std = np.std(salience)

            #print clusters
            for _, row in clusters.iterrows():
                if row['cluster size'] < min_cluster_size:
                    continue
                vec = lvec_df.loc[
                    (lvec_df['stream id'] == row['stream id']) & \
                    (lvec_df['sentence id'] == row['sentence id'])].as_matrix()[:,2:].astype(np.float64)

                sal_norm = (row['salience'] - sal_mean) / sal_std
                if sal_norm < center_threshold:
                    continue

                if Xcache is None:
                    Xcache = vec
                else:
                    if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold:
                        continue
                    else:
                        Xcache = np.vstack((Xcache, vec))
                updates.append({
                    'query id':
                    event.query_id[5:],
                    'system id':
                    'cunlp',
                    'run id':
                    'apsal-time-ranked-sal_{}-sim_{}'.format(
                        center_threshold, sim_threshold),
                    'stream id':
                    row['stream id'],
                    'sentence id':
                    row['sentence id'],
                    'timestamp':
                    timestamp,
                    'conf':
                    row['salience'],
                    'string':
                    row['string']
                })

        print "Writing", tsv_path, "For ", center_threshold, sim_threshold
        df = pd.DataFrame(updates,
                          columns=[
                              "query id", "system id", "run id", "stream id",
                              "sentence id", "timestamp", "conf", "string"
                          ])
        with open(tsv_path, u'w') as f:
            df.to_csv(f,
                      sep='\t',
                      index=False,
                      index_label=False,
                      header=False)