예제 #1
0
파일: run.py 프로젝트: kedz/cuttsum
def get_all_semsim():
    accident_semsim = event2semsim("accident")
    natdis_semsim = event2semsim("earthquake")
    social_semsim = event2semsim("protest")
    terror_semsim = event2semsim("shooting")
    return {
        "accident": accident_semsim,
        "earthquake": natdis_semsim,
        "storm": natdis_semsim,
        "impact event": natdis_semsim,
        "shooting": terror_semsim,
        "hostage": terror_semsim,
        "conflict": terror_semsim,
        "bombing": terror_semsim,
        "protest": social_semsim,
        "riot": social_semsim,
    }
예제 #2
0
def get_all_semsim():
    accident_semsim = event2semsim("accident")
    natdis_semsim = event2semsim("earthquake")
    social_semsim = event2semsim("protest")
    terror_semsim = event2semsim("shooting")
    return {
        "accident": accident_semsim,
        "earthquake": natdis_semsim,
        "storm": natdis_semsim,
        "impact event": natdis_semsim,
        "shooting": terror_semsim,
        "hostage": terror_semsim,
        "conflict": terror_semsim,
        "bombing": terror_semsim,
        "protest": social_semsim,
        "riot": social_semsim,
    }
예제 #3
0
all_results = []
data = []
with open("apsal.tsv", "w") as o:
    for event in cuttsum.events.get_events():
        if event.query_num < 26: continue
        istream = get_input_stream(event, False)

        with open("clusters-2015/{}.tsv".format(event.query_id), "r") as f:
            df = pd.read_csv(f, sep="\t", converters={"stems": eval, "nuggets": eval})

        #thresh = lm2thr[event2lm_name(event)]
        thresh = .65

        cache = None
        semsim = event2semsim(event)
        results = []
        for ts, batch in df.groupby("timestamp"):
            X = semsim.transform(batch["stems"].apply(lambda x: ' '.join(x)).tolist())
            for i, (_, row) in enumerate(batch.iterrows()):
                if cache is None:
                    cache = X[i]
                    results.append(row.to_dict())
                    all_results.append(row.to_dict())
                else:
                    K = cosine_similarity(cache, X[i])
                    if (K < thresh).all(): 
                        cache = np.vstack([cache, X[i]])
                        results.append(row.to_dict())
                        all_results.append(row.to_dict())
        for result in results:
예제 #4
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("Job unit {} out of range".format(unit))

        service_configs = kwargs.get("service-configs", {})
        cnlp_configs = service_configs.get("corenlp", {})
        cnlp_port = int(cnlp_configs.get("port", 9999))

        domain_lm_config = service_configs[event2lm_name(event)]
        domain_lm_port = int(domain_lm_config["port"])
        domain_lm_order = int(domain_lm_config.get("order", 3))
        gw_lm_config = service_configs["gigaword-lm"]
        gw_lm_port = int(gw_lm_config["port"])
        gw_lm_order = int(gw_lm_config.get("order", 3))

        thresh = kwargs.get("dedupe-sim-threshold", .8)
        extractor = kwargs.get("extractor", "goose")

        res = DedupedArticlesResource()
        dfiter = res.dataframe_iter(event,
                                    corpus,
                                    extractor,
                                    include_matches=None,
                                    threshold=thresh)

        domain_lm = cuttsum.srilm.Client(domain_lm_port, domain_lm_order, True)
        gw_lm = cuttsum.srilm.Client(gw_lm_port, gw_lm_order, True)
        cnlp_client = cnlp.client.CoreNLPClient(port=cnlp_port)

        def make_query_synsets():
            synonyms = []
            hypernyms = []
            hyponyms = []
            print event.type.split(' ')[0]
            for synset in wn.synsets(event.type.split(' ')[0]):
                synonyms.extend([
                    lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                    for lemma in synset.lemmas()
                ])

                hypernyms.extend([
                    lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                    for synset in synset.hypernyms()
                    for lemma in synset.lemmas()
                ])

                hyponyms.extend([
                    lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                    for synset in synset.hyponyms()
                    for lemma in synset.lemmas()
                ])
            print hypernyms
            print hyponyms
            print synonyms
            return set(synonyms), set(hypernyms), set(hyponyms)

        def heal_text(sent_text):
            sent_text = re.sub(
                ur"[A-Z ]+, [A-Z][a-z ]+\( [A-Z]+ \) [-\u2014_]+ ", r"",
                sent_text)
            sent_text = re.sub(ur"^.*?[A-Z ]+, [A-Z][a-z]+ [-\u2014_]+ ", r"",
                               sent_text)
            sent_text = re.sub(ur"^.*?[A-Z ]+\([^\)]+\) [-\u2014_]+ ", r"",
                               sent_text)
            sent_text = re.sub(ur"^.*?[A-Z]+ +[-\u2014_]+ ", r"", sent_text)

            sent_text = re.sub(r"\([^)]+\)", r" ", sent_text)
            sent_text = re.sub(ur"^ *[-\u2014_]+", r"", sent_text)
            sent_text = re.sub(u" ([,.;?!]+)([\"\u201c\u201d'])", r"\1\2",
                               sent_text)
            sent_text = re.sub(r" ([:-]) ", r"\1", sent_text)
            sent_text = re.sub(r"([^\d]\d{1,3}) , (\d\d\d)([^\d]|$)",
                               r"\1,\2\3", sent_text)
            sent_text = re.sub(r"^(\d{1,3}) , (\d\d\d)([^\d]|$)", r"\1,\2\3",
                               sent_text)
            sent_text = re.sub(ur" ('|\u2019) ([a-z]|ll|ve|re)( |$)", r"\1\2 ",
                               sent_text)
            sent_text = re.sub(r" ([',.;?!]+) ", r"\1 ", sent_text)
            sent_text = re.sub(r" ([',.;?!]+)$", r"\1", sent_text)

            sent_text = re.sub(r"(\d\.) (\d)", r"\1\2", sent_text)
            sent_text = re.sub(r"(a|p)\. m\.", r"\1.m.", sent_text)
            sent_text = re.sub(r"U\. (S|N)\.", r"U.\1.", sent_text)

            sent_text = re.sub(ur"\u201c ([^\s])", ur"\u201c\1", sent_text)
            sent_text = re.sub(ur"([^\s]) \u201d", ur"\1\u201d", sent_text)
            sent_text = re.sub(ur"\u2018 ([^\s])", ur"\u2018\1", sent_text)
            sent_text = re.sub(ur"([^\s]) \u2019", ur"\1\u2019", sent_text)

            sent_text = re.sub(ur"\u00e2", ur"'", sent_text)
            sent_text = re.sub(r"^Photo:Reuters|^Photo:AP", r"", sent_text)
            sent_text = sent_text.replace("\n", " ")

            return sent_text.encode("utf-8")

        def get_number_feats(sent):
            feats = []
            for tok in sent:
                if tok.ne == "NUMBER" and tok.nne is not None:
                    for chain in get_dep_chain(tok, sent, 0):
                        feat = [tok.nne] + [elem[1].lem for elem in chain]
                        feats.append(feat)
            return feats

        def get_dep_chain(tok, sent, depth):
            chains = []
            if depth > 2:
                return chains
            for p in sent.dep2govs[tok]:
                if p[1].is_noun():
                    for chain in get_dep_chain(p[1], sent, depth + 1):
                        chains.append([p] + chain)
                elif p[1]:
                    chains.append([p])
            return chains

        import unicodedata as u
        P = ''.join(
            unichr(i) for i in range(65536) if u.category(unichr(i))[0] == 'P')
        P = re.escape(P)
        punc_patt = re.compile("[" + P + "]")

        from collections import defaultdict
        stopwords = english_stopwords()
        mention_counts = defaultdict(int)
        total_mentions = 0

        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

        synonyms, hypernyms, hyponyms = make_query_synsets()

        path = self.get_path(event, corpus, extractor, thresh)
        dirname = os.path.dirname(path)
        if not os.path.exists(dirname): os.makedirs(dirname)

        meta_cols = [
            "update id", "stream id", "sent id", "timestamp", "pretty text",
            "tokens", "lemmas", "stems", "pos", "ne", "tokens stopped",
            "lemmas stopped"
        ]

        basic_cols = [
            "BASIC length", "BASIC char length", "BASIC doc position",
            "BASIC all caps ratio", "BASIC upper ratio", "BASIC lower ratio",
            "BASIC punc ratio", "BASIC person ratio", "BASIC location ratio",
            "BASIC organization ratio", "BASIC date ratio", "BASIC time ratio",
            "BASIC duration ratio", "BASIC number ratio",
            "BASIC ordinal ratio", "BASIC percent ratio", "BASIC money ratio",
            "BASIC set ratio", "BASIC misc ratio"
        ]

        lm_cols = [
            "LM domain lp", "LM domain avg lp", "LM gw lp", "LM gw avg lp"
        ]

        query_cols = [
            "Q_query_sent_cov",
            "Q_sent_query_cov",
            "Q_syn_sent_cov",
            "Q_sent_syn_cov",
            "Q_hyper_sent_cov",
            "Q_sent_hyper_cov",
            "Q_hypo_sent_cov",
            "Q_sent_hypo_cov",
        ]

        sum_cols = [
            "SUM_sbasic_sum",
            "SUM_sbasic_amean",
            "SUM_sbasic_max",
            "SUM_novelty_gmean",
            "SUM_novelty_amean",
            "SUM_novelty_max",
            "SUM_centrality",
            "SUM_pagerank",
            "SUM_sem_novelty_gmean",
            "SUM_sem_novelty_amean",
            "SUM_sem_novelty_max",
            "SUM_sem_centrality",
            "SUM_sem_pagerank",
        ]

        stream_cols = [
            "STREAM_sbasic_sum",
            "STREAM_sbasic_amean",
            "STREAM_sbasic_max",
            "STREAM_per_prob_sum",
            "STREAM_per_prob_max",
            "STREAM_per_prob_amean",
            "STREAM_loc_prob_sum",
            "STREAM_loc_prob_max",
            "STREAM_loc_prob_amean",
            "STREAM_org_prob_sum",
            "STREAM_org_prob_max",
            "STREAM_org_prob_amean",
            "STREAM_nt_prob_sum",
            "STREAM_nt_prob_max",
            "STREAM_nt_prob_amean",
        ]

        semsim = event2semsim(event)

        all_cols = meta_cols + basic_cols + query_cols + lm_cols + sum_cols + stream_cols

        stream_uni_counts = defaultdict(int)
        stream_per_counts = defaultdict(int)
        stream_loc_counts = defaultdict(int)
        stream_org_counts = defaultdict(int)
        stream_nt_counts = defaultdict(int)

        with gzip.open(path, "w") as f:
            f.write("\t".join(all_cols) + "\n")
            for df in dfiter:
                if len(df) == 1: continue
                df = df.head(20)

                #df["lm"] = df["sent text"].apply(lambda x: lm.sentence_log_prob(x.encode("utf-8"))[1])
                df["pretty text"] = df["sent text"].apply(heal_text)
                df = df[df["pretty text"].apply(lambda x: len(x.strip())) > 0]
                df = df[
                    df["pretty text"].apply(lambda x: len(x.split(" "))) < 200]
                df = df.reset_index(drop=True)
                if len(df) == 0:
                    print "skipping"
                    continue
                doc_text = "\n".join(df["pretty text"].tolist())

                doc = cnlp_client.annotate(doc_text)
                df["tokens"] = map(lambda sent: [str(tok) for tok in sent],
                                   doc)
                df["lemmas"] = map(
                    lambda sent: [tok.lem.encode("utf-8") for tok in sent],
                    doc)

                df["stems"] = map(
                    lambda sent:
                    [stemmer.stem(unicode(tok).lower()) for tok in sent], doc)
                df["pos"] = map(lambda sent: [tok.pos for tok in sent], doc)

                df["ne"] = map(lambda sent: [tok.ne for tok in sent], doc)

                df["tokens stopped"] = map(
                    lambda sent: [str(tok) for tok in sent
                                  if unicode(tok).lower() not in stopwords \
                                      and len(unicode(tok)) < 50],
                    doc)
                df["lemmas stopped"] = map(
                    lambda sent: [tok.lem.lower().encode("utf-8") for tok in sent
                                  if unicode(tok).lower() not in stopwords \
                                      and len(unicode(tok)) < 50],
                    doc)

                df["num tuples"] = [get_number_feats(sent) for sent in doc]
                ### Basic Features ###

                df["BASIC length"] = df["lemmas stopped"].apply(len)
                df["BASIC doc position"] = df.index.values + 1

                df = df[df["BASIC length"] > 0]
                df = df.reset_index(drop=True)

                df["BASIC char length"] = df["pretty text"].apply(
                    lambda x: len(x.replace(" ", "")))

                df["BASIC upper ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall("[A-Z]", x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))

                df[ "BASIC lower ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall("[a-z]", x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))

                df["BASIC punc ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall(punc_patt, x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))
                df["BASIC all caps ratio"] = df["tokens stopped"].apply(
                    lambda x: np.sum([1 if re.match("^[A-Z]+$", xi) else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC person ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "PERSON" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC location ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "LOCATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC organization ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "ORGANIZATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC date ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "DATE" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC time ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "TIME" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC duration ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "DURATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC number ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "NUMBER" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC ordinal ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "ORDINAL" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC percent ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "PERCENT" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC money ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "MONEY" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC set ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "SET" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                df["BASIC misc ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "MISC" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)

                ### Language Model Features ###

                dm_probs = df["lemmas"].apply(
                    lambda x: domain_lm.sentence_log_prob(" ".join([
                        xi.decode("utf-8").lower().encode("utf-8") for xi in x
                        if len(xi) < 50
                    ])))
                dm_log_probs = [lp for lp, avg_lp in dm_probs.tolist()]
                dm_avg_log_probs = [avg_lp for lp, avg_lp in dm_probs.tolist()]
                df["LM domain lp"] = dm_log_probs
                df["LM domain avg lp"] = dm_avg_log_probs
                gw_probs = df["lemmas"].apply(
                    lambda x: gw_lm.sentence_log_prob(" ".join([
                        xi.decode("utf-8").lower().encode("utf-8") for xi in x
                        if len(xi) < 50
                    ])))
                gw_log_probs = [lp for lp, avg_lp in gw_probs.tolist()]
                gw_avg_log_probs = [avg_lp for lp, avg_lp in gw_probs.tolist()]
                df["LM gw lp"] = gw_log_probs
                df["LM gw avg lp"] = gw_avg_log_probs

                ### Query Features ###

                self.compute_query_features(
                    df, set([q.lower() for q in event.query]), synonyms,
                    hypernyms, hyponyms)

                ### Single Doc Summarization Features ###

                counts = []
                doc_counts = defaultdict(int)
                for lemmas in df["lemmas stopped"].tolist():
                    counts_i = {}
                    for lem in lemmas:
                        counts_i[lem.lower()] = counts_i.get(lem.lower(),
                                                             0) + 1
                        doc_counts[lem.lower()] += 1
                    doc_counts["__TOTAL__"] += len(lemmas)
                    counts.append(counts_i)
                doc_counts["__TOTAL__"] *= 1.
                doc_uni = {
                    key: val / doc_counts["__TOTAL__"]
                    for key, val in doc_counts.items() if key != "__TOTAL__"
                }

                sum_probs = []
                amean_probs = []
                max_probs = []
                for lemmas in df["lemmas stopped"].tolist():
                    probs = [doc_uni[lem.lower()] for lem in lemmas]
                    sum_probs.append(np.sum(probs))
                    amean_probs.append(np.mean(probs))
                    max_probs.append(np.max(probs))

                df["SUM_sbasic_sum"] = sum_probs
                df["SUM_sbasic_amean"] = amean_probs
                df["SUM_sbasic_max"] = max_probs

                tfidfer = TfidfTransformer()
                vec = DictVectorizer()
                X = vec.fit_transform(counts)
                X = tfidfer.fit_transform(X)

                ctrd = X.mean(axis=0)
                K = cosine_similarity(ctrd, X).ravel()
                I = K.argsort()[::-1]
                R = np.array([[i, r + 1] for r, i in enumerate(I)])
                R = R[R[:, 0].argsort()]
                df["SUM_centrality"] = R[:, 1]

                L = semsim.transform(
                    df["stems"].apply(lambda x: ' '.join(x)).tolist())
                ctrd_l = L.mean(axis=0)
                K_L = cosine_similarity(ctrd_l, L).ravel()
                I_L = K_L.argsort()[::-1]
                R_L = np.array([[i, r + 1] for r, i in enumerate(I_L)])
                R_L = R_L[R_L[:, 0].argsort()]
                df["SUM_sem_centrality"] = R_L[:, 1]

                K = cosine_similarity(X)
                M = np.zeros_like(K)
                M[np.diag_indices(K.shape[0])] = 1
                Km = np.ma.masked_array(K, M)
                D = 1 - Km

                novelty_amean = D.mean(axis=1)
                novelty_max = D.max(axis=1)
                novelty_gmean = gmean(D, axis=1)

                df["SUM_novelty_amean"] = novelty_amean
                df["SUM_novelty_max"] = novelty_max
                df["SUM_novelty_gmean"] = novelty_gmean

                K_L = cosine_similarity(L)
                M_L = np.zeros_like(K)
                M_L[np.diag_indices(K_L.shape[0])] = 1
                K_Lm = np.ma.masked_array(K_L, M_L)
                D_L = 1 - K_Lm

                sem_novelty_amean = D_L.mean(axis=1)
                sem_novelty_max = D_L.max(axis=1)
                sem_novelty_gmean = gmean(D_L, axis=1)

                df["SUM_sem_novelty_amean"] = sem_novelty_amean
                df["SUM_sem_novelty_max"] = sem_novelty_max
                df["SUM_sem_novelty_gmean"] = sem_novelty_gmean

                K = (K > 0).astype("int32")
                degrees = K.sum(axis=1) - 1
                edges_x_2 = K.sum() - K.shape[0]
                if edges_x_2 == 0: edges_x_2 = 1
                pr = 1. - degrees / float(edges_x_2)
                df["SUM_pagerank"] = pr

                K_L = (K_L > .2).astype("int32")
                degrees_L = K_L.sum(axis=1) - 1
                edges_x_2_L = K_L.sum() - K_L.shape[0]
                if edges_x_2_L == 0: edges_x_2_L = 1
                pr_L = 1. - degrees_L / float(edges_x_2_L)
                df["SUM_sem_pagerank"] = pr_L

                print df["pretty text"]
                # print df[["SUM_sbasic_sum", "SUM_sbasic_amean", "SUM_sbasic_max"]]
                # print df[
                #     ["SUM_pagerank", "SUM_centrality", "SUM_novelty_gmean",
                #      "SUM_novelty_amean", "SUM_novelty_max"]]

                ### Stream Features ###
                for key, val in doc_counts.items():
                    stream_uni_counts[key] += val
                denom = stream_uni_counts["__TOTAL__"]
                sum_probs = []
                amean_probs = []
                max_probs = []

                for lemmas in df["lemmas stopped"].tolist():
                    probs = [
                        stream_uni_counts[lem.lower()] / denom
                        for lem in lemmas
                    ]
                    sum_probs.append(np.sum(probs))
                    amean_probs.append(np.mean(probs))
                    max_probs.append(np.max(probs))

                df["STREAM_sbasic_sum"] = sum_probs
                df["STREAM_sbasic_amean"] = amean_probs
                df["STREAM_sbasic_max"] = max_probs

                for lemmas, nes in izip(df["lemmas"].tolist(),
                                        df["ne"].tolist()):
                    for lem, ne in izip(lemmas, nes):
                        if ne == "PERSON":
                            stream_per_counts[lem.lower()] += 1
                            stream_per_counts["__TOTAL__"] += 1.
                        if ne == "LOCATION":
                            stream_loc_counts[lem.lower()] += 1
                            stream_loc_counts["__TOTAL__"] += 1.
                        if ne == "ORGANIZATION":
                            stream_org_counts[lem.lower()] += 1
                            stream_org_counts["__TOTAL__"] += 1.

                for tuples in df["num tuples"].tolist():

                    for nt in tuples:
                        for item in nt:
                            stream_nt_counts[item.lower()] += 1
                            stream_nt_counts["__TOTAL__"] += 1.

                pdenom = stream_per_counts["__TOTAL__"]
                ldenom = stream_loc_counts["__TOTAL__"]
                odenom = stream_org_counts["__TOTAL__"]
                ntdenom = stream_nt_counts["__TOTAL__"]
                sum_per_probs = []
                amean_per_probs = []
                max_per_probs = []
                sum_loc_probs = []
                amean_loc_probs = []
                max_loc_probs = []
                sum_org_probs = []
                amean_org_probs = []
                max_org_probs = []
                sum_nt_probs = []
                amean_nt_probs = []
                max_nt_probs = []

                for tuples in df["num tuples"].tolist():
                    if ntdenom > 0:
                        nt_probs = [
                            stream_nt_counts[item.lower()] / ntdenom
                            for nt in tuples for item in nt
                        ]
                    else:
                        nt_probs = []

                    if len(nt_probs) > 0:
                        sum_nt_probs.append(np.sum(nt_probs))
                        amean_nt_probs.append(np.mean(nt_probs))
                        max_nt_probs.append(np.max(nt_probs))
                    else:
                        sum_nt_probs.append(0)
                        amean_nt_probs.append(0)
                        max_nt_probs.append(0)

                for lemmas, nes in izip(df["lemmas"].tolist(),
                                        df["ne"].tolist()):

                    if pdenom > 0:
                        per_probs = [
                            stream_per_counts[lem.lower()] / pdenom
                            for lem, ne in izip(lemmas, nes) if ne == "PERSON"
                        ]
                    else:
                        per_probs = []

                    if len(per_probs) > 0:
                        sum_per_probs.append(np.sum(per_probs))
                        amean_per_probs.append(np.mean(per_probs))
                        max_per_probs.append(np.max(per_probs))
                    else:
                        sum_per_probs.append(0)
                        amean_per_probs.append(0)
                        max_per_probs.append(0)

                    if ldenom > 0:
                        loc_probs = [
                            stream_loc_counts[lem.lower()] / ldenom
                            for lem, ne in izip(lemmas, nes)
                            if ne == "LOCATION"
                        ]
                    else:
                        loc_probs = []

                    if len(loc_probs) > 0:
                        sum_loc_probs.append(np.sum(loc_probs))
                        amean_loc_probs.append(np.mean(loc_probs))
                        max_loc_probs.append(np.max(loc_probs))
                    else:
                        sum_loc_probs.append(0)
                        amean_loc_probs.append(0)
                        max_loc_probs.append(0)

                    if odenom > 0:
                        org_probs = [
                            stream_org_counts[lem.lower()] / odenom
                            for lem, ne in izip(lemmas, nes)
                            if ne == "ORGANIZATION"
                        ]
                    else:
                        org_probs = []

                    if len(org_probs) > 0:
                        sum_org_probs.append(np.sum(org_probs))
                        amean_org_probs.append(np.mean(org_probs))
                        max_org_probs.append(np.max(org_probs))
                    else:
                        sum_org_probs.append(0)
                        amean_org_probs.append(0)
                        max_org_probs.append(0)

                df["STREAM_per_prob_sum"] = sum_per_probs
                df["STREAM_per_prob_max"] = max_per_probs
                df["STREAM_per_prob_amean"] = amean_per_probs

                df["STREAM_loc_prob_sum"] = sum_loc_probs
                df["STREAM_loc_prob_max"] = max_loc_probs
                df["STREAM_loc_prob_amean"] = amean_loc_probs

                df["STREAM_org_prob_sum"] = sum_org_probs
                df["STREAM_org_prob_max"] = max_org_probs
                df["STREAM_org_prob_amean"] = amean_org_probs

                df["STREAM_nt_prob_sum"] = sum_nt_probs
                df["STREAM_nt_prob_max"] = max_nt_probs
                df["STREAM_nt_prob_amean"] = amean_nt_probs

                #print df[["STREAM_sbasic_sum", "STREAM_sbasic_amean", "STREAM_sbasic_max"]]
                #print df[["STREAM_per_prob_sum", "STREAM_per_prob_amean", "STREAM_per_prob_max"]]
                #print df[["STREAM_loc_prob_sum", "STREAM_loc_prob_amean", "STREAM_loc_prob_max"]]
                #print df[["STREAM_nt_prob_sum", "STREAM_nt_prob_amean", "STREAM_nt_prob_max"]]

                ### Write dataframe to file ###
                df[all_cols].to_csv(f, index=False, header=False, sep="\t")
예제 #5
0
파일: ap-test.py 프로젝트: kedz/cuttsum
def main(output_dir, sim_threshold, bucket_size): 
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


    dev_qids = set([19, 23, 27, 34, 35,] + [7,24])

    summary_data = []

    K_data = []
    for event in cuttsum.events.get_events():
        if event.query_num in dev_qids: continue
        

        print event

        semsim = event2semsim(event)
        istream = get_input_stream(event, False, extractor="goose", 
            thresh=.8, delay=None, topk=20)
        prev_time = 0
        cache = None

        clusters = []

        max_h = len(event.list_event_hours()) - 1


        for h, hour in enumerate(event.list_event_hours()):
            if h % bucket_size != 0 and h != max_h:
                continue
            
            current_time = epoch(hour)
            input_sents = istream[
                (istream["timestamp"] < current_time) & \
                (istream["timestamp"] >= prev_time)]
            len_select = input_sents["lemmas stopped"].apply(len) > 10
            input_sents = input_sents[len_select]

            if len(input_sents) <= 1: continue

            stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist()
            X = semsim.transform(stems)
            K = -(1 - cosine_similarity(X))   
            K_ma = np.ma.masked_array(K, np.eye(K.shape[0]))
            Kmin = np.ma.min(K_ma)
            Kmax = np.ma.max(K_ma)
            median = np.ma.median(K_ma)[0]
            print "SYS TIME:", hour, "# SENTS:", K.shape[0], 
            print "min/median/max pref: {}/{}/{}".format(
                    Kmin, median, Kmax)

    #
            ap = AffinityPropagation(affinity="precomputed",
                    verbose=True, max_iter=1000)
            ap.fit(K)
            labels = ap.labels_ 
            if ap.cluster_centers_indices_ != None:
                for c in ap.cluster_centers_indices_:
                    if cache == None:
                        cache = X[c]
                        updates_df = \
                            input_sents.reset_index(drop=True).iloc[c]
                        updates_df["query id"] = event.query_num
                        updates_df["system timestamp"] = current_time
                        summary_data.append(
                            updates_df[
                                ["query id", "stream id", "sent id", 
                                 "system timestamp", "sent text"]
                            ].to_frame().T
                        )

                    else:
                        Ksum = cosine_similarity(cache, X[c])
                        if Ksum.max() < sim_threshold:
                            cache = np.vstack([cache, X[c]])
                            updates_df = \
                                input_sents.reset_index(drop=True).iloc[c]
                            updates_df["query id"] = event.query_num
                            updates_df["system timestamp"] = current_time
                            summary_data.append(
                                updates_df[
                                    ["query id", "stream id", "sent id", 
                                     "system timestamp", "sent text"]
                                ].to_frame().T
                            )

            prev_time = current_time

    df = pd.DataFrame(K_data, columns=["min", "max", "median"])
    print df
    print df.mean()
    print df.std()
    print df.max()
    df =  pd.concat(summary_data)
    df["conf"] = .5
    df["team id"] = "AP"
    df["run id"] = "sim{}_bs{}".format(
        sim_threshold, bucket_size)
    print df
    of = os.path.join(output_dir, "ap." + "sim{}_bs{}.tsv".format(
                sim_threshold, bucket_size))
    cols = ["query id", "team id", "run id", "stream id", "sent id", 
            "system timestamp", "conf"]
    df[cols].to_csv(of, sep="\t", header=False, index=False) 
예제 #6
0
파일: apsal-test.py 프로젝트: kedz/cuttsum
def main(output_dir, sim_threshold, bucket_size, pref_offset): 
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


    dev_qids = set([19, 23, 27, 34, 35] + [7, 24])

    summary_data = []

    K_data = []
    for event in cuttsum.events.get_events():
        if event.query_num in dev_qids: continue
        

        print event

        semsim = event2semsim(event)
        istream = get_input_stream(event, False, extractor="goose", 
            thresh=.8, delay=None, topk=20)
        prev_time = 0
        cache = None

        clusters = []

        max_h = len(event.list_event_hours()) - 1


        for h, hour in enumerate(event.list_event_hours()):
            if h % bucket_size != 0 and h != max_h:
                continue
            
            current_time = epoch(hour)
            input_sents = istream[
                (istream["timestamp"] < current_time) & \
                (istream["timestamp"] >= prev_time)]
            len_select = input_sents["lemmas stopped"].apply(len) > 10
            input_sents = input_sents[len_select]

            if len(input_sents) <= 1: continue

            stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist()
            X = semsim.transform(stems)
            probs = input_sents["probs"]
            p = probs.values
            K = -(1 - cosine_similarity(X))   
            K_ma = np.ma.masked_array(K, np.eye(K.shape[0]))
            Kmin = np.ma.min(K_ma)
            Kmax = np.ma.max(K_ma)
            median = np.ma.median(K_ma)[0]
            pref = np.minimum(p + median, -.05) 
            print "SYS TIME:", hour, "# SENTS:", K.shape[0], 
            print "min/median/max pref: {}/{}/{}".format(
                    pref.min(), np.median(pref), pref.max())

            #K_data.append({"min": Kmin, "max": Kmax, "median": median})
            K_data.append({"min": (pref).min(), "max": (pref).max(), 
                           "median": np.median((pref))})

            #print K
    #        continue
    #
            ap = AffinityPropagation(preference=pref-pref_offset, affinity="precomputed",
                    verbose=True, max_iter=50000)
            ap.fit(K)
    #        ##print input_sents["pretty text"]
    #       
            labels = ap.labels_ 
            if ap.cluster_centers_indices_ != None:
                for c in ap.cluster_centers_indices_:
                    if cache == None:
                        cache = X[c]
                        updates_df = input_sents.reset_index(
                                drop=True).iloc[c]
                        updates_df["query id"] = event.query_num
                        updates_df["system timestamp"] = current_time
                        summary_data.append(
                            updates_df[
                                ["query id", "stream id", "sent id", 
                                 "system timestamp", "sent text"]
                            ].to_frame().T)

                    else:
                        Ksum = cosine_similarity(cache, X[c])
                        
                        #print "MAX SIM", Ksum.max()
                        #print input_sents.reset_index(drop=True).iloc[c]["sent text"]

                        if Ksum.max() < sim_threshold:

                            cache = np.vstack([cache, X[c]])
                            updates_df = input_sents.reset_index(
                                    drop=True).iloc[c]
                            updates_df["query id"] = event.query_num
                            updates_df["system timestamp"] = current_time
                            summary_data.append(
                                updates_df[
                                    ["query id", "stream id", "sent id", 
                                     "system timestamp", "sent text"]
                                ].to_frame().T)

    #
    #        for l, i in enumerate(af.cluster_centers_indices_):
    #            support = np.sum(labels == l)
    #            center = input_sents.iloc[i][["update id", "sent text", "pretty text", "stems", "nuggets"]]
    #            center = center.to_dict()
    #            center["support"] = support
    #            center["timestamp"] = current_time
    #            clusters.append(center)
    #            


            prev_time = current_time
    #    df = pd.DataFrame(clusters, columns=["update id", "timestamp", "support", "sent text", "pretty text", "stems", "nuggets"])
    #
    #    import os
    #    dirname = "clusters"
    #    if not os.path.exists(dirname):
    #        os.makedirs(dirname)
    #
    #    with open(os.path.join(dirname, "{}.tsv".format(event.query_id)), "w") as f:
    #        df.to_csv(f, sep="\t", index=False)
    #
    df = pd.DataFrame(K_data, columns=["min", "max", "median"])
    print df
    print df.mean()
    print df.std()
    print df.max()
    df =  pd.concat(summary_data)
    df["conf"] = .5
    df["team id"] = "APSAL"
    df["run id"] = "sim{}_bs{}_off{}".format(
        sim_threshold, bucket_size, pref_offset)
    print df
    of = os.path.join(output_dir, "apsal" + "sim{}_bs{}_off{}.tsv".format(
                sim_threshold, bucket_size, pref_offset))
    cols = ["query id", "team id", "run id", "stream id", "sent id", 
            "system timestamp", "conf"]
    df[cols].to_csv(of, sep="\t", header=False, index=False) 
예제 #7
0
def main(output_dir, sim_threshold, bucket_size):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    dev_qids = set([19, 23, 27, 34, 35])

    summary_data = []

    K_data = []
    for event in cuttsum.events.get_events():
        if event.query_num not in dev_qids: continue

        print event

        semsim = event2semsim(event)
        istream = get_input_stream(event,
                                   False,
                                   extractor="goose",
                                   thresh=.8,
                                   delay=None,
                                   topk=20)
        prev_time = 0
        cache = None

        clusters = []

        max_h = len(event.list_event_hours()) - 1

        for h, hour in enumerate(event.list_event_hours()):
            if h % bucket_size != 0 and h != max_h:
                continue

            current_time = epoch(hour)
            input_sents = istream[
                (istream["timestamp"] < current_time) & \
                (istream["timestamp"] >= prev_time)]
            len_select = input_sents["lemmas stopped"].apply(len) > 10
            input_sents = input_sents[len_select]

            if len(input_sents) <= 1: continue

            stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist()
            X = semsim.transform(stems)
            K = -(1 - cosine_similarity(X))
            K_ma = np.ma.masked_array(K, np.eye(K.shape[0]))
            Kmin = np.ma.min(K_ma)
            Kmax = np.ma.max(K_ma)
            median = np.ma.median(K_ma)[0]
            print "SYS TIME:", hour, "# SENTS:", K.shape[0],
            print "min/median/max pref: {}/{}/{}".format(Kmin, median, Kmax)

            #
            ap = AffinityPropagation(affinity="precomputed",
                                     verbose=True,
                                     max_iter=1000)
            ap.fit(K)
            labels = ap.labels_
            if ap.cluster_centers_indices_ != None:
                for c in ap.cluster_centers_indices_:
                    if cache == None:
                        cache = X[c]
                        updates_df = \
                            input_sents.reset_index(drop=True).iloc[c]
                        updates_df["query id"] = event.query_num
                        updates_df["system timestamp"] = current_time
                        summary_data.append(updates_df[[
                            "query id", "stream id", "sent id",
                            "system timestamp", "sent text"
                        ]].to_frame().T)

                    else:
                        Ksum = cosine_similarity(cache, X[c])
                        if Ksum.max() < sim_threshold:
                            cache = np.vstack([cache, X[c]])
                            updates_df = \
                                input_sents.reset_index(drop=True).iloc[c]
                            updates_df["query id"] = event.query_num
                            updates_df["system timestamp"] = current_time
                            summary_data.append(updates_df[[
                                "query id", "stream id", "sent id",
                                "system timestamp", "sent text"
                            ]].to_frame().T)

            prev_time = current_time

    df = pd.DataFrame(K_data, columns=["min", "max", "median"])
    print df
    print df.mean()
    print df.std()
    print df.max()
    df = pd.concat(summary_data)
    df["conf"] = .5
    df["team id"] = "AP"
    df["run id"] = "sim{}_bs{}".format(sim_threshold, bucket_size)
    print df
    of = os.path.join(
        output_dir,
        "ap." + "sim{}_bs{}.tsv".format(sim_threshold, bucket_size))
    cols = [
        "query id", "team id", "run id", "stream id", "sent id",
        "system timestamp", "conf"
    ]
    df[cols].to_csv(of, sep="\t", header=False, index=False)
예제 #8
0
def main(output_dir, sim_threshold, bucket_size, pref_offset): 
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)


    dev_qids = set([19, 23, 27, 34, 35])

    summary_data = []

    K_data = []
    for event in cuttsum.events.get_events():
        if event.query_num not in dev_qids: continue
        

        print event

        semsim = event2semsim(event)
        istream = get_input_stream(event, False, extractor="goose", 
            thresh=.8, delay=None, topk=20)
        prev_time = 0
        cache = None

        clusters = []

        max_h = len(event.list_event_hours()) - 1


        for h, hour in enumerate(event.list_event_hours()):
            if h % bucket_size != 0 and h != max_h:
                continue
            
            current_time = epoch(hour)
            input_sents = istream[
                (istream["timestamp"] < current_time) & \
                (istream["timestamp"] >= prev_time)]
            len_select = input_sents["lemmas stopped"].apply(len) > 10
            input_sents = input_sents[len_select]

            if len(input_sents) <= 1: continue

            stems = input_sents["stems"].apply(lambda x: ' '.join(x)).tolist()
            X = semsim.transform(stems)
            probs = input_sents["probs"]
            p = probs.values
            K = -(1 - cosine_similarity(X))   
            K_ma = np.ma.masked_array(K, np.eye(K.shape[0]))
            Kmin = np.ma.min(K_ma)
            Kmax = np.ma.max(K_ma)
            median = np.ma.median(K_ma)[0]
            pref = np.minimum(p + median, -.05) 
            print "SYS TIME:", hour, "# SENTS:", K.shape[0], 
            print "min/median/max pref: {}/{}/{}".format(
                    pref.min(), np.median(pref), pref.max())

            #K_data.append({"min": Kmin, "max": Kmax, "median": median})
            K_data.append({"min": (pref).min(), "max": (pref).max(), 
                           "median": np.median((pref))})

            #print K
    #        continue
    #
            ap = AffinityPropagation(
                    preference=pref-pref_offset, affinity="precomputed",
                    verbose=True, max_iter=1000)
            ap.fit(K)
    #        ##print input_sents["pretty text"]
    #       
            labels = ap.labels_ 
            if ap.cluster_centers_indices_ != None:
                for c in ap.cluster_centers_indices_:
                    if cache == None:
                        cache = X[c]
                        updates_df = \
                            input_sents.reset_index(drop=True).iloc[c]
                        updates_df["query id"] = event.query_num
                        updates_df["system timestamp"] = current_time
                        summary_data.append(
                            updates_df[
                                ["query id", "stream id", "sent id", 
                                 "system timestamp", "sent text"]
                            ].to_frame().T
                        )                       

                    else:
                        Ksum = cosine_similarity(cache, X[c])
                        #print "MAX SIM", Ksum.max()
                        #print input_sents.reset_index(drop=True).iloc[c]["sent text"]

                        if Ksum.max() < sim_threshold:

                            cache = np.vstack([cache, X[c]])
                            updates_df = \
                                input_sents.reset_index(drop=True).iloc[c]
                            updates_df["query id"] = event.query_num
                            updates_df["system timestamp"] = current_time
                            summary_data.append(
                                updates_df[
                                    ["query id", "stream id", "sent id", 
                                     "system timestamp", "sent text"]
                                ].to_frame().T
                            )

    #
    #        for l, i in enumerate(af.cluster_centers_indices_):
    #            support = np.sum(labels == l)
    #            center = input_sents.iloc[i][["update id", "sent text", "pretty text", "stems", "nuggets"]]
    #            center = center.to_dict()
    #            center["support"] = support
    #            center["timestamp"] = current_time
    #            clusters.append(center)
    #            


            prev_time = current_time
    #    df = pd.DataFrame(clusters, columns=["update id", "timestamp", "support", "sent text", "pretty text", "stems", "nuggets"])
    #
    #    import os
    #    dirname = "clusters"
    #    if not os.path.exists(dirname):
    #        os.makedirs(dirname)
    #
    #    with open(os.path.join(dirname, "{}.tsv".format(event.query_id)), "w") as f:
    #        df.to_csv(f, sep="\t", index=False)
    #
    df = pd.DataFrame(K_data, columns=["min", "max", "median"])
    print df
    print df.mean()
    print df.std()
    print df.max()
    df =  pd.concat(summary_data)
    df["conf"] = .5
    df["team id"] = "APSAL"
    df["run id"] = "sim{}_bs{}_off{}".format(
        sim_threshold, bucket_size, pref_offset)
    print df
    of = os.path.join(output_dir, "apsal" + "sim{}_bs{}_off{}.tsv".format(
                sim_threshold, bucket_size, pref_offset))
    cols = ["query id", "team id", "run id", "stream id", "sent id", 
            "system timestamp", "conf"]
    df[cols].to_csv(of, sep="\t", header=False, index=False) 
예제 #9
0
    def get_classifier(self, event):
        path = os.path.join(self.dir_, event.fs_name())
        classifiers = []
        semsim = event2semsim(event)
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

        if not os.path.exists(path):
            return None
        for nugget_id in os.listdir(path):
            model = self.get_best_model(event, nugget_id) 
            if model is not None:            
                classifiers.append((nugget_id, model[0], model[1], model[2], model[3]))

#        def remove_punctuation(text):
#            return re.sub(
#                ur"\p{P}+", "", 
#                text).lower().encode("utf-8")


        def classify_nuggets(df):
            
            sents = [" ".join(lemmas).lower() for lemmas in df["lemmas stopped"].tolist()]
            all_stems = [' '.join(stems) for stems in df["stems"].tolist()]

            sets = [set(sent.split(" ")) for sent in sents]         
            
            #sents = map(remove_punctuation, sents)
            nuggets = [set() for sent in sents]
            nugget_probs = [dict() for sent in sents]            

            all_probs = []
            for nugget_id, vec, clf, nugget_lems, nugget_stems in classifiers:
                if vec is not None and clf is not None:
                    
                    X = vec.transform(sents).todense()
                    x_cov = [len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems))
                             for lems in sets]

                    x_cov = np.array(x_cov)[:, np.newaxis]
                    K = cosine_similarity(semsim.transform(all_stems), 
                                          semsim.transform(nugget_stems))
                    X = np.hstack([X, x_cov, K, x_cov * K])
                    P = clf.predict_proba(X)
                    y = np.zeros(X.shape[0], dtype="int32")
                    #y[(P[:,1] > .95) | ((x_cov[:,0] > .75) & (len(nugget_lems) > 1))] = 1
                    for i in np.where(y == 1)[0]:
                        #if X[i].sum() < 3:
                        #if len(sents[i].split(" ")) < 6:
                         #   continue
                        nuggets[i].add(nugget_id.encode("utf-8")) 
                    
                    for i in xrange(len(nugget_probs)):
                            #if P[i,1] > .5:
                            nugget_probs[i][nugget_id.encode("utf-8")] = P[i,1]
                    all_probs.append(P[:,1,np.newaxis])
                #else:
                    #x_cov = [len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems))
                    #         for lems in sets]
                    #x_cov = np.array(x_cov)[:, np.newaxis]
                    #y = np.zeros(x_cov.shape[0], dtype="int32")
                    #y[((x_cov[:,0] > .75) & (len(nugget_lems) > 1))] = 1
                    #for i in np.where(y == 1)[0]:
                    #    nuggets[i].add(nugget_id.encode("utf-8")) 
                    #for i in xrange(len(nugget_probs)):
                    #    if x_cov[i,0] > .5:
                    #        nugget_probs[i][nugget_id.encode("utf-8")] = x_cov[i,0]

            Pp = np.hstack(all_probs)
            conf = np.max(Pp, axis=1)

            return nuggets, conf, nugget_probs
                       
#for sent in sents:
#            sent
#            return 


        return classify_nuggets 
예제 #10
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        stopwords = english_stopwords()

        ### Preprocessing here. ###
        df = cuttsum.judgements.get_merged_dataframe()
        matches = df[df["query id"] == event.query_id]
        matching_update_ids = set(matches["update id"].tolist())
        #nuggets = matches.groupby("nugget id")
        #thrsh_nuggets = all_nuggets.filter(lambda x: len(x) > 10)

        nugget_ids = list(set(matches["nugget id"].tolist()))

        #nugget_ids = list(set(all_nuggets["nugget id"].tolist()))
        nugget_ids.sort()
        nugget_id = nugget_ids[unit]
        with corenlp.Server(port=9876 + event.query_num * 100 + unit, mem="20G", threads=4, max_message_len=524288,
                annotators=["tokenize", "ssplit", "pos", "lemma"], #, "ner"],
                corenlp_props={
                    "pos.maxlen": 50, "ssplit.eolonly": "true"}) as pipeline:

            if event.query_id.startswith("TS13"):
                updates = cuttsum.judgements.get_2013_updates()
            elif event.query_id.startswith("TS14"):
                updates = cuttsum.judgements.get_2014_sampled_updates()
            elif event.query_id.startswith("TS15"):
                updates = cuttsum.judgements.get_2015_sampled_updates()

            updates = updates[updates["query id"] == event.query_id]  
            non_matching_updates = updates[updates["update id"].apply(
                lambda x: x not in matching_update_ids)]
            matching_updates = matches[matches["nugget id"] == nugget_id]
           # if len(matching_updates) == 0:
            #    return 
                #matching_updates = df[df["nugget id"] == nugget_id]

            nugget_text = matching_updates.iloc[0]["nugget text"]
            n_matching = len(matching_updates)
            n_nonmatching = min(n_matching, len(non_matching_updates))
            n_instances = n_matching + n_nonmatching

            semsim = event2semsim(event)
            from nltk.stem.porter import PorterStemmer
            stemmer = PorterStemmer()

            nugget_doc = pipeline.annotate(nugget_text)
            nugget_lems = []
            nugget_stems = []
            for sent in nugget_doc:
                for tok in sent:
                    if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50:
                        nugget_lems.append(tok.lem.lower())
                    stem = stemmer.stem(unicode(tok).lower())
                    if len(stem) < 50:
                        nugget_stems.append(stem)
            nugget_stems = [u" ".join(nugget_stems)]

            if n_matching <= 10:
                model_dir = self.get_model_dir(event, nugget_id)
                if not os.path.exists(model_dir):
                    os.makedirs(model_dir)
            
                joblib.dump([None, set(nugget_lems), nugget_stems], self.get_vectorizer_path(event, nugget_id), compress=9)
                joblib.dump([], self.get_model_path(event, nugget_id, "gbc"), compress=9)
                return 


            non_matching_updates = non_matching_updates.iloc[
                np.random.permutation(len(non_matching_updates))] 
            non_matching_updates = non_matching_updates.iloc[
                np.arange(n_nonmatching)] 
            #non_matching_updates["text"] = \
            #    non_matching_updates["text"].apply(lambda x: x.lower())

            y = np.zeros(n_instances, dtype="int32")
            y[:n_matching] = 1
            X_string = matching_updates["update text"].tolist()
            X_string += non_matching_updates.head(n_nonmatching)["text"].tolist()
            assert len(X_string) == n_instances

            p = np.random.permutation(n_instances)
            y = y[p]
            X_string = [X_string[i] for i in p]
            print "pipeline start"
            docs = pipeline.annotate_mp(X_string, n_procs=4)
            print "pipeline done"

            lemmas = []
            all_stems = []
            for doc in docs:
                lems = []
                stems = []
                for sent in doc:
                    for tok in sent:
                        if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50:
                            lems.append(tok.lem.lower())
                        stem = stemmer.stem(unicode(tok).lower())
                        if len(stem) < 50:
                            stems.append(stem)
                #print lems
                lemmas.append(lems)
                all_stems.append(u" ".join(stems))

                        
    # map(
    #                    lambda doc: [str(tok) 
    #                                 for doc in docs
    #                                 for sent in doc
    #                                 for tok in sent

            K = cosine_similarity(
                semsim.transform(all_stems),
                semsim.transform(nugget_stems))

            X_string = [u" ".join(lem) for lem in lemmas]
            vec = TfidfVectorizer(
                input=u"content", stop_words="english", ngram_range=(1,5))
            vec.fit([u" ".join(nugget_lems)] + X_string)
            X = vec.transform(X_string).todense()
            
            nugget_lems = set(nugget_lems)
            x_cov = [len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems))
                     for lems in lemmas]
            x_cov = np.array(x_cov)[:, np.newaxis]
            X = np.hstack([X, x_cov, K, K * x_cov])
            
            
            gbc = GradientBoostingClassifier(
                n_estimators=500, learning_rate=.1,
                max_depth=8, random_state=0, max_features="log2")
            gbc.fit(X, y)
            print "SCORE", gbc.score(X, y)
            model_dir = self.get_model_dir(event, nugget_id)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)

            joblib.dump([vec, nugget_lems, nugget_stems], 
                self.get_vectorizer_path(event, nugget_id), compress=9)
            joblib.dump(
                gbc, self.get_model_path(event, nugget_id, "gbc"), compress=9)
예제 #11
0
파일: _features.py 프로젝트: kedz/cuttsum
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("Job unit {} out of range".format(unit))

        service_configs = kwargs.get("service-configs", {})
        cnlp_configs = service_configs.get("corenlp", {})
        cnlp_port = int(cnlp_configs.get("port", 9999))

        domain_lm_config = service_configs[event2lm_name(event)]
        domain_lm_port = int(domain_lm_config["port"])      
        domain_lm_order = int(domain_lm_config.get("order", 3))	
        gw_lm_config = service_configs["gigaword-lm"]  
        gw_lm_port = int(gw_lm_config["port"])        
        gw_lm_order = int(gw_lm_config.get("order", 3))	


        thresh = kwargs.get("dedupe-sim-threshold", .8)
        extractor = kwargs.get("extractor", "goose")

        res = DedupedArticlesResource()
        dfiter = res.dataframe_iter(
            event, corpus, extractor, include_matches=None, 
            threshold=thresh)

        domain_lm = cuttsum.srilm.Client(domain_lm_port, domain_lm_order, True)
        gw_lm = cuttsum.srilm.Client(gw_lm_port, gw_lm_order, True)
        cnlp_client = cnlp.client.CoreNLPClient(port=cnlp_port)

        def make_query_synsets():
            synonyms = []
            hypernyms = []
            hyponyms = [] 
            print event.type.split(' ')[0]
            for synset in wn.synsets(event.type.split(' ')[0]):
                synonyms.extend(
                    [lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                     for lemma in synset.lemmas()])

                hypernyms.extend(
                    [lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                     for synset in synset.hypernyms()
                     for lemma in synset.lemmas()])

                hyponyms.extend(
                    [lemma.name().lower().replace(u'_', u' ').encode(u'utf-8')
                     for synset in synset.hyponyms()
                     for lemma in synset.lemmas()])
            print hypernyms
            print hyponyms
            print synonyms
            return set(synonyms), set(hypernyms), set(hyponyms)

        def heal_text(sent_text):
            sent_text = re.sub(
                ur"[A-Z ]+, [A-Z][a-z ]+\( [A-Z]+ \) [-\u2014_]+ ", 
                r"", sent_text)
            sent_text = re.sub(
                ur"^.*?[A-Z ]+, [A-Z][a-z]+ [-\u2014_]+ ", 
                r"", sent_text)
            sent_text = re.sub(
                ur"^.*?[A-Z ]+\([^\)]+\) [-\u2014_]+ ", 
                r"", sent_text)
            sent_text = re.sub(
                ur"^.*?[A-Z]+ +[-\u2014_]+ ", 
                r"", sent_text)
            
            sent_text = re.sub(r"\([^)]+\)", r" ", sent_text)
            sent_text = re.sub(ur"^ *[-\u2014_]+", r"", sent_text)
            sent_text = re.sub(u" ([,.;?!]+)([\"\u201c\u201d'])", r"\1\2", sent_text)
            sent_text = re.sub(r" ([:-]) ", r"\1", sent_text)
            sent_text = re.sub(r"([^\d]\d{1,3}) , (\d\d\d)([^\d]|$)", r"\1,\2\3", sent_text)
            sent_text = re.sub(r"^(\d{1,3}) , (\d\d\d)([^\d]|$)", r"\1,\2\3", sent_text)
            sent_text = re.sub(ur" ('|\u2019) ([a-z]|ll|ve|re)( |$)", r"\1\2 ", sent_text)
            sent_text = re.sub(r" ([',.;?!]+) ", r"\1 ", sent_text)
            sent_text = re.sub(r" ([',.;?!]+)$", r"\1", sent_text)

            sent_text = re.sub(r"(\d\.) (\d)", r"\1\2", sent_text)
            sent_text = re.sub(r"(a|p)\. m\.", r"\1.m.", sent_text)
            sent_text = re.sub(r"U\. (S|N)\.", r"U.\1.", sent_text)

            sent_text = re.sub(
                ur"\u201c ([^\s])", 
                ur"\u201c\1", sent_text)
            sent_text = re.sub(
                ur"([^\s]) \u201d", 
                ur"\1\u201d", sent_text)
            sent_text = re.sub(
                ur"\u2018 ([^\s])", 
                ur"\u2018\1", sent_text)
            sent_text = re.sub(
                ur"([^\s]) \u2019", 
                ur"\1\u2019", sent_text)

            sent_text = re.sub(
                ur"\u00e2", 
                ur"'", sent_text)
            sent_text = re.sub(
                r"^Photo:Reuters|^Photo:AP", 
                r"", sent_text)
            sent_text = sent_text.replace("\n", " ")

            return sent_text.encode("utf-8")

        def get_number_feats(sent):
            feats = []
            for tok in sent:
                if tok.ne == "NUMBER" and tok.nne is not None:
                    for chain in get_dep_chain(tok, sent, 0):
                        feat = [tok.nne] + [elem[1].lem for elem in chain]
                        feats.append(feat)
            return feats
            

        def get_dep_chain(tok, sent, depth):
            chains = []
            if depth > 2:
                return chains
            for p in sent.dep2govs[tok]:
                if p[1].is_noun():
                    for chain in get_dep_chain(p[1], sent, depth + 1):
                        chains.append([p] + chain)
                elif p[1]:
                    chains.append([p])
            return chains
        
        import unicodedata as u
        P=''.join(unichr(i) for i in range(65536) if u.category(unichr(i))[0]=='P')
        P = re.escape(P)
        punc_patt = re.compile("[" + P + "]")

        from collections import defaultdict
        stopwords = english_stopwords()
        mention_counts = defaultdict(int)
        total_mentions = 0

        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

        synonyms, hypernyms, hyponyms = make_query_synsets()

        path = self.get_path(
            event, corpus, extractor, thresh)
        dirname = os.path.dirname(path)
        if not os.path.exists(dirname): os.makedirs(dirname)

        meta_cols = ["update id", "stream id", "sent id", "timestamp", 
            "pretty text", "tokens", "lemmas", "stems", "pos", "ne", 
            "tokens stopped", "lemmas stopped"]

        basic_cols = ["BASIC length", "BASIC char length", 
            "BASIC doc position", "BASIC all caps ratio", 
            "BASIC upper ratio", "BASIC lower ratio",
            "BASIC punc ratio", "BASIC person ratio", 
            "BASIC location ratio",
            "BASIC organization ratio", "BASIC date ratio", 
            "BASIC time ratio", "BASIC duration ratio",
            "BASIC number ratio", "BASIC ordinal ratio",
            "BASIC percent ratio", "BASIC money ratio", 
            "BASIC set ratio", "BASIC misc ratio"]
       

        lm_cols = ["LM domain lp", "LM domain avg lp",
                   "LM gw lp", "LM gw avg lp"]

        query_cols = [
            "Q_query_sent_cov",
            "Q_sent_query_cov",
            "Q_syn_sent_cov",
            "Q_sent_syn_cov",
            "Q_hyper_sent_cov",
            "Q_sent_hyper_cov",
            "Q_hypo_sent_cov",
            "Q_sent_hypo_cov",
        ]

        sum_cols = [
            "SUM_sbasic_sum",
            "SUM_sbasic_amean",
            "SUM_sbasic_max",
            "SUM_novelty_gmean",
            "SUM_novelty_amean",
            "SUM_novelty_max",
            "SUM_centrality",
            "SUM_pagerank",
            "SUM_sem_novelty_gmean",
            "SUM_sem_novelty_amean",
            "SUM_sem_novelty_max",
            "SUM_sem_centrality",
            "SUM_sem_pagerank",
        ]
    
        stream_cols = [
            "STREAM_sbasic_sum",
            "STREAM_sbasic_amean",
            "STREAM_sbasic_max",
            "STREAM_per_prob_sum",
            "STREAM_per_prob_max",
            "STREAM_per_prob_amean",
            "STREAM_loc_prob_sum",
            "STREAM_loc_prob_max",
            "STREAM_loc_prob_amean",
            "STREAM_org_prob_sum",
            "STREAM_org_prob_max",
            "STREAM_org_prob_amean",
            "STREAM_nt_prob_sum",
            "STREAM_nt_prob_max",
            "STREAM_nt_prob_amean",
        ]


        semsim = event2semsim(event)
 
        all_cols = meta_cols + basic_cols + query_cols + lm_cols + sum_cols + stream_cols
        
        stream_uni_counts = defaultdict(int)
        stream_per_counts = defaultdict(int)
        stream_loc_counts = defaultdict(int)
        stream_org_counts = defaultdict(int)
        stream_nt_counts = defaultdict(int)

        with gzip.open(path, "w") as f:
            f.write("\t".join(all_cols) + "\n")
            for df in dfiter:
                if len(df) == 1: continue
                df = df.head(20)
                
                #df["lm"] = df["sent text"].apply(lambda x: lm.sentence_log_prob(x.encode("utf-8"))[1])
                df["pretty text"] = df["sent text"].apply(heal_text)
                df = df[df["pretty text"].apply(lambda x: len(x.strip())) > 0]
                df = df[df["pretty text"].apply(lambda x: len(x.split(" "))) < 200]
                df = df.reset_index(drop=True)
                if len(df) == 0:
                    print "skipping"
                    continue
                doc_text = "\n".join(df["pretty text"].tolist())
                
                doc = cnlp_client.annotate(doc_text)
                df["tokens"] = map(lambda sent: [str(tok) for tok in sent],
                                   doc)
                df["lemmas"] = map(lambda sent: [tok.lem.encode("utf-8") 
                                                 for tok in sent],
                                   doc)

                df["stems"] = map(lambda sent: 
                    [stemmer.stem(unicode(tok).lower()) for tok in sent], doc)
                df["pos"] = map(lambda sent: [tok.pos for tok in sent],
                                doc)
                
                df["ne"] = map(lambda sent: [tok.ne for tok in sent],
                               doc)
                
                df["tokens stopped"] = map(
                    lambda sent: [str(tok) for tok in sent
                                  if unicode(tok).lower() not in stopwords \
                                      and len(unicode(tok)) < 50],
                    doc)
                df["lemmas stopped"] = map(
                    lambda sent: [tok.lem.lower().encode("utf-8") for tok in sent
                                  if unicode(tok).lower() not in stopwords \
                                      and len(unicode(tok)) < 50],
                    doc)
                
                df["num tuples"] = [get_number_feats(sent) for sent in doc]
                ### Basic Features ###

                df["BASIC length"] = df["lemmas stopped"].apply(len)
                df["BASIC doc position"] = df.index.values + 1
                
                df = df[df["BASIC length"] > 0]
                df = df.reset_index(drop=True)

                df["BASIC char length"] = df["pretty text"].apply(
                    lambda x: len(x.replace(" ", "")))       
        
                df["BASIC upper ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall("[A-Z]", x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))

                df[ "BASIC lower ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall("[a-z]", x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))

                df["BASIC punc ratio"] = df["pretty text"].apply(
                    lambda x: len(re.findall(punc_patt, x))) \
                    / df["BASIC char length"].apply(lambda x: float(max(x, 1)))
                df["BASIC all caps ratio"] = df["tokens stopped"].apply(
                    lambda x: np.sum([1 if re.match("^[A-Z]+$", xi) else 0 
                                      for xi in x])) \
                    / df["BASIC length"].apply(float)
               
                df["BASIC person ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "PERSON" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC location ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "LOCATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 
 
                df["BASIC organization ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "ORGANIZATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 
 
                df["BASIC date ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "DATE" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC time ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "TIME" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC duration ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "DURATION" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC number ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "NUMBER" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC ordinal ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "ORDINAL" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC percent ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "PERCENT" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC money ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "MONEY" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC set ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "SET" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                df["BASIC misc ratio"] = df["ne"].apply(
                    lambda x: np.sum([1 if xi == "MISC" else 0
                                      for xi in x])) \
                    / df["BASIC length"].apply(float) 

                ### Language Model Features ###                

                dm_probs = df["lemmas"].apply(
                    lambda x: domain_lm.sentence_log_prob(
                        " ".join([xi.decode("utf-8").lower().encode("utf-8") 
                                  for xi in x if len(xi) < 50])))
                dm_log_probs = [lp for lp, avg_lp in dm_probs.tolist()]
                dm_avg_log_probs = [avg_lp for lp, avg_lp in dm_probs.tolist()]
                df["LM domain lp"] = dm_log_probs
                df["LM domain avg lp"] = dm_avg_log_probs
                gw_probs = df["lemmas"].apply(
                    lambda x: gw_lm.sentence_log_prob(
                        " ".join([xi.decode("utf-8").lower().encode("utf-8")
                                  for xi in x if len(xi) < 50])))
                gw_log_probs = [lp for lp, avg_lp in gw_probs.tolist()]
                gw_avg_log_probs = [avg_lp for lp, avg_lp in gw_probs.tolist()]
                df["LM gw lp"] = gw_log_probs
                df["LM gw avg lp"] = gw_avg_log_probs


                ### Query Features ###

                self.compute_query_features(df, 
                    set([q.lower() for q in event.query]),
                    synonyms, hypernyms, hyponyms)


                ### Single Doc Summarization Features ###
                
                counts = []
                doc_counts = defaultdict(int)
                for lemmas in df["lemmas stopped"].tolist():
                    counts_i = {}
                    for lem in lemmas:
                        counts_i[lem.lower()] = counts_i.get(lem.lower(), 0) + 1
                        doc_counts[lem.lower()] += 1
                    doc_counts["__TOTAL__"] += len(lemmas)
                    counts.append(counts_i)
                doc_counts["__TOTAL__"] *= 1.
                doc_uni = {key: val / doc_counts["__TOTAL__"] 
                           for key, val in doc_counts.items() 

                           if key != "__TOTAL__"}

                sum_probs = []
                amean_probs = []
                max_probs = []
                for lemmas in df["lemmas stopped"].tolist():
                    probs = [doc_uni[lem.lower()] for lem in lemmas]
                    sum_probs.append(np.sum(probs))
                    amean_probs.append(np.mean(probs))
                    max_probs.append(np.max(probs))

                df["SUM_sbasic_sum"] = sum_probs
                df["SUM_sbasic_amean"] = amean_probs
                df["SUM_sbasic_max"] = max_probs

                tfidfer = TfidfTransformer()
                vec = DictVectorizer()
                X = vec.fit_transform(counts)
                X = tfidfer.fit_transform(X)
        
                ctrd = X.mean(axis=0)
                K = cosine_similarity(ctrd, X).ravel()
                I = K.argsort()[::-1]
                R = np.array([[i, r + 1] for r, i in enumerate(I)])
                R = R[R[:,0].argsort()]
                df["SUM_centrality"] = R[:,1]

                L = semsim.transform(df["stems"].apply(lambda x: ' '.join(x)).tolist())
                ctrd_l = L.mean(axis=0)
                K_L = cosine_similarity(ctrd_l, L).ravel()                
                I_L = K_L.argsort()[::-1]
                R_L = np.array([[i, r + 1] for r, i in enumerate(I_L)])
                R_L = R_L[R_L[:, 0].argsort()]
                df["SUM_sem_centrality"] = R_L[:,1]

                K = cosine_similarity(X)
                M = np.zeros_like(K)
                M[np.diag_indices(K.shape[0])] = 1
                Km = np.ma.masked_array(K, M)
                D = 1 - Km
        
                novelty_amean = D.mean(axis=1)
                novelty_max = D.max(axis=1)
                novelty_gmean = gmean(D, axis=1)

                df["SUM_novelty_amean"] = novelty_amean
                df["SUM_novelty_max"] = novelty_max
                df["SUM_novelty_gmean"] = novelty_gmean

                K_L = cosine_similarity(L)
                M_L = np.zeros_like(K)
                M_L[np.diag_indices(K_L.shape[0])] = 1
                K_Lm = np.ma.masked_array(K_L, M_L)
                D_L = 1 - K_Lm

                sem_novelty_amean = D_L.mean(axis=1)
                sem_novelty_max = D_L.max(axis=1)
                sem_novelty_gmean = gmean(D_L, axis=1)

                df["SUM_sem_novelty_amean"] = sem_novelty_amean
                df["SUM_sem_novelty_max"] = sem_novelty_max
                df["SUM_sem_novelty_gmean"] = sem_novelty_gmean




                K = (K > 0).astype("int32")
                degrees = K.sum(axis=1) - 1
                edges_x_2 = K.sum() - K.shape[0]
                if edges_x_2 == 0: edges_x_2 = 1
                pr = 1. - degrees / float(edges_x_2)
                df["SUM_pagerank"] = pr

                K_L = (K_L > .2).astype("int32")
                degrees_L = K_L.sum(axis=1) - 1
                edges_x_2_L = K_L.sum() - K_L.shape[0]
                if edges_x_2_L == 0: edges_x_2_L = 1
                pr_L = 1. - degrees_L / float(edges_x_2_L)
                df["SUM_sem_pagerank"] = pr_L


                print df["pretty text"]
               # print df[["SUM_sbasic_sum", "SUM_sbasic_amean", "SUM_sbasic_max"]]
               # print df[
               #     ["SUM_pagerank", "SUM_centrality", "SUM_novelty_gmean", 
               #      "SUM_novelty_amean", "SUM_novelty_max"]]

                ### Stream Features ###
                for key, val in doc_counts.items():
                    stream_uni_counts[key] += val      
                denom = stream_uni_counts["__TOTAL__"]
                sum_probs = []
                amean_probs = []
                max_probs = []
                
                for lemmas in df["lemmas stopped"].tolist():
                    probs = [stream_uni_counts[lem.lower()] / denom for lem in lemmas]
                    sum_probs.append(np.sum(probs))
                    amean_probs.append(np.mean(probs))
                    max_probs.append(np.max(probs))

                df["STREAM_sbasic_sum"] = sum_probs
                df["STREAM_sbasic_amean"] = amean_probs
                df["STREAM_sbasic_max"] = max_probs



                for lemmas, nes in izip(df["lemmas"].tolist(), df["ne"].tolist()):
                    for lem, ne in izip(lemmas, nes):
                        if ne == "PERSON":
                            stream_per_counts[lem.lower()] += 1
                            stream_per_counts["__TOTAL__"] += 1.                                    
                        if ne == "LOCATION":
                            stream_loc_counts[lem.lower()] += 1
                            stream_loc_counts["__TOTAL__"] += 1.                                    
                        if ne == "ORGANIZATION":
                            stream_org_counts[lem.lower()] += 1
                            stream_org_counts["__TOTAL__"] += 1.                                    


                for tuples in df["num tuples"].tolist():

                    for nt in tuples:
                        for item in nt:
                            stream_nt_counts[item.lower()] += 1
                            stream_nt_counts["__TOTAL__"] += 1.

                pdenom = stream_per_counts["__TOTAL__"]                
                ldenom = stream_loc_counts["__TOTAL__"]                
                odenom = stream_org_counts["__TOTAL__"]                
                ntdenom = stream_nt_counts["__TOTAL__"]                
                sum_per_probs = []
                amean_per_probs = []
                max_per_probs = []
                sum_loc_probs = []
                amean_loc_probs = []
                max_loc_probs = []
                sum_org_probs = []
                amean_org_probs = []
                max_org_probs = []
                sum_nt_probs = []
                amean_nt_probs = []
                max_nt_probs = []

                            
                for tuples in df["num tuples"].tolist():
                    if ntdenom > 0:
                        nt_probs = [stream_nt_counts[item.lower()] / ntdenom
                                    for nt in tuples
                                    for item in nt]    
                    else:
                        nt_probs = []

                    if len(nt_probs) > 0:
                        sum_nt_probs.append(np.sum(nt_probs))
                        amean_nt_probs.append(np.mean(nt_probs))
                        max_nt_probs.append(np.max(nt_probs))
                    else:
                        sum_nt_probs.append(0)
                        amean_nt_probs.append(0)
                        max_nt_probs.append(0)


                for lemmas, nes in izip(df["lemmas"].tolist(), df["ne"].tolist()):

                    

                    if pdenom > 0:
                        per_probs = [stream_per_counts[lem.lower()] / pdenom
                                     for lem, ne in izip(lemmas, nes)
                                     if ne == "PERSON"]
                    else:
                        per_probs = []

                    if len(per_probs) > 0:
                        sum_per_probs.append(np.sum(per_probs))
                        amean_per_probs.append(np.mean(per_probs))
                        max_per_probs.append(np.max(per_probs))
                    else:
                        sum_per_probs.append(0)
                        amean_per_probs.append(0)
                        max_per_probs.append(0)

                    if ldenom > 0:
                        loc_probs = [stream_loc_counts[lem.lower()] / ldenom
                                     for lem, ne in izip(lemmas, nes)
                                     if ne == "LOCATION"]
                    else:
                        loc_probs = []

                    if len(loc_probs) > 0 :
                        sum_loc_probs.append(np.sum(loc_probs))
                        amean_loc_probs.append(np.mean(loc_probs))
                        max_loc_probs.append(np.max(loc_probs))
                    else:
                        sum_loc_probs.append(0)
                        amean_loc_probs.append(0)
                        max_loc_probs.append(0)

                    if odenom > 0:
                        org_probs = [stream_org_counts[lem.lower()] / odenom
                                     for lem, ne in izip(lemmas, nes)
                                     if ne == "ORGANIZATION"]
                    else:
                        org_probs = []

                    if len(org_probs) > 0 :
                        sum_org_probs.append(np.sum(org_probs))
                        amean_org_probs.append(np.mean(org_probs))
                        max_org_probs.append(np.max(org_probs))
                    else:
                        sum_org_probs.append(0)
                        amean_org_probs.append(0)
                        max_org_probs.append(0)



                df["STREAM_per_prob_sum"] = sum_per_probs
                df["STREAM_per_prob_max"] = max_per_probs
                df["STREAM_per_prob_amean"] = amean_per_probs

                df["STREAM_loc_prob_sum"] = sum_loc_probs
                df["STREAM_loc_prob_max"] = max_loc_probs
                df["STREAM_loc_prob_amean"] = amean_loc_probs

                df["STREAM_org_prob_sum"] = sum_org_probs
                df["STREAM_org_prob_max"] = max_org_probs
                df["STREAM_org_prob_amean"] = amean_org_probs

                df["STREAM_nt_prob_sum"] = sum_nt_probs
                df["STREAM_nt_prob_max"] = max_nt_probs
                df["STREAM_nt_prob_amean"] = amean_nt_probs

                #print df[["STREAM_sbasic_sum", "STREAM_sbasic_amean", "STREAM_sbasic_max"]]
                #print df[["STREAM_per_prob_sum", "STREAM_per_prob_amean", "STREAM_per_prob_max"]]  
                #print df[["STREAM_loc_prob_sum", "STREAM_loc_prob_amean", "STREAM_loc_prob_max"]]  
                #print df[["STREAM_nt_prob_sum", "STREAM_nt_prob_amean", "STREAM_nt_prob_max"]]  


                ### Write dataframe to file ###
                df[all_cols].to_csv(f, index=False, header=False, sep="\t")