Exemplo n.º 1
0
def main(output_path, norm, stop):

    dirname, fname = os.path.split(output_path)
    if dirname != "" and not os.path.exists(dirname):
        os.makedirs(dirname)

    output_path = os.path.join(
        dirname, "{}.norm-{}{}.spl.gz".format(fname, norm,
                                              ".stop" if stop else ""))

    print "Domain: {}".format(fname)
    print "Output Path: {}".format(output_path)
    events = [
        event for event in cuttsum.events.get_events()
        if event.type in dom2type[fname] and event.query_num < 26
    ]

    ne = False
    #if ne is True:
    #     annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"]
    if norm == "lemma":
        annotators = ["tokenize", "ssplit", "pos", "lemma"]
    else:
        annotators = ["tokenize", "ssplit"]



    with cnlp.Server(annotators=annotators, mem="6G",
            port=2001, max_message_len=1000000) as client, \
            gzip.open(output_path, "w") as f:

        query_ids = set([event.query_id for event in events])
        updates = matches_df[matches_df["query id"].apply(
            lambda x: x in query_ids)]
        texts = updates.drop_duplicates(
            subset='update id')["update text"].apply(heal_text).tolist()

        central_per = None
        central_loc = None
        central_org = None

        print "processing update text"
        docs = [client.annotate(text) for text in texts]
        for doc in docs[:10]:
            print doc
        print "tokenizing"
        X_upd_txt = tokenize(docs,
                             norm,
                             stop,
                             ne,
                             central_per=central_per,
                             central_loc=central_loc,
                             central_org=central_org)
        print "writing"
        for line in X_upd_txt:
            f.write(line + "\n")
Exemplo n.º 2
0
def main2():
    events = cuttsum.events.get_events()
    df = cuttsum.judgements.get_merged_dataframe()
    stopwords = english_stopwords()

    with corenlp.Server(
            port=9876,
            mem="20G",
            threads=8,
            max_message_len=524288,
            annotators=["tokenize", "ssplit", "pos", "lemma"],  #, "ner"],
            corenlp_props={
                "pos.maxlen": 50,
                "ssplit.eolonly": "true"
            }) as pipeline:

        for event in events[8:9]:

            matches = df[df["query id"] == event.query_id]

            matching_update_ids = set(matches["update id"].tolist())
            all_nuggets = matches.groupby("nugget id")
            thrsh_nuggets = all_nuggets.filter(lambda x: len(x) <= 10)

            nugget_ids = list(set(thrsh_nuggets["nugget id"].tolist()))
            nugget_ids.sort()

            for nugget_id in nugget_ids:

                if event.query_id.startswith("TS13"):
                    updates = cuttsum.judgements.get_2013_updates()
                elif event.query_id.startswith("TS14"):
                    updates = cuttsum.judgements.get_2014_sampled_updates()

                updates = updates[updates["query id"] == event.query_id]
                non_matching_updates = updates[updates["update id"].apply(
                    lambda x: x not in matching_update_ids)]
                matching_updates = matches[matches["nugget id"] == nugget_id]

                nugget_text = matching_updates.iloc[0]["nugget text"]
                print nugget_text
                n_matching = len(matching_updates)
                n_nonmatching = len(non_matching_updates)
                n_instances = n_matching + n_nonmatching

                #matching_updates["update text"] = \
                #    matching_updates["update text"].apply(lambda x: x.lower())

                non_matching_updates = non_matching_updates.iloc[
                    np.random.permutation(len(non_matching_updates))]
                non_matching_updates = non_matching_updates.iloc[np.arange(
                    n_nonmatching)]
                #non_matching_updates["text"] = \
                #    non_matching_updates["text"].apply(lambda x: x.lower())

                y = np.zeros(n_instances, dtype="int32")
                y[:n_matching] = 1
                X_string = matching_updates["update text"].tolist()
                X_string += non_matching_updates.head(
                    n_nonmatching)["text"].tolist()
                assert len(X_string) == n_instances

                p = np.random.permutation(n_instances)
                y = y[p]
                X_string = [X_string[i] for i in p]
                print "pipeline start"
                docs = pipeline.annotate_mp(X_string, n_procs=8)
                nugget_doc = pipeline.annotate(nugget_text)
                print "pipeline done"

                lemmas = []
                for doc in docs:
                    lems = []
                    for sent in doc:
                        for tok in sent:
                            if unicode(tok).lower() not in stopwords and len(
                                    unicode(tok)) < 50:
                                lems.append(tok.lem.lower())
                    #print lems
                    lemmas.append(set(lems))

                nugget_lems = []
                for sent in nugget_doc:
                    for tok in sent:
                        if unicode(tok).lower() not in stopwords and len(
                                unicode(tok)) < 50:
                            nugget_lems.append(tok.lem.lower())

                nugget_lems = set(nugget_lems)
                n_lems = float(len(nugget_lems))
                if n_lems == 1:
                    print
                    continue
                for i in xrange(n_instances):
                    if len(lemmas[i]) > 50: continue
                    cov = len(nugget_lems.intersection(lemmas[i])) / n_lems
                    if cov > .75:
                        if isinstance(nugget_text, str):
                            print nugget_text
                        else:
                            print nugget_text.encode("utf-8")

                        if isinstance(X_string[i], str):
                            print y[i], X_string[i]
                        else:
                            print y[i], X_string[i].encode("utf-8")
                print
Exemplo n.º 3
0
def main():
    events = cuttsum.events.get_events()
    df = cuttsum.judgements.get_merged_dataframe()
    stopwords = english_stopwords()

    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    import os
    from sklearn.externals import joblib
    wtmf_models = {}
    wtmf_models["accidents"] = joblib.load(
        os.getenv("TREC_DATA") + "/semsim/accidents.norm-stem.lam20.000.pkl")
    wtmf_models["social-unrest"] = joblib.load(
        os.getenv("TREC_DATA") +
        "/semsim/social-unrest.norm-stem.lam1.000.pkl")
    wtmf_models["terrorism"] = joblib.load(
        os.getenv("TREC_DATA") + "/semsim/terrorism.norm-stem.lam10.000.pkl")
    wtmf_models["natural-disasters"] = joblib.load(
        os.getenv("TREC_DATA") +
        "/semsim/natural-disasters.norm-stem.lam20.000.pkl")

    all_acc = []
    all_aug_acc = []

    with corenlp.Server(
            port=9876,
            mem="20G",
            threads=8,
            max_message_len=524288,
            annotators=["tokenize", "ssplit", "pos", "lemma"],  #, "ner"],
            corenlp_props={
                "pos.maxlen": 50,
                "ssplit.eolonly": "true"
            }) as pipeline:

        for event in events:
            if event.query_num == 7: continue
            if event.query_num > 25: continue

            if event.type in dom2type["natural-disasters"]:
                wtmf_vec = wtmf_models["natural-disasters"]

            if event.type in dom2type["accidents"]:
                wtmf_vec = wtmf_models["accidents"]

            if event.type in dom2type["social-unrest"]:
                wtmf_vec = wtmf_models["social-unrest"]

            if event.type in dom2type["terrorism"]:
                wtmf_vec = wtmf_models["terrorism"]

            matches = df[df["query id"] == event.query_id]

            matching_update_ids = set(matches["update id"].tolist())
            all_nuggets = matches.groupby("nugget id")
            thrsh_nuggets = all_nuggets.filter(lambda x: len(x) > 10)

            nugget_ids = list(set(thrsh_nuggets["nugget id"].tolist()))
            #nugget_ids.sort()

            for num_nug, nugget_id in enumerate(nugget_ids):
                if event.query_id.startswith("TS13"):
                    updates = cuttsum.judgements.get_2013_updates()
                elif event.query_id.startswith("TS14"):
                    updates = cuttsum.judgements.get_2014_sampled_updates()

                updates = updates[updates["query id"] == event.query_id]
                non_matching_updates = updates[updates["update id"].apply(
                    lambda x: x not in matching_update_ids)]
                matching_updates = matches[matches["nugget id"] == nugget_id]

                nugget_text = matching_updates.iloc[0]["nugget text"]
                n_matching = len(matching_updates)
                n_nonmatching = min(n_matching, len(non_matching_updates))
                n_instances = n_matching + n_nonmatching

                #matching_updates["update text"] = \
                #    matching_updates["update text"].apply(lambda x: x.lower())

                non_matching_updates = non_matching_updates.iloc[
                    np.random.permutation(len(non_matching_updates))]
                non_matching_updates = non_matching_updates.iloc[np.arange(
                    n_nonmatching)]
                #non_matching_updates["text"] = \
                #    non_matching_updates["text"].apply(lambda x: x.lower())

                y = np.zeros(n_instances, dtype="int32")
                y[:n_matching] = 1
                X_string = matching_updates["update text"].tolist()
                X_string += non_matching_updates.head(
                    n_nonmatching)["text"].tolist()
                assert len(X_string) == n_instances

                p = np.random.permutation(n_instances)
                y = y[p]
                X_string = [X_string[i] for i in p]
                print "pipeline start"
                docs = pipeline.annotate_mp(X_string, n_procs=8)
                nugget_doc = pipeline.annotate(nugget_text)
                print "pipeline done"

                lemmas = []
                all_stems = []

                for doc in docs:
                    lems = []
                    stems = []
                    for sent in doc:
                        for tok in sent:
                            if unicode(tok).lower() not in stopwords and len(
                                    unicode(tok)) < 50:
                                lems.append(tok.lem.lower())
                            stems.append(stemmer.stem(unicode(tok).lower()))
                    #print lems
                    lemmas.append(lems)
                    all_stems.append(u" ".join(stems))

                nugget_lems = []
                nugget_stems = []
                for sent in nugget_doc:
                    for tok in sent:
                        if unicode(tok).lower() not in stopwords and len(
                                unicode(tok)) < 50:
                            nugget_lems.append(tok.lem.lower())
                        nugget_stems.append(stemmer.stem(unicode(tok).lower()))
                nugget_stems = [u" ".join(nugget_stems)]
                # map(
                #                    lambda doc: [str(tok)
                #                                 for doc in docs
                #                                 for sent in doc
                #                                 for tok in sent

                X_string = [u" ".join(lem) for lem in lemmas]
                vec = TfidfVectorizer(input=u"content",
                                      stop_words="english",
                                      ngram_range=(1, 5))
                vec.fit([u" ".join(nugget_lems)] + X_string)
                X = vec.transform(X_string).todense()

                nugget_lems = set(nugget_lems)
                x_cov = [
                    len(nugget_lems.intersection(set(lems))) /
                    float(len(nugget_lems)) for lems in lemmas
                ]
                x_cov = np.array(x_cov)[:, np.newaxis]
                X = np.hstack([X, x_cov])
                #print X[:, -1]
                #X_nug = vec.transform([u" ".join(nugget_lems)]).todense()

                from sklearn.cross_validation import StratifiedKFold
                from sklearn.metrics import classification_report
                from sklearn.ensemble import GradientBoostingClassifier

                gbc = GradientBoostingClassifier(n_estimators=500,
                                                 learning_rate=.1,
                                                 max_depth=8,
                                                 random_state=0,
                                                 max_features="log2")

                #scores = cross_validation.cross_val_score(gbc, X, y, cv=10)
                #print scores.mean()

                K = cosine_similarity(wtmf_vec.transform(all_stems),
                                      wtmf_vec.transform(nugget_stems))

                X_aug = np.hstack([X, K, K * x_cov])

                scores = []
                aug_scores = []
                print event.fs_name(), nugget_text
                for train_index, test_index in StratifiedKFold(y, n_folds=10):
                    X_train = X[train_index]
                    y_train = y[train_index]
                    X_test = X[test_index]
                    y_test = y[test_index]
                    gbc.fit(X_train, y_train)
                    score = gbc.score(X_test, y_test)
                    X_aug_train = X_aug[train_index]
                    y_train = y[train_index]
                    X_aug_test = X_aug[test_index]
                    y_test = y[test_index]
                    gbc.fit(X_aug_train, y_train)
                    score_aug = gbc.score(X_aug_test, y_test)

                    print score, score_aug
                    scores.append(score)
                    aug_scores.append(score_aug)
                print "mean", np.mean(scores), np.mean(aug_scores)
                all_aug_acc.append(np.mean(aug_scores))
                all_acc.append(np.mean(scores))

                print classification_report(y_test, gbc.predict(X_aug_test))
                y_pred = gbc.predict(X_aug)
                for i, c in enumerate(y_pred):
                    if c == 0 and y[i] == 1:
                        print nugget_text  #.encode("utf-8")
                        print X_string[i]  #.encode("utf-8")

                print
                print "False positives"
                for i, c in enumerate(y_pred):
                    if c == 1 and y[i] == 0:
                        print nugget_text  #.encode("utf-8")
                        print X_string[i]  #.encode("utf-8")

    #        model_dir = self.get_model_dir(event, nugget_id)
    #        if not os.path.exists(model_dir):
    #            os.makedirs(model_dir)

    #       joblib.dump(vec, self.get_vectorizer_path(event, nugget_id), compress=9)

    ### Classifier shootout here. ###
    #prob_thresh = .5
    print "Macro avg acc", np.mean(all_acc), np.mean(all_aug_acc)
Exemplo n.º 4
0
def main(input_path, output_path, norm, stop, ne, lam, port):
    dirname, domain = os.path.split(input_path)
    input_path = os.path.join(
        dirname,
        "{}.norm-{}{}{}.lam{:0.3f}.pkl".format(domain, norm,
                                               ".stop" if stop else "",
                                               ".ne" if ne else "", lam))
    print "Domain: {}".format(domain)
    print "Model Path: {}".format(input_path)
    events = [
        event for event in cuttsum.events.get_events()
        if event.type in dom2type[domain] and event.query_num < 26
        and event.query_num != 7
    ]

    if ne is True:
        annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"]
    elif norm == "lemma":
        annotators = ["tokenize", "ssplit", "pos", "lemma"]
    else:
        annotators = ["tokenize", "ssplit"]

    results = []
    vec = joblib.load(input_path)

    modelname = "{}.norm_{}.stop_{}.ne_{}.lam_{}".format(
        domain, norm, stop, ne, lam)

    with cnlp.Server(annotators=annotators,
                     mem="6G",
                     port=port,
                     max_message_len=1000000) as client:

        for event in events:
            print event
            event_nuggets = nuggets.loc[nuggets["query id"] == event.query_id]
            print "processing nugget text"
            nugget_docs = [
                client.annotate(text)
                for text in event_nuggets["text"].tolist()
            ]
            #for doc in nugget_docs:
            #    print doc
            #print

            if ne:
                central_per, central_loc, central_org = find_central_nes(
                    nugget_docs)
            else:
                central_per = None
                central_loc = None
                central_org = None

            X_nug_txt = tokenize(nugget_docs,
                                 norm,
                                 stop,
                                 ne,
                                 central_per=central_per,
                                 central_loc=central_loc,
                                 central_org=central_org)
            nuggets.loc[nuggets["query id"] == event.query_id, "X"] = X_nug_txt
            event_nuggets = nuggets[nuggets["query id"] == event.query_id]
            event_nuggets = event_nuggets[event_nuggets["X"].apply(
                lambda x: len(x.split(" ")) < 50 and len(x.split(" ")) > 0)]
            X_nug_txt = event_nuggets["X"].tolist()
            #for txt in X_nug_txt:
            #    print txt
            #print
            print "transforming nugget text"
            X_nug = vec.transform(X_nug_txt)
            assert X_nug.shape[0] == len(event_nuggets)

            print "getting updates"
            updates.loc[updates["query id"] == event.query_id, "text"] = \
                updates.loc[updates["query id"] == event.query_id, "text"].apply(heal_text)

            event_updates = updates[(updates["query id"] == event.query_id)
                                    & (updates["text"].apply(len) < 1000)]
            print "processing update text"
            docs = [
                client.annotate(text)
                for text in event_updates["text"].tolist()
            ]
            X_upd_txt = tokenize(docs,
                                 norm,
                                 stop,
                                 ne,
                                 central_per=central_per,
                                 central_loc=central_loc,
                                 central_org=central_org)
            print "transforming update text"
            X_upd = vec.transform(X_upd_txt)

            for i, (index, nugget) in enumerate(event_nuggets.iterrows()):

                boolean = (matches_df["query id"] == event.query_id) & (
                    matches_df["nugget id"] == nugget["nugget id"])
                match_ids = set(matches_df.loc[boolean, "update id"].tolist())
                if len(match_ids) == 0: continue

                #print index, nugget["nugget id"], nugget["text"]
                #print X_nug[i]
                if (X_nug[i] == 0).all(): continue

                n_matches = 0
                K = cosine_similarity(X_nug[i], X_upd)
                for j in K.ravel().argsort()[::-1][:100]:

                    #print K[0,j],
                    #print event_updates.iloc[j]["text"]
                    if event_updates.iloc[j]["update id"] in match_ids:
                        n_matches += 1

                #print
                P100 = n_matches / 100.
                optP100 = min(1., len(match_ids) / 100.)
                nP100 = P100 / optP100
                results.append({
                    "model": modelname,
                    "nugget id": nugget["nugget id"],
                    "P@100": P100,
                    "opt P@100": optP100,
                    "normP@100": nP100
                })
            df = pd.DataFrame(results)
            print df
            print df["normP@100"].mean()
            df["model"] = modelname
        return results
Exemplo n.º 5
0
import wtmf
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.externals import joblib
import corenlp as cnlp
import cuttsum.judgements
import gzip

matches_df = cuttsum.judgements.get_merged_dataframe()
nuggets_df = cuttsum.judgements.get_nuggets()

annotators = ["tokenize", "ssplit"]
with cnlp.Server(annotators=annotators, mem="6G",
        max_message_len=1000000) as client:

 

    for event, event_nuggets in nuggets_df.groupby('query id'):
        nugget_docs = []
        for text in event_nuggets["text"].tolist():
            doc = client.annotate(text)
            print doc
            nugget_docs.append(nugget_docs)
#    for nugget in nuggets_df.iterrows():
#    print nugget

exit()

with gzip.open("wp-lm-preproc/accidents.norm-lemma.stop.spl.gz", "r") as f:
    X = f.readlines()
Exemplo n.º 6
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        stopwords = english_stopwords()

        ### Preprocessing here. ###
        df = cuttsum.judgements.get_merged_dataframe()
        matches = df[df["query id"] == event.query_id]
        matching_update_ids = set(matches["update id"].tolist())
        #nuggets = matches.groupby("nugget id")
        #thrsh_nuggets = all_nuggets.filter(lambda x: len(x) > 10)

        nugget_ids = list(set(matches["nugget id"].tolist()))

        #nugget_ids = list(set(all_nuggets["nugget id"].tolist()))
        nugget_ids.sort()
        nugget_id = nugget_ids[unit]
        with corenlp.Server(port=9876 + event.query_num * 100 + unit, mem="20G", threads=4, max_message_len=524288,
                annotators=["tokenize", "ssplit", "pos", "lemma"], #, "ner"],
                corenlp_props={
                    "pos.maxlen": 50, "ssplit.eolonly": "true"}) as pipeline:

            if event.query_id.startswith("TS13"):
                updates = cuttsum.judgements.get_2013_updates()
            elif event.query_id.startswith("TS14"):
                updates = cuttsum.judgements.get_2014_sampled_updates()
            elif event.query_id.startswith("TS15"):
                updates = cuttsum.judgements.get_2015_sampled_updates()

            updates = updates[updates["query id"] == event.query_id]  
            non_matching_updates = updates[updates["update id"].apply(
                lambda x: x not in matching_update_ids)]
            matching_updates = matches[matches["nugget id"] == nugget_id]
           # if len(matching_updates) == 0:
            #    return 
                #matching_updates = df[df["nugget id"] == nugget_id]

            nugget_text = matching_updates.iloc[0]["nugget text"]
            n_matching = len(matching_updates)
            n_nonmatching = min(n_matching, len(non_matching_updates))
            n_instances = n_matching + n_nonmatching

            semsim = event2semsim(event)
            from nltk.stem.porter import PorterStemmer
            stemmer = PorterStemmer()

            nugget_doc = pipeline.annotate(nugget_text)
            nugget_lems = []
            nugget_stems = []
            for sent in nugget_doc:
                for tok in sent:
                    if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50:
                        nugget_lems.append(tok.lem.lower())
                    stem = stemmer.stem(unicode(tok).lower())
                    if len(stem) < 50:
                        nugget_stems.append(stem)
            nugget_stems = [u" ".join(nugget_stems)]

            if n_matching <= 10:
                model_dir = self.get_model_dir(event, nugget_id)
                if not os.path.exists(model_dir):
                    os.makedirs(model_dir)
            
                joblib.dump([None, set(nugget_lems), nugget_stems], self.get_vectorizer_path(event, nugget_id), compress=9)
                joblib.dump([], self.get_model_path(event, nugget_id, "gbc"), compress=9)
                return 


            non_matching_updates = non_matching_updates.iloc[
                np.random.permutation(len(non_matching_updates))] 
            non_matching_updates = non_matching_updates.iloc[
                np.arange(n_nonmatching)] 
            #non_matching_updates["text"] = \
            #    non_matching_updates["text"].apply(lambda x: x.lower())

            y = np.zeros(n_instances, dtype="int32")
            y[:n_matching] = 1
            X_string = matching_updates["update text"].tolist()
            X_string += non_matching_updates.head(n_nonmatching)["text"].tolist()
            assert len(X_string) == n_instances

            p = np.random.permutation(n_instances)
            y = y[p]
            X_string = [X_string[i] for i in p]
            print "pipeline start"
            docs = pipeline.annotate_mp(X_string, n_procs=4)
            print "pipeline done"

            lemmas = []
            all_stems = []
            for doc in docs:
                lems = []
                stems = []
                for sent in doc:
                    for tok in sent:
                        if unicode(tok).lower() not in stopwords and len(unicode(tok)) < 50:
                            lems.append(tok.lem.lower())
                        stem = stemmer.stem(unicode(tok).lower())
                        if len(stem) < 50:
                            stems.append(stem)
                #print lems
                lemmas.append(lems)
                all_stems.append(u" ".join(stems))

                        
    # map(
    #                    lambda doc: [str(tok) 
    #                                 for doc in docs
    #                                 for sent in doc
    #                                 for tok in sent

            K = cosine_similarity(
                semsim.transform(all_stems),
                semsim.transform(nugget_stems))

            X_string = [u" ".join(lem) for lem in lemmas]
            vec = TfidfVectorizer(
                input=u"content", stop_words="english", ngram_range=(1,5))
            vec.fit([u" ".join(nugget_lems)] + X_string)
            X = vec.transform(X_string).todense()
            
            nugget_lems = set(nugget_lems)
            x_cov = [len(nugget_lems.intersection(set(lems))) / float(len(nugget_lems))
                     for lems in lemmas]
            x_cov = np.array(x_cov)[:, np.newaxis]
            X = np.hstack([X, x_cov, K, K * x_cov])
            
            
            gbc = GradientBoostingClassifier(
                n_estimators=500, learning_rate=.1,
                max_depth=8, random_state=0, max_features="log2")
            gbc.fit(X, y)
            print "SCORE", gbc.score(X, y)
            model_dir = self.get_model_dir(event, nugget_id)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)

            joblib.dump([vec, nugget_lems, nugget_stems], 
                self.get_vectorizer_path(event, nugget_id), compress=9)
            joblib.dump(
                gbc, self.get_model_path(event, nugget_id, "gbc"), compress=9)
Exemplo n.º 7
0
def main(input_dir, output_path, norm, stop, ne, port):
    dirname, fname = os.path.split(output_path)
    if dirname != "" and not os.path.exists(dirname):
        os.makedirs(dirname)

    output_path = os.path.join(
        dirname,
        "{}.norm-{}{}{}.spl.gz".format(fname, norm, ".stop" if stop else "",
                                       ".ne" if ne else ""))
    print "Writing spl file to {} ...".format(output_path)

    if not os.path.exists(input_dir):
        raise Exception("{} does not exist!".format(input_dir))
    paths = [os.path.join(input_dir, fname) for fname in os.listdir(input_dir)]

    texts = []
    for path in paths:
        with open(path, "r") as f:
            text = f.read()
        if text.strip() == "": continue
        texts.append(text)
    print "Removed", len(paths) - len(texts), "empty files."

    if ne is True:
        annotators = ["tokenize", "ssplit", "pos", "lemma", "ner"]
    elif norm == "lemma":
        annotators = ["tokenize", "ssplit", "pos", "lemma"]
    else:
        annotators = ["tokenize", "ssplit"]

    if norm == "stem":
        from nltk.stem.porter import PorterStemmer
        stemmer = PorterStemmer()

    if stop:
        with open("stopwords.txt", "r") as f:
            sw = set([word.strip().decode("utf-8").lower() for word in f])


    with cnlp.Server(annotators=annotators, mem="6G",
            port=port,
            max_message_len=1000000) as client, \
            gzip.open(output_path, "w") as f:

        for i, text in enumerate(texts):
            sys.stdout.write("{:3.1f}%\r".format(i * 100. / len(texts)))
            sys.stdout.flush()
            doc = client.annotate(text)

            if ne:
                per_counts = defaultdict(int)
                org_counts = defaultdict(int)
                loc_counts = defaultdict(int)
                for sent in doc:
                    for tok in sent:
                        if tok.ne == "PERSON":
                            per_counts[unicode(tok.lem).lower()] += 1
                        elif tok.ne == "LOCATION":
                            loc_counts[unicode(tok.lem).lower()] += 1
                        elif tok.ne == "ORGANIZATION":
                            org_counts[unicode(tok.lem).lower()] += 1

                if len(per_counts) > 0:
                    central_per = max(per_counts.items(), key=lambda x: [1])[0]
                else:
                    central_per = None
                if len(org_counts) > 0:
                    central_org = max(org_counts.items(), key=lambda x: [1])[0]
                else:
                    central_org = None
                if len(loc_counts) > 0:
                    central_loc = max(loc_counts.items(), key=lambda x: [1])[0]
                else:
                    central_loc = None

            for sent in doc:
                if ne:

                    toks = []
                    for tok in sent:
                        if tok.ne == "PERSON":
                            if unicode(tok.lem).lower() == central_per:
                                print tok, "__CPER__"
                                toks.append(u"__CPER__")
                            else:
                                toks.append(u"__PER__")
                        elif tok.ne == "LOCATION":
                            if unicode(tok.lem).lower() == central_loc:
                                print tok, "__CLOC__"
                                toks.append(u"__CLOC__")
                            else:
                                toks.append(u"__LOC__")

                        elif tok.ne == "ORGANIZATION":
                            if unicode(tok.lem).lower() == central_org:
                                print tok, "__CORG__"
                                toks.append(u"__CORG__")
                            else:
                                toks.append(u"__ORG__")
                        else:
                            if norm == "lemma":
                                form = unicode(tok.lem).lower()
                            elif norm == "stem":
                                form = stemmer.stem(unicode(tok).lower())
                            else:
                                form = unicode(tok).lower()
                            if stop:
                                if form not in sw and len(form) < 50:
                                    toks.append(form)
                            else:
                                if len(form) < 50:
                                    toks.append(form)
                else:
                    if norm == "lemma":
                        toks = [unicode(tok.lem).lower() for tok in sent]
                    elif norm == "stem":
                        toks = [
                            stemmer.stem(unicode(tok).lower()) for tok in sent
                        ]
                    else:
                        toks = [unicode(tok).lower() for tok in sent]
                    if stop:
                        toks = [tok for tok in toks if tok not in sw]
                    toks = [tok for tok in toks if len(tok) < 50]
                if len(toks) == 0: continue
                string = u" ".join(toks).encode("utf-8") + "\n"
                print string
                f.write(string)