Python load_stopwords 예제들, models.classic.stopword.load_stopwords Python 예제들

예제 #1

0

파일 보기

파일: segment_ranker_1.py 프로젝트: clover3/Chair

    def __init__(self, window_size):
        self.stemmer = CacheStemmer()
        self.window_size = window_size
        self.doc_posting = None
        self.stopword = load_stopwords()

        vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=True)


        def load_pickle(name):
            p = os.path.join(cpath.data_path, "adhoc", name + ".pickle")
            return pickle.load(open(p, "rb"))

        self.doc_len_dict = load_pickle("doc_len")
        self.qdf = load_pickle("robust_qdf_ex")
        self.meta = load_pickle("robust_meta")
        self.head_tokens = load_pickle("robust_title_tokens")
        self.seg_info = load_pickle("robust_seg_info")
        self.not_found = set()

        self.total_doc_n =  len(self.doc_len_dict)
        self.avdl = sum(self.doc_len_dict.values()) / len(self.doc_len_dict)
        tprint("Init PassageRanker")

예제 #2

0

파일 보기

def display(tf1, tf2, label_name1="pos", label_name2="neg"):
    odd_dict = get_all_term_odd(tf1, tf2, 0.95)

    def contrib(e):
        key, value = e
        return (tf1[key] + tf2[key]) * value

    odd_list = list(odd_dict.items())
    odd_list.sort(key=contrib, reverse=True)
    stopword = load_stopwords()

    def valid(e):
        key, value = e
        return key not in stopword and tf1[key] > 10 and tf2[key] > 10

    acc = 0
    for key, value in odd_list:
        acc += value * (tf1[key] + tf2[key])

    ctf = sum(tf1.values()) + sum(tf2.values())
    print(acc, acc/ctf)

    k = 50

    odd_list = list(filter(valid, odd_list))
    print("Top {} ".format(label_name1))
    for key, value in odd_list[:k]:
        print(key, tf1[key], tf2[key], odd_dict[key])
    print("Top {} ".format(label_name2))
    for idx in range(len(odd_list) - 1, len(odd_list) - 1 - k, -1):
        key, value = odd_list[idx]
        print(key, contrib(odd_list[idx]), tf1[key], tf2[key], odd_dict[key])

예제 #3

0

파일 보기

파일: show_biased_random_walk.py 프로젝트: clover3/Chair

def sum_random_walk_score(name_class):
    d_ids: List[int] = list(load_train_claim_ids())
    claims = get_claims_from_ids(d_ids)
    claim_d = claims_to_dict(claims)

    prob_score_d = load_from_pickle("pc_{}_word_prob_train".format(name_class))
    stopwords = load_stopwords()
    acc_counter_prob_init = Counter()
    for claim_id, prob_scores in prob_score_d.items():
        for k, v in prob_scores:
            if k not in stopwords:
                acc_counter_prob_init[k] += v

    rw_score = dict(load_from_pickle("bias_random_walk_train_{}".format(name_class)))
    acc_counter = Counter()
    for claim_id, qtf in rw_score.items():
        for k, v in qtf.items():
            acc_counter[k] += v

    acc_counter_prob_init = normalize_counter_to_sum1(acc_counter_prob_init)
    acc_counter = normalize_counter_to_sum1(acc_counter)

    new_counter = Counter()
    for k, v in acc_counter.items():
        if len(k) > 2:
            new_v = v - acc_counter_prob_init[k]
            new_counter[k] = new_v

    return new_counter

예제 #4

0

파일 보기

파일: verify_claim_gen.py 프로젝트: clover3/Chair

def merge_batch(parsed_data):
    stopwords = load_stopwords()

    all_annotations = defaultdict(list)
    for entry in parsed_data:
        url_id = entry['url_id']
        all_annotations[url_id].extend(entry['statements'])

    def get_dist(text1, text2):
        tokens1 = tokenize(text1, stopwords)
        tokens2 = tokenize(text2, stopwords)

        common = set(tokens1).intersection(set(tokens2))
        n_common = len(common)
        return (n_common / len(tokens1)) * (n_common / len(tokens2))

    result = dict()
    dist_thres = 0.5
    for key in all_annotations:
        annot_list = all_annotations[key]
        n = len(annot_list)
        for idx1 in range(n):
            for idx2 in range(idx1 + 1, n):
                annot1 = annot_list[idx1]
                annot2 = annot_list[idx2]
                dist = get_dist(annot1, annot2)

                if dist > dist_thres:
                    print("Diff : ")
                    print(annot1)
                    print(annot2)

        result[key] = annot_list
    return result

예제 #5

0

파일 보기

파일: segment2problem.py 프로젝트: clover3/Chair

 def __init__(self):
     vocab_file = os.path.join(data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.stemmer = CacheStemmer()
     self.stopword = load_stopwords()
     self.df = self.load_galgo_df_stat()

예제 #6

0

파일 보기

def segment_per_doc_index(task_id):
    token_reader = get_token_reader()
    stemmer = CacheStemmer()
    stopword = load_stopwords()

    p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info.pickle")
    seg_info = pickle.load(open(p, "rb"))

    def get_doc_posting_list(doc_id):
        doc_posting = defaultdict(list)
        for interval in seg_info[doc_id]:
            (loc, loc_ed), (_, _) = interval
            tokens = token_reader.retrieve(doc_id)
            st_tokens = list([stemmer.stem(t) for t in tokens])
            ct = Counter(st_tokens[loc:loc_ed])
            for term, cnt in ct.items():
                if term in stopword:
                    continue
                doc_posting[term].append((loc, cnt))

        return doc_posting

    doc_id_list = get_doc_task(task_id)
    ticker = TimeEstimator(len(doc_id_list))
    doc_posting_d = {}
    for doc_id in doc_id_list:
        doc_posting_d[doc_id] = get_doc_posting_list(doc_id)
        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "per_doc_posting_{}.pickle".format(task_id))
    pickle.dump(doc_posting_d, open(save_path, "wb"))

예제 #7

0

파일 보기

파일: show_from_collection.py 프로젝트: clover3/Chair

def get_odd_list():
    result = load_from_pickle("majority_3gram")
    tf0, tf1, tf2 = result
    odd_dict = get_all_term_odd(tf1, tf2, 0.95)

    def contrib(e):
        key, value = e
        return (tf1[key] + tf2[key]) * value

    odd_list = list(odd_dict.items())
    odd_list.sort(key=contrib, reverse=True)
    stopword = load_stopwords()

    def valid(e):
        key, value = e
        return key not in stopword and tf1[key] > 10 and tf2[key] > 10

    acc = 0
    for key, value in odd_list:
        acc += value * (tf1[key] + tf2[key])

    ctf = sum(tf1.values()) + sum(tf2.values())
    print(acc, acc / ctf)

    return list(filter(valid, odd_list))

예제 #8

0

파일 보기

    def summarize(self):
        topic = data_generator.argmining.ukp_header.all_topics[0]
        data_loader = ukp.DataLoader(topic)
        stopwords = load_stopwords()

        def tokenize(x):
            return tokenizer.tokenize(x, stopwords)

        def sent_score(token_sent, bow_score):
            score = 0
            factor = 1
            for t in token_sent:
                score += bow_score[t] * factor
                factor *= 0.5
            return score

        def is_argument(entry):
            return entry['annotation'] == "Argument_for" or entry[
                'annotation'] == "Argument_against"

        for topic in data_generator.argmining.ukp_header.all_topics:
            entries = data_loader.all_data[topic]
            raw_sents = list(
                [e['sentence'] for e in entries if e['set'] == 'train'])
            token_sents = list(map(tokenize, raw_sents))
            tprint("Runing TextRank")
            text_rank = TextRank(token_sents)
            tr_score = Counter(text_rank.run(flatten(token_sents)))
            tprint("claim_gen.generate")

            raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score),
                           reverse=True)
            for i in range(10):
                print(raw_sents[i])

예제 #9

0

파일 보기

파일: ukp.py 프로젝트: clover3/Chair

    def __init__(self, target_topic, is_3way, max_sequence, vocab_filename):
        DataLoader.__init__(self, target_topic, is_3way)

        self.max_seq = max_sequence
        voca_path = os.path.join(data_path, vocab_filename)
        self.stopwords = load_stopwords()
        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)

예제 #10

0

파일 보기

 def __init__(self):
     self.stopword = load_stopwords()
     self.stemmer = CacheStemmer()
     vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     tprint("Loading inv_index for robust")
     self.collection = RobustCollection()
     tprint("Done")
     self.num_candidate = 10

예제 #11

0

파일 보기

파일: lm_classifier.py 프로젝트: clover3/Chair

    def build3(self, C, NC):
        stopwords = load_stopwords()
        self.supervised = True

        self.NC = NC
        self.C = C
        self.stopword = stopwords

        self.NC_ctf = sum(self.NC.values())
        self.C_ctf = sum(self.C.values())

예제 #12

0

파일 보기

 def __init__(self, data):
     self.p_reset = 0.1
     self.max_repeat = 500
     self.window_size = 10
     self.idf = collections.Counter()
     self.def_idf = 2
     for document in data:
         for word in set(document):
             self.idf[word] += 1
     self.stopword = load_stopwords()

예제 #13

0

파일 보기

def build_inv_index(sents):
    stopword = load_stopwords()
    stopword.add("should")
    group = {}

    for idx, sent in enumerate(sents):
        for t in sent:
            if len(t) > 1 and t not in stopword:
                assign_list_if_not_exists(group, t)
                group[t].append(idx)
    return group

예제 #14

0

파일 보기

    def build(self, lm_docs_list, bg_tf, bg_ctf):
        self.n_lm = len(lm_docs_list)

        stopwords = load_stopwords()

        def transform(counter):
            if self.stemmer is None:
                new_tf = counter
            else:
                new_tf = Counter()
                for key in counter:
                    source = key
                    target = self.stemmer(key)
                    new_tf[target] += counter[source]

            counter = new_tf
            new_tf = Counter()
            for key in counter:
                if len(key) <= 3 or key in stopwords:
                    pass
                else:
                    new_tf[key] = counter[key]
            return new_tf

        def remove_stopword(counter):
            new_tf = Counter()
            for key in counter:
                if len(key) < 3 or key in stopwords:
                    pass
                else:
                    new_tf[key] = counter[key]
            return new_tf

        self.BG = transform(bg_tf)
        self.BG_ctf = bg_ctf
        self.stopword = stopwords

        for lm_docs in lm_docs_list:
            c_tf = collections.Counter()
            for idx, s in enumerate(lm_docs):
                tokens = self.tokenizer(s)
                for token in tokens:
                    if token in bg_tf:
                        c_tf[token] += 1

            tf_dict = transform(c_tf)
            self.C.append(tf_dict)
            self.C_ctf.append(sum(tf_dict.values()))

예제 #15

0

파일 보기

파일: lm_classifier.py 프로젝트: clover3/Chair

    def build2(self, x, y):
        stopwords = load_stopwords()
        self.stopword = stopwords
        self.supervised = True

        self.NC = collections.Counter()
        self.C = collections.Counter()

        def update(counter, tokens):
            for token in tokens:
                counter[token] += 1

        for idx, s in enumerate(x):
            tokens = self.tokenize(s)
            if y[idx] == 0:
                update(self.NC, tokens)
            elif y[idx] == 1:
                update(self.C, tokens)

        self.NC_ctf = sum(self.NC.values())
        self.C_ctf = sum(self.C.values())

        vectors = []
        for idx, s in enumerate(x):
            tokens = self.tokenize(s)
            odd = self.log_odd_binary(tokens)
            vectors.append((odd, y[idx]))
        vectors.sort(key=lambda x: x[0], reverse=True)

        total = len(vectors)
        p = np.count_nonzero(y)
        fp = 0
        max_acc = 0
        self.opt_alpha = 0
        for idx, (odd, label) in enumerate(vectors):
            alpha = odd - 1e-8
            if label == 0:
                fp += 1

            tp = (idx + 1) - fp
            fn = p - tp
            tn = total - (idx + 1) - fn
            acc = (tp + tn) / (total)
            if acc > max_acc:
                self.opt_alpha = alpha
                max_acc = acc

        print("Train acc : {}".format(max_acc))

예제 #16

0

파일 보기

def build_krovetz_index():
    stemmer = Stemmer()
    stopwords = load_stopwords()

    stem_dict = dict()

    def stem(token):
        if token in stem_dict:
            return stem_dict[token]
        else:
            r = stemmer.stem(token)
            stem_dict[token] = r
            return r

    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    inv_index = dict()
    ticker = TimeEstimator(len(collection))

    for doc_id in collection:
        content = collection[doc_id]
        tokens = nltk.tokenize.wordpunct_tokenize(content)
        terms = dict()
        for idx, t in enumerate(tokens):
            if t in stopwords:
                continue

            t_s = stem(t)

            if t_s not in terms:
                terms[t_s] = list()

            terms[t_s].append(idx)

        for t_s in terms:
            if t_s not in inv_index:
                inv_index[t_s] = list()

            posting = (doc_id, terms[t_s])
            inv_index[t_s].append(posting)

        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "robust_inv_index.pickle")
    pickle.dump(inv_index, open(save_path, "wb"))

예제 #17

0

파일 보기

    def __init__(self, data, data_info):
        super(DictAuxDataFeeder, self).__init__(data)
        self.stopword = load_stopwords()
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)

        self.dict = self.encode_dict_as_feature(self.raw_dictionary)

        # data is already truncated and padded
        self.data = data

        self.data_len = len(self.data)
        if data_info is not None:
            self.data_info = data_info
        else:
            self.data_info = self.nli_data_indexing(data)

예제 #18

0

파일 보기

def get_simple_claim_query(claims, drop_stopwords=False) -> List[DocQuery]:
    if drop_stopwords:
        stopword = load_stopwords()

    queries: List[DocQuery] = []
    for c in claims:
        cid = str(c["cId"])
        claim_text = c["text"]
        q_terms: List[str] = clean_tokenize_str_to_tokens(claim_text)
        print(q_terms)
        if drop_stopwords:
            q_terms = list([t for t in q_terms if t not in stopword])
        q_terms = list([t.replace(".", "") for t in q_terms])
        print(q_terms)

        q_entry: DocQuery = format_query_simple(cid, q_terms)
        queries.append(q_entry)
    return queries

예제 #19

0

파일 보기

파일: count_coocurrence_graph.py 프로젝트: clover3/Chair

def build_co_occurrence(list_tokens: List[List[str]], window_size,
                        stemmer: CacheStemmer) -> Counter:
    list_tokens: List[List[str]] = lmap(stemmer.stem_list, list_tokens)

    stopword = load_stopwords()

    def remove_stopwords(tokens: List[str]) -> List[str]:
        return list([t for t in tokens if t not in stopword])

    list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens)
    counter = Counter()

    def count_co_ocurrence_fn(token_list):
        count_co_ocurrence(window_size, counter, token_list)

    foreach(count_co_ocurrence_fn, list_tokens)

    return counter

예제 #20

0

파일 보기

    def tf_stat(self):
        topic = data_generator.argmining.ukp_header.all_topics[0]
        data_loader = ukp.DataLoader(topic)
        stopwords = load_stopwords()

        def tokenize(x):
            return tokenizer.tokenize(x, stopwords)

        for topic in data_generator.argmining.ukp_header.all_topics:
            print("-----------")
            print(topic)
            entries = data_loader.all_data[topic]
            token_sents = list([
                tokenize(e['sentence']) for e in entries if e['set'] == 'train'
            ])
            tf_dict = Counter(flatten(token_sents))
            for word, tf in tf_dict.most_common(30):
                print(word, tf)

예제 #21

0

파일 보기

def test_generative_model():
    train, val = load_feature_and_split()
    print("Training lm")
    classifier = learn_lm(train)
    stopwords = load_stopwords()

    def fileter_fn(data_point: Dict):
        remove_stopword_and_punct(stopwords, data_point['feature'])

    foreach(fileter_fn, train)

    def is_correct(data_point: Dict):
        x = data_point['feature']
        y = int(data_point['label'])
        return classifier.predict(x) == int(y)

    correctness = lmap(is_correct, val)

    print("val acc: ", average(correctness))

예제 #22

0

파일 보기

파일: controversy.py 프로젝트: clover3/Chair

def from_pos_neg(pos_docs, neg_docs):
    stemmer = None
    stopwords = load_stopwords()
    y = list(1 for _ in pos_docs) + list(0 for _ in neg_docs)

    def transform(counter):
        if stemmer is None:
            new_tf = counter
        else:
            new_tf = Counter()
            for key in counter:
                source = key
                target = stemmer(key)
                new_tf[target] += counter[source]

        counter = new_tf
        new_tf = Counter()
        for key in counter:
            if len(key) <= 3 or key in stopwords:
                pass
            else:
                new_tf[key] = counter[key]
        return new_tf

    def count_word_parallel(documents):
        split = 30
        p = Pool(split)
        args = chunks(documents, split)
        counters = p.map(count_word, args)
        g_counter = Counter()
        for counter in counters:
            for key in counter.keys():
                g_counter[key] += counter[key]
        return g_counter

    c_counter = transform(count_word_parallel(pos_docs))
    nc_counter = transform(count_word_parallel(neg_docs))

    tokenizer = lambda x: tokenize(x, set(), False)
    classifier = LMClassifer(tokenizer, None, fulltext=True)
    classifier.build3(c_counter, nc_counter)
    return classifier

예제 #23

0

파일 보기

파일: controversy.py 프로젝트: clover3/Chair

def get_yw_may():
    from dispute.guardian import load_local_pickle

    stopwords = load_stopwords()
    tokenizer = lambda x: tokenize(x, stopwords, False)

    class YWMay:
        def __init__(self):
            self.stopwords = stopwords
            self.topic_info = load_local_pickle("topic_score")

        def get_tf10(self, tokens):
            counter = Counter()
            for t in tokens:
                if t not in self.stopwords and len(t) > 2:
                    counter[t] += 1

            return counter.most_common(10)

        def score(self, docs):
            def term_odd(token):
                if token not in self.topic_info:
                    return 0
                else:
                    p = self.topic_info[token]
                    if p > 0.9999 or p < 0.0001:
                        return 0
                    else:
                        return math.log(p) - math.log(1 - p)

            def predict(doc):
                tokens = tokenizer(doc)
                sum_odd = 0

                top10 = left(list(self.get_tf10(tokens)))
                odd_list = lmap(term_odd, tokens)
                result = sum(odd_list)
                return result

            return lmap(predict, docs)

    return YWMay()

예제 #24

0

파일 보기

파일: pipeline.py 프로젝트: clover3/Chair

 def __init__(self):
     tprint("Pipeline Init")
     self.stemmer = CacheStemmer()
     vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.iteration_dir = "/mnt/scratch/youngwookim/data/tlm_iter1"
     if not os.path.exists("/mnt/scratch/youngwookim/"):
         self.iteration_dir = "/mnt/nfs/work3/youngwookim/data/tlm_iter1"
     self.seg_max_seq = 256
     self.model_max_seq = 512
     self.rng = random.Random(0)
     self.masked_lm_prob = 0.15
     self.short_seq_prob = 0.1
     self.inst_per_job = 1000
     self.stopword = load_stopwords()
     self.pr = FeatureExtractor(self.seg_max_seq - 3)
     self.tf_record_maker = None
     self.code_tick = CodeTiming()
     tprint("Pipeline Init Done")

예제 #25

0

파일 보기

파일: lm_classifier.py 프로젝트: clover3/Chair

    def build4(self, c_tf, bg_tf, bg_ctf):
        stopwords = load_stopwords()
        self.stopword = stopwords

        def transform(counter: Counter):
            if self.stemmer is None:
                new_tf = counter
            else:
                new_tf = Counter()
                for key in counter:
                    source = key
                    try:
                        target = self.stemmer(key)
                        new_tf[target] += counter[source]
                    except:
                        pass

            counter = new_tf
            new_tf = Counter()
            for key in counter:
                if len(key) <= 3 or key in stopwords:
                    pass
                else:
                    new_tf[key] = counter[key]
            return new_tf

        def remove_stopword(counter):
            new_tf = Counter()
            for key in counter:
                if len(key) < 3 or key in stopwords:
                    pass
                else:
                    new_tf[key] = counter[key]
            return new_tf

        self.BG = transform(bg_tf)
        self.BG_ctf = bg_ctf
        self.C = transform(c_tf)
        self.C_ctf = sum(self.C.values())

예제 #26

0

파일 보기

def count_it(
        data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]:
    stemmer = CacheStemmer()
    r = []
    stopword = load_stopwords()

    def remove_stopwords(tokens: List[str]) -> List[str]:
        return list([t for t in tokens if t not in stopword])

    ticker = TimeEstimator(len(data))
    for cid, para_list in data.items():
        ticker.tick()
        tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list]
        list_tokens: List[List[str]] = lmap(stemmer.stem_list, tokens_list)
        list_tokens: List[List[str]] = lmap(remove_stopwords, list_tokens)

        all_cnt = Counter()
        for tokens in list_tokens:
            all_cnt.update(Counter(tokens))

        r.append((cid, all_cnt))
    return r

예제 #27

0

파일 보기

def lm_contribution():
    train, val = load_feature_and_split()
    print("Training lm")
    stopwords = load_stopwords()

    def fileter_fn(data_point):
        remove_stopword_and_punct(stopwords, data_point[0][0])

    foreach(fileter_fn, train)
    classifier = learn_lm(train)

    acc_contrib = Counter()
    for data_point in train:
        (tf, num), y = data_point

        contrib = classifier.counter_contribution(tf)
        # print("{} {} {}".format(y, classifier.predict(tf), classifier.counter_odd(tf)))
        # print("--------------")
        for t, score in contrib.most_common(100):
            acc_contrib[t] += score

    for t, score in acc_contrib.most_common(100):
        print(t, score, classifier.P_w_C_dict[t], classifier.P_w_NC_dict[t])

예제 #28

0

파일 보기

def save_qdf_ex():
    ii_path = os.path.join(cpath.data_path, "adhoc", "robust_inv_index.pickle")
    inv_index = pickle.load(open(ii_path, "rb"))
    save_path = os.path.join(cpath.data_path, "adhoc", "robust_meta.pickle")
    meta = pickle.load(open(save_path, "rb"))
    stopwords = load_stopwords()
    stemmer = CacheStemmer()

    simple_posting = {}

    qdf_d = Counter()
    for term in inv_index:
        simple_posting[term] = set()
        for doc_id, _ in inv_index[term]:
            simple_posting[term].add(doc_id)

    for doc in meta:
        date, headline = meta[doc]
        tokens = nltk.tokenize.wordpunct_tokenize(headline)
        terms = set()
        for idx, t in enumerate(tokens):
            if t in stopwords:
                continue

            t_s = stemmer.stem(t)

            terms.add(t_s)

        for t in terms:
            simple_posting[t].add(doc)

    for term in inv_index:
        qdf = len(simple_posting[term])
        qdf_d[term] = qdf

    save_path = os.path.join(cpath.data_path, "adhoc", "robust_qdf_ex.pickle")
    pickle.dump(qdf_d, open(save_path, "wb"))

예제 #29

0

파일 보기

    def divergence(self):
        # Compare Arg vs Non-Arg
        topic = data_generator.argmining.ukp_header.all_topics[0]
        data_loader = ukp.DataLoader(topic)
        stopwords = load_stopwords()

        def tokenize(x):
            return tokenizer.tokenize(x, stopwords)

        def is_argument(entry):
            return entry['annotation'] == "Argument_for" or entry[
                'annotation'] == "Argument_against"

        for topic in data_generator.argmining.ukp_header.all_topics:
            print("-----------")
            print(topic)
            entries = data_loader.all_data[topic]
            token_sents = list([
                tokenize(e['sentence']) for e in entries if e['set'] == 'train'
            ])
            topic_tf = Counter(flatten(token_sents))

            arg_div = []
            narg_div = []
            for e in entries:
                sent_tf = Counter(tokenize(e['sentence']))
                div = kl.kl_divergence_subset(sent_tf, topic_tf)
                assert not math.isnan(div)

                if e['set'] == 'train' and is_argument(e):
                    arg_div.append(div)
                elif e['set'] == 'train':
                    narg_div.append(div)

            print("Arg KL mean : ", average(arg_div))
            print("Non-Arg KL mean : ", average(narg_div))

예제 #30

0

파일 보기

    def divergence_lr(self):
        f1_list = []
        for dev_topic in data_generator.argmining.ukp_header.all_topics:
            print(dev_topic)
            data_loader = ukp.DataLoader(dev_topic)
            idx_for = data_loader.labels.index("Argument_for")
            idx_against = data_loader.labels.index("Argument_against")

            train_data = data_loader.get_train_data()
            dev_data = data_loader.get_dev_data()

            train_X, train_y = zip(*train_data)
            dev_X, dev_y = zip(*dev_data)
            feature = CountVectorizer()
            train_X_v = feature.fit_transform(train_X)

            stopwords = load_stopwords()

            def tokenize(x):
                return tokenizer.tokenize(x, stopwords)

            data_idx = 0
            for topic in data_generator.argmining.ukp_header.all_topics:
                if topic == dev_topic:
                    continue
                entries = data_loader.all_data[topic]
                token_sents = list([
                    tokenize(e['sentence']) for e in entries
                    if e['set'] == 'train'
                ])
                topic_tf = Counter(flatten(token_sents))

                for e in entries:
                    if e['set'] == 'train':
                        sent_tf = Counter(tokenize(e['sentence']))
                        div = kl.kl_divergence_subset(sent_tf, topic_tf)
                        assert not math.isnan(div)
                        train_X_v[data_idx, -1] = div
                        data_idx += 1

            assert data_idx == len(train_X)

            classifier = LogisticRegression(random_state=0,
                                            solver='lbfgs',
                                            multi_class='multinomial')
            #classifier = LinearSVC()
            classifier = MLPClassifier()

            classifier.fit(train_X_v, train_y)

            dev_X_v = feature.transform(dev_X)

            token_sents = list([
                tokenize(e['sentence'])
                for e in data_loader.all_data[dev_topic] if e['set'] == 'val'
            ])
            topic_tf = Counter(flatten(token_sents))
            data_idx = 0
            for e in data_loader.all_data[dev_topic]:
                if e['set'] == 'val':
                    sent_tf = Counter(tokenize(e['sentence']))
                    div = kl.kl_divergence_subset(sent_tf, topic_tf)
                    train_X_v[data_idx, -1] = div
                    data_idx += 1

            assert data_idx == len(dev_X)
            train_pred = classifier.predict(train_X_v)
            dev_pred = classifier.predict(dev_X_v)

            def print_eval(pred_y, gold_y):
                all_result = eval_3label(pred_y, gold_y)
                for_result = all_result[idx_for]
                against_result = all_result[idx_against]
                f1 = sum([result['f1'] for result in all_result]) / 3
                print("F1", f1)
                print("P_arg+", for_result['precision'])
                print("R_arg+", for_result['recall'])
                print("P_arg-", against_result['precision'])
                print("R_arg-", against_result['recall'])
                return f1

            #print("Train")
            #print_eval(train_pred, train_y)

            f1 = print_eval(dev_pred, dev_y)
            f1_list.append(f1)
        average(f1_list)