def summarize(sentence_list, target, window, verbose, topk, dic_type):
    tokenize = GrpcTokenizer(target, dic_type=dic_type)
    summarizer = KeywordSummarizer(
        tokenize=tokenize,
        window=window,
        verbose=verbose,
    )
    try:
        return summarizer.summarize(sentence_list, topk=topk)
    except ValueError:
        return []
예제 #2
0
def cluster_keyword_summarizer(results, k):
    keyword_extractor = KeywordSummarizer(
        tokenize=mecab_tokenizer,
        min_count=2,
        window=-1,  # cooccurrence within a sentence
        min_cooccurrence=2,
        vocab_to_idx=None,  # you can specify vocabulary to build word graph
        df=0.85,  # PageRank damping factor
        max_iter=30,  # PageRank maximum iteration
        verbose=False)
    keyword_result = []
    for i in range(k):
        _tmp = keyword_extractor.summarize(results[i], topk=20)
        keyword_result.append(_tmp)
    return keyword_result
예제 #3
0
def summarize_without_ray(sentence_list,
                          target=TARGET_DEFAULT,
                          window=WINDOW_DEFAULT,
                          verbose=VERBOSE_DEFAULT,
                          topk=TOPK_DEFAULT,
                          dic_type=DIC_TYPE_DEFAULT,
                          min_count=MIN_COUNT_DEFAULT):
    tokenize = GrpcTokenizer(target, dic_type=dic_type)
    summarizer = KeywordSummarizer(
        tokenize=tokenize,
        window=window,
        min_count=min_count,
        verbose=verbose,
    )
    try:
        keyword_list = summarizer.summarize(sentence_list, topk=topk)
    except ValueError:
        return []

    # term1_regex 필터링 적용
    return [(word, rank) for (word, rank) in keyword_list
            if not term1_regex.remove(word) if not english_regex.remove(word)
            if not common_regex.remove(word)]
예제 #4
0
파일: test_rank2.py 프로젝트: veroroot/TIL

komoran = Komoran()


def komoran_tokenize(sent):
    words = komoran.pos(sent, join=True)
    words = [
        w for w in words
        if ('/NN' in w or '/XR' in w or '/VA' in w or '/VV' in w)
    ]
    return words


keyword_extractor = KeywordSummarizer(tokenize=komoran_tokenize,
                                      window=-1,
                                      verbose=False)

sents = [
    '오패산터널 총격전 용의자 검거 서울 연합뉴스 경찰 관계자들이 19일 오후 서울 강북구 오패산 터널 인근에서 사제 총기를 발사해 경찰을 살해한 용의자 성모씨를 검거하고 있다 성씨는 검거 당시 서바이벌 게임에서 쓰는 방탄조끼에 헬멧까지 착용한 상태였다',
    '서울 연합뉴스 김은경 기자 사제 총기로 경찰을 살해한 범인 성모 46 씨는 주도면밀했다',
    '경찰에 따르면 성씨는 19일 오후 강북경찰서 인근 부동산 업소 밖에서 부동산업자 이모 67 씨가 나오기를 기다렸다 이씨와는 평소에도 말다툼을 자주 한 것으로 알려졌다',
    '이씨가 나와 걷기 시작하자 성씨는 따라가면서 미리 준비해온 사제 총기를 이씨에게 발사했다 총알이 빗나가면서 이씨는 도망갔다 그 빗나간 총알은 지나가던 행인 71 씨의 배를 스쳤다',
    '성씨는 강북서 인근 치킨집까지 이씨 뒤를 쫓으며 실랑이하다 쓰러뜨린 후 총기와 함께 가져온 망치로 이씨 머리를 때렸다',
    '이 과정에서 오후 6시 20분께 강북구 번동 길 위에서 사람들이 싸우고 있다 총소리가 났다 는 등의 신고가 여러건 들어왔다',
    '5분 후에 성씨의 전자발찌가 훼손됐다는 신고가 보호관찰소 시스템을 통해 들어왔다 성범죄자로 전자발찌를 차고 있던 성씨는 부엌칼로 직접 자신의 발찌를 끊었다',
    '용의자 소지 사제총기 2정 서울 연합뉴스 임헌정 기자 서울 시내에서 폭행 용의자가 현장 조사를 벌이던 경찰관에게 사제총기를 발사해 경찰관이 숨졌다 19일 오후 6시28분 강북구 번동에서 둔기로 맞았다 는 폭행 피해 신고가 접수돼 현장에서 조사하던 강북경찰서 번동파출소 소속 김모 54 경위가 폭행 용의자 성모 45 씨가 쏜 사제총기에 맞고 쓰러진 뒤 병원에 옮겨졌으나 숨졌다 사진은 용의자가 소지한 사제총기',
    '신고를 받고 번동파출소에서 김창호 54 경위 등 경찰들이 오후 6시 29분께 현장으로 출동했다 성씨는 그사이 부동산 앞에 놓아뒀던 가방을 챙겨 오패산 쪽으로 도망간 후였다',
    '김 경위는 오패산 터널 입구 오른쪽의 급경사에서 성씨에게 접근하다가 오후 6시 33분께 풀숲에 숨은 성씨가 허공에 난사한 10여발의 총알 중 일부를 왼쪽 어깨 뒷부분에 맞고 쓰러졌다',
    '김 경위는 구급차가 도착했을 때 이미 의식이 없었고 심폐소생술을 하며 병원으로 옮겨졌으나 총알이 폐를 훼손해 오후 7시 40분께 사망했다',
    '김 경위는 외근용 조끼를 입고 있었으나 총알을 막기에는 역부족이었다',
예제 #5
0
def kobert(MODEL_NAME, train_data, test_data, MAX_LEN=32):
    if MODEL_NAME == 'summakobert':
        print("summarizing data that are longer than MAX_LEN " + str(MAX_LEN))
        log.info("summarizing data that are longer than MAX_LEN " +
                 str(MAX_LEN))
        if config.data_name == 'nsmc':
            summarizer = KeywordSummarizer(tokenize=komoran_tokenizer,
                                           min_count=1,
                                           min_cooccurrence=1)
            print("summarizing train data")
            train_data = kor_summa(summarizer, train_data, MAX_LEN)
            print("summarizing test data")
            test_data = kor_summa(summarizer, test_data, MAX_LEN)

        elif config.data_name == 'imdb':
            print("not implemented yet...")

    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    num_labels = 2
    num_epochs = 5
    batch_size = 32
    warmup_ratio = 0.1
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5  # 1e-5 or 2e-5

    X_train = np.array(train_data['document'])
    X_test = np.array(test_data['document'])
    y_train = np.array(train_data['label'])
    y_test = np.array(test_data['label'])

    drop_train = [
        index for index, sentence in enumerate(X_train) if len(sentence) < 1
    ]
    drop_test = [
        index for index, sentence in enumerate(X_test) if len(sentence) < 1
    ]

    #  delete empty samples
    X_train = np.delete(X_train, drop_train, axis=0)
    y_train = np.delete(y_train, drop_train, axis=0)
    X_test = np.delete(X_test, drop_test, axis=0)
    y_test = np.delete(y_test, drop_test, axis=0)

    train_data = pd.DataFrame({'document': X_train, 'label': y_train})
    test_data = pd.DataFrame({'document': X_test, 'label': y_test})

    print("loading kobert model")
    bertmodel, vocab = get_pytorch_kobert_model()
    tokenizer_path = get_tokenizer()
    tokenizer = nlp.data.BERTSPTokenizer(tokenizer_path, vocab, lower=False)

    print("training dataset is splited to train*0.9 + val*0.1")
    train_data, val_data = split_train_val(train_data, rate=0.1)

    #  tokenize...
    log.info("tokenizing...")
    print("tokenizing train data")
    data_train = BERTDataset(train_data, 0, 1, tokenizer, MAX_LEN, True, False)
    print("tokenizing val data")
    data_val = BERTDataset(val_data, 0, 1, tokenizer, MAX_LEN, True, False)
    print("tokenizing test data")
    data_test = BERTDataset(test_data, 0, 1, tokenizer, MAX_LEN, True, False)

    train_dataloader = DataLoader(data_train,
                                  batch_size=batch_size,
                                  num_workers=5)
    val_dataloader = DataLoader(data_val, batch_size=batch_size, num_workers=5)
    test_dataloader = DataLoader(data_test,
                                 batch_size=batch_size,
                                 num_workers=5)

    model = BERTClassifier(bertmodel,
                           dr_rate=0.5).to(device)  # TODO : disable dropout
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                warmup_steps=warmup_step,
                                                t_total=t_total)

    for e in range(num_epochs):
        train_acc = 0.0
        val_acc = 0.0

        print("epoch {}".format(e))
        log.info("epoch {}".format(e))

        # train
        model.train()
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(train_dataloader):
            optimizer.zero_grad()
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if (batch_id + 1) % log_interval == 0:
                print("batch {:>5,} loss {:>5,} train acc {:>5,}".format(
                    batch_id + 1,
                    loss.data.cpu().numpy(), train_acc / (batch_id + 1)))
                log.info("batch {:>5,} loss {:>5,} train acc {:>5,}".format(
                    batch_id + 1,
                    loss.data.cpu().numpy(), train_acc / (batch_id + 1)))
        print("epoch {:>5,} avg train acc {:>5,}".format(
            e + 1, train_acc / (batch_id + 1)))
        log.info("epoch {:>5,} avg train acc {:>5,}".format(
            e + 1, train_acc / (batch_id + 1)))

        # val
        model.eval()
        for batch_id, (token_ids, valid_length, segment_ids,
                       label) in enumerate(val_dataloader):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            val_acc += calc_accuracy(out, label)
        print("epoch {:>5,} val acc {:>5,}".format(e + 1,
                                                   val_acc / (batch_id + 1)))
        log.info("epoch {:>5,} val acc {:>5,}".format(e + 1, val_acc /
                                                      (batch_id + 1)))

    #  test
    test_acc = 0.0
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids,
                   label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("test acc {:>5,}".format(test_acc / (batch_id + 1)))
    log.info("test acc {:>5,}".format(test_acc / (batch_id + 1)))
예제 #6
0
def gru(MODEL_NAME, train_data, test_data, MAX_LEN=32):
    if MODEL_NAME == 'summagru':
        print("summarizing data that are longer than MAX_LEN " + str(MAX_LEN))
        log.info("summarizing data that are longer than MAX_LEN " +
                 str(MAX_LEN))
        if config.data_name == 'nsmc':
            summarizer = KeywordSummarizer(tokenize=komoran_tokenizer,
                                           min_count=1,
                                           min_cooccurrence=1)
            print("summarizing train data")
            train_data = kor_summa(summarizer, train_data, MAX_LEN)
            print("summarizing test data")
            test_data = kor_summa(summarizer, test_data, MAX_LEN)

        elif config.data_name == 'imdb':
            print("not implemented yet...")

    print("tokenizing...")
    log.info("tokenizing...")
    okt = Okt()
    print("tokenizing train_data")
    X_train = tokenize(train_data, okt)
    print("tokenizing test_data")
    X_test = tokenize(test_data, okt)
    print()
    log.info("")

    # encode
    print("encoding and preprocessing...")
    log.info("encoding and preprocessing...")
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)

    threshold = 3
    total_cnt = len(tokenizer.word_index)
    rare_cnt = 0
    total_freq = 0
    rare_freq = 0

    for key, value in tokenizer.word_counts.items():
        total_freq = total_freq + value

        if (value < threshold):
            rare_cnt = rare_cnt + 1
            rare_freq = rare_freq + value

    # delete rare tokens
    vocab_size = total_cnt - rare_cnt + 1

    tokenizer = Tokenizer(vocab_size)
    tokenizer.fit_on_texts(X_train)
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    y_train = np.array(train_data['label'])
    y_test = np.array(test_data['label'])

    drop_train = [
        index for index, sentence in enumerate(X_train) if len(sentence) < 1
    ]
    drop_test = [
        index for index, sentence in enumerate(X_test) if len(sentence) < 1
    ]

    # delete empty samples
    X_train = np.delete(X_train, drop_train, axis=0)
    y_train = np.delete(y_train, drop_train, axis=0)
    X_test = np.delete(X_test, drop_test, axis=0)
    y_test = np.delete(y_test, drop_test, axis=0)

    # 32 / 64 / 128
    max_len = MAX_LEN

    # padding
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = pad_sequences(X_test, maxlen=max_len)
    print()

    print("reached checkpoint!")
    log.info("reached checkpoint!")

    model = Sequential()
    model.add(Embedding(vocab_size, 100))
    model.add(GRU(128))
    model.add(Dense(1, activation='sigmoid'))
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    mc = ModelCheckpoint('best_model.h5',
                         monitor='val_acc',
                         mode='max',
                         verbose=1,
                         save_best_only=True)
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    history = model.fit(X_train,
                        y_train,
                        epochs=15,
                        callbacks=[es, mc],
                        batch_size=60,
                        validation_split=0.1)
    loaded_model = load_model('best_model.h5')

    print("acc : %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
    log.info("acc : %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))
예제 #7
0
    lines = f.read().splitlines()
    lines = list(filter(None, lines))
    data = ' '
    for line in lines:
        if line == lines[0]:
            continue
        elif line == lines[1]:
            continue
        else:
            line = line.strip()
            data = data + line
            data = data + "\n"

    # 구 단위로
    sents = okt.phrases(data)
    summarizer = KeywordSummarizer(tokenize=okt_tokenizer_ox, min_count=0, min_cooccurrence=1)
    keywords1 = summarizer.summarize(sents, topk=30)

    # 띄어쓰기로 나눈거
    sents = subword_tokenizer(data)
    summarizer = KeywordSummarizer(tokenize=okt_tokenizer_ox, min_count=0, min_cooccurrence=1)
    keywords2 = summarizer.summarize(sents, topk=30)

    keywords_1 = []
    keywords_2 = []

    for j in range(1, 31):
        keywords_1.append(keywords1[j-1][0])
        keywords_2.append(keywords2[j-1][0])

    f = open("key_word_" + str(i) + ".txt", "w", encoding='UTF-8')
예제 #8
0
def bert(MODEL_NAME, train_data, test_data, MAX_LEN=32):
    if MODEL_NAME == 'summabert':
        print("summarizing data that are longer than MAX_LEN " + str(MAX_LEN))
        log.info("summarizing data that are longer than MAX_LEN " +
                 str(MAX_LEN))
        if config.data_name == 'nsmc':
            summarizer = KeywordSummarizer(tokenize=komoran_tokenizer,
                                           min_count=1,
                                           min_cooccurrence=1)
            print("summarizing train data")
            train_data = kor_summa(summarizer, train_data, MAX_LEN)
            print("summarizing test data")
            test_data = kor_summa(summarizer, test_data, MAX_LEN)

        elif config.data_name == 'imdb':
            print("not implemented yet...")

    num_labels = 2
    num_epochs = 5
    batch_size = 32

    #  count_tokenizer = Tokenizer()
    #  count_tokenizer.fit_on_texts(train_data['document'])
    #
    #  threshold = 3
    #  total_cnt = len(count_tokenizer.word_index)
    #  rare_cnt = 0
    #  total_freq = 0
    #  rare_freq = 0
    #
    X_train = np.array(train_data['document'])
    X_test = np.array(test_data['document'])
    y_train = np.array(train_data['label'])
    y_test = np.array(test_data['label'])
    #
    #  for key, value in count_tokenizer.word_counts.items():
    #      total_freq = total_freq + value
    #
    #      if(value < threshold):
    #          rare_cnt = rare_cnt + 1
    #          rare_freq = rare_freq + value

    drop_train = [
        index for index, sentence in enumerate(X_train) if len(sentence) < 1
    ]
    drop_test = [
        index for index, sentence in enumerate(X_test) if len(sentence) < 1
    ]

    # delete empty samples
    X_train = np.delete(X_train, drop_train, axis=0)
    y_train = np.delete(y_train, drop_train, axis=0)
    X_test = np.delete(X_test, drop_test, axis=0)
    y_test = np.delete(y_test, drop_test, axis=0)

    train_data = pd.DataFrame({'document': X_train, 'label': y_train})
    test_data = pd.DataFrame({'document': X_test, 'label': y_test})

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)

    print("training dataset is splited to train*0.9 + val*0.1")
    train_data, val_data = split_train_val(train_data, rate=0.1)

    log.info("tokenizing...")
    print("tokenizing train data...")
    train_inputs, train_MAX_LEN, train_attn_masks = tokenize(
        tokenizer, train_data['document'], MAX_LEN)
    train_labels = train_data['label'].values

    print("tokenizing val data...")
    val_inputs, val_MAX_LEN, val_attn_masks = tokenize(tokenizer,
                                                       val_data['document'],
                                                       MAX_LEN)
    val_labels = val_data['label'].values

    print("tokenizing test data...")
    test_inputs, test_MAX_LEN, test_attn_masks = tokenize(
        tokenizer, test_data['document'], MAX_LEN)
    test_labels = test_data['label'].values

    # convert to Pytorch data types
    train_inputs = torch.tensor(train_inputs)
    train_attn_masks = torch.tensor(train_attn_masks)
    train_labels = torch.tensor(train_labels)
    val_inputs = torch.tensor(val_inputs)
    val_attn_masks = torch.tensor(val_attn_masks)
    val_labels = torch.tensor(val_labels)
    test_inputs = torch.tensor(test_inputs)
    test_attn_masks = torch.tensor(test_attn_masks)
    test_labels = torch.tensor(test_labels)

    # Create the DataLoader
    train_data = TensorDataset(train_inputs, train_attn_masks, train_labels)
    val_data = TensorDataset(val_inputs, val_attn_masks, val_labels)
    test_data = TensorDataset(test_inputs, test_attn_masks, test_labels)

    # fine tune and test
    fine_tune_and_test(train_data=train_data,
                       val_data=val_data,
                       test_data=test_data,
                       num_labels=num_labels,
                       num_epochs=num_epochs,
                       batch_size=batch_size)
예제 #9
0
        neg_tmp.append(total_len)
        neg_ks.append(neg_tmp)

    with open('data/애매한영화/' + name, 'w', encoding='euc-kr', newline='') as f2:
        wr = csv.writer(f2)
        wr.writerow([
            'movie_id', 'text_rank', 'content', 'emotion', 'rating',
            'total_rank_pred', 'posneg_len', 'total_len'
        ])
        for i in pos_ks:
            wr.writerow(i)
        for i in neg_ks:
            wr.writerow(i)

    keyword_extractor = KeywordSummarizer(tokenize=mecab_noun_tokenizer,
                                          window=2,
                                          verbose=True)
    pos_keywords = keyword_extractor.summarize(pos_samples_sents, topk=100)
    neg_keywords = keyword_extractor.summarize(neg_samples_sents, topk=100)
    pos_wordrank = {}
    neg_wordrank = {}
    for i in range(len(pos_keywords)):
        pos_wordrank[pos_keywords[i][0]
                     [:pos_keywords[i][0].find('/')]] = pos_keywords[i][1]
    for i in range(len(neg_keywords)):
        neg_wordrank[neg_keywords[i][0]
                     [:neg_keywords[i][0].find('/')]] = neg_keywords[i][1]

    if '영화' in pos_wordrank.keys():
        del pos_wordrank['영화']
    if '것' in pos_wordrank.keys():