예제 #1
0
def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type)

    # === Result ===
    print('===結果===')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print(segmentation_model.sim_arr)
예제 #2
0
                or args[1] == 'utterance' or args[1] == 'segmentation/ans'):
            print('Argument is invalid')
            exit()
    else:
        print('Arguments are too sort')
        exit()

    doc_type = args[1]

    doc_num = 'all'
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    if doc_type == 'sentence':
        data = utils.load_data(path)
        # to sentence
        data = utils.to_sentence(data)
        docs = [row[1] for row in data]

    if doc_type == 'utterance':
        data = utils.load_data(path)
        docs = [row[1] for row in data]

    elif doc_type == 'segmentation' or doc_type == 'segmentation/ans':
        ans = False
        if doc_type == 'segmentation/ans':
            ans = True
        if doc_num == 'all':
            doc_num = '26'
        data_arr = []
        for num in range(int(doc_num)):
            num += 1
예제 #3
0
        if not(args[1] == 'tfidf' or args[1] == 'doc2vec' or args[1] == 'word2vec'):
            print('Argument is invalid')
            exit()
        if args[-1] == 'update':
            update = True
    else:
        print('Arguments are too sort')
        exit()

    model_type = args[1]

    # docs: インタビュー全体
    print('Load data')
    # モデルを訓練する
    path = './data/interview/interview-text_01-26_all.txt'
    data = utils.to_sentence(utils.load_data(path))
    docs = [row[1] for row in data]

    # max_characters: XX文字以上の単文は要約対象外
    # docs = utils.polish_docs(docs, max_characters=1000)
    sw = stopwords()
    docs_for_train = [stems(doc, polish=True, sw=sw) for doc in docs]
    print(docs_for_train[:10])
    sum = 0
    for arr in docs_for_train:
        sum += len(arr)
    print(sum)
    """
    以下のようなデータを作っています
    edocs_for_train = [
    ['出身は', 'どこ', 'ですか' ...
예제 #4
0
        if not (args[1] == 'tfidf' or args[1] == 'doc2vec'
                or args[1] == 'word2vec'):
            print('Argument is invalid')
            exit()
        if args[-1] == 'update':
            update = True
    else:
        print('Arguments are too sort')
        exit()

    model_type = args[1]

    # docs: インタビュー全体
    print('Load data')
    # モデルを訓練する
    data = utils.to_sentence(scraping.scraping(10))
    docs = [row[1] for row in data]

    # max_characters: XX文字以上の単文は要約対象外
    # docs = utils.polish_docs(docs, max_characters=1000)
    docs_for_train = [stems(doc) for doc in docs]
    """
    以下のようなデータを作っています
    edocs_for_train = [
    ['出身は', 'どこ', 'ですか' ...
    ['好き', 'な', '食べもの', ...
    ...
    ]
    """
    print(data[:3])
    print(docs[:1])
예제 #5
0
def main_segmentation(doc_num,
                      window_size,
                      model_type,
                      doc_type,
                      segmentation_type,
                      eval=False):
    # === Load doc ===
    print('')
    print('Interview:', doc_num)
    print('Load data')
    path = './data/interview/interview-text_01-26_' + doc_num + '.txt'

    data = utils.load_data(path)
    if doc_type == 'sentence':
        data = utils.to_sentence(data)

    docs = [row[1] for row in data]
    label = [row[0] for row in data]
    print(data[:5])
    print('Done')

    # === Model ===
    print('Model:', model_type)
    print('Segmentation type:', segmentation_type)
    model, segmentation_model = load_model(model_type, segmentation_type,
                                           [stems(doc) for doc in docs])

    # === Result ===
    print('Segmentation')
    res = segmentation_model.segment([stems(doc) for doc in docs])
    print('Done')
    # print(res)

    # 画像
    save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(
        segmentation_model.window_size) + '_' + str(datetime.date.today())

    fig = plt.figure()
    plt.ylim([0, 1])
    segmentation_model.sim_arr.plot(title='Cosine similarity')
    plt.savefig(save_path + '.png')
    plt.close('all')

    # セグメント
    # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today())
    # For lda
    save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num
    with open(save_path + '.txt', 'w') as f:
        for i in range(len(docs)):
            print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f)
            print('', file=f)
            if str(i + 0.5) in res.index.values:
                print("___________\n", file=f)

    # === Evaluation ===
    count, f_score = 0, 0
    label_for_eval = []
    if eval:
        print('===評価===')
        count, label_for_eval, f_score = evaluation(res, segmentation_model,
                                                    segmentation_type,
                                                    model_type, doc_type,
                                                    doc_num)

    return count, res.index.values, label_for_eval, f_score