def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type) # === Result === print('===結果===') res = segmentation_model.segment([stems(doc) for doc in docs]) print(segmentation_model.sim_arr)
def lexrank(topic, level, node, original_docs, dir='./result/summary/hlda/'): n_words = 10 with_weights = False docs = [] for arr in original_docs: tmp_docs = [] for speaker, remark in arr: tmp_docs.append(remark) docs.append('\n'.join(tmp_docs)) tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000) docs_for_training = [stems(doc) for doc in docs] tfidf.train(docs_for_training) sent_vecs = tfidf.to_vector(docs_for_training) # 表示 print('===要約===') # 要約 indexes = summarize(docs, sent_vecs, sort_type='normal', sent_limit=5, threshold=0.1) docs_summary = [original_docs[i] for i in indexes] path = [] node_parent = node.parent while node_parent is not None: path.append(node_parent.node_id) node_parent = node_parent.parent path.reverse() for node_id in path: dir += '/topic_' + str(node_id) if not (os.path.exists(dir)): os.makedirs(dir) with open(dir + '/topic_' + str(topic) + '.txt', 'w') as f: node_parent = node.parent msg = 'topic=%d level=%d (documents=%d): ' % ( node_parent.node_id, node_parent.level, node_parent.customers) msg += node_parent.get_top_words(n_words, with_weights) print(msg, file=f) msg = ' topic=%d level=%d (documents=%d): ' % ( node.node_id, node.level, node.customers) msg += node.get_top_words(n_words, with_weights) print(msg, file=f) for node_child in node.children: msg = ' topic=%d level=%d (documents=%d): ' % ( node_child.node_id, node_child.level, node_child.customers) msg += node_child.get_top_words(n_words, with_weights) print(msg, file=f) print('-------------------------------', file=f) for i, docs in enumerate(docs_summary): print('', file=f) print(str(i + 1) + ':', file=f) for speaker, remark in docs: print(speaker + ' ' + remark, file=f)
def lexrank(original_docs, topic, dictionary, dir='./result/lda/summary/'): docs = [] for doc in original_docs: tmp_docs = [] for speaker, remark in doc: tmp_docs.append(remark) docs.append('\n'.join(tmp_docs)) # for training tfidf = TfidfModel(no_below=0, no_above=1.0, keep_n=100000) docs_for_training = [stems(doc) for doc in docs] tfidf.train(docs_for_training) sent_vecs = tfidf.to_vector(docs_for_training) # for dict sw = stopwords() docs_for_dict = [stems(doc, polish=True, sw=sw) for doc in docs] corpus = list(map(dictionary.doc2bow, docs_for_dict)) # 表示 print('===要約===') # 要約 indexes = summarize(docs, sent_vecs, sort_type='normal', sent_limit=10, threshold=0.1) docs_summary = [original_docs[i] for i in indexes] probs_summary = [lda[corpus[i]] for i in indexes] if not (os.path.exists(dir)): os.makedirs(dir) with open(dir + '/topic_' + str(topic + 1) + '.txt', 'w') as f: i = 0 for docs, prob in zip(docs_summary, probs_summary): i += 1 print("-" * 80, file=f) print(str(i) + ':', file=f) print([(t + 1, p) for t, p in prob], file=f) for speaker, remark in docs: print(speaker + ' ' + remark, file=f)
tmp_docs = [] else: tmp_docs.extend([item[1][1]]) docs.append('\n'.join(tmp_docs)) if doc_num == 'all': doc_num = '26' doc_num = '01_' + doc_num # Params no_below = 3 no_above = 0.8 keep_n = 100000 topic_N = 9 sw = stopwords() docs_for_training = [stems(doc, polish=True, sw=sw) for doc in docs] print('===コーパス生成===') # tfidf # tfidf = TfidfModel(no_below=no_below, no_above=no_above, keep_n=keep_n) # tfidf.train(docs_for_training) # dictionary = tfidf.dictionary # corpus = tfidf.corpus # corpus = tfidf.model[corpus] dictionary = gensim.corpora.Dictionary(docs_for_training) dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) corpus = list(map(dictionary.doc2bow, docs_for_training))
print('Done') # 要約する単位 文 or 発言 # to sentence if sum_type == 'sentence': data = utils.to_sentence(data) # for sum docs = [row[1] for row in data] print(docs[:1]) if model_type == 'tfidf': # GensimのTFIDFモデルを用いた文のベクトル化 tfidf = TfidfModel(no_below=10, no_above=0.1, keep_n=100000) tfidf.load_model() sent_vecs = tfidf.to_vector([stems(doc) for doc in docs]) elif model_type == 'doc2vec': # ===Doc2Vec=== doc2vec = Doc2Vec(alpha=0.025, min_count=10, vector_size=300, epochs=50, workers=4) model_path = './model/doc2vec/doc2vec_' + str( doc2vec.vector_size) + '.model' doc2vec.load_model(model_path) sent_vecs = doc2vec.to_vector(([stems(doc) for doc in docs])) else: print('Invalid model type')
step = 1 sw = stopwords() # data_set = [stems(doc, polish=True, sw=sw) for doc in docs] # docs_for_dict = data_set print('===コーパス生成===') if eval_type == 'perplexity': # Test set print(docs[:3]) random.shuffle(docs) print(docs[:3]) test_size = int(len(docs) * 0.25) docs_test = docs[:test_size] # docs_test = docs test_set = [stems(doc, polish=True, sw=sw) for doc in docs_test] # dict # data_for_test_dict = [stems(doc, polish=True, sw=sw) for doc in utils.to_sentence_docs(docs_test)] data_for_test_dict = test_set test_dict = gensim.corpora.Dictionary(data_for_test_dict) test_dict.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) test_corpus = list(map(test_dict.doc2bow, test_set)) # Train set docs_train = docs[test_size:] # docs_train = docs train_set = [stems(doc, polish=True, sw=sw) for doc in docs_train] # dict # data_for_train_dict = [stems(doc, polish=True, sw=sw) for doc in utils.to_sentence_docs(docs_train)]
update = True else: print('Arguments are too sort') exit() model_type = args[1] # docs: インタビュー全体 print('Load data') # モデルを訓練する data = utils.to_sentence(scraping.scraping(10)) docs = [row[1] for row in data] # max_characters: XX文字以上の単文は要約対象外 # docs = utils.polish_docs(docs, max_characters=1000) docs_for_train = [stems(doc) for doc in docs] """ 以下のようなデータを作っています edocs_for_train = [ ['出身は', 'どこ', 'ですか' ... ['好き', 'な', '食べもの', ... ... ] """ print(data[:3]) print(docs[:1]) print(docs_for_train[:1]) print('Done') if model_type == 'tfidf': # TFIDFモデル生成
def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type, [stems(doc) for doc in docs]) # === Result === print('Segmentation') res = segmentation_model.segment([stems(doc) for doc in docs]) print('Done') # print(res) # 画像 save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str( segmentation_model.window_size) + '_' + str(datetime.date.today()) fig = plt.figure() plt.ylim([0, 1]) segmentation_model.sim_arr.plot(title='Cosine similarity') plt.savefig(save_path + '.png') plt.close('all') # セグメント # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today()) # For lda save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num with open(save_path + '.txt', 'w') as f: for i in range(len(docs)): print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f) print('', file=f) if str(i + 0.5) in res.index.values: print("___________\n", file=f) # === Evaluation === count, f_score = 0, 0 label_for_eval = [] if eval: print('===評価===') count, label_for_eval, f_score = evaluation(res, segmentation_model, segmentation_type, model_type, doc_type, doc_num) return count, res.index.values, label_for_eval, f_score
print('Load data') path = './data/test.txt' # path = './data/interview-text_01-26_all.txt' data = utils.load_data(path) data = utils.to_sentence(data) docs = [row[1] for row in data] print('Done') # モデルを訓練する場合 if train: print('===TFIDFモデル生成===') print('Train') # docs # for train print(docs[:1]) tfidf.train([stems(doc) for doc in docs]) print('Done') # 要約する単位 文 or 発言 print(docs[:1]) # GensimのTFIDFモデルを用いた文のベクトル化 sent_vecs = tfidf.to_vector([stems(doc) for doc in docs]) # print('===セグメンテーション===') # TODO text_tiling = TextTiling(sent_vecs) # with open('./result/segmentation/tfidf/' + str(datetime.date.today()) + '.txt', 'w') as f: # print("no_below: " + str(no_below) + ", no_above: " + str(no_above) + ", keep_n: " + str(keep_n) + ", threshold: " + str(threshold), file=f) # for i, docs in enumerate(docs_summary):