def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type) # === Result === print('===結果===') res = segmentation_model.segment([stems(doc) for doc in docs]) print(segmentation_model.sim_arr)
or args[1] == 'utterance' or args[1] == 'segmentation/ans'): print('Argument is invalid') exit() else: print('Arguments are too sort') exit() doc_type = args[1] doc_num = 'all' path = './data/interview/interview-text_01-26_' + doc_num + '.txt' if doc_type == 'sentence': data = utils.load_data(path) # to sentence data = utils.to_sentence(data) docs = [row[1] for row in data] if doc_type == 'utterance': data = utils.load_data(path) docs = [row[1] for row in data] elif doc_type == 'segmentation' or doc_type == 'segmentation/ans': ans = False if doc_type == 'segmentation/ans': ans = True if doc_num == 'all': doc_num = '26' data_arr = [] for num in range(int(doc_num)): num += 1
if not(args[1] == 'tfidf' or args[1] == 'doc2vec' or args[1] == 'word2vec'): print('Argument is invalid') exit() if args[-1] == 'update': update = True else: print('Arguments are too sort') exit() model_type = args[1] # docs: インタビュー全体 print('Load data') # モデルを訓練する path = './data/interview/interview-text_01-26_all.txt' data = utils.to_sentence(utils.load_data(path)) docs = [row[1] for row in data] # max_characters: XX文字以上の単文は要約対象外 # docs = utils.polish_docs(docs, max_characters=1000) sw = stopwords() docs_for_train = [stems(doc, polish=True, sw=sw) for doc in docs] print(docs_for_train[:10]) sum = 0 for arr in docs_for_train: sum += len(arr) print(sum) """ 以下のようなデータを作っています edocs_for_train = [ ['出身は', 'どこ', 'ですか' ...
if not (args[1] == 'tfidf' or args[1] == 'doc2vec' or args[1] == 'word2vec'): print('Argument is invalid') exit() if args[-1] == 'update': update = True else: print('Arguments are too sort') exit() model_type = args[1] # docs: インタビュー全体 print('Load data') # モデルを訓練する data = utils.to_sentence(scraping.scraping(10)) docs = [row[1] for row in data] # max_characters: XX文字以上の単文は要約対象外 # docs = utils.polish_docs(docs, max_characters=1000) docs_for_train = [stems(doc) for doc in docs] """ 以下のようなデータを作っています edocs_for_train = [ ['出身は', 'どこ', 'ですか' ... ['好き', 'な', '食べもの', ... ... ] """ print(data[:3]) print(docs[:1])
def main_segmentation(doc_num, window_size, model_type, doc_type, segmentation_type, eval=False): # === Load doc === print('') print('Interview:', doc_num) print('Load data') path = './data/interview/interview-text_01-26_' + doc_num + '.txt' data = utils.load_data(path) if doc_type == 'sentence': data = utils.to_sentence(data) docs = [row[1] for row in data] label = [row[0] for row in data] print(data[:5]) print('Done') # === Model === print('Model:', model_type) print('Segmentation type:', segmentation_type) model, segmentation_model = load_model(model_type, segmentation_type, [stems(doc) for doc in docs]) # === Result === print('Segmentation') res = segmentation_model.segment([stems(doc) for doc in docs]) print('Done') # print(res) # 画像 save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/img/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str( segmentation_model.window_size) + '_' + str(datetime.date.today()) fig = plt.figure() plt.ylim([0, 1]) segmentation_model.sim_arr.plot(title='Cosine similarity') plt.savefig(save_path + '.png') plt.close('all') # セグメント # save_path = './result/segmentation/' + segmentation_type + '/' + model_type + '/' + doc_type + '/interview_text/' + 'doc_num_' + doc_num + '_' + model_type + '_window_size_' + str(segmentation_model.window_size) + '_' + str(datetime.date.today()) # For lda save_path = './data/segmentation/' + doc_type + '/' + 'interview-text_' + doc_num with open(save_path + '.txt', 'w') as f: for i in range(len(docs)): print(label[i] + ' ' + docs[i].replace('\n', '。'), file=f) print('', file=f) if str(i + 0.5) in res.index.values: print("___________\n", file=f) # === Evaluation === count, f_score = 0, 0 label_for_eval = [] if eval: print('===評価===') count, label_for_eval, f_score = evaluation(res, segmentation_model, segmentation_type, model_type, doc_type, doc_num) return count, res.index.values, label_for_eval, f_score