def tmp_replace_word(text, noun_list_dict, num=10): tagger = util.create_tagger() result_list = [] for _ in range(num): replaced, _ = replace_noun_by_similar_word( text, noun_list_dict, tagger, {} # not used ) result_list.append(replaced) return result_list
def output_word_list(name_char_topn, noun_topn): ''' JSON出力 ''' # name_char_topn, noun_topn は類似度のチューニング用 model = util.load_model(const.WORD2VEC_MODEL_PATH) tagger = util.create_tagger() name_char_dict, noun_dict = crate_target_word_dict( model, tagger, name_char_topn, noun_topn) write_to_json(const.NAME_CHARCTER_LIST_FILE, name_char_dict) write_to_json(const.SIMILAR_NOUN_LIST_FILE, noun_dict)
def tmp(target, noun_topn=9): model = util.load_model(const.WORD2VEC_MODEL_PATH) tagger = util.create_tagger() similar_noun_dict = {} tab_divided_word_token_list = tagger.parse(target).split('\n') noun_list = create_noun_list(tab_divided_word_token_list) noun_results = create_similar_noun_dict( model, tagger, noun_list, noun_topn) similar_noun_dict.update(noun_results) return similar_noun_dict
def random_generate_ese_bungo_one(): ''' 一つだけrandomで取得する用 ''' tagger = util.create_tagger() source_dict = read_json_to_dict(const.ORIGINAL_NOVEL_FILE) noun_list_dict = read_json_to_dict(const.SIMILAR_NOUN_LIST_FILE) author_name, novel_list = random.choice(list(source_dict.items())) novel = random.choice(novel_list) title = novel['title'] quote = random.choice(novel['quotes']) used_word = {} generated_quote, used_word = replace_noun_by_similar_word( quote, noun_list_dict, tagger, used_word) generated_title, used_word = replace_noun_by_similar_word( title, noun_list_dict, tagger, used_word) generated_name = generate_fake_name(author_name) print(author_name, title, quote) print('') print(generated_name, generated_title, generated_quote) print('---') original = { 'author': author_name, 'title': title, 'quote': quote, 'url': novel['url'] } generated = { 'author': generated_name, 'title': generated_title, 'quote': generated_quote } return original, generated
def generate_fake_books_and_quotes_tmp(num=5): tagger = util.create_tagger() noun_list_dict = read_json_to_dict(const.SIMILAR_NOUN_LIST_FILE_TMP) source_dict = read_json_to_dict(const.ORIGINAL_NOVEL_SOURCE_TMP) results = [] for author_name, data in source_dict.items(): print('-----------') print(author_name) for book in data['novels']: title = book['title'] for quote_text in book['quotes']: for _ in range(num): # 1つの作品(タイトル+クオート)の中で、おなじ単語は同じように変換するようにused_wordに保持。 # 『走れメロス』の「メロスは激怒した。」のようにタイトル中の単語が本文中で使われてる時、 # 別々に変換すると面白みが減るため used_word = {} fake_quote_text, used_word = replace_noun_by_similar_word( quote_text, noun_list_dict, tagger, used_word) fake_title, used_word = replace_noun_by_similar_word( title, noun_list_dict, tagger, used_word) generated_data = { # 'author_id': author_id, 'fake_name': generate_fake_name(author_name, is_tmp=True), # 'book_id': book_id, 'fake_title': fake_title, # 'quote_id': quote_id, 'fake_text': fake_quote_text, } results.append(generated_data) return results
def decorate_with_logging(target_text_list, beautiful_word_list, topn): ''' ループして美しくdecorateして結果をprint ''' main_word = beautiful_word_list[0] print(f'{main_word}化開始...') print('') pn_dict = decorator.get_pn_dict1() model = util.load_model(const.WORD2VEC_MODEL_PATH) tagger = util.create_tagger() all_result = [] for target_text in target_text_list: result = decorator.decorate(model, tagger, pn_dict, target_text, beautiful_word_list, topn) all_result.append(result) print(f'{main_word}化完了') print('') print('') print('***************') print('変更結果もれCheck') print('***************') not_decorated_list = [ result[3] for result in all_result if len(result[3]) != 0 ] if len(not_decorated_list) == 0: print(f'全て{main_word}化!') else: for not_decorated in not_decorated_list: print(not_decorated) print('') print('***************') print(f'使用{main_word}語チェック') print('***************') for beautiful_word in beautiful_word_list: print('') print('------------------') print(f'{beautiful_word}によって{main_word}化された単語') print('------------------') # あとで重複削除するためにsetに変換できるようtuple化 res = [ tuple(decorate_history) for result in all_result for decorate_history in result[4] if decorate_history[0] == beautiful_word ] # 重複削除 res = list(set(res)) if len(res) == 0: print('なし!') else: for r in res: nega_word = r[1] posi_word = r[2] print(f'{nega_word} to {posi_word}') print('') print('***************') print('変更結果Check') print('***************') for result in all_result: print('-----------') print(result[0]) print('↓') print(result[1]) print('')
def generate_fake_books_and_quotes(num=5, author_id=None): ''' 元データにある文言をnum分だけ変換 ''' tagger = util.create_tagger() noun_list_dict = read_json_to_dict(const.SIMILAR_NOUN_LIST_FILE) results = [] conn = get_connect() with conn: cur = conn.cursor() # Author authors = [] if author_id is None: authors = get_all_authors(cur) else: authors = get_author(cur, author_id) print(authors) for author in authors: author_id = author[0] author_name = author[1] # Book books = get_all_books_by_author(cur, author_id) for book in books: book_id = book[0] title = book[1] url = book[2] # Quote quotes = get_all_quotes_by_book(cur, book_id) for quote in quotes: quote_id = quote[0] quote_text = quote[1] for _ in range(num): # 1つの作品(タイトル+クオート)の中で、おなじ単語は同じように変換するようにused_wordに保持。 # 『走れメロス』の「メロスは激怒した。」のようにタイトル中の単語が本文中で使われてる時、 # 別々に変換すると面白みが減るため used_word = {} fake_quote_text, used_word = replace_noun_by_similar_word( quote_text, noun_list_dict, tagger, used_word) fake_title, used_word = replace_noun_by_similar_word( title, noun_list_dict, tagger, used_word) generated_data = { 'author_id': author_id, 'fake_name': generate_fake_name(author_name), 'book_id': book_id, 'fake_title': fake_title, 'quote_id': quote_id, 'fake_text': fake_quote_text, } results.append(generated_data) return results
def generate_ese_bungo_all(num=1, is_tmp=True): ''' 元データにある文言をnum分だけ変換 ''' tagger = util.create_tagger() source_dict = read_json_to_dict(const.ORIGINAL_NOVEL_FILE) noun_list_dict = read_json_to_dict(const.SIMILAR_NOUN_LIST_FILE) results = [] loop_cnt = 0 orginal_list = [] a_c = 0 # 作家数 n_c = 0 # 作品数 q_c = 0 # 引用文数 for author_name, novel_list in source_dict.items(): a_c += 1 for novel in novel_list: n_c += 1 title = novel['title'] # for quote in novel['quotes']: for quote in novel['quotes']: q_c += 1 orginal_list.append([author_name, title, quote, novel['url']]) orginal_list_idx = len(orginal_list) - 1 used_quote = [] for _ in range(num): loop_cnt = loop_cnt + 1 # 1つの作品(タイトル+クオート)の中で、おなじ単語は同じように変換するようにused_wordに保持。 # 『走れメロス』の「メロスは激怒した。」のようにタイトル中の単語が本文中で使われてる時、 # 別々に変換すると面白みが減るため used_word = {} generated_quote, used_word = replace_noun_by_similar_word( quote, noun_list_dict, tagger, used_word) # 同じQuoteは省く。なんども同じの見ても退屈なので(デモサイト用) if generated_quote in used_quote: continue else: used_quote.append(generated_quote) generated_title, used_word = replace_noun_by_similar_word( title, noun_list_dict, tagger, used_word) generated_name = generate_fake_name(author_name) print(author_name, title, quote) print('') print(generated_name, generated_title, generated_quote) print('---') fake = [ generated_name, generated_title, generated_quote, orginal_list_idx ] results.append(fake) print(f'author :{a_c}, novel :{n_c}, quote :{q_c}') print('loop_cnt:', loop_cnt) print('total:', len(results)) return orginal_list, results
def tmp(): tagger = util.create_tagger() source_dict = read_json_to_dict(const.ORIGINAL_NOVEL_FILE_TMP) noun_list_dict = read_json_to_dict(const.SIMILAR_NOUN_LIST_FILE)