def get_info_of_sentences(sentences_fn, sentences_num): if sentences_num < 1: print("ERROR: sentences_num MUST be more than 1.") return print("Start to read file...") sentences = uu.load_text_file(sentences_fn)[:sentences_num] single_word_sentence = 0 total_words = [] print("Get words from sentences...") for s in sentences: words = s.strip().split(' ') if len(words) == 1: single_word_sentence += 1 total_words.extend(words) total_words_num = len(total_words) unique_words_num = len(list(set(total_words))) logger = uu.get_custom_logger( 'sentences_info', os.path.join(uu.get_base_path(), 'logs/sentences_info.log')) logger.info(f'{sentences_num} sentences INFO:') logger.info('Total words: %d | Unique words: %d (%.2f%% of total)' % (total_words_num, unique_words_num, unique_words_num / total_words_num * 100)) logger.info('Words per sentences: %.2f' % (total_words_num / sentences_num)) logger.info( "Single-word-sentences: %d (%.2f%% of total)" % (single_word_sentence, single_word_sentence / sentences_num * 100)) logger.info("=" * 50)
def make_omitted_sentences(sentences_fn, output_fn, sentences_num, min_count): if sentences_num < 1: print("ERROR: sentences_num MUST be more than 1.") return print("Start to read file...") sentences = uu.load_text_file(sentences_fn)[:sentences_num] print("Get word_counts from sentences...") word_counts = {} for s in tqdm(sentences): words = s.strip().split(' ') for w in words: if w in word_counts.keys(): word_counts[w] += 1 else: word_counts[w] = 1 print("Get frequent words list...") frequent_words = [] for k in tqdm(word_counts.keys()): if word_counts[k] >= min_count: frequent_words.append(k) logger = uu.get_custom_logger( 'info_omitted', os.path.join(uu.get_base_path(), 'logs/omit.log')) logger.info("Omitting ~%d Sentences with min_count %d" % (sentences_num, min_count)) frequent_len = len(frequent_words) total_len = len(word_counts) logger.info("Survived Vocabs: %d of Total %d (%.2f%%)" % (frequent_len, total_len, frequent_len / total_len * 100)) print("Write results...") total_words_len = 0 omitted_words_len = 0 with open(output_fn, 'w') as writefile: for s in tqdm(sentences): words = s.strip().split(' ') omitted_words = [] for idx, w in enumerate(words): if w not in frequent_words: words[idx] = '()' omitted_words.append(w) omitted_words_len += len(omitted_words) total_words_len += len(words) - omitted_words_len writefile.write("%s [%s]" % (' '.join(words), ', '.join(omitted_words)) + os.linesep) frequent_words_len = total_words_len - omitted_words_len logger.info("Survived Words: %d of Total %d (%.2f%%)" % (frequent_words_len, total_words_len, frequent_words_len / total_words_len * 100)) logger.info("-" * 50)
def count_sentences_in_jsons(direc, log_file): logger = uu.get_custom_logger('count_sentences', log_file) logger.info(f"Count sentences in {direc} directory...") total_lines = 0 for f in os.listdir(direc): if (f.endswith('.json')): with open(os.path.join(direc, f), 'r') as readfile: dialogs = json.load(readfile) for d in dialogs: for l in d['lines']: if (len(l['message']) > 0): total_lines += 1 logger.info(f"Total sentences in {direc}: {total_lines}")
def convert_script_json_txt(direc, output_fn, log_fn): logger = uu.get_custom_logger('convert_json_txt', log_fn) start_time = time.time() with open(output_fn, 'w') as writefile: for f in tqdm(os.listdir(direc)): if (f.endswith('.json')): with open(os.path.join(direc, f), 'r') as readfile: dialogs = json.load(readfile) for d in dialogs: for l in d['lines']: content = f"[[{l['speaker']}]] {l['message']}" writefile.write(content + os.linesep) writefile.write(os.linesep) end_time = time.time() - start_time logger.info("Finish to convert script json to txt file: %.2f sec" % end_time)
def split_sentences_in_txt(input_fn, output_fn, log_fn): ELLIPSIS_RE = re.compile('\.\.+|…') IN_BRACKETS_RE = re.compile( '\(.*?\)') # Limitation on nested brackets like '(a(b)c)' logger = uu.get_custom_logger('toolbox', log_fn) sentences = uu.load_text_file(input_fn) results = [] logger.info('Split sentences...') for s in tqdm(sentences): s = s.strip() if len(s) == 0 or s == '' or not s.startswith('[['): results.append('') else: if ' ' not in s: continue result = [] speaker = s.split(' ')[0] replaced_s = IN_BRACKETS_RE.sub('', ' '.join(s.split(' ')[1:])) replaced_s = ELLIPSIS_RE.sub(' ', replaced_s) splited_s = '' for w in replaced_s.strip(): if w == '.': if len(splited_s) > 0: result.append(speaker + ' ' + splited_s) splited_s = '' elif w == '!' or w == '?': result.append(speaker + ' ' + splited_s + w) splited_s = '' else: splited_s += w if len(splited_s) > 0: result.append(speaker + ' ' + splited_s) results.extend(result) logger.info('Save results...') with open(output_fn, 'w') as writefile: for r in tqdm(results): writefile.write(r + os.linesep) logger.info( f'Done - {len(sentences)} sentences => {len(results)} sentences')
def adjust_sentence_len(sentences_fn, output_fn, sentence_len=16): sentences = uu.load_text_file(sentences_fn) new_sentences = [] for s in tqdm(sentences): words = s.split(' ') if len(words) > sentence_len: words = words[:sentence_len] elif len(words) < sentence_len: words = ['0'] * (sentence_len - len(words)) + words new_sentences.append(' '.join(words)) with open(output_fn, 'w') as writefile: writefile.write(os.linesep.join(new_sentences)) ## Main logger = uu.get_custom_logger('toolbox', log_file) # create_word_list(input_file, output_file) # create_skip_grams(words_file, sentences_file, output_file, log_per=log_per) # get_info_of_sentences(sentences_file, 500000) # make_omitted_sentences(sentences_file, '../results/reduce3-omitted-500000-5.txt', 500000, 5) # convert_script_json_txt(directory, output_file, log_file) # count_sentences_in_jsons(directory, log_file) # split_sentences_in_txt(input_file, output_file, log_file) # get_word_frequency_json(input_file, output_file, 50000) # show_counts_log_scale(output_file) # select_with_word_txt(output_file, split_output_file, '네', logger) # remove_less_frequent_words(input_file, words_file, output_file, logger) # adjust_sentence_len(input_file, output_file)
# Divide into train & test set positive_train_len = int(len(X_positive_t) * train_ratio) negative_train_len = int(len(X_negative_t) * train_ratio) X_train_t = np.array(X_positive_t[:positive_train_len] + X_negative_t[:negative_train_len]) X_test_t = np.array(X_positive_t[positive_train_len:] + X_negative_t[negative_train_len:]) Y_train_t = np.array([[1]] * positive_train_len + [[0]] * negative_train_len) Y_test_t = np.array([[1]] * (len(X_positive) - positive_train_len) + [[0]] * (len(X_negative) - negative_train_len)) print(f"Data: X_train - {X_train_t.shape}, X_test - {X_test_t.shape}") print(f"Data: Y_train - {Y_train_t.shape}, Y_test - {Y_test_t.shape}") return X_train_t, X_test_t, Y_train_t, Y_test_t ## Main logger = uu.get_custom_logger('keras_lstm', LOG_PATH) X_train_t, X_test_t, Y_train_t, Y_test_t = create_vrm_dataset(INPUT_PATH, EMBEDDING_PATH, logger) # Create LSTM Model K.clear_session() model = Sequential() model.add(LSTM(128, input_shape=(13, 300))) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() # Model Fitting eval_iter = evaluateIter()