예제 #1
0
def get_info_of_sentences(sentences_fn, sentences_num):
    if sentences_num < 1:
        print("ERROR: sentences_num MUST be more than 1.")
        return

    print("Start to read file...")
    sentences = uu.load_text_file(sentences_fn)[:sentences_num]
    single_word_sentence = 0
    total_words = []

    print("Get words from sentences...")
    for s in sentences:
        words = s.strip().split(' ')
        if len(words) == 1:
            single_word_sentence += 1
        total_words.extend(words)

    total_words_num = len(total_words)
    unique_words_num = len(list(set(total_words)))

    logger = uu.get_custom_logger(
        'sentences_info',
        os.path.join(uu.get_base_path(), 'logs/sentences_info.log'))
    logger.info(f'{sentences_num} sentences INFO:')
    logger.info('Total words: %d | Unique words: %d (%.2f%% of total)' %
                (total_words_num, unique_words_num,
                 unique_words_num / total_words_num * 100))
    logger.info('Words per sentences: %.2f' %
                (total_words_num / sentences_num))
    logger.info(
        "Single-word-sentences: %d (%.2f%% of total)" %
        (single_word_sentence, single_word_sentence / sentences_num * 100))
    logger.info("=" * 50)
예제 #2
0
def make_omitted_sentences(sentences_fn, output_fn, sentences_num, min_count):
    if sentences_num < 1:
        print("ERROR: sentences_num MUST be more than 1.")
        return

    print("Start to read file...")
    sentences = uu.load_text_file(sentences_fn)[:sentences_num]

    print("Get word_counts from sentences...")
    word_counts = {}
    for s in tqdm(sentences):
        words = s.strip().split(' ')
        for w in words:
            if w in word_counts.keys():
                word_counts[w] += 1
            else:
                word_counts[w] = 1

    print("Get frequent words list...")
    frequent_words = []
    for k in tqdm(word_counts.keys()):
        if word_counts[k] >= min_count:
            frequent_words.append(k)
    logger = uu.get_custom_logger(
        'info_omitted', os.path.join(uu.get_base_path(), 'logs/omit.log'))
    logger.info("Omitting ~%d Sentences with min_count %d" %
                (sentences_num, min_count))
    frequent_len = len(frequent_words)
    total_len = len(word_counts)
    logger.info("Survived Vocabs: %d of Total %d (%.2f%%)" %
                (frequent_len, total_len, frequent_len / total_len * 100))

    print("Write results...")
    total_words_len = 0
    omitted_words_len = 0
    with open(output_fn, 'w') as writefile:
        for s in tqdm(sentences):
            words = s.strip().split(' ')
            omitted_words = []
            for idx, w in enumerate(words):
                if w not in frequent_words:
                    words[idx] = '()'
                    omitted_words.append(w)
            omitted_words_len += len(omitted_words)
            total_words_len += len(words) - omitted_words_len
            writefile.write("%s [%s]" %
                            (' '.join(words), ', '.join(omitted_words)) +
                            os.linesep)
    frequent_words_len = total_words_len - omitted_words_len
    logger.info("Survived Words: %d of Total %d (%.2f%%)" %
                (frequent_words_len, total_words_len,
                 frequent_words_len / total_words_len * 100))
    logger.info("-" * 50)
예제 #3
0
def count_sentences_in_jsons(direc, log_file):
    logger = uu.get_custom_logger('count_sentences', log_file)
    logger.info(f"Count sentences in {direc} directory...")
    total_lines = 0
    for f in os.listdir(direc):
        if (f.endswith('.json')):
            with open(os.path.join(direc, f), 'r') as readfile:
                dialogs = json.load(readfile)
                for d in dialogs:
                    for l in d['lines']:
                        if (len(l['message']) > 0):
                            total_lines += 1
    logger.info(f"Total sentences in {direc}: {total_lines}")
예제 #4
0
def convert_script_json_txt(direc, output_fn, log_fn):
    logger = uu.get_custom_logger('convert_json_txt', log_fn)
    start_time = time.time()
    with open(output_fn, 'w') as writefile:
        for f in tqdm(os.listdir(direc)):
            if (f.endswith('.json')):
                with open(os.path.join(direc, f), 'r') as readfile:
                    dialogs = json.load(readfile)
                    for d in dialogs:
                        for l in d['lines']:
                            content = f"[[{l['speaker']}]] {l['message']}"
                            writefile.write(content + os.linesep)
                    writefile.write(os.linesep)

    end_time = time.time() - start_time
    logger.info("Finish to convert script json to txt file: %.2f sec" %
                end_time)
예제 #5
0
def split_sentences_in_txt(input_fn, output_fn, log_fn):
    ELLIPSIS_RE = re.compile('\.\.+|…')
    IN_BRACKETS_RE = re.compile(
        '\(.*?\)')  # Limitation on nested brackets like '(a(b)c)'

    logger = uu.get_custom_logger('toolbox', log_fn)
    sentences = uu.load_text_file(input_fn)
    results = []
    logger.info('Split sentences...')
    for s in tqdm(sentences):
        s = s.strip()
        if len(s) == 0 or s == '' or not s.startswith('[['):
            results.append('')
        else:
            if ' ' not in s:
                continue
            result = []
            speaker = s.split(' ')[0]
            replaced_s = IN_BRACKETS_RE.sub('', ' '.join(s.split(' ')[1:]))
            replaced_s = ELLIPSIS_RE.sub(' ', replaced_s)
            splited_s = ''
            for w in replaced_s.strip():
                if w == '.':
                    if len(splited_s) > 0:
                        result.append(speaker + ' ' + splited_s)
                        splited_s = ''
                elif w == '!' or w == '?':
                    result.append(speaker + ' ' + splited_s + w)
                    splited_s = ''
                else:
                    splited_s += w
            if len(splited_s) > 0:
                result.append(speaker + ' ' + splited_s)
            results.extend(result)

    logger.info('Save results...')
    with open(output_fn, 'w') as writefile:
        for r in tqdm(results):
            writefile.write(r + os.linesep)
    logger.info(
        f'Done - {len(sentences)} sentences => {len(results)} sentences')
예제 #6
0

def adjust_sentence_len(sentences_fn, output_fn, sentence_len=16):
    sentences = uu.load_text_file(sentences_fn)
    new_sentences = []
    for s in tqdm(sentences):
        words = s.split(' ')
        if len(words) > sentence_len:
            words = words[:sentence_len]
        elif len(words) < sentence_len:
            words = ['0'] * (sentence_len - len(words)) + words
        new_sentences.append(' '.join(words))
    with open(output_fn, 'w') as writefile:
        writefile.write(os.linesep.join(new_sentences))


## Main
logger = uu.get_custom_logger('toolbox', log_file)

# create_word_list(input_file, output_file)
# create_skip_grams(words_file, sentences_file, output_file, log_per=log_per)
# get_info_of_sentences(sentences_file, 500000)
# make_omitted_sentences(sentences_file, '../results/reduce3-omitted-500000-5.txt', 500000, 5)
# convert_script_json_txt(directory, output_file, log_file)
# count_sentences_in_jsons(directory, log_file)
# split_sentences_in_txt(input_file, output_file, log_file)
# get_word_frequency_json(input_file, output_file, 50000)
# show_counts_log_scale(output_file)
# select_with_word_txt(output_file, split_output_file, '네', logger)
# remove_less_frequent_words(input_file, words_file, output_file, logger)
# adjust_sentence_len(input_file, output_file)
예제 #7
0
  # Divide into train & test set
  positive_train_len = int(len(X_positive_t) * train_ratio)
  negative_train_len = int(len(X_negative_t) * train_ratio)
  X_train_t = np.array(X_positive_t[:positive_train_len] + X_negative_t[:negative_train_len])
  X_test_t = np.array(X_positive_t[positive_train_len:] + X_negative_t[negative_train_len:])
  Y_train_t = np.array([[1]] * positive_train_len + [[0]] * negative_train_len)
  Y_test_t = np.array([[1]] * (len(X_positive) - positive_train_len) + [[0]] * (len(X_negative) - negative_train_len))

  print(f"Data: X_train - {X_train_t.shape}, X_test - {X_test_t.shape}")
  print(f"Data: Y_train - {Y_train_t.shape}, Y_test - {Y_test_t.shape}")

  return X_train_t, X_test_t, Y_train_t, Y_test_t


## Main
logger = uu.get_custom_logger('keras_lstm', LOG_PATH)

X_train_t, X_test_t, Y_train_t, Y_test_t = create_vrm_dataset(INPUT_PATH, EMBEDDING_PATH, logger)

# Create LSTM Model
K.clear_session()
model = Sequential()
model.add(LSTM(128, input_shape=(13, 300)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

# Model Fitting
eval_iter = evaluateIter()