def add_noise_to_string(a_string, amount_of_noise): """Add some artificial spelling mistakes to the string""" if rand() < amount_of_noise * len(a_string): # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice( CHARS[:-1]) + a_string[random_char_position + 1:] if rand() < amount_of_noise * len(a_string): # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[ random_char_position + 1:] if len(a_string) < MAX_INPUT_LEN and rand( ) < amount_of_noise * len(a_string): # Add a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice( CHARS[:-1]) + a_string[random_char_position:] if rand() < amount_of_noise * len(a_string): # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) return a_string
def add_noise_to_string(self,a_string): """Adds aritificial random noise to a string, returns a list of strings with noise added""" CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .") incorrectVersions=[] origString=a_string for i in range(random.randrange(1,4)): a_string=origString onehop=random.randrange(1,3) for _ in range(onehop): j=random.randrange(1,5) if j==1 and len(a_string)>0: # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:] elif j==2 and len(a_string)>0: # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[random_char_position + 1:] elif j==3: # Add a random character if len(a_string)>0: random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:] else: a_string=random_choice(CHARS[:-1]) elif len(a_string)>1: # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) incorrectVersions.append(a_string) return incorrectVersions
def add_noise_to_string( a_string, amount_of_noise): # Add artificial spelling mistakes to string from numpy.random import choice as random_choice, randint as random_randint, seed as random_seed, rand CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .") if rand() < amount_of_noise * len(a_string): # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice( CHARS[:-1]) + a_string[random_char_position + 1:] if rand() < amount_of_noise * len(a_string): # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[ random_char_position + 1:] if len(a_string) < MAX_INPUT_LEN and rand( ) < amount_of_noise * len(a_string): # Add a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice( CHARS[:-1]) + a_string[random_char_position:] if rand() < amount_of_noise * len(a_string): # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) return a_string
def iterate_training(model, X_train, y_train, X_val, y_val, ctable): """Iterative Training""" # Train the model each generation and show predictions against the validation dataset for iteration in range(1, NUMBER_OF_ITERATIONS): print() print('-' * 50) print('Iteration', iteration) model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=EPOCHS_PER_ITERATION, validation_data=(X_val, y_val)) # Select 10 samples from the validation set at random so we can visualize errors for _ in range(10): ind = random_randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member preds = model.predict_classes(rowX, verbose=0) q = ctable.decode(rowX[0]) correct = ctable.decode(rowy[0]) guess = ctable.decode(preds[0], calc_argmax=False) if INVERTED: print('Q', q[::-1]) # inverted back! else: print('Q', q) print('A', correct) print( Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess) print('---')
def generate_news_data(corpus): """Generate some news data""" print("Generating Data") questions, answers, seen_answers = [], [], set() while corpus: line = corpus.pop() while len(line) > MIN_INPUT_LEN: if len(line) <= MAX_INPUT_LEN: answer = line line = "" else: space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue if answer and answer in seen_answers: continue seen_answers.add(answer) answers.append(answer) if random_randint(100000) == 8: # Show some progress print('.', end="") print('suffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question = add_noise_to_string(answer, AMOUNT_OF_NOISE) question += '.' * (MAX_INPUT_LEN - len(question)) answer += "." * (MAX_INPUT_LEN - len(answer)) answers[answer_index] = answer assert len(answer) == MAX_INPUT_LEN if random_randint(100000) == 8: # Show some progress print(len(seen_answers)) print("answer: '{}'".format(answer)) print("question: '{}'".format(question)) print() question = question[::-1] if INVERTED else question questions.append(question) return questions, answers
def generate_news_data(corpus): """Generate some news data""" print ("Generating Data") questions, answers, seen_answers = [], [], set() while corpus: line = corpus.pop() while len(line) > MIN_INPUT_LEN: if len(line) <= MAX_INPUT_LEN: answer = line line = "" else: space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue if answer and answer in seen_answers: continue seen_answers.add(answer) answers.append(answer) if random_randint(100000) == 8: # Show some progress print('.', end="") print('suffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question = add_noise_to_string(answer, AMOUNT_OF_NOISE) question += '.' * (MAX_INPUT_LEN - len(question)) answer += "." * (MAX_INPUT_LEN - len(answer)) answers[answer_index] = answer assert len(answer) == MAX_INPUT_LEN if random_randint(100000) == 8: # Show some progress print (len(seen_answers)) print ("answer: '{}'".format(answer)) print ("question: '{}'".format(question)) print () question = question[::-1] if INVERTED else question questions.append(question) return questions, answers
def add_noise_to_sentence(sentence, amount_of_noise): """ Add artificial spelling mistakes to string :param sentence: list of words :param amount_of_noise: constant from 0 to 1 which show amount of mistakes :return: list of words with mistakes """ CHARS = list("abcdefghijklmnopqrstuvwxyz") if rand() < amount_of_noise * len(sentence): # Replace a character with a random character random_word_position = random_randint(len(sentence)) if len(sentence[random_word_position]): random_char_position = random_randint( len(sentence[random_word_position])) sentence[random_word_position] = sentence[ random_word_position][:random_char_position] + random_choice( CHARS[:-1] ) + sentence[random_word_position][random_char_position + 1:] if rand() < amount_of_noise * len(sentence): # Delete a character random_word_position = random_randint(len(sentence)) if len(sentence[random_word_position]) > 1: random_char_position = random_randint( len(sentence[random_word_position])) sentence[random_word_position] = sentence[random_word_position][:random_char_position] + \ sentence[random_word_position][random_char_position + 1:] if rand() < amount_of_noise * len(sentence): # Add a random character random_word_position = random_randint(len(sentence)) if len(sentence[random_word_position]): random_char_position = random_randint( len(sentence[random_word_position])) sentence[random_word_position] = sentence[ random_word_position][:random_char_position] + random_choice( CHARS[:-1] ) + sentence[random_word_position][random_char_position:] if rand() < amount_of_noise * len(sentence): # Transpose 2 characters random_word_position = random_randint(len(sentence)) if len(sentence[random_word_position]) > 1: random_char_position = random_randint( len(sentence[random_word_position]) - 1) sentence[random_word_position] = sentence[random_word_position][:random_char_position] + \ sentence[random_word_position][random_char_position + 1] + \ sentence[random_word_position][random_char_position] + \ sentence[random_word_position][random_char_position + 2:] return sentence
def add_noise_to_string(a_string, amount_of_noise): """Add some artificial spelling mistakes to the string""" if rand() < amount_of_noise * len(a_string): # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:] if rand() < amount_of_noise * len(a_string): # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[random_char_position + 1:] if len(a_string) < MAX_INPUT_LEN and rand() < amount_of_noise * len(a_string): # Add a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:] if rand() < amount_of_noise * len(a_string): # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) return a_string
def print_random_predictions(model, ctable, X_val, y_val): """Select 10 samples from the validation set at random so we can visualize errors""" print() for _ in range(10): ind = random_randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member preds = model.predict_classes(rowX, verbose=0) q = ctable.decode(rowX[0]) correct = ctable.decode(rowy[0]) guess = ctable.decode(preds[0], calc_argmax=False) if CONFIG.inverted: print('Q', q[::-1]) # inverted back! else: print('Q', q) print('A', correct) print(Colors.green + '☑' + Colors.close if correct == guess else Colors.red + '☒' + Colors.close, guess) print('---') print()
def show_samples(model, dataset, epoch, logs, X_dev_batch, y_dev_batch): """Selects 10 samples from the dev set at random so we can visualize errors""" for _ in range(10): ind = random_randint(0, len(X_dev_batch)) row_X, row_y = X_dev_batch[np.array([ind])], y_dev_batch[np.array([ind])] preds = model.predict_classes(row_X, verbose=0) q = dataset.character_table.decode(row_X[0]) correct = dataset.character_table.decode(row_y[0]) guess = dataset.character_table.decode(preds[0], calc_argmax=False) if INVERTED: print('Q', q[::-1]) # inverted back! else: print('Q', q) print('A', correct) print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess) print('---')
def generate_news_data(): """Generate some news data""" print("Generating Data") answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") questions = [] print('shuffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question, answer = generate_question(answer) answers[answer_index] = answer assert len(answer) == CONFIG.max_input_len if random_randint(100000) == 8: # Show some progress print(len(answers)) print("answer: '{}'".format(answer)) print("question: '{}'".format(question)) print() question = question[::-1] if CONFIG.inverted else question questions.append(question) return questions, answers
def iterate_training(model, X_train, y_train, X_val, y_val, ctable): """Iterative Training""" # Train the model each generation and show predictions against the validation dataset for iteration in range(1, NUMBER_OF_ITERATIONS): print() print('-' * 50) print('Iteration', iteration) model.fit(X_train, y_train, batch_size=BATCH_SIZE, nb_epoch=EPOCHS_PER_ITERATION, validation_data=(X_val, y_val), show_accuracy=True) # Select 10 samples from the validation set at random so we can visualize errors for _ in range(10): ind = random_randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member preds = model.predict_classes(rowX, verbose=0) q = ctable.decode(rowX[0]) correct = ctable.decode(rowy[0]) guess = ctable.decode(preds[0], calc_argmax=False) if INVERTED: print('Q', q[::-1]) # inverted back! else: print('Q', q) print('A', correct) print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess) print('---')
def show_samples(model, dataset, epoch, logs, X_dev_batch, y_dev_batch): """Selects 10 samples from the dev set at random so we can visualize errors""" #UTF8Writer = getwriter('utf8') #sys.stdout = UTF8Writer(sys.stdout) #PYTHONIOENCODING=utf8 for _ in range(10): ind = random_randint(0, len(X_dev_batch)) row_X, row_y = X_dev_batch[np.array([ind ])], y_dev_batch[np.array([ind])] preds = model.predict_classes(row_X, verbose=0) q = dataset.character_table.decode(row_X[0]) correct = dataset.character_table.decode(row_y[0]) guess = dataset.character_table.decode(preds[0], calc_argmax=False) #if INVERTED: # print('Q', q[::-1]) # inverted back! #else: # print('Q', q) #print('A', correct) #print(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess) #print('---') with open("data/outFile.txt", "a", encoding="utf-8") as out: if INVERTED: out.write('Q ' + q[::-1]) # inverted back! else: out.write('Q ' + q) with open("data/outFile.txt", "a", encoding="utf-8") as out: out.write('A ' + correct) if correct == guess: out.write(Colors.ok + '?' + ' ' + guess) else: out.write(Colors.fail + '?' + ' ' + guess) #out.write(Colors.ok + '☑' + Colors.close if correct == guess else Colors.fail + '☒' + Colors.close, guess) out.write('---')
def add_noise_to_sentence(sentence, amount_of_noise): """ Add artificial spelling mistakes to string :param sentence: list of words :param amount_of_noise: constant from 0 to 1 which show amount of mistakes :return: list of words with mistakes """ CHARS = list("abcdefghijklmnopqrstuvwxyz") substitutions = { "a": ["a"], "b": ["b"], "c": ["c"], "d": ["d"], "e": ["e"], "f": ["f"], "g": ["g"], "h": ["h"], "i": ["i"], "j": ["j"], "k": ["k"], "l": ["l"], "m": ["m"], "n": ["n"], "o": ["o"], "p": ["p"], "q": ["q"], "r": ["r"], "s": ["s"], "t": ["t"], "u": ["u"], "v": ["v"], "w": ["w"], "x": ["x"], "y": ["y"], "z": ["z"], "A": ["A"], "B": ["B"], "C": ["C"], "D": ["D"], "E": ["E"], "F": ["F"], "G": ["G"], "H": ["H"], "I": ["I"], "J": ["J"], "K": ["K"], "L": ["L"], "M": ["M"], "N": ["N"], "O": ["O"], "P": ["P"], "Q": ["Q"], "R": ["R"], "S": ["S"], "T": ["T"], "U": ["U"], "V": ["V"], "W": ["W"], "X": ["X"], "Y": ["Y"], "Z": ["Z"], " ": [" "], ".": ["."] } if rand() < amount_of_noise * len(sentence): # Replace a character with a random character random_char_position = random_randint(len(sentence)) sentence = sentence[:random_char_position] + random_choice( substitutions[sentence[random_char_position]] ) + sentence[random_char_position + 1:] if rand() < amount_of_noise * len(sentence): # Delete a character random_char_position = random_randint(len(sentence)) sentence = sentence[:random_char_position] + sentence[ random_char_position + 1:] if rand() < amount_of_noise * len(sentence) and len(sentence) < 197: # Add a random character random_char_position = random_randint(len(sentence)) sentence = sentence[:random_char_position] + random_choice( CHARS[:-1]) + sentence[random_char_position:] if rand() < amount_of_noise * len(sentence): # Transpose 2 characters random_char_position = random_randint(len(sentence) - 1) sentence = sentence[:random_char_position] + sentence[random_char_position + 1] + \ sentence[random_char_position] + sentence[random_char_position + 2:] return sentence