def inference(question, include_blacklisted=True): answers = inference_helper(question) answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_rate = score_answers(answers) try: index = answers_rate.index(1) score = 1 except: index = None if index is None and include_blacklisted: try: index = answers_rate.index(0) score = 0 except: index = 0 score = -1 if index is None: index = 0 score = -1 return {'answers': answers, 'index': index, 'score': score}
def process_questions(questions, return_score_modifiers = False): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append(apply_bpe(tokenize(question)) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers) answers = normalize_new_lines(answers) answers_score = score_answers(questions[index], answers) best_index, best_score = get_best_score(answers_score['score']) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) elif return_score_modifiers: prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers']}) else: prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score}) return prepared_answers_list
def process_questions(questions, include_blacklisted = True): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append(tokenize(question) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_score = score_answers(answers, 'answers') best_index, best_score = get_best_score(answers_score, include_blacklisted) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) else: prepared_answers_list.append({'answers': answers, 'scores': answers_score, 'best_index': best_index, 'best_score': best_score}) return prepared_answers_list
def prepare(): global vocab print("\nPreparing training set from raw set") # Ensure that train folder exists try: os.makedirs(preprocessing['train_folder']) except OSError as e: if e.errno != errno.EEXIST: raise # Ensure that model/log folder exists train_log_dir = os.path.join(hparams['out_dir'], 'train_log') try: os.makedirs(train_log_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Iterate thru files and prepare them for file_name, amounts in files.items(): vocab = Counter([]) print("\nFile: {} (iteration = 10k lines)".format(file_name)) # Output file handler out_file = open('{}/{}'.format(preprocessing['train_folder'], file_name), 'w', encoding='utf-8', buffering=131072) # Maximum number of lines read = 0 amount = int( min( amounts['amount'] * preprocessing['samples'] if preprocessing['samples'] > 0 else 10**20, amounts['up_to'] if amounts['up_to'] > 0 else 10**20)) # Prepare thread variables write_thread = None vocab_thread1 = None vocab_thread2 = None # We are going to use multiprocessing for tokenization, as it's cpu intensive with Pool(processes=preprocessing['cpu_count']) as pool: # Open input file with open('{}/{}'.format(preprocessing['source_folder'], file_name), 'r', encoding='utf-8', buffering=131072) as in_file: # Iterate every 10k lines for rows in tqdm(read_lines(in_file, 10000, '')): # Process using multiprocessing rows = pool.map_async(tokenize, rows, 100).get() # Join running threads from previous loop if write_thread is not None: write_thread.join() vocab_thread1.join() vocab_thread2.join() # If number of lines greater than limit or EOF - break # We are leaving before last save as we have to handle last batch diffrently # zip_longest in read_lines adds extra empty lines up to batch size and we need to remove them # but only for last batch - no need to do that for every batch read += len(rows) if read >= amount: rows = rows[:amount - read + len(rows)] break assert len(rows) == 10000 # We are going to process vocab in two threads - a bit faster than one and we need shared memory # Also multiprocessing is slower here vocab_thread1 = Thread(target=append_vocab, args=(rows, 1)) vocab_thread1.start() vocab_thread2 = Thread(target=append_vocab, args=(rows, 2)) vocab_thread2.start() # And thread for saving tokenized data to putput file write_thread = Thread(target=write_lines, args=(out_file, rows)) write_thread.start() rows = [] # Last vocab parts and last lines to write vocab_thread1 = Thread(target=append_vocab, args=(rows, 1)) vocab_thread1.start() vocab_thread2 = Thread(target=append_vocab, args=(rows, 2)) vocab_thread2.start() write_thread = Thread(target=write_lines, args=(out_file, rows)) write_thread.start() vocab_thread1.join() vocab_thread2.join() write_thread.join() out_file.close() # If it's train file, make vocab if file_name == 'train.from' or file_name == 'train.to': print("\nFile: {} (saving vocab)".format( file_name.replace('train', 'vocab'))) # Get most common entities vocab = [entity for entity, v in vocab.most_common()] # Do replacements new_vocab = [ replace_in_answers([entity], 'vocab')[0] for entity in vocab ] # Filter out duplicates and empty entities vocab = set() vocab = [ entity for entity in new_vocab if not (entity in vocab or vocab.add(entity)) and entity ] # Write entities to a file with open('{}/{}'.format(preprocessing['train_folder'], file_name.replace('train', 'vocab')), 'w', encoding='utf-8', buffering=131072) as vocab_file: vocab_file.write( "<unk>\n<s>\n</s>\n" + "\n".join(vocab[:preprocessing['vocab_size']])) with open('{}/{}'.format( preprocessing['train_folder'], file_name.replace('train', 'vocab_unused')), 'w', encoding='utf-8', buffering=131072) as vocab_file: vocab_file.write("\n".join( vocab[preprocessing['vocab_size']:])) # Write metadata for embeddings with open('{}/{}'.format( os.path.join(train_log_dir), 'decoder.tsv' if file_name == 'train.to' else 'encoder.tsv'), 'w', encoding='utf-8', buffering=131072) as metadata_file: metadata_file.write( "<unk>\n<s>\n</s>\n" + "\n".join(vocab[:preprocessing['vocab_size']])) # Write pbtxt file for metadata for embeddings with open('{}/{}'.format(os.path.join(train_log_dir), 'projector_config.pbtxt'), 'w', encoding='utf-8', buffering=131072) as pbtxt_file: pbtxt_file.write( '''embeddings {\n tensor_name: 'embeddings/decoder/embedding_decoder'\n ''' + '''metadata_path: 'decoder.tsv'\n}\nembeddings {\n ''' + '''tensor_name: 'embeddings/encoder/embedding_encoder'\n metadata_path: 'encoder.tsv'\n}''' )
def inference_internal(question): answers = inference_helper(question) answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_rate = score_answers(answers, 'answers') return (answers, answers_rate)
import sys sys.path.insert(0, '../') from core.sentence import replace_in_answers from colorama import Fore, init tests = [ ['', ''], ] init() for test in tests: replaced_answers = replace_in_answers([test[0]], 'answers') print('[{}] {} -> {}{}'.format(Fore.GREEN + 'PASS' + Fore.RESET if replaced_answers[0] == test[1] else Fore.RED + 'FAIL' + Fore.RESET, test[0], test[1], '' if replaced_answers[0] == test[1] else ' Result: {}'.format(replaced_answers[0])))
import sys sys.path.insert(0, '../') from core.sentence import replace_in_answers from colorama import Fore, init tests = [ ['', ''], ] init() for test in tests: replaced_answers = replace_in_answers([test[0]], 'vocab') print('[{}] {} -> {}{}'.format( Fore.GREEN + 'PASS' + Fore.RESET if replaced_answers[0] == test[1] else Fore.RED + 'FAIL' + Fore.RESET, test[0], test[1], '' if replaced_answers[0] == test[1] else ' Result: {}'.format(replaced_answers[0])))
def inference_internal(self, question): answers = self.do_inference(tokenize(question)) answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_rate = score_answers(answers, 'answers') return (answers, answers_rate)