示例#1
0
def inference(question, include_blacklisted=True):
    answers = inference_helper(question)
    answers = detokenize(answers)
    answers = replace_in_answers(answers, 'answers')
    answers_rate = score_answers(answers)

    try:
        index = answers_rate.index(1)
        score = 1
    except:
        index = None

    if index is None and include_blacklisted:
        try:
            index = answers_rate.index(0)
            score = 0
        except:
            index = 0
            score = -1

    if index is None:
        index = 0
        score = -1

    return {'answers': answers, 'index': index, 'score': score}
示例#2
0
def process_questions(questions, return_score_modifiers = False):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        prepared_questions.append(apply_bpe(tokenize(question)) if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)

    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        answers = detokenize(answers)
        answers = replace_in_answers(answers)
        answers = normalize_new_lines(answers)
        answers_score = score_answers(questions[index], answers)
        best_index, best_score = get_best_score(answers_score['score'])

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        elif return_score_modifiers:
            prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers']})
        else:
            prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score})

    return prepared_answers_list
def process_questions(questions, include_blacklisted = True):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        prepared_questions.append(tokenize(question) if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)

    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        answers = detokenize(answers)
        answers = replace_in_answers(answers, 'answers')
        answers_score = score_answers(answers, 'answers')
        best_index, best_score = get_best_score(answers_score, include_blacklisted)

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        else:
            prepared_answers_list.append({'answers': answers, 'scores': answers_score, 'best_index': best_index, 'best_score': best_score})

    return prepared_answers_list
示例#4
0
def prepare():
    global vocab

    print("\nPreparing training set from raw set")

    # Ensure that train folder exists
    try:
        os.makedirs(preprocessing['train_folder'])
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Ensure that model/log folder exists
    train_log_dir = os.path.join(hparams['out_dir'], 'train_log')
    try:
        os.makedirs(train_log_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Iterate thru files and prepare them
    for file_name, amounts in files.items():

        vocab = Counter([])

        print("\nFile: {} (iteration = 10k lines)".format(file_name))

        # Output file handler
        out_file = open('{}/{}'.format(preprocessing['train_folder'],
                                       file_name),
                        'w',
                        encoding='utf-8',
                        buffering=131072)

        # Maximum number of lines
        read = 0
        amount = int(
            min(
                amounts['amount'] * preprocessing['samples']
                if preprocessing['samples'] > 0 else 10**20,
                amounts['up_to'] if amounts['up_to'] > 0 else 10**20))

        # Prepare thread variables
        write_thread = None
        vocab_thread1 = None
        vocab_thread2 = None

        # We are going to use multiprocessing for tokenization, as it's cpu intensive
        with Pool(processes=preprocessing['cpu_count']) as pool:

            # Open input file
            with open('{}/{}'.format(preprocessing['source_folder'],
                                     file_name),
                      'r',
                      encoding='utf-8',
                      buffering=131072) as in_file:

                # Iterate every 10k lines
                for rows in tqdm(read_lines(in_file, 10000, '')):

                    # Process using multiprocessing
                    rows = pool.map_async(tokenize, rows, 100).get()

                    # Join running threads from previous loop
                    if write_thread is not None:
                        write_thread.join()
                        vocab_thread1.join()
                        vocab_thread2.join()

                    # If number of lines greater than limit or EOF - break
                    # We are leaving before last save as we have to handle last batch diffrently
                    # zip_longest in read_lines adds extra empty lines up to batch size and we need to remove them
                    # but only for last batch - no need to do that for every batch
                    read += len(rows)
                    if read >= amount:
                        rows = rows[:amount - read + len(rows)]
                        break
                    assert len(rows) == 10000

                    # We are going to process vocab in two threads - a bit faster than one and we need shared memory
                    # Also multiprocessing is slower here
                    vocab_thread1 = Thread(target=append_vocab, args=(rows, 1))
                    vocab_thread1.start()
                    vocab_thread2 = Thread(target=append_vocab, args=(rows, 2))
                    vocab_thread2.start()

                    # And thread for saving tokenized data to putput file
                    write_thread = Thread(target=write_lines,
                                          args=(out_file, rows))
                    write_thread.start()

                    rows = []

        # Last vocab parts and last lines to write
        vocab_thread1 = Thread(target=append_vocab, args=(rows, 1))
        vocab_thread1.start()
        vocab_thread2 = Thread(target=append_vocab, args=(rows, 2))
        vocab_thread2.start()
        write_thread = Thread(target=write_lines, args=(out_file, rows))
        write_thread.start()
        vocab_thread1.join()
        vocab_thread2.join()
        write_thread.join()
        out_file.close()

        # If it's train file, make vocab
        if file_name == 'train.from' or file_name == 'train.to':
            print("\nFile: {} (saving vocab)".format(
                file_name.replace('train', 'vocab')))

            # Get most common entities
            vocab = [entity for entity, v in vocab.most_common()]

            # Do replacements
            new_vocab = [
                replace_in_answers([entity], 'vocab')[0] for entity in vocab
            ]

            # Filter out duplicates and empty entities
            vocab = set()
            vocab = [
                entity for entity in new_vocab
                if not (entity in vocab or vocab.add(entity)) and entity
            ]

            # Write entities to a file
            with open('{}/{}'.format(preprocessing['train_folder'],
                                     file_name.replace('train', 'vocab')),
                      'w',
                      encoding='utf-8',
                      buffering=131072) as vocab_file:
                vocab_file.write(
                    "<unk>\n<s>\n</s>\n" +
                    "\n".join(vocab[:preprocessing['vocab_size']]))
            with open('{}/{}'.format(
                    preprocessing['train_folder'],
                    file_name.replace('train', 'vocab_unused')),
                      'w',
                      encoding='utf-8',
                      buffering=131072) as vocab_file:
                vocab_file.write("\n".join(
                    vocab[preprocessing['vocab_size']:]))

            # Write metadata for embeddings
            with open('{}/{}'.format(
                    os.path.join(train_log_dir), 'decoder.tsv'
                    if file_name == 'train.to' else 'encoder.tsv'),
                      'w',
                      encoding='utf-8',
                      buffering=131072) as metadata_file:
                metadata_file.write(
                    "<unk>\n<s>\n</s>\n" +
                    "\n".join(vocab[:preprocessing['vocab_size']]))

    # Write pbtxt file for metadata for embeddings
    with open('{}/{}'.format(os.path.join(train_log_dir),
                             'projector_config.pbtxt'),
              'w',
              encoding='utf-8',
              buffering=131072) as pbtxt_file:
        pbtxt_file.write(
            '''embeddings {\n    tensor_name: 'embeddings/decoder/embedding_decoder'\n    '''
            + '''metadata_path: 'decoder.tsv'\n}\nembeddings {\n    ''' +
            '''tensor_name: 'embeddings/encoder/embedding_encoder'\n    metadata_path: 'encoder.tsv'\n}'''
        )
示例#5
0
def inference_internal(question):
    answers = inference_helper(question)
    answers = detokenize(answers)
    answers = replace_in_answers(answers, 'answers')
    answers_rate = score_answers(answers, 'answers')
    return (answers, answers_rate)
示例#6
0
import sys
sys.path.insert(0, '../')
from core.sentence import replace_in_answers
from colorama import Fore, init


tests = [
    ['', ''],
]

init()

for test in tests:
    replaced_answers = replace_in_answers([test[0]], 'answers')
    print('[{}]  {}  ->  {}{}'.format(Fore.GREEN + 'PASS' + Fore.RESET if replaced_answers[0] == test[1] else Fore.RED + 'FAIL' + Fore.RESET, test[0], test[1], '' if replaced_answers[0] == test[1] else '  Result: {}'.format(replaced_answers[0])))
示例#7
0
import sys
sys.path.insert(0, '../')
from core.sentence import replace_in_answers
from colorama import Fore, init

tests = [
    ['', ''],
]

init()

for test in tests:
    replaced_answers = replace_in_answers([test[0]], 'vocab')
    print('[{}]  {}  ->  {}{}'.format(
        Fore.GREEN + 'PASS' +
        Fore.RESET if replaced_answers[0] == test[1] else Fore.RED + 'FAIL' +
        Fore.RESET, test[0], test[1], '' if replaced_answers[0] == test[1] else
        '  Result: {}'.format(replaced_answers[0])))
 def inference_internal(self, question):
     answers = self.do_inference(tokenize(question))
     answers = detokenize(answers)
     answers = replace_in_answers(answers, 'answers')
     answers_rate = score_answers(answers, 'answers')
     return (answers, answers_rate)