def format(s, split_phrases=False, add_sol_eol=False, add_eol_only=False, only_one_phrase=False):
    z = tokenize_weak.format(s)

    if split_phrases:
        x = []

        z = z.replace(',', ' ')
        z = z.replace('?', ' ? . ')
        z = z.replace('!', ' . ')
        zz = z.split('.')
        #zz = filter(None, re.split("[,.\-!?:]+", z))
        for i in zz:
            xx = i.split(' ')
            y = []
            for j in xx:
                j = j.strip()
                if len(j) > 0: y.append(j)
            i = ' '.join(y)
            i = i.strip()
            if len(i) > 1 and not i.isspace():
                if not add_eol_only:
                    x.append( hparams['sol'] + ' ' + i + ' ' + hparams['eol'] + ' . ' )
                else:
                    if i.split(' ')[-1] != hparams['eol']:
                        x.append( i + ' ' + hparams['eol'] + ' . ')

        if only_one_phrase and len(x) > 1: ## return just first phrase
            return x[-1]
        x = ' '.join(x)
        return x
    if add_sol_eol:
        if not add_eol_only: z = hparams['sol'] + ' ' + z
        z = z + ' ' + hparams['eol']
    return z
def make_vocab():
    wordlist = []
    for filename in train_file:
        with open(filename, 'r') as x:
            xx = x.read()
            for line in xx.split('\n'):
                line = tokenize_weak.format(line)
                y = line.lower().split()
                for word in y:
                    wordlist.append(word)
            pass
    print('values read')

    #wordset = set(wordlist)
    c = Counter(wordlist)
    l = len(wordlist)
    print(l, 'length of raw vocab data')
    if l > vocab_length: l = vocab_length
    cc = c.most_common()

    #cc = wordlist
    print(len(cc), 'length of result list')
    #v = []
    num = 0
    ss = sorted(cc, key=itemgetter(1))
    #print(ss[0:10])
    ss.reverse()
    #print(ss[0:10])
    for z in ss:  # sorted(cc, key= lambda word: word[1]):
        if z[0].lower() not in v and num < vocab_length: v.append(z[0].lower())
        num += 1
    #vv = list(set(v))
    v.sort()
Exemplo n.º 3
0
def format(s):
    z = tokenize_weak.format(s)
    if z == None or z.strip() == '':
        z = ' what ? '

    z = hparams['sol'] + ' ' + z
    z = z + ' ' + hparams['eol']
    return z
Exemplo n.º 4
0
    def task_interactive(self):

        print('-------------------')
        while True:
            line = input("> ")
            line = tokenize_weak.format(line)
            print(line)
            out, _ = self.evaluate(None, None, line)
            print(out)
Exemplo n.º 5
0
 def task_interactive(self):
     self.model, _, _ = self.embedding_model(self.model,
                                             self.model_encoder,
                                             self.model_inference,
                                             global_check=True)
     print('-------------------')
     while True:
         line = input("> ")
         line = tokenize_weak.format(line)
         print(line)
         self.predict_words(line, stop_at_eol=True)
Exemplo n.º 6
0
def tokenize(sentence):
    sentence = tokenize_weak.format(sentence)
    xx = []
    for x in sentence.split():
        x = x.strip()
        if x == '\n' or x == '\r' or x == '\n\r' or x == '\r\n' or len(x) == 0:
            pass
        else:
            #xx.append(' ')
            xx.append(x)
        pass
    sentence = ' '.join(xx)
    return sentence
    pass
Exemplo n.º 7
0
    def task_interactive(self):

        print('-------------------')
        while True:
            line = input("> ")
            line = tokenize_weak.format(line)
            ll = []
            for l in line.split():
                if l in self.vocab_lang.word2index:
                    ll.append(l)
            if len(ll) > 0:
                line = ' '.join(ll)
                print(line)
                out, _ = self.evaluate(None, None, line)
                print(out)
Exemplo n.º 8
0
def process_questions(questions, include_blacklisted=True, do_tokenize=True):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        if do_tokenize:
            prepared_questions.append(
                tokenize_weak.format(question
                                     ) if question else '##emptyquestion##')
        else:
            prepared_questions.append(
                question if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)

    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        answers = detokenize(answers)
        answers = replace_in_answers(answers, 'answers')
        answers_score = score_answers(answers, 'answers')
        best_index, best_score = get_best_score(answers_score,
                                                include_blacklisted)

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        else:
            prepared_answers_list.append({
                'answers': answers,
                'scores': answers_score,
                'best_index': best_index,
                'best_score': best_score
            })

    return prepared_answers_list
Exemplo n.º 9
0
def make_vocab():
    wordlist = []
    with open(train_file, 'r') as x:
        xx = x.read()
        for line in xx.split('\n'):
            line = tokenize_weak.format(line)
            y = line.lower().split()
            for word in y:
                wordlist.append(word)
        pass
    print('values read')
    wordset = set(wordlist)
    c = Counter(wordset)
    l = len(wordset)
    print(l,'length of raw vocab data')
    if l > vocab_length: l = vocab_length
    cc = c.most_common(l)
    print(len(cc), 'length of result list')
    #v = []
    for z in sorted(cc):
        if z[0].lower() not in v: v.append(z[0].lower())
    #vv = list(set(v))
    v.sort()
Exemplo n.º 10
0
def make_vocab(train_file,
               order=False,
               read_glove=False,
               contractions=False,
               no_limit=False):
    global v, v_end
    wordlist = []

    vocab_length = hparams['num_vocab_total']

    if contractions:
        whitelist.extend(directions)
        wordlist.extend(whitelist)
        #wordlist.extend(directions)
        print('add whitelist.')

    for filename in train_file:
        if os.path.isfile(filename) and filename.endswith('.csv'):
            print('csv file:', filename)
            with open(filename, 'rb') as x:
                text = x.readlines()

                for xx in text:  #[:csv_cutoff]:
                    line = xx.strip().decode('utf-8', errors='ignore')

                    y = line.split(',')[
                        1:-1]  # magic numbers -- which columns to use.
                    y[0] = y[0].lower()
                    #print(y)
                    for word in y:
                        if word not in wordlist or True:
                            wordlist.append(word)
                pass

    for filename in train_file:
        if os.path.isfile(filename) and not filename.endswith('.csv'):
            print('found:', filename)
            with open(filename, 'r') as x:
                xx = x.read()
                for line in xx.split('\n'):
                    line = tokenize_weak.format(line)
                    y = line.lower().split()
                    for word in y:
                        wordlist.append(word)
                pass
    print('values read from text file.', ' '.join(train_file))

    if read_glove:
        with open(FROM, 'r') as x:
            xx = x.read()
            for line in xx.split('\n'):
                l = line.split(' ')
                #print( len(l))
                if len(l) > 2:
                    wordlist.append(l[0].strip())
        pass

        print('values read from glove file.')

    #wordset = set(wordlist)
    c = Counter(wordlist)
    l = len(wordlist)
    print(l, 'length of raw vocab data')
    if l > vocab_length and not no_limit:
        l = vocab_length
    if no_limit:
        vocab_length = l
        hparams['num_vocab_total'] = vocab_length
    cc = c.most_common()[:l]

    print(len(cc), 'length of result list')
    #v = []
    num = 0
    if order:
        ss = sorted(cc, key=itemgetter(1))
        #print(ss[0:10])
        ss.reverse()
    else:
        ss = cc

    #print(ss[0:10])
    #vocab_length -= m
    print(vocab_length, 'vl')

    for z in ss:  # sorted(cc, key= lambda word: word[1]):
        if (z[0].lower() not in v
                and num < vocab_length) or (z[0].lower() in whitelist
                                            and z[0].lower() not in v_end):
            v.append(z[0].lower())
            num += 1

    if len(v_end) > 0:
        v_temp = []
        for z in v_end:
            if z not in v:
                v_temp.append(z)
        v_end = v_temp
        v_temp_num = len(v_end)
        v = v[:-v_temp_num]

    if order: v.sort()

    if len(v_end) > 0:
        v.extend(v_end)

    vv = [hparams['unk'], hparams['sol'], hparams['eol'], hparams['eow']]
    for z in v:
        if len(vv) < vocab_length and z not in vv: vv.append(z)
    if len(v_end) > 0:
        vv.extend(v_end)

    v = vv

    print('len', len(v))
    return v
Exemplo n.º 11
0
def format(s):
    z = tokenize_weak.format(s)
    if z.strip() == '':
        z = ' what ? '
    return z
Exemplo n.º 12
0
            if arg_stagger:
                print('stagger output.')

            for line in z:

                ## set autoencode here.
                auto_flag = False
                if args['autoencode'] is not None and random.uniform(0, 1) < arg_autoencode: auto_flag = True
                else: auto_flag = False

                save = ''
                if num >= arg_start and (arg_length == 0 or num < arg_start + arg_length):
                    line = line.split('\t')

                    line[0] = format(line[0])
                    line[1] = format(line[1])

                    line[0], line[1] = move_order(line[0], line[1])

                    if arg_eol and len(line[0]) > 1:
                        line[0] += ' ' + hparams['eol']

                    if arg_eol and len(line[1]) > 1:
                        line[1] += ' ' + hparams['eol']

                    if not arg_stagger and arg_classifier != "MRPC" and arg_classifier != "MNLI" and not arg_gpt2:

                        src.write(line[0].lower())
                        save = line[0][:]
                        if not line[0].endswith('\n'):
Exemplo n.º 13
0
            print(answers['answers'][answers['best_index']])

        sys.exit()

    # Interactive mode
    print("\n\nStarting interactive mode (first response will take a while):")
    colorama.init()

    hparams['num_translations_per_input'] = 10
    hparams['override_loaded_hparams'] = True
    print(hparams)

    # QAs
    while True:
        question = input(colorama.Fore.WHITE + "\n> ")
        question = tokenize_weak.format(question)
        answers = process_questions(question)
        print(answers)
        answers = answers[0]
        #answers = process_questions(question)[0]
        chosen = ''
        if answers is None:
            print(colorama.Fore.RED + "! Question can't be empty")
        else:
            for i, _ in enumerate(answers['scores']):
                if chosen == '':
                    if answers['scores'][i] == 1 and question.strip().lower(
                    ) != answers['answers'][i].strip().lower():
                        chosen = answers['answers'][i]
                print("{}- {}{}".format(
                    colorama.Fore.GREEN