Python tokenize примеры, utils.query_util.tokenize Python примеры использования

Пример #1

0

Показать файл

Файл: data_helper.py Проект: aquadrop/py

def parse_dialogs_per_response(sentences, lines, candid_dic, char=1):
    '''
        Parse dialogs provided in the babi tasks format
    '''
    data = []
    context = []
    u = None
    r = None
    # print(candid_dic)
    for line in lines:
        line = line.strip()
        if line:
            if '\t' in line:
                # print(line)
                try:
                    u, r, salt = line.split('\t')
                except:
                    print('error and exit', line)
                    exit(-1)
                if config.multi_label:
                    a = [candid_dic[single_r] for single_r in r.split(",")]
                else:
                    if r not in candid_dic:
                        print('warning candidate is not listed..', r)
                        continue
                    a = candid_dic[r]
                u = tokenize(u, char=char)
                if config.fix_vocab:
                    r = translator.en2cn(r)
                r = tokenize(r, char=char)
                placeholder = salt == 'placeholder'
                if config.fix_vocab:
                    salt = translator.en2cn(salt)
                salt = tokenize(salt, char=char)

                sentences.add(','.join(u))
                sentences.add(','.join(r))
                sentences.add(','.join(salt))

                # print(u)
                # temporal encoding, and utterance/response encoding
                # data.append((context[:],u[:],candid_dic[' '.join(r)]))
                data.append((context[:], u[:], a))
                context.append(u)
                # r = r if placeholder == 'placeholder' else r + salt
                context.append(r)
                if not placeholder:
                    context.append(salt)
        else:
            # clear context
            context = []
    # print(data)
    sentences.add(config.EMPTY)
    return data

Пример #2

0

Показать файл

def parse_dialogs_per_response(lines, candid_dic, dmn=False):
    '''
        Parse dialogs provided in the babi tasks format
    '''
    data = []
    context = []
    u = None
    r = None
    # print(candid_dic)
    for line in lines:
        line = line.strip()
        if line:
            if '\t' in line:
                # print(line)
                try:
                    u, r, salt = line.split('\t')
                except:
                    print(line)
                    exit(-1)
                if config.MULTILABEL >= 1:
                    a = [candid_dic[single_r] for single_r in r.split(",")]
                else:
                    if r not in candid_dic:
                        continue
                    a = candid_dic[r]
                u = tokenize(u)
                if config.FIX_VOCAB:
                    r = translator.en2cn(r)
                r = tokenize(r)
                placeholder = salt == 'placeholder'
                if config.FIX_VOCAB:
                    salt = translator.en2cn(salt)
                salt = tokenize(salt)

                # print(u)
                # temporal encoding, and utterance/response encoding
                # data.append((context[:],u[:],candid_dic[' '.join(r)]))
                data.append((context[:], u[:], a))
                if dmn:
                    context.append(u)
                    context.append(r + salt)
                else:
                    u.append('$u')
                    r.append('$r')
                    salt.append('$r')
                    context.append(u)
                    context.append(r)
                    if not placeholder:
                        context.append(salt)
        else:
            # clear context
            context = []
    # print(data)
    return data

Пример #3

0

Показать файл

Файл: iqa.py Проект: aquadrop/py

    def w2v_local_similarity(self, query1, query2):
        tokens1 = tokenize(query1, 3)
        tokens2 = [tokenize(t, 3) for t in query2]

        max_sim = -10
        _g = query2[0]
        for words2 in tokens2:
            sim = computeSentenceSim(tokens1, words2)
            if sim > max_sim:
                max_sim = sim
                _g = words2
        return float(max_sim), _g

Пример #4

0

Показать файл

Файл: dmn_main.py Проект: aquadrop/py

    def reply(self, msg):
        line = msg.strip().lower()
        if line == 'clear':
            self.context = []
            replp_msg = 'memory cleared!'
        else:
            inputs = []
            questions = []

            q = tokenize(line, char=2)
            q_vector = [self.w2idx[w] for w in q]
            inp_vector = [[self.w2idx[w] for w in s] for s in self.context]

            inputs.append(inp_vector)
            questions.append(np.vstack(q_vector).astype(np.float32))

            input_lens, sen_lens, max_sen_len = dmn_data_utils.get_sentence_lens(
                inputs)

            q_lens = dmn_data_utils.get_lens(questions)
            # max_q_len = np.max(q_lens)

            # max_input_len = min(np.max(input_lens),
            #                     self.config.max_allowed_inputs)

            max_input_len = self.model.max_input_len
            max_sen_len = self.model.max_sen_len
            max_q_len = self.model.max_q_len

            inputs = dmn_data_utils.pad_inputs(inputs, input_lens,
                                               max_input_len,
                                               "split_sentences", sen_lens,
                                               max_sen_len)

            # inputs = [inputs[0] for _ in range(self.config.batch_size)]
            inputs = np.asarray(inputs)

            questions = dmn_data_utils.pad_inputs(questions, q_lens, max_q_len)
            # questions = [questions[0] for _ in range(self.config.batch_size)]
            questions = np.asarray(questions)

            preds = self.model.predict(self.session, inputs, input_lens,
                                       max_sen_len, questions, q_lens)
            preds = preds[0].tolist()
            print(preds)
            r = self.idx2candid[preds[0]]
            reply_msg = r
            r = translator.en2cn(r)
            r = tokenize(r, char=2)
            self.context.append(r)

        return reply_msg

Пример #5

0

Показать файл

Файл: iqa.py Проект: aquadrop/py

    def similarity(self, query1, query2):
        def cos(embed1, embed2):
            num = np.dot(embed1, embed2.T)
            denom = np.linalg.norm(embed1) * np.linalg.norm(embed2)
            cos = num / denom
            sin = 0.5 + 0.5 * cos
            return cos

        tokens1 = tokenize(query1, 3)
        tokens2 = tokenize(query2, 3)
        embed1 = self.embed(tokens1)
        embed2 = self.embed(tokens2)

        return cos(embed1, embed2)

Пример #6

0

Показать файл

def build_vocab_beforehand(vocab_base, vocab_path):
    with open(vocab_base, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    vocab = reduce(lambda x, y: x | y,
                   (set(tokenize(line, char=0)) for line in lines))
    vocab = sorted(vocab)
    # print(vocab)
    extra_list = [
        'api', 'call', 'slot', 'deny', 'rhetorical', 'general', 'brand',
        'price', 'ac', 'power', 'fr', 'cool_type', 'phone', 'sys', 'feature',
        'color', 'memsize', 'size', 'distance', 'resolution', 'panel', 'dyson',
        'root', 'virtual', 'mode', 'energy_lvl', 'connect', 'net', 'rmem',
        'mmem', 'people', 'vol', 'width', 'height', 'control', 'olec', 'led',
        'vr', 'oled', 'tcl', 'lcd', 'oled', 'oppo', 'vivo', 'moto', "1.5",
        '2.5', 'plugin'
    ]
    for w in extra_list:
        vocab.append(w)
    for i in range(100):
        vocab.append('placeholder' + str(i + 1))
    vocab = sorted(vocab)
    print(vocab)
    # 0 is reserved
    w2idx = dict((c, i + 1) for i, c in enumerate(vocab))
    with open(vocab_path, 'w', encoding='utf-8') as f:
        json.dump(vocab, f, ensure_ascii=False)

Пример #7

0

Показать файл

 def append_memory(self, m):
     if not m:
         return
     if config.FIX_VOCAB:
         m = translator.en2cn(m)
         print('translated..', m)
     m = tokenize(m)
     m.append('$r')
     self.context.append(m)

Пример #8

0

Показать файл

def vectorize_candidates(candidates, word_idx, sentence_size):
    shape = (len(candidates), sentence_size)
    C = []
    # print(shape)
    for i, candidate in enumerate(candidates):
        tokens = tokenize(candidate)
        lc = max(0, sentence_size - len(tokens))
        C.append([word_idx[w] if w in word_idx else 0
                  for w in tokens] + [0] * lc)
    # print(C)
    return tf.constant(C, shape=shape)

Пример #9

0

Показать файл

 def append_memory(self, m):
     if not m:
         return
     m = translator.en2cn(m)
     m = tokenize(m, self.char)
     print('appending memory..', m)
     q_vector = m + \
                [self.config.PAD for _ in range(
                    self.max_sen_len - len(m))]
     q_vector = [getVector(word) for word in q_vector]
     self.context.append(q_vector)
     self.context = self.context[-self.config.max_memory_size:]

Пример #10

0

Показать файл

Файл: translator.py Проект: aquadrop/py

def test():
    with open(os.path.join(grandfatherdir, 'data/memn2n/train/tree/train.txt'),
              'r',
              encoding='utf-8') as f:
        candidates = f.readlines()
    translator = Translator(
        os.path.join(grandfatherdir, "model/graph/translator_graph.pkl"))
    for line in candidates:
        line = line.strip('\n')
        line = translator.en2cn(line)
        line = query_util.tokenize(line, char=0)
        print(line)

Пример #11

0

Показать файл

def build_vocab(data, candidates, memory_size=config.MAX_MEMORY_SIZE):
    if config.FIX_VOCAB:
        with open(grandfatherdir + '/data/char_table/vocab.txt', 'r') as f:
            vocab = json.load(f)
    else:
        vocab = reduce(lambda x, y: x | y,
                       (set(list(chain.from_iterable(s)) + q)
                        for s, q, a in data))
        # vocab2 = reduce(lambda x, y: x | y, (set(candidate)
        #                                      for candidate in candidates))
        vocab2 = reduce(lambda x, y: x | y,
                        (set(tokenize(candidate)) for candidate in candidates))
        vocab |= vocab2
        vocab = sorted(vocab)
        print(vocab)
        # 0 is reserved
    w2idx = dict((c, i + 1) for i, c in enumerate(vocab))
    print(w2idx)
    max_story_size = max(map(len, (s for s, _, _ in data)))
    mean_story_size = int(np.mean([len(s) for s, _, _ in data]))
    sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
    # candidate_sentence_size = max(map(len, candidates))
    tokenized_candidates = [tokenize(candidate) for candidate in candidates]
    candidate_sentence_size = max(map(len, tokenized_candidates))
    query_size = max(map(len, (q for _, q, _ in data)))
    memory_size = min(memory_size, max_story_size)
    vocab_size = len(w2idx) + 1  # +1 for nil word
    sentence_size = max(query_size, sentence_size)  # for the position

    return {
        'w2idx': w2idx,
        'idx2w': vocab,
        'sentence_size': sentence_size,
        'candidate_sentence_size': candidate_sentence_size,
        'memory_size': memory_size,
        'vocab_size': vocab_size,
        'n_cand': len(candidates)
    }  # metadata

Пример #12

0

Показать файл

    def reply(self, msg):
        line = msg.strip().lower()
        if line == 'clear':
            self.context = []
            reply_msg = ['memory cleared!']
            values = [0]
        else:
            inputs = []
            questions = []

            q = tokenize(line, self.char)
            q_vector = [self.w2idx.get(w, 0) for w in q]
            print('q_vector:', q_vector)
            inp_vector = [[self.w2idx.get(w, 0) for w in s]
                          for s in self.context]
            inp_vector = inp_vector[-self.config.max_memory_size:]
            inputs.append(inp_vector)
            questions.append(np.vstack(q_vector).astype(np.float32))

            input_lens, sen_lens, max_sen_len = dmn_data_utils.get_sentence_lens(
                inputs)

            q_lens = dmn_data_utils.get_lens(questions)

            max_input_len = self.model.max_input_len
            max_sen_len = self.model.max_sen_len
            max_q_len = self.model.max_q_len

            inputs = dmn_data_utils.pad_inputs(inputs, input_lens,
                                               max_input_len,
                                               "split_sentences", sen_lens,
                                               max_sen_len)

            inputs = np.asarray(inputs)

            questions = dmn_data_utils.pad_inputs(questions, q_lens, max_q_len)
            questions = np.asarray(questions)

            pred, top_prob = self.model.predict(self.session, inputs,
                                                input_lens, max_sen_len,
                                                questions, q_lens)
            print(pred)
            # print('preds:', preds)
            # if self.config.multi_label:
            indices = top_prob.indices.tolist()[0]
            values = top_prob.values.tolist()[0]
            # else:
            #     indices = preds[1].tolist()[0]
            #     values = preds[0].tolist()[0]
            # print('indices:{0},values:{1}'.format(indices, values))
            # reply_msg = [self.idx2candid[ind] for ind in indices]
            # print(reply_msg)
            reply_msg = self.idx2candid[pred[0]]
            r = reply_msg
            # print('r:',r)
            r = translator.en2cn(r)
            r = tokenize(r, self.char)
            self.context.append(r)

        if self.config.multi_label:
            return reply_msg, values
        else:
            return reply_msg, values  #reply_msg[0], values[0]

Пример #13

0

Показать файл

 def append_memory(self, m):
     if not m:
         return
     m = translator.en2cn(m)
     m = tokenize(m, self.char)
     self.context.append(m)

Пример #14

0

Показать файл

Файл: iqa.py Проект: aquadrop/py

    def m_similarity(self, query1, m_query2):
        tokens1 = ','.join(tokenize(query1, 3))
        tokens2 = '@@'.join([','.join(tokenize(t, 3)) for t in m_query2])
        score, _g = mlt_ff_embedding(tokens1, tokens2)

        return score, _g

Пример #15

0

Показать файл

    def reply(self, msg):
        line = msg.strip().lower()
        if line == 'clear':
            # self.context = [[data_helper.ff_embedding_local(self.config.EMPTY) for _ in range(self.max_sen_len)]]
            self.context = [[
                getVector(self.config.EMPTY) for _ in range(self.max_sen_len)
            ]]
            reply_msg = ['memory cleared!']
            top_prob = [0]
        else:
            inputs = []
            questions = []

            q = tokenize(line, self.char)
            q_len = len(q)
            q = q[:self.max_sen_len]
            self.context_raw.append(q)
            q_vector = q + \
                       [self.config.PAD for _ in range(
                               self.max_sen_len - len(q))]
            q_vector = [getVector(word) for word in q_vector]

            inp_vector = self.context
            pad_vector = [
                getVector(self.config.PAD) for _ in range(self.max_sen_len)
            ]
            inp_vector = inp_vector + \
                         [pad_vector for _ in range(
                                 self.max_input_len - len(inp_vector))]

            inputs.append(inp_vector)
            questions.append(q_vector)

            # with tf.Session() as session:
            #     session.run(tf.global_variables_initializer())
            #
            #     top_predict_proba = self.graph.get_tensor_by_name(
            #         'pred:0')
            #     qp = self.graph.get_tensor_by_name('questions:0')
            #     ql = self.graph.get_tensor_by_name('question_lens:0')
            #     ip = self.graph.get_tensor_by_name('inputs:0')
            #     il = self.graph.get_tensor_by_name('input_lens:0')
            #     dp = self.graph.get_tensor_by_name('dropout:0')
            #     output = session.run(top_predict_proba, feed_dict={
            #         qp: questions, ql: [self.max_sen_len], ip: inputs, il: [len(self.context)], dp: self.config.dropout})

            pred, top_prob = self.model.predict(self.session, inputs,
                                                [len(self.context)],
                                                self.max_sen_len, questions,
                                                [q_len])

            # print('pred:', pred, top_prob)
            # indices = output.indices.tolist()[0]
            # values = output.values.tolist()[0]

            reply_msg = [self.idx2candid[ind] for ind in pred]
            r = reply_msg[0]
            r = translator.en2cn(r)
            r = tokenize(r, self.char)
            r = r[:self.max_sen_len]
            self.context_raw.append(r)
            r_vector = r + \
                       [self.config.PAD for _ in range(self.max_sen_len - len(r))]
            r_vector = [getVector(word) for word in r_vector]
            self.context.append(q_vector)
            self.context.append(r_vector)
            if len(self.context) > self.config.max_memory_size:
                self.context = self.context[-self.config.max_memory_size:]

        return reply_msg[0], top_prob[0]

Пример #16

0

Показать файл

    def reply(self, msg):
        line = msg.strip().lower()
        if line == 'clear':
            self.clear_memory()
            reply_msg = 'memory cleared!'
        else:
            if config.MULTILABEL >= 1:
                u = tokenize(line)
                print('context:', self.context)
                data = [(self.context, u, -1)]
                print('data:', data)
                s, q, a = data_utils.vectorize_data(data, self.w2idx,
                                                    self.model._sentence_size,
                                                    1, self.n_cand,
                                                    self.memory_size)
                preds, top_probs = self.model.predict(s, q)
                # preds = preds.indices[0]
                preds = preds.indices[0].tolist()
                top_probs = top_probs.values[0]
                print(top_probs)
                r = []
                for i, pred in enumerate(preds):
                    r.append(self.idx2candid[pred])
                reply_msg = ','.join(r)
                r = tokenize(reply_msg)
                u.append('$u')
                # u.append('#' + str(self.nid))
                r.append('$r')
                # r.append('#' + str(self.nid))
                r = translator(r)
                self.context.append(u)
                self.context.append(r)
                print('context:', self.context)
                self.nid += 1
            else:
                u = data_utils.tokenize(line)
                data = [(self.context, u, -1)]
                s, q, a = data_utils.vectorize_data(data, self.w2idx,
                                                    self.model._sentence_size,
                                                    1, self.n_cand,
                                                    self.memory_size)
                preds, top_probs = self.model.predict(s, q)
                prob = 1
                try:
                    prob = top_probs.values[0][0]
                except:
                    pass
                r = self.idx2candid[preds[0]]
                reply_msg = r
                if config.FIX_VOCAB:
                    r = translator.en2cn(r)
                    print('translated..', r)
                r = data_utils.tokenize(r)
                u.append('$u')
                # u.append('#' + str(self.nid))
                r.append('$r')
                # r = translator(r)
                # r.append('#' + str(self.nid))
                self.context.append(u)
                self.context.append(r)
                self.nid += 1

        return reply_msg, prob

Python tokenize примеры использования