Python preprocess_text示例，data_utils.preprocess_text Python示例

示例#1

0

显示文件

文件： test.py 项目： suraj-maniyar/VQA-PyTorch

def predict_image(path,
                  question,
                  model=model,
                  idx2word=idx2word,
                  input_embedding=input_embedding):

    input_seq_len = 21
    embedding_size = 50

    X1 = get_feature(path)

    question = preprocess_text(question)
    padding = ['<pad>'] * (input_seq_len - len(question))
    question = question + padding

    X2 = np.zeros((input_seq_len, embedding_size))
    for i in range(input_seq_len):
        if question[i] not in input_embedding.keys():
            question[i] = '<unk>'
        X2[i] = input_embedding[question[i]]

    X2 = torch.from_numpy(X2).float()

    X1 = X1.unsqueeze(0)
    X2 = X2.unsqueeze(0)

    if torch.cuda.is_available():
        X1 = X1.cuda()
        X2 = X2.cuda()
        model = model.cuda()

    output = model(X1, X2)
    index = output.max(-1)[1].item()

    return idx2word[index]

示例#2

0

显示文件

    def run_step(self, text):
        cur_sent = preprocess_text(text.strip(), lower=FLAGS.uncased)
        tokens, ids = self.sp.encode_ids(cur_sent)
        sent_len, diff_len = len(ids) - 1, FLAGS.seq_len - len(ids)

        input_ids = ids + [SEP_ID] * (diff_len - 1) + [
            CLS_ID
        ]  #  cat_data = np.concatenate([inp, a_data, sep_array, b_data, sep_array, cls_array])
        input_tokens = tokens + ["<sep>"] * (diff_len - 1) + ["<cls>"]
        input_mask = [1] + [0] * sent_len + [1] * diff_len
        segment_ids = [0] * (sent_len + 1) + [
            2
        ] * diff_len  # seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + [1] * b_data.shape[0] + [1] + [2])
        input_ids, input_tokens, input_mask, segment_ids = input_ids[:FLAGS.
                                                                     seq_len], input_tokens[:
                                                                                            FLAGS
                                                                                            .
                                                                                            seq_len], input_mask[:
                                                                                                                 FLAGS
                                                                                                                 .
                                                                                                                 seq_len], segment_ids[:
                                                                                                                                       FLAGS
                                                                                                                                       .
                                                                                                                                       seq_len]
        '''
       logging.info("text: %s, seg_text: %s" % (text, " ".join([str(x) for x in tokens])))
       logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
       logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
       logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
       '''
        il={'text':text,'seg_text':" ".join([str(x) for x in tokens]),'input_ids':" ".join([str(x) for x in input_ids]), \
           'input_mask':" ".join([str(x) for x in input_mask]),'segment_ids':" ".join([str(x) for x in segment_ids])}
        logging.info(json.dumps(il, ensure_ascii=False))

        feed_dict = {
            self.input_ids: [input_ids],
            self.segment_ids: [segment_ids],
            self.input_mask: [input_mask]
        }
        fetch = self.sess.run(
            [self.output, self.attn_prob, self.attention_out], feed_dict)
        out_encode, atten_prob = fetch[0], fetch[1]
        #weight0 = normalization(self.cal_weight(out_encode, input_tokens))
        weight_attn = normalization(self.weight_attenprob(atten_prob, tokens))
        weight_idf = normalization(self.sp.cal_weight_idf(tokens[1:]))
        weight_lm = normalization(self.lm.cal_weight_lm(tokens[1:]))
        weight_rule = self.merge_weight([(weight_attn, 0.5), (weight_idf, 0.5),
                                         (weight_lm, 0.5)])
        self.weight_attn, self.weight_idf, self.weight_lm = weight_attn, weight_idf, weight_lm
        sen2terms = [e for e in tokens[1:]]
        weightrank = self.rank_weight(sen2terms, weight_attn, weight_idf,
                                      weight_lm)
        weight_rank = normalization(weightrank)
        weight = self.merge_weight([(weight_rank, 0.7),
                                    (weight_rule, 0.0)])  # 0.6-0.4
        wl = {'weight_rank':' '.join([str(k)+':'+str(v) for k, v in weight_rank]),'weight_rule':' '.join([str(k)+':'+str(v) for k, v in weight_rule]), \
              'weight': ' '.join([str(k) + ':' + str(v) for k, v in weight])}
        logging.info(json.dumps(wl, ensure_ascii=False))
        return weight

示例#3

0

显示文件

    def __getitem__(self, idx):

        feature, path, question, answer = self.image_features[idx]
        original_question = question

        if isinstance(feature, list):
            X1 = random.choice(feature)  # Data Augmentation for training
        else:
            X1 = feature  # No Data Augmentation for validation

        X1 = torch.Tensor(X1)

        question = preprocess_text(question)
        answer = preprocess_text(answer)[0]
        answer = change(answer)

        padding = ['<pad>'] * (self.config['input_seq_len'] - len(question))
        question = question + padding
        assert len(question) == self.config[
            'input_seq_len'], "Length of question is %d" % len(question)

        X2 = np.zeros(
            (self.config['input_seq_len'], self.config['embedding_size']))

        for i in range(self.config['input_seq_len']):
            if question[i] not in self.input_embedding.keys():
                question[i] = '<unk>'
            X2[i] = self.input_embedding[question[i]]

        X2 = torch.from_numpy(X2).float()

        if answer not in self.word2idx.keys(): answer = '<unk>'

        if answer not in self.word_list: answer = '<unk>'

        Y = self.word2idx[answer]

        return X1, X2, Y

示例#4

0

显示文件

def cal_ndcg_train_data(topk=1):
    ndcg_sum = 0.0
    matchObj = re.compile(r'(.+)\t([0-9]+)', re.M | re.I)
    qw = query_weight()
    text = [
        e.strip().split("\t")
        for e in open("get_jdcv_data/label.data", encoding="utf8").readlines()
        if e.strip()
    ]
    for line in tqdm(text, total=len(text)):
        seg_line = [(preprocess_text(e.split(":")[0]), e.split(":")[1])
                    for e in line]
        sorted_seg_line = sorted(seg_line, key=lambda d: d[1], reverse=True)
        rel = {
            k: len(sorted_seg_line) - i - 1
            for i, (k, v) in enumerate(sorted_seg_line)
        }
        query = " ".join([e[0] for e in seg_line])
        dcg, idcg, ndcg = get_one_query_ndcg(qw, query, rel, topk)
        ndcg_sum += ndcg
    ndcg_avg = ndcg_sum / len(text)
    print("ndcg_avg@%d: %.3f" % (topk, ndcg_avg))

示例#5

0

显示文件

文件： main.py 项目： suraj-maniyar/VQA-PyTorch

print('Loaded GloVe')

### Loading Image Features

with open('dumps/train_features_vgg16.pkl', 'rb') as f:
    original_features = pickle.load(f)
print('len(original_features) : ', len(original_features))

with open('dumps/val_features_vgg16.pkl', 'rb') as f:
    original_val_features = pickle.load(f)
print('len(original_val_features) : ', len(original_val_features))

for i in range(len(original_features)):
    features, path, question, answer = original_features[i]
    answer = preprocess_text(answer)
    if (len(answer) == 2):
        train_features.append(original_features[i])

for i in range(len(original_val_features)):
    feature, path, question, answer = original_val_features[i]
    answer = preprocess_text(answer)
    if (len(answer) == 2):
        val_features.append(original_val_features[i])

print('len(train_features) : ', len(train_features))
print('len(val_features) : ', len(val_features))

print('\n----------------------------------------\n')

question, answer = [], ['<unk>']