Пример #1
0
def main():
    data_set = DataSet.load(FLAGS.data_set_path)
    embedder = Embedder(data_set.input_vocabulary, FLAGS.checkpoint_path)

    texts = [
        text.replace('\n', '')
        for text in get_text_file(FLAGS.inputs_file_path)
    ]

    logging.info('getting embeddings')
    embeddings = [embedder.get_embedding(text) for text in tqdm(texts)]
    embedder.close()

    logging.info('applying dimensionality reduction')
    embeddings_reduced = do_pca(embeddings, 2, FLAGS.dim_reduction_method)

    logging.info('plotting')
    plt.scatter(embeddings_reduced[:, 0], embeddings_reduced[:, 1])
    for text, x_coord, y_coord in zip(texts, embeddings_reduced[:, 0],
                                      embeddings_reduced[:, 1]):
        plt.annotate(text,
                     xy=(x_coord, y_coord),
                     xytext=(0, 0),
                     textcoords='offset points')

    plt.show()
Пример #2
0
    def __init__(self, path_KG, path_QA, split_ratio=0.8, using_cache=True):
        self.KG = KnowledgeGraph(path_KG)
        self.embedder = Embedder()
        self.training = True  # 指定是否是训练阶段
        self._iter_i = 0
        self._split_ratio = split_ratio

        # try to load from cache
        if using_cache and Utility.Binary.exists('dataset'):
            self.questions = Utility.Binary.load('dataset')
            print('{} questions loaded'.format(len(self.questions)))
            return

        # read the original questions
        questions = pd.read_csv(
            path_QA,
            sep='\t',
            header=None,
            names=['question_sentence', 'answer_set', 'answer_path'])
        questions['answer'] = questions['answer_set'].apply(
            lambda x: x.split('(')[0])
        questions['q_split'] = questions['question_sentence'].apply(
            lambda x: x.lower().split(' '))
        questions['answer'] = questions['answer_set'].apply(
            lambda x: x.split('(')[0])
        questions['e_s'] = questions['answer_path'].apply(
            lambda x: x.split('#')[0])
        # find head entity e_s, answer, and question_list by parsing the question_sentence
        questions['q_str'] = [
            self.parse_question(row['question_sentence'].split('?')[0],
                                row['e_s'])
            for idx, row in questions.iterrows()
        ]

        # 对问题编码
        # NOTE: 这里是正对小数据集采取的空间换时间的方式,避免每一次都重新embed问题,对于大数据集需要单独处理数据
        questions['q'] = questions['q_str'].apply(
            lambda q: self.embed_question(q))

        question_list = questions[['q_str', 'q', 'e_s',
                                   'answer']].values.tolist()
        question_list = [tuple(x) for x in question_list]
        self.questions = question_list
        print('{} questions loaded'.format(len(question_list)))

        if using_cache:
            Utility.Binary.save('dataset', question_list)
Пример #3
0
 def __init__(self, vocab_size, d_model, N, heads, device, weight_matrix):
     super().__init__()
     self.N = N
     self.embed = Embedder(weight_matrix).to(device)
     self.linear = nn.Linear(weight_matrix.shape[1], d_model)
     self.pe = PositionalEncoder(d_model)
     self.layers = self.get_clones(EncoderLayer(d_model, heads, 0.3), N)
     self.norm = Norm(d_model)
Пример #4
0
 def __init__(self, vocab_file, embed_dim, filter_num, filter_sizes,
              drop_rate, sen_len):
     self.embedding_ = Embedder(vocab_file, embed_dim)
     self.textcnn_ = TextCnn(filter_num, filter_sizes, drop_rate, embed_dim,
                             sen_len)
     self.textmatch_ = TextMatch(filter_num, filter_sizes, drop_rate,
                                 embed_dim, sen_len)
     self.sen_len = sen_len
     self.__call__()
     pass
Пример #5
0
class Dataset:
    '''
    数据集:

        每一条数据为 [问题的字符串表示, 问题的embedding, 头实体, 答案]
    '''
    def __init__(self, path_KG, path_QA, split_ratio=0.8, using_cache=True):
        self.KG = KnowledgeGraph(path_KG)
        self.embedder = Embedder()
        self.training = True  # 指定是否是训练阶段
        self._iter_i = 0
        self._split_ratio = split_ratio

        # try to load from cache
        if using_cache and Utility.Binary.exists('dataset'):
            self.questions = Utility.Binary.load('dataset')
            print('{} questions loaded'.format(len(self.questions)))
            return

        # read the original questions
        questions = pd.read_csv(
            path_QA,
            sep='\t',
            header=None,
            names=['question_sentence', 'answer_set', 'answer_path'])
        questions['answer'] = questions['answer_set'].apply(
            lambda x: x.split('(')[0])
        questions['q_split'] = questions['question_sentence'].apply(
            lambda x: x.lower().split(' '))
        questions['answer'] = questions['answer_set'].apply(
            lambda x: x.split('(')[0])
        questions['e_s'] = questions['answer_path'].apply(
            lambda x: x.split('#')[0])
        # find head entity e_s, answer, and question_list by parsing the question_sentence
        questions['q_str'] = [
            self.parse_question(row['question_sentence'].split('?')[0],
                                row['e_s'])
            for idx, row in questions.iterrows()
        ]

        # 对问题编码
        # NOTE: 这里是正对小数据集采取的空间换时间的方式,避免每一次都重新embed问题,对于大数据集需要单独处理数据
        questions['q'] = questions['q_str'].apply(
            lambda q: self.embed_question(q))

        question_list = questions[['q_str', 'q', 'e_s',
                                   'answer']].values.tolist()
        question_list = [tuple(x) for x in question_list]
        self.questions = question_list
        print('{} questions loaded'.format(len(question_list)))

        if using_cache:
            Utility.Binary.save('dataset', question_list)

    def embed_question(self, question):
        n, idx = len(question), 0
        q_emb = torch.zeros((n, ExpSet.word_embedding_dimension))
        for word in question:
            if word == '<e>':
                continue
            w_emb = self.embedder.get_word_embedding(word)
            if w_emb is not None:
                q_emb[idx] = w_emb
                idx = idx + 1
        return q_emb[:idx]

    def embed_relation(self, relation):
        return self.embedder.get_relation_embedding(relation)

    def parse_question(self, question: str, e_s: str):
        '''
        将问题分为以单词为单位的分词列表,并找出头实体(e_s)【问题中在KG实体的单词中最长的单词】,并替换为<e>

        :param question: 问题字符串
        :param e_s: 头实体
        :return: 问题分词列表(字符串)
        '''
        modified_question_list = []
        for item in question.split(' '):
            if item == e_s:
                modified_question_list.append('<e>')
            else:
                if len(item.split('_')) > 0:
                    for x in item.split('_'):
                        if x != '':
                            modified_question_list.append(x)
                else:
                    modified_question_list.append(item)
        return modified_question_list

    def __iter__(self):
        return self

    def __next__(self):
        try:
            d = self[self._iter_i]
            self._iter_i = self._iter_i + 1
            return d
        except IndexError:
            self._iter_i = 0
            raise StopIteration()

    def __getitem__(self, item):
        if item >= self.size:
            raise IndexError(
                'index out of bound, size={}, item={}, training={}'.format(
                    self.size, item, self.training))
        if self.training:
            return self.questions[item]
        return self.questions[self.training_size + item]

    def __len__(self):
        return self.size

    @property
    def size(self):
        if self.training:
            return self.training_size
        return self.testing_size

    @property
    def data_size(self):
        return len(self.questions)

    @property
    def testing_size(self):
        return self.data_size - self.training_size

    @property
    def training_size(self):
        return int(self._split_ratio * self.data_size)

    def train(self, _train=True):
        self.training = _train
        self._iter_i = 0
Пример #6
0
                                        max_length_en)
    target_tensor = tensor_from_sentence("de", output_lang, pair[1],
                                         max_length_de)
    return input_tensor, target_tensor


language, total_data = data_generator(batch_size, 20, device)
train_data, test_data, y_train, y_test = train_test_split(
    total_data, np.zeros(len(total_data, )), test_size=0.1, random_state=42)

d_model = 128
heads = 8
N = 6
src_vocab = language.n_words
trg_vocab = language.n_words
en_weight_matrix = Embedder.initial_weights_matrix(
    "word_vector/glove.6B.300d.txt", language, 300)
src_vocab = language.n_words
trg_vocab = language.n_words

model = Transformer(src_vocab, trg_vocab, d_model, N, heads, device,
                    en_weight_matrix, en_weight_matrix)
try:
    model.load_state_dict(
        torch.load("model/transformer.pt", map_location=device))
    model.eval()
except:
    print("no weights exist")
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
Пример #7
0
                      max_length_de):
    input_tensor = tensor_from_sentence("en", input_lang, pair[0],
                                        max_length_en)
    target_tensor = tensor_from_sentence("de", output_lang, pair[1],
                                         max_length_de)
    return input_tensor, target_tensor


input_lang, output_lang, _ = prepare_data(lang1, lang2, 40)

d_model = 128
heads = 8
N = 6
src_vocab = input_lang.n_words
trg_vocab = output_lang.n_words
en_weight_matrix = Embedder.initial_weights_matrix(
    "word_vector/glove.6B.300d.txt", input_lang, 300)
de_weight_matrix = Embedder.initial_weights_matrix(
    "word_vector/vn_word2vec_300d.txt", input_lang, 300)
src_vocab = input_lang.n_words
trg_vocab = output_lang.n_words

model = Transformer(src_vocab, trg_vocab, d_model, N, heads, device,
                    en_weight_matrix, de_weight_matrix)
model.load_state_dict(torch.load("model/transformer.pt", map_location=device))


def translate(model, sentence, lang_input, lang_output, max_len=80):
    model.eval()

    src = tensor_from_sentence("en", lang_input, sentence, len(sentence))
Пример #8
0


n_way = 3
n_support = 5
n_query = 5
max_length = 75


support, query, label = batch_maker(1, n_way, n_support, n_query, max_length, train['encoded'], train['label'])


from embedding import Embedder


model = Embedder(r.word_vec_tot, 10)

print('Support: ', support['pos'].shape)

output = model(support['pos'])
query1 = model(query['pos'])

print('LAbel', label)

print('Support Shape Embedding', output.shape)

output = output.view(3, 5, 750)


query1 = query1.view(15, 1, 750)