示例#1
0
    def _process_input(self, data_raw):
        questions = []
        inputs = []
        answers = []
        fact_counts = []
        input_masks = []

        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]

            inp_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in inp]

            q_vector = [utils.process_word(word = w,
                                        word2vec = self.word2vec,
                                        vocab = self.vocab,
                                        ivocab = self.ivocab,
                                        word_vector_size = self.word_vector_size,
                                        to_return = "word2vec") for w in q]

            if (self.input_mask_mode == 'word'):
                input_mask = range(len(inp))
            elif (self.input_mask_mode == 'sentence'):
                input_mask = [index for index, w in enumerate(inp) if w == '.']
            else:
                raise Exception("unknown input_mask_mode")
            fact_count = len(input_mask)

            inputs.append(inp_vector)
            questions.append(q_vector)
            # NOTE: here we assume the answer is one word!
            answers.append(utils.process_word(word = x["A"],
                                            word2vec = self.word2vec,
                                            vocab = self.vocab,
                                            ivocab = self.ivocab,
                                            word_vector_size = self.word_vector_size,
                                            to_return = "index"))
            fact_counts.append(fact_count)
            input_masks.append(input_mask)

        return inputs, questions, answers, fact_counts, input_masks
示例#2
0
    def _process_input(self, data_raw):
        '''
            This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

        Args:
            data_raw: raw data coming in from main class.
        Returns:
            inputs section, answers section, questions section, and input_masks as numpy arrays.
        '''
        inputs = []
        answers = []
        input_masks = []
        questions = []
        for x in data_raw:
            # inp = x["C"].lower().split(' ')
            # inp = [w for w in inp if len(w) > 0]
            # q = x["Q"].lower().split(' ')
            # q = [w for w in q if len(w) > 0]

            inp = utils.get_one_hot_doc(x["C"],
                                        self.char_vocab,
                                        max_length=self.max_doc_length)
            q = utils.get_one_hot_doc(x["Q"],
                                      self.char_vocab,
                                      max_length=self.max_doc_length)

            # # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
            # inp_vector = [utils.process_word(word = w,
            #                             word2vec = self.word2vec,
            #                             vocab = self.vocab,
            #                             ivocab = self.ivocab,
            #                             word_vector_size = self.word_vector_size,
            #                             to_return = "word2vec") for w in inp]
            #
            # q_vector = [utils.process_word(word = w,
            # 		word2vec = self.word2vec,
            # 	vocab = self.vocab,
            # 	ivocab = self.ivocab,
            # 	word_vector_size = self.word_vector_size,
            # 	to_return = "word2vec") for w in q]
            #
            # inputs.append(np.vstack(inp_vector).astype(floatX))
            # questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="index"))

            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp)],
                        dtype=np.int32))  # Get the input_masks for the data
            elif self.input_mask_mode == 'sentence':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp) if w == '.'],
                        dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")
            # print(x["C"])
            # print(inp.shape)
            # print(inp[0])
            #inp_vector = self.build_cnn(inp)
            inputs.append(inp)
            # q_vector = self.build_cnn(q)
            questions.append(q)

            #input_masks = None

        return inputs, questions, answers, input_masks
示例#3
0
    def _process_input(self, data_raw):
        """
            This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

        Args:
            data_raw: raw data coming in from main class.
        Returns:
            inputs section, answers section, questions section, and input_masks as numpy arrays.
        """
        inputs = []
        answers = []
        input_masks = []
        questions = []
        for x in data_raw:
            # inp = x["C"].lower().split(' ')
            # inp = [w for w in inp if len(w) > 0]
            # q = x["Q"].lower().split(' ')
            # q = [w for w in q if len(w) > 0]

            inp = utils.get_one_hot_doc(x["C"], self.char_vocab, max_length=self.max_doc_length)
            q = utils.get_one_hot_doc(x["Q"], self.char_vocab, max_length=self.max_doc_length)

            # # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
            # inp_vector = [utils.process_word(word = w,
            #                             word2vec = self.word2vec,
            #                             vocab = self.vocab,
            #                             ivocab = self.ivocab,
            #                             word_vector_size = self.word_vector_size,
            #                             to_return = "word2vec") for w in inp]
            #
            # q_vector = [utils.process_word(word = w,
            # 		word2vec = self.word2vec,
            # 	vocab = self.vocab,
            # 	ivocab = self.ivocab,
            # 	word_vector_size = self.word_vector_size,
            # 	to_return = "word2vec") for w in q]
            #
            # inputs.append(np.vstack(inp_vector).astype(floatX))
            # questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(
                    word=x["A"],
                    word2vec=self.word2vec,
                    vocab=self.vocab,
                    ivocab=self.ivocab,
                    word_vector_size=self.word_vector_size,
                    to_return="index",
                )
            )

            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == "word":
                input_masks.append(
                    np.array([index for index, w in enumerate(inp)], dtype=np.int32)
                )  # Get the input_masks for the data
            elif self.input_mask_mode == "sentence":
                input_masks.append(np.array([index for index, w in enumerate(inp) if w == "."], dtype=np.int32))
            else:
                raise Exception("invalid input_mask_mode")
            # print(x["C"])
            # print(inp.shape)
            # print(inp[0])
            # inp_vector = self.build_cnn(inp)
            inputs.append(inp)
            # q_vector = self.build_cnn(q)
            questions.append(q)

            # input_masks = None

        return inputs, questions, answers, input_masks
示例#4
0
    def _process_input(self, data_raw):
        '''
            This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

        Args:
            data_raw: raw data coming in from main class.
        Returns:
            inputs section, answers section, questions section, and input_masks as numpy arrays.
        '''
        inputs = []
        answers = []
        input_masks = []
        questions = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            # Sentence punctuation delimiters
            punkt = ['.', '?', '!']

            problem = False
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(
                    np.array(
                        [index for index, w in enumerate(inp)],
                        dtype=np.int32))  # Get the input_masks for the data
            elif self.input_mask_mode == 'sentence':
                sent_mask = np.array(
                    [index for index, w in enumerate(inp) if w in punkt],
                    dtype=np.int32)
                input_masks.append(sent_mask)
                if (len(sent_mask) < 1):
                    #Pass over the data if there is only one sentence as this will cause an error later
                    problem = True
            else:
                raise Exception("invalid input_mask_mode")
            if problem:
                print("Passing over data: ", x["C"])
                continue

            #Process the documents
            inp_vector = utils.process_sent(
                inp,
                word2vec=self.word2vec,
                vocab=self.vocab,
                ivocab=self.ivocab,
                word_vector_size=self.word_vector_size,
                to_return=self.to_return,
                silent=True,
                encoder_decoder=self.encoder_decoder,
                vocab_dict=self.vocab_dict)

            q_vector = utils.process_sent(
                q,
                word2vec=self.word2vec,
                vocab=self.vocab,
                ivocab=self.ivocab,
                word_vector_size=self.word_vector_size,
                to_return=self.to_return,
                silent=True,
                encoder_decoder=self.encoder_decoder,
                vocab_dict=self.vocab_dict)
            # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
            # inp_vector = [utils.process_word(word = w,
            #                             word2vec = self.word2vec,
            #                             vocab = self.vocab,
            #                             ivocab = self.ivocab,
            #                             word_vector_size = self.word_vector_size,
            #                             to_return = "word2vec", silent=True) for w in inp]
            #
            # q_vector = [utils.process_word(word = w,
            # 		word2vec = self.word2vec,
            # 	vocab = self.vocab,
            # 	ivocab = self.ivocab,
            # 	word_vector_size = self.word_vector_size,
            # 	to_return = "word2vec", silent=True) for w in q]
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(
                utils.process_word(word=x["A"],
                                   word2vec=self.word2vec,
                                   vocab=self.vocab,
                                   ivocab=self.ivocab,
                                   word_vector_size=self.word_vector_size,
                                   to_return="bool"))

        return inputs, questions, answers, input_masks
示例#5
0
文件: dmn_basic.py 项目: Lab41/pythia
    def _process_input(self, data_raw):
        '''
            This module processes the raw data input and grabs all the relevant sections and calculates the input_mask.

        Args:
            data_raw: raw data coming in from main class.
        Returns:
            inputs section, answers section, questions section, and input_masks as numpy arrays.
        '''
        inputs = []
        answers = []
        input_masks = []
        questions = []
        for x in data_raw:
            inp = x["C"].lower().split(' ')
            inp = [w for w in inp if len(w) > 0]
            q = x["Q"].lower().split(' ')
            q = [w for w in q if len(w) > 0]
            # Sentence punctuation delimiters
            punkt = ['.','?','!']

            problem=False
            # NOTE: here we assume the answer is one word!
            if self.input_mask_mode == 'word':
                input_masks.append(np.array([index for index, w in enumerate(inp)], dtype=np.int32)) # Get the input_masks for the data
            elif self.input_mask_mode == 'sentence':
                sent_mask = np.array([index for index, w in enumerate(inp) if w in punkt], dtype=np.int32)
                input_masks.append(sent_mask)
                if(len(sent_mask)<1):
                    #Pass over the data if there is only one sentence as this will cause an error later
                    problem=True
            else:
                raise Exception("invalid input_mask_mode")
            if problem:
                print("Passing over data: ", x["C"])
                continue

            #Process the documents
            inp_vector = utils.process_sent(inp, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab,
                                            word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True,
                                            encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict)

            q_vector = utils.process_sent(q, word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab,
                                            word_vector_size=self.word_vector_size, to_return=self.to_return, silent=True,
                                            encoder_decoder=self.encoder_decoder, vocab_dict=self.vocab_dict)
            # Process the words from the input, answers, and questions to see what needs a new vector in word2vec.
            # inp_vector = [utils.process_word(word = w,
            #                             word2vec = self.word2vec,
            #                             vocab = self.vocab,
            #                             ivocab = self.ivocab,
            #                             word_vector_size = self.word_vector_size,
            #                             to_return = "word2vec", silent=True) for w in inp]
            #
            # q_vector = [utils.process_word(word = w,
		    	# 		word2vec = self.word2vec,
				# 	vocab = self.vocab,
				# 	ivocab = self.ivocab,
				# 	word_vector_size = self.word_vector_size,
				# 	to_return = "word2vec", silent=True) for w in q]
            inputs.append(np.vstack(inp_vector).astype(floatX))
            questions.append(np.vstack(q_vector).astype(floatX))
            answers.append(utils.process_word(word = x["A"],
                                            word2vec = self.word2vec, 
                                            vocab = self.vocab, 
                                            ivocab = self.ivocab, 
                                            word_vector_size = self.word_vector_size, 
                                            to_return = "bool"))


        
        return inputs, questions, answers, input_masks