예제 #1
0
    def read_split(self, name, questions):
        split_path = self.file_path('{}.txt'.format(name))
        datapoints = []
        split_questions = []
        for i, line in enumerate(self.read_tsv(split_path, is_gzip=False)):
            query_question = questions[line[0]]
            ground_truth = [questions[gt_id] for gt_id in line[1].split()]
            pool = [questions[pa_id] for pa_id in line[2].split()]
            np.random.shuffle(pool)
            datapoints.append(QAPool(query_question, pool, ground_truth))

            split_questions += [query_question] + ground_truth + pool

        # we filter out all pools that do not contain any ground truth answer (except train!)
        if name != 'train_random':
            qa_pools_len_before = len(datapoints)
            datapoints = [
                p for p in datapoints
                if len([1 for gt in p.ground_truth
                        if gt in p.pooled_answers]) > 0
            ]
            qa_pools_len_after = len(datapoints)
            self.logger.info(
                "Split {} reduced to {} item from {} due to missing ground truth in pool"
                .format(name, qa_pools_len_after, qa_pools_len_before))

        return Data('askubuntu / {}'.format(name), datapoints,
                    unique_items(split_questions))
예제 #2
0
    def shrink(self, split, max_len):
        """Reduces the size of a split of the archive

        :param split:
        :param max_len:
        :return:
        """
        split = {'train': self.train, 'valid': self.valid}[split]
        split.shrink(max_len)

        self.questions = self.train.questions + self.valid.questions + [q for t in self.test for q in t.questions]
        self.answers = unique_items(self.train.answers + self.valid.answers + [a for t in self.test for a in t.answers])
    def vocab(self):
        """
        :rtype: set
        """
        if self._vocab is None:
            self._vocab = []
            for question in self.questions:
                self._vocab += question.vocab
            for answer in self.answers:
                self._vocab += answer.vocab
            self._vocab = unique_items(self._vocab)

        return self._vocab
예제 #4
0
    def shrink(self, max_size, offset):
        start = (offset * max_size) % len(self.qa)
        end = ((offset + 1) * max_size) % len(self.qa)

        new_qa = []
        if start > end:
            new_qa += self.qa[start:]
            start = 0
        new_qa += self.qa[start:end]
        self.qa = new_qa

        self.answers = unique_items([a for sl in self.qa for a in
                                     (sl.pooled_answers if sl.pooled_answers is not None else sl.ground_truth)])
예제 #5
0
    def vocab(self):
        """
        :rtype: set
        """
        if self._vocab is None:
            self._vocab = []
            for question in self.questions:
                self._vocab += question.vocab
            for answer in self.answers:
                self._vocab += answer.vocab
            for answer in self.additional_answers:
                self._vocab += answer.vocab
            for questions in self.generated_questions.values():
                for question in questions:
                    self._vocab += question.vocab

            self._vocab = unique_items(self._vocab)

        return self._vocab
    def read_split(self, name, questions):
        file_path = self.file_path('{}.tsv.gz'.format(name))

        datapoints = []
        split_questions = []
        for line in self.read_tsv(file_path, is_gzip=True):
            question_id = line[0]
            query = questions[question_id + '_title']
            truth = questions[question_id + '_body']
            pool = []
            if len(line) > 1:
                pool = [
                    questions[neg_id + '_body'] for neg_id in line[1].split()
                ]
                np.random.shuffle(pool)

            split_questions += [query, truth] + pool
            datapoints.append(QAPool(query, pool, [truth]))

        return Data(
            'SE({}) / {}'.format(os.path.basename(self.archive_path), name),
            datapoints, unique_items(split_questions))
예제 #7
0
    def setup(self):
        readers = self._get_train_readers()
        self.logger.info('Loading {} train datasets'.format(len(readers)))
        archives = [reader.read() for reader in readers]
        self.archive = archives[0]

        if self.max_train_samples:
            for archive in archives:
                archive.shrink('train', self.max_train_samples)
                print('')
            self.logger.info(
                'Reduced the maximum of training sample of all data archives to a maximum of {}'
                .format(self.max_train_samples))

        if self.config.get('balance_data') is True:
            max_len_train = min([len(a.train.qa) for a in archives])
            max_len_dev = min([len(a.valid.qa) for a in archives])
            for archive in archives:
                archive.shrink('train', max_len_train)
                archive.shrink('valid', max_len_dev)
                # archive.train.qa = archive.train.qa[:max_len_train]
                # archive.valid.qa = archive.valid.qa[:max_len_dev]

            self.logger.info(
                'Balanced all data archives to maximum length for train={}, dev={}'
                .format(max_len_train, max_len_dev))

        for other_archive in archives[1:]:
            self.archive = self.archive.combine(other_archive)
        self.logger.debug(
            'Train dataset questions: train={}, dev={}, test={}'.format(
                len(self.archive.train.qa), len(self.archive.valid.qa),
                [len(t.qa) for t in self.archive.test]))

        qas = self.archive.train.qa + self.archive.valid.qa
        for t in self.archive.test:
            qas += t.qa
        self.logger.debug('Mean answer count={}'.format(
            np.mean([len(p.ground_truth) for p in qas])))
        self.logger.debug('Mean poolsize={}'.format(
            np.mean([
                len(p.pooled_answers) for p in qas
                if p.pooled_answers is not None
            ])))

        self.transfer_archives = [
            r.read() for r in self._get_transfer_readers()
        ]
        if self.transfer_archives:
            self.logger.debug(
                'Transfer datasets with test questions: {}'.format(', '.join([
                    '{}={}'.format(a.name, [len(t.qa) for t in a.test])
                    for a in self.transfer_archives
                ])))

        if 'embeddings_path' in self.config:
            # load the initial embeddings
            self.logger.info('Fetching the dataset vocab')
            vocab = unique_items(
                self.archive.vocab +
                [b for a in self.transfer_archives for b in a.vocab])
            self.logger.info('Loading embeddings (vocab size={})'.format(
                len(vocab)))

            embeddings_paths = self.config['embeddings_path']
            if isinstance(embeddings_paths, str):
                embeddings_paths = [embeddings_paths]

            embeddings_dicts = [
                read_embeddings(p, vocab, self.logger)
                for p in embeddings_paths
            ]
            embeddings_dicts_sizes = [
                len(next(itervalues(ed))) for ed in embeddings_dicts
            ]
            embedding_size = sum(embeddings_dicts_sizes)

            zero_padding = np.zeros((embedding_size, ))
            oov = np.random.uniform(-1.0, 1.0, [
                embedding_size,
            ])
            # oov = np.zeros([embedding_size, ])
            embeddings = [zero_padding, oov]

            n_oov = 0
            for token in self.archive.vocab:
                embedding_dict_items = [
                    ed.get(token, None) for ed in embeddings_dicts
                ]
                is_oov = all(v is None for v in embedding_dict_items)
                embedding_dict_items = [
                    x if x is not None else np.random.uniform(
                        -1.0, 1.0, [
                            embeddings_dicts_sizes[i],
                        ]) for (i, x) in enumerate(embedding_dict_items)
                ]

                if not is_oov:
                    self.vocab_to_index[token] = len(embeddings)
                    embedding = np.hstack(embedding_dict_items)
                    if len(embedding) == embedding_size:
                        embeddings.append(embedding)
                    else:
                        embeddings.append(
                            np.random.uniform(-1.0, 1.0, [
                                embedding_size,
                            ]))
                else:
                    n_oov += 1
                    if self.map_oov:
                        self.vocab_to_index[token] = 1  # oov
                    else:
                        # for each oov, we create a new random vector
                        self.vocab_to_index[token] = len(embeddings)
                        embeddings.append(
                            np.random.uniform(-1.0, 1.0, [
                                embedding_size,
                            ]))

            self.embeddings = np.array(embeddings)
            self.logger.info('OOV tokens: {}'.format(n_oov))

        else:
            embedding_size = self.config_global['embedding_size']
            self.vocab_to_index = dict([
                (t, i) for (i, t) in enumerate(self.archive.vocab, start=1)
            ])
            self.embeddings = np.append(
                np.zeros((1, embedding_size)),  # zero-padding
                np.random.uniform(-1.0, 1.0,
                                  [len(self.archive.vocab), embedding_size]),
                axis=0)
 def vocab(self):
     vocab = []
     for sentence in self.sentences:
         vocab += sentence.vocab
     return unique_items(vocab)
 def vocab(self):
     vocab = []
     for token in self.tokens:
         vocab.append(token.text)
     return unique_items(vocab)
예제 #10
0
 def shrink(self, max_size):
     self.qa = self.qa[:max_size]
     self.answers = unique_items([a for sl in self.qa for a in
                                  (sl.pooled_answers if sl.pooled_answers is not None else sl.ground_truth)])
예제 #11
0
 def vocab(self):
     return unique_items([t.text for t in self.tokens])
예제 #12
0
    def read_split(self, name, questions):
        file_path = self.file_path('{}.tsv.gz'.format(name))
        question_keys = [
            k for k in questions.keys()
            if k.endswith('body') or k.endswith('answer')
        ]

        datapoints = []
        split_questions = []
        for line in self.read_tsv(file_path, is_gzip=True):
            question_id = line[0]
            query = questions[question_id + '_title']
            truth = [questions[question_id + '_body']]
            if name == 'train':
                # we also include answers and bodies of duplicates, if they are in the dataset
                answer = questions.get(question_id + '_answer')
                if answer:
                    truth.append(answer)
                for dup_id in query.metadata['duplicates']:
                    other_gt = [
                        questions.get(dup_id + '_body'),
                        questions.get(dup_id + '_answer')
                    ]
                    for ti in other_gt:
                        if ti:
                            truth.append(ti)

            pool = []
            if len(line) > 1:
                pool = [
                    questions[neg_id + '_body'] for neg_id in line[1].split()
                ]
                #np.random.shuffle(pool)
            else:
                # create a pool with random answers plus gt
                pool = (truth + [
                    questions[neg_id]
                    for neg_id in random.sample(question_keys, 20)
                ])[:20]

            split_questions += [query] + truth + pool
            datapoints.append(QAPool(query, pool, truth))

        # We filter out all pools that do not contain any ground truth answer (i.e., body)
        # This can e.g. happen if the dataset is not in English but the retrieval model was in English only
        if name != 'train':
            # we filter out all pools that do not contain any ground truth answer
            qa_pools_len_before = len(datapoints)
            datapoints = [
                p for p in datapoints
                if len([1 for gt in p.ground_truth
                        if gt in p.pooled_answers]) > 0
            ]
            qa_pools_len_after = len(datapoints)
            self.logger.info(
                "Split {} reduced to {} item from {} due to missing ground truth in pool"
                .format(name, qa_pools_len_after, qa_pools_len_before))

        return Data(
            'SE({}) / {}'.format(os.path.basename(self.archive_path), name),
            datapoints, unique_items(split_questions))