def read_split(self, name, questions): split_path = self.file_path('{}.txt'.format(name)) datapoints = [] split_questions = [] for i, line in enumerate(self.read_tsv(split_path, is_gzip=False)): query_question = questions[line[0]] ground_truth = [questions[gt_id] for gt_id in line[1].split()] pool = [questions[pa_id] for pa_id in line[2].split()] np.random.shuffle(pool) datapoints.append(QAPool(query_question, pool, ground_truth)) split_questions += [query_question] + ground_truth + pool # we filter out all pools that do not contain any ground truth answer (except train!) if name != 'train_random': qa_pools_len_before = len(datapoints) datapoints = [ p for p in datapoints if len([1 for gt in p.ground_truth if gt in p.pooled_answers]) > 0 ] qa_pools_len_after = len(datapoints) self.logger.info( "Split {} reduced to {} item from {} due to missing ground truth in pool" .format(name, qa_pools_len_after, qa_pools_len_before)) return Data('askubuntu / {}'.format(name), datapoints, unique_items(split_questions))
def shrink(self, split, max_len): """Reduces the size of a split of the archive :param split: :param max_len: :return: """ split = {'train': self.train, 'valid': self.valid}[split] split.shrink(max_len) self.questions = self.train.questions + self.valid.questions + [q for t in self.test for q in t.questions] self.answers = unique_items(self.train.answers + self.valid.answers + [a for t in self.test for a in t.answers])
def vocab(self): """ :rtype: set """ if self._vocab is None: self._vocab = [] for question in self.questions: self._vocab += question.vocab for answer in self.answers: self._vocab += answer.vocab self._vocab = unique_items(self._vocab) return self._vocab
def shrink(self, max_size, offset): start = (offset * max_size) % len(self.qa) end = ((offset + 1) * max_size) % len(self.qa) new_qa = [] if start > end: new_qa += self.qa[start:] start = 0 new_qa += self.qa[start:end] self.qa = new_qa self.answers = unique_items([a for sl in self.qa for a in (sl.pooled_answers if sl.pooled_answers is not None else sl.ground_truth)])
def vocab(self): """ :rtype: set """ if self._vocab is None: self._vocab = [] for question in self.questions: self._vocab += question.vocab for answer in self.answers: self._vocab += answer.vocab for answer in self.additional_answers: self._vocab += answer.vocab for questions in self.generated_questions.values(): for question in questions: self._vocab += question.vocab self._vocab = unique_items(self._vocab) return self._vocab
def read_split(self, name, questions): file_path = self.file_path('{}.tsv.gz'.format(name)) datapoints = [] split_questions = [] for line in self.read_tsv(file_path, is_gzip=True): question_id = line[0] query = questions[question_id + '_title'] truth = questions[question_id + '_body'] pool = [] if len(line) > 1: pool = [ questions[neg_id + '_body'] for neg_id in line[1].split() ] np.random.shuffle(pool) split_questions += [query, truth] + pool datapoints.append(QAPool(query, pool, [truth])) return Data( 'SE({}) / {}'.format(os.path.basename(self.archive_path), name), datapoints, unique_items(split_questions))
def setup(self): readers = self._get_train_readers() self.logger.info('Loading {} train datasets'.format(len(readers))) archives = [reader.read() for reader in readers] self.archive = archives[0] if self.max_train_samples: for archive in archives: archive.shrink('train', self.max_train_samples) print('') self.logger.info( 'Reduced the maximum of training sample of all data archives to a maximum of {}' .format(self.max_train_samples)) if self.config.get('balance_data') is True: max_len_train = min([len(a.train.qa) for a in archives]) max_len_dev = min([len(a.valid.qa) for a in archives]) for archive in archives: archive.shrink('train', max_len_train) archive.shrink('valid', max_len_dev) # archive.train.qa = archive.train.qa[:max_len_train] # archive.valid.qa = archive.valid.qa[:max_len_dev] self.logger.info( 'Balanced all data archives to maximum length for train={}, dev={}' .format(max_len_train, max_len_dev)) for other_archive in archives[1:]: self.archive = self.archive.combine(other_archive) self.logger.debug( 'Train dataset questions: train={}, dev={}, test={}'.format( len(self.archive.train.qa), len(self.archive.valid.qa), [len(t.qa) for t in self.archive.test])) qas = self.archive.train.qa + self.archive.valid.qa for t in self.archive.test: qas += t.qa self.logger.debug('Mean answer count={}'.format( np.mean([len(p.ground_truth) for p in qas]))) self.logger.debug('Mean poolsize={}'.format( np.mean([ len(p.pooled_answers) for p in qas if p.pooled_answers is not None ]))) self.transfer_archives = [ r.read() for r in self._get_transfer_readers() ] if self.transfer_archives: self.logger.debug( 'Transfer datasets with test questions: {}'.format(', '.join([ '{}={}'.format(a.name, [len(t.qa) for t in a.test]) for a in self.transfer_archives ]))) if 'embeddings_path' in self.config: # load the initial embeddings self.logger.info('Fetching the dataset vocab') vocab = unique_items( self.archive.vocab + [b for a in self.transfer_archives for b in a.vocab]) self.logger.info('Loading embeddings (vocab size={})'.format( len(vocab))) embeddings_paths = self.config['embeddings_path'] if isinstance(embeddings_paths, str): embeddings_paths = [embeddings_paths] embeddings_dicts = [ read_embeddings(p, vocab, self.logger) for p in embeddings_paths ] embeddings_dicts_sizes = [ len(next(itervalues(ed))) for ed in embeddings_dicts ] embedding_size = sum(embeddings_dicts_sizes) zero_padding = np.zeros((embedding_size, )) oov = np.random.uniform(-1.0, 1.0, [ embedding_size, ]) # oov = np.zeros([embedding_size, ]) embeddings = [zero_padding, oov] n_oov = 0 for token in self.archive.vocab: embedding_dict_items = [ ed.get(token, None) for ed in embeddings_dicts ] is_oov = all(v is None for v in embedding_dict_items) embedding_dict_items = [ x if x is not None else np.random.uniform( -1.0, 1.0, [ embeddings_dicts_sizes[i], ]) for (i, x) in enumerate(embedding_dict_items) ] if not is_oov: self.vocab_to_index[token] = len(embeddings) embedding = np.hstack(embedding_dict_items) if len(embedding) == embedding_size: embeddings.append(embedding) else: embeddings.append( np.random.uniform(-1.0, 1.0, [ embedding_size, ])) else: n_oov += 1 if self.map_oov: self.vocab_to_index[token] = 1 # oov else: # for each oov, we create a new random vector self.vocab_to_index[token] = len(embeddings) embeddings.append( np.random.uniform(-1.0, 1.0, [ embedding_size, ])) self.embeddings = np.array(embeddings) self.logger.info('OOV tokens: {}'.format(n_oov)) else: embedding_size = self.config_global['embedding_size'] self.vocab_to_index = dict([ (t, i) for (i, t) in enumerate(self.archive.vocab, start=1) ]) self.embeddings = np.append( np.zeros((1, embedding_size)), # zero-padding np.random.uniform(-1.0, 1.0, [len(self.archive.vocab), embedding_size]), axis=0)
def vocab(self): vocab = [] for sentence in self.sentences: vocab += sentence.vocab return unique_items(vocab)
def vocab(self): vocab = [] for token in self.tokens: vocab.append(token.text) return unique_items(vocab)
def shrink(self, max_size): self.qa = self.qa[:max_size] self.answers = unique_items([a for sl in self.qa for a in (sl.pooled_answers if sl.pooled_answers is not None else sl.ground_truth)])
def vocab(self): return unique_items([t.text for t in self.tokens])
def read_split(self, name, questions): file_path = self.file_path('{}.tsv.gz'.format(name)) question_keys = [ k for k in questions.keys() if k.endswith('body') or k.endswith('answer') ] datapoints = [] split_questions = [] for line in self.read_tsv(file_path, is_gzip=True): question_id = line[0] query = questions[question_id + '_title'] truth = [questions[question_id + '_body']] if name == 'train': # we also include answers and bodies of duplicates, if they are in the dataset answer = questions.get(question_id + '_answer') if answer: truth.append(answer) for dup_id in query.metadata['duplicates']: other_gt = [ questions.get(dup_id + '_body'), questions.get(dup_id + '_answer') ] for ti in other_gt: if ti: truth.append(ti) pool = [] if len(line) > 1: pool = [ questions[neg_id + '_body'] for neg_id in line[1].split() ] #np.random.shuffle(pool) else: # create a pool with random answers plus gt pool = (truth + [ questions[neg_id] for neg_id in random.sample(question_keys, 20) ])[:20] split_questions += [query] + truth + pool datapoints.append(QAPool(query, pool, truth)) # We filter out all pools that do not contain any ground truth answer (i.e., body) # This can e.g. happen if the dataset is not in English but the retrieval model was in English only if name != 'train': # we filter out all pools that do not contain any ground truth answer qa_pools_len_before = len(datapoints) datapoints = [ p for p in datapoints if len([1 for gt in p.ground_truth if gt in p.pooled_answers]) > 0 ] qa_pools_len_after = len(datapoints) self.logger.info( "Split {} reduced to {} item from {} due to missing ground truth in pool" .format(name, qa_pools_len_after, qa_pools_len_before)) return Data( 'SE({}) / {}'.format(os.path.basename(self.archive_path), name), datapoints, unique_items(split_questions))