def __init__(self, path, load_ext_feats=False): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt if not load_ext_feats: overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) else: overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt')) with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)] example = Example.fromlist(example_list, fields) examples.append(example) super(CastorPairDataset, self).__init__(examples, fields)
def __init__(self, path): """ Create a MSRP dataset instance """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') ext_feats = [] # Number features sent1_nums, sent2_nums = [], [] match = self.NUMBER_PATTERN.search(' '.join(l1)) if match: for g in match.groups(): if g is not None: sent1_nums.append(g) match = self.NUMBER_PATTERN.search(' '.join(l2)) if match: for g in match.groups(): if g is not None: sent2_nums.append(g) sent1_nums = set(sent1_nums) sent2_nums = set(sent2_nums) exact = int(sent1_nums == sent2_nums) superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums)) ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0) ext_feats.append(exact) ext_feats.append(superset) # Length difference ext_feats.append(len(l2) - len(l1)) # Overlap overlap = len(set(l1) & set(l2)) ext_feats.append(overlap / len(l1)) ext_feats.append(overlap / len(l2)) example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields) examples.append(example) super(MSRP, self).__init__(examples, fields)
def __init__(self, path): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] ids, labels, sent_list_1, sent_list_2 = [], [], [], [] with open(path) as f: for line in f: content = json.loads(line) sent_list_1.append(content['question']) sent_list_2.append(content['qaquestion']) word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) self.word_to_doc_cnt = word_to_doc_cnt with open(path) as f: for line in f: content = json.loads(line) ids.append(content['qid']) labels.append(content['qarel']) for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1, sent_list_2, overlap_feats, labels): example = Example.fromlist([ pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2) ], fields) examples.append(example) super(SemevalDataset, self).__init__(examples, fields)
def __init__(self, path, **kwargs): """ Create a Semeval dataset instance """ fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD), ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD)] examples = [] with open(path) as infile: for line in infile: content = json.loads(line) sent_list_1 = content['question'] sent_list_2 = content['qaquestion'] word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features( sent_list_1, sent_list_2, word_to_doc_cnt) overlap_feats = [] values = [ content['qid'], content['qaid'], content['qarel'], content['question'], content['qaquestion'], ' '.join(content['question']), ' '.join(content['qaquestion']), overlap_feats ] examples.append(Example.fromlist(values, fields)) super(Semeval, self).__init__(examples, fields, **kwargs)