def __init__(self, path, load_ext_feats=False):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD),
                  ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        if not load_ext_feats:
            overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt)
        else:
            overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt'))

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)]
                example = Example.fromlist(example_list, fields)
                examples.append(example)

        super(CastorPairDataset, self).__init__(examples, fields)
예제 #2
0
    def __init__(self, path):
        """
        Create a MSRP dataset instance
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD),
                ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []
        with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2:
            sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1]
            sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2]

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file:
            for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file):
                pair_id = pair_id.rstrip('.\n')
                label = label.rstrip('.\n')
                ext_feats = []

                # Number features
                sent1_nums, sent2_nums = [], []
                match = self.NUMBER_PATTERN.search(' '.join(l1))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent1_nums.append(g)

                match = self.NUMBER_PATTERN.search(' '.join(l2))
                if match:
                    for g in match.groups():
                        if g is not None:
                            sent2_nums.append(g)

                sent1_nums = set(sent1_nums)
                sent2_nums = set(sent2_nums)
                exact = int(sent1_nums == sent2_nums)
                superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums))
                ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0)
                ext_feats.append(exact)
                ext_feats.append(superset)

                # Length difference
                ext_feats.append(len(l2) - len(l1))

                # Overlap
                overlap = len(set(l1) & set(l2))
                ext_feats.append(overlap / len(l1))
                ext_feats.append(overlap / len(l2))

                example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields)
                examples.append(example)

        super(MSRP, self).__init__(examples, fields)
예제 #3
0
    def __init__(self, path):
        """
        Create a Castor dataset involving pairs of texts
        """
        fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD),
                  ('label', self.LABEL_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD)]

        examples = []

        ids, labels, sent_list_1, sent_list_2 = [], [], [], []
        with open(path) as f:
            for line in f:
                content = json.loads(line)
                sent_list_1.append(content['question'])
                sent_list_2.append(content['qaquestion'])

        word_to_doc_cnt = get_pairwise_word_to_doc_freq(
            sent_list_1, sent_list_2)
        overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2,
                                                      word_to_doc_cnt)
        self.word_to_doc_cnt = word_to_doc_cnt

        with open(path) as f:
            for line in f:
                content = json.loads(line)
                ids.append(content['qid'])
                labels.append(content['qarel'])

        for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1,
                                                     sent_list_2,
                                                     overlap_feats, labels):
            example = Example.fromlist([
                pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)
            ], fields)
            examples.append(example)

        super(SemevalDataset, self).__init__(examples, fields)
예제 #4
0
    def __init__(self, path, **kwargs):
        """
        Create a Semeval dataset instance
        """

        fields = [('qid', self.QID_FIELD), ('qaid', self.QID_FIELD),
                  ('label', self.LABEL_FIELD), ('sentence_1', self.TEXT_FIELD),
                  ('sentence_2', self.TEXT_FIELD),
                  ('sentence_1_raw', self.RAW_TEXT_FIELD),
                  ('sentence_2_raw', self.RAW_TEXT_FIELD),
                  ('ext_feats', self.EXT_FEATS_FIELD)]

        examples = []

        with open(path) as infile:
            for line in infile:
                content = json.loads(line)

                sent_list_1 = content['question']
                sent_list_2 = content['qaquestion']

                word_to_doc_cnt = get_pairwise_word_to_doc_freq(
                    sent_list_1, sent_list_2)
                overlap_feats = get_pairwise_overlap_features(
                    sent_list_1, sent_list_2, word_to_doc_cnt)
                overlap_feats = []
                values = [
                    content['qid'], content['qaid'], content['qarel'],
                    content['question'], content['qaquestion'],
                    ' '.join(content['question']),
                    ' '.join(content['qaquestion']), overlap_feats
                ]

                examples.append(Example.fromlist(values, fields))

        super(Semeval, self).__init__(examples, fields, **kwargs)