예제 #1
0
    def test_output(self):
        """
        Testing a simple scenario when a token matching function, cleaner, and a
        simple token splitter are used.
        """
        field_name = "dummy"
        special_token = "<ANIMAL>"
        lower_case = True
        tok_mat_func = lambda x: token_matching_func(x, special_token)
        token_cleaning_func = lambda x: re.sub(r'[?!,.]', '', x)
        tokenization_func = lambda x: x.split()

        input_seqs = ["Hello, this is my dog!",
                      "A dummy sentence for tokenization.",
                      "What a lovely puppy!"]
        input_data_chunk = DataChunk(**{field_name: input_seqs})
        expect_seqs = [["hello", "this", "is", "my", special_token],
                       ["a", "dummy", "sentence", "for",
                        "tokenization"],
                       ["what", "a", "lovely", special_token]]
        expected_data_chunk = DataChunk(**{field_name: expect_seqs})

        tokenizer = TokenProcessor(field_name,
                                   tokenization_func=tokenization_func,
                                   token_cleaning_func=token_cleaning_func,
                                   token_matching_func=tok_mat_func,
                                   lower_case=lower_case)
        actual_data_chunk = tokenizer(input_data_chunk)
        self.assertTrue(expected_data_chunk == actual_data_chunk)
예제 #2
0
    def test_vocabulary_mapper_mixed_field_values(self):
        """Testing whether the mapper can map multi-dim mixed field values."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(**{target_field_name: np.array([
            [["one"], np.array(["two", "one"])],
            [["three"], np.array(["four", "five", "six"])]
        ], dtype="object")})
        expected_output_chunk = DataChunk(**{target_field_name: np.array([
            [[1], np.array([2, 1])],
            [[3], np.array([4, 5, 6])]
        ], dtype="object")})

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(data_chunk)

        self.assertTrue(actual_output_chunk == expected_output_chunk)
    def test_vocabulary_mapper_multidim_lists(self):
        """Testing whether the mapper can map multi-dim lists."""
        target_field_name = "dummy"
        symbols_attr = "id"

        data_chunk = DataChunk(
            **{
                target_field_name:
                np.array(
                    [[["one"], ["two"]], [["three"], ["four", "five", "six"]]],
                    dtype="object")
            })
        exp_val = np.empty(2, dtype="object")
        exp_val[0] = np.array([[1], [2]])
        exp_val[1] = np.array([[3], [4, 5, 6]])
        expected_output_chunk = DataChunk(**{target_field_name: exp_val})

        # creating and populating a vocab
        vocab = Vocabulary()
        vocab.add_symbol("zero")
        vocab.add_symbol("one")
        vocab.add_symbol("two")
        vocab.add_symbol("three")
        vocab.add_symbol("four")
        vocab.add_symbol("five")
        vocab.add_symbol("six")

        mapper = VocabMapper({target_field_name: vocab},
                             symbols_attr=symbols_attr)
        actual_output_chunk = mapper(copy.deepcopy(data_chunk))

        self.assertTrue(actual_output_chunk == expected_output_chunk)
예제 #4
0
    def compile_chunks(self):
        """Compiles data-chunks filled with group sequences."""
        if self.max_units:
            while len(self):
                if len(self) > self.max_units:
                    sel_indxs = random.choice(range(len(self)), replace=False,
                                              size=self.max_units)
                else:
                    sel_indxs = range(len(self))

                # create an output data-chunk based on the selected units
                dc = DataChunk()
                for k, val in self._coll.items():
                    dc[k] = [val[indx] for indx in sel_indxs]
                    if isinstance(val, np.ndarray):
                        dc[k] = np.array(dc[k], dtype=val.dtype)
                yield dc

                # removing the selected indxs from the collector
                for indx in sorted(sel_indxs, reverse=True):
                    for fn in self._coll:
                        if isinstance(self._coll[fn], np.ndarray):
                            self._coll[fn] = np.delete(self._coll[fn], indx)
                        else:
                            del self._coll[fn][indx]

                # stop the cycle as one sample is already produced
                if not self.sample_all_revs:
                    break
        else:
            dc = DataChunk()
            for k, val in self._coll.items():
                dc[k] = val
            yield dc
예제 #5
0
    def test_appending_data_units_to_invalid_dc(self):

        act_dc = DataChunk(test=np.array([], dtype='int64'),
                           dummy=np.array([1], dtype='int64'))

        with self.assertRaises(DataChunkError):
            act_dc.append({"test": 1, "dummy": 2})
예제 #6
0
    def test_setting_data_units(self):
        act_dc = DataChunk(test=np.array([1, 2, 3, 4, 5], dtype='int64'),
                           dummy=np.array([6, 7, 8, 9, 10], dtype='int64'))
        act_dc[0] = {"test": 0, "dummy": 0}
        act_dc[3] = {"test": 20, "dummy": 30}

        exp_dc = DataChunk(test=np.array([0, 2, 3, 20, 5]),
                           dummy=np.array([0, 7, 8, 30, 10]))

        self.assertTrue(act_dc == exp_dc)
예제 #7
0
    def test_appending_data_units_to_valid_dc(self):

        act_dc = DataChunk(test=np.array([], dtype='int64'),
                           dummy=np.array([], dtype='int64'))

        act_dc.append({"test": 1, "dummy": 2})
        act_dc.append({"test": 3, "dummy": 4})

        exp_dc = DataChunk(test=np.array([1, 3]), dummy=np.array([2, 4]))

        self.assertTrue(act_dc == exp_dc)
예제 #8
0
    def test_output(self):
        fn = "dummy"
        new_fn = "dummy_len"
        data = [[1, 2, 3], [12], ["a", "b", "d", "e"]]

        actual_dc = DataChunk(**{fn: np.array(deepcopy(data))})
        expected_dc = DataChunk(**{fn: np.array(deepcopy(data)),
                                 new_fn: np.array([3, 1, 4])})
        slc = SeqLenComputer(fname=fn, new_len_fname=new_fn)
        actual_dc = slc(actual_dc)

        self.assertTrue(actual_dc == expected_dc)
예제 #9
0
 def _data_chunk_from_dicts_list(list_of_dicts):
     """Creates a data-chunk from list of data-units (dicts)."""
     data_chunk = DataChunk()
     flag = False
     for du in list_of_dicts:
         if not flag:
             for k in du.keys():
                 data_chunk[k] = []
             flag = True
         for k, v in du.items():
             data_chunk[k].append(v)
     for k, v in data_chunk.items():
         data_chunk[k] = np.array(v)
     return data_chunk
예제 #10
0
    def test_modification_of_data_units(self):
        """Selecting specific data-units and altering their values."""
        act_dc = DataChunk(test=np.array([1, 2, 3, 4]),
                           dummy=np.array([11., 12., 13., 14.]))
        act_du1 = act_dc[0]
        act_du1['test'] += 100

        act_du2 = act_dc[3]
        act_du2['dummy'] += 5

        exp_dc = DataChunk(test=np.array([101, 2, 3, 4]),
                           dummy=np.array([11., 12., 13., 19.]))

        self.assertTrue(act_dc == exp_dc)
예제 #11
0
def concat_data_chunks(*args):
    """Concatenates data-chunks together based on their keys."""
    data_chunk = DataChunk()

    for k in args[0]:
        data_chunk[k] = []

    for arg in args:
        for k, v in arg.items():
            data_chunk[k].append(v)

    for k, v in data_chunk.items():
        data_chunk[k] = np.concatenate(v)

    return data_chunk
예제 #12
0
    def test_data_units_inval_access(self):
        """When data-chunk is incorrect, it should throw an error."""

        dc = DataChunk(test=[1, 22, 3, 4, 5], dummy=[99, 2, 3])

        with self.assertRaises(DataChunkError):
            du = dc[0]

        dc["test"] = np.array(dc["test"])
        dc['dummy'] = np.array(dc['dummy'])

        with self.assertRaises(DataChunkError):
            du = dc[0]

        dc['dummy'] = np.append(dc['dummy'], 0)
        dc['dummy'] = np.append(dc['dummy'], 1)

        du = dc[0]

        self.assertTrue(du['test'] == 1)
        self.assertTrue(du['dummy'] == 99)

        du = dc[1]

        self.assertTrue(du['test'] == 22)
        self.assertTrue(du['dummy'] == 2)
예제 #13
0
 def _get_dummy_dc():
     dc = DataChunk()
     dc["country"] = np.array(["UK", "UK", "UK", "DK", "DK"])
     dc["shop_id"] = np.array(['1', '1', '1', '2', '3'])
     dc["product_id"] = np.array([11, 12, 13, 101, 101])
     dc["sales"] = np.array([0, 1, 2, 5, 6])
     return dc
예제 #14
0
파일: dc.py 프로젝트: yaoxy2010/FewSum
def concat_chunks(*dcs):
    """Combines data-chunks horizontally and returns them as one chunk."""
    new_dc = DataChunk()
    key_to_type = {}
    for dc in dcs:
        for k, v in dc.items():
            if k not in new_dc:
                new_dc[k] = []
            if isinstance(v, np.ndarray):
                if k in key_to_type and key_to_type[k] != np.ndarray:
                    raise TypeError("All values must either 'arrays' or "
                                    "'lists'.")
                key_to_type[k] = np.ndarray
                new_dc[k].append(v)
            elif isinstance(v, list):
                if k in key_to_type and key_to_type[k] != list:
                    raise TypeError("All values must either 'arrays' or "
                                    "'lists'.")
                key_to_type[k] = list
                new_dc[k] += v
            else:
                raise TypeError("Can't concat values other than 'lists' or "
                                "'arrays'.")

    for k in new_dc:
        if key_to_type[k] == np.ndarray:
            new_dc[k] = np.concatenate(tuple(new_dc[k]))
    return new_dc
    def _transform(self, data_chunk):
        fields_to_copy = [YelpEvalF.BUSINESS_ID]
        new_dc = DataChunk(**{fn: [] for fn in fields_to_copy})

        summ_cats = ["no_cat" for _ in range(len(data_chunk))]

        # wrapping each summary to a list as there is only one summary per
        # business
        new_dc[ModelF.SUMMS] = [[summ] for summ in data_chunk[YelpEvalF.SUMM]]

        new_dc[ModelF.SUMM_CAT] = summ_cats
        new_dc[ModelF.SUMM_GROUP_ID] = data_chunk[YelpEvalF.BUSINESS_ID]

        # splitting data-units by the reviews field. I.e. each unit will have
        # one review associated with it

        new_dc[ModelF.REV] = []
        for du in data_chunk.iter():
            for rev_fn in YelpEvalF.REVS:
                new_dc[ModelF.REV].append(du[rev_fn])
                # copying the rest
                for c_fn in fields_to_copy:
                    new_dc[c_fn].append(du[c_fn])

        # adding dummy category field
        cat_fvals = ["no_cat" for _ in range(len(new_dc))]
        new_dc[ModelF.CAT] = cat_fvals

        return new_dc
예제 #16
0
 def _transform(self, data_chunk):
     new_dc = DataChunk()
     for k, v in data_chunk.items():
         if k in self.old_to_new_fnames:
             k = self.old_to_new_fnames[k]
         new_dc[k] = v
     return new_dc
예제 #17
0
    def _transform(self, data_chunk):
        fields_to_copy = [AmazonEvalF.PROD_ID, AmazonEvalF.CAT]
        new_dc = DataChunk(**{fn: [] for fn in fields_to_copy})

        new_dc[ModelF.SUMMS] = [[summ1, summ2, summ3]
                                for summ1, summ2, summ3 in
                                zip(data_chunk[AmazonEvalF.SUMM1],
                                    data_chunk[AmazonEvalF.SUMM2],
                                    data_chunk[AmazonEvalF.SUMM3])]

        new_dc[ModelF.SUMM_CAT] = data_chunk[AmazonEvalF.CAT]
        new_dc[ModelF.SUMM_GROUP_ID] = data_chunk[AmazonEvalF.PROD_ID]

        # splitting data-units by the reviews field. I.e. each unit will have
        # one review associated with it

        new_dc[ModelF.REV] = []
        for du in data_chunk.iter():
            for rev_fn in AmazonEvalF.REVS:
                new_dc[ModelF.REV].append(du[rev_fn])
                # copying the rest
                for c_fn in fields_to_copy:
                    new_dc[c_fn].append(du[c_fn])

        return new_dc
예제 #18
0
    def test_3D_padding(self):
        """Light version test to check if the padder works for 3D data."""
        field_name = "dummy"
        mask_field_name = 'dummy_mask'
        pad_symbol = -99
        mask_fn_suffix = "mask"
        padding_mode = "both"
        axis = 2

        data_chunk = DataChunk(
            **{
                field_name:
                np.array([[[0, 1, 2], [3, 4, 5], [], [6]], [[1], [1, 2], []]])
            })
        padder = Padder(field_name,
                        pad_symbol=pad_symbol,
                        axis=axis,
                        new_mask_fname=mask_field_name,
                        padding_mode=padding_mode)
        padded_data_chunk = padder(copy.deepcopy(data_chunk))

        original_fv = data_chunk[field_name]
        padded_fv = padded_data_chunk[field_name]
        mask = padded_data_chunk[mask_field_name]

        for ofv, pfv, m in zip(original_fv, padded_fv, mask):
            self._test_padded_values(original_field_values=ofv,
                                     padded_field_values=pfv,
                                     mask=m,
                                     pad_symbol=pad_symbol)
예제 #19
0
 def test_sorting_by_ints_descending(self):
     expected_dc = DataChunk(**{
         self.ints_fn: np.array([123, 10, 0]),
         self.strings_fn: np.array(["d", "a", "c"]),
         self.floats_fn: np.array([15., -1, -10.])
     })
     actual_dc = self._run_sorter(fn=self.ints_fn, order='descending')
     self.assertTrue(expected_dc == actual_dc)
예제 #20
0
def generate_data_chunk(data_attrs_number, data_size):
    """Generated a data-chunk with random 1D values of data_size."""
    data = {
        str(i): np.random.rand(data_size)
        for i in range(data_attrs_number)
    }
    data_chunk = DataChunk(**data)
    return data_chunk
예제 #21
0
    def test_valid_data_units_deletion(self):
        dc = DataChunk(one=np.array([1, 2, 3, 4]),
                       two=np.array([0, 10, 11, 24]))

        del dc[2]

        self.assertTrue((np.array([1, 2, 4]) == dc['one']).all())
        self.assertTrue((np.array([0, 10, 24]) == dc['two']).all())
예제 #22
0
 def test_sorting_by_string_ascending(self):
     expected_dc = DataChunk(**{
         self.ints_fn: np.array([10, 0, 123]),
         self.strings_fn: np.array(["a", "c", "d"]),
         self.floats_fn: np.array([-1., -10, 15.])
     })
     actual_dc = self._run_sorter(fn=self.strings_fn, order='ascending')
     self.assertTrue(expected_dc == actual_dc)
예제 #23
0
    def testing_absolute_corruption(self):
        data_dc = DataChunk(
            **{DUMMY_FNAME: np.array([range(10) for _ in range(5)])})

        exp_dc = DataChunk()
        exp_dc[DUMMY_FNAME] = deepcopy(data_dc[DUMMY_FNAME])
        exp_dc[NEW_DUMMY_FNAME] = np.zeros(len(data_dc), dtype='object')
        for indx in range(len(exp_dc)):
            exp_dc[indx, NEW_DUMMY_FNAME] = list()

        word_dropper = WordDropper(fname=DUMMY_FNAME,
                                   new_fname=NEW_DUMMY_FNAME,
                                   dropout_prob=1.)

        act_dc = word_dropper(data_dc)

        self.assertTrue(act_dc == exp_dc)
예제 #24
0
    def _data_chunk_from_dicts_tree(dicts, tree_grouping_fnames):
        """Creates a data-chunk from a tree of data-units (dicts)."""
        def yield_paths_and_leaves(tree, path=None):
            def is_leaf(dct):
                for v in dct.values():
                    if not isinstance(v, list):
                        return False
                return True

            if is_leaf(tree):
                yield path, tree
            else:
                for k in tree.keys():
                    curr_path = [p for p in path] if path else []
                    curr_path.append(k)
                    for r in yield_paths_and_leaves(tree[k], curr_path):
                        yield r

        if not tree_grouping_fnames:
            raise ValueError("Please provide 'tree_grouping_fnames' to parse "
                             "input json files.")
        data_chunk = DataChunk()
        for fn in tree_grouping_fnames:
            data_chunk[fn] = []

        for path, leaf in yield_paths_and_leaves(dicts):
            leaf_size = _get_leaf_size(leaf)
            if len(path) != len(tree_grouping_fnames):
                raise ValueError("Please provide all grouping fields.")

            # storing path values
            for p_val, fn in zip(path, tree_grouping_fnames):
                data_chunk[fn] += [p_val] * leaf_size

            # storing leaf values
            for k, vals in leaf.items():
                assert (isinstance(vals, list))
                if k not in data_chunk:
                    data_chunk[k] = []
                data_chunk[k] += vals

        for k, v in data_chunk.items():
            data_chunk[k] = np.array(v)

        return data_chunk
예제 #25
0
 def test_scenario2(self):
     window_size = 3
     step_size = 3
     only_full_windows = False
     input_seqs = np.array([list(range(7)), list(range(2))])
     input_chunk = DataChunk(**{self.field_name: input_seqs})
     expect_seqs = np.array([[[0, 1, 2], [3, 4, 5], [6]], [[0, 1]]])
     expected_output_chunk = DataChunk(**{
         self.field_name: input_seqs,
         self.new_field_name: expect_seqs
     })
     self._test_window_setup(input_chunk,
                             expected_output_chunk,
                             field_name=self.field_name,
                             suffix=self.suffix,
                             window_size=window_size,
                             step_size=step_size,
                             only_full_windows=only_full_windows)
예제 #26
0
    def test_field_values_access(self):

        arrays_size = 40
        names = ["one", "two", "three", "four"]

        for _ in range(10):
            data = {name: np.random.rand(arrays_size, 1) for name in names}
            data_chunk = DataChunk(**deepcopy(data))
            for name in names:
                self.assertTrue((data_chunk[name] == data[name]).all())
예제 #27
0
    def test_condition_satisfaction(self):
        ks = [51.5, 2, 3, 4, 5]
        for k in ks:
            word_shuffler = WordShuffler(fname=TEXT_FNAME,
                                         end_symbol='DUMMY',
                                         k=k)
            dc = DataChunk(
                **{
                    TEXT_FNAME:
                    np.array(
                        [list(range(100)), list(range(30))], dtype='object')
                })
            corr_dc = word_shuffler(deepcopy(dc))

            for corr_du, du in zip(corr_dc.iter(), dc.iter()):
                text = du[TEXT_FNAME]
                corr_text = corr_du[TEXT_FNAME]
                self.assertTrue(condition_sat(corr_text, k=k))
                self.assertTrue(len(text) == len(corr_text))
                self.assertTrue(text != corr_text)
예제 #28
0
def create_list_of_data_chunks(data_chunk, chunk_size):
    """Creates a list of data-chunks out of the passed data-chunk."""
    collector = []
    start_indx = 0
    while start_indx < len(data_chunk):
        slice_range = range(start_indx,
                            min(start_indx + chunk_size, len(data_chunk)))
        dc = DataChunk(**{k: v[slice_range] for k, v in data_chunk.items()})
        collector.append(dc)
        start_indx += chunk_size
    return collector
예제 #29
0
    def test_specific_fvalues_access(self):
        arrays_size = 40
        names = ["one", "two", "three", "four"]

        for _ in range(10):
            data = {name: np.random.rand(arrays_size) for name in names}
            data_chunk = DataChunk(**deepcopy(data))

            for r_name in np.random.choice(names, size=10, replace=True):
                for r_indx in np.random.randint(0, 40, size=100):
                    res = (data_chunk[r_indx, r_name] == data[r_name][r_indx])
                    self.assertTrue(res)
예제 #30
0
    def test_invalid_data_units_deletion(self):
        """Deletion by data-unit index from data-chunks should not work."""
        dc = DataChunk(one=[1, 2, 3, 4], two=[10, 20, 30, 40, 50, 60])

        self.assertFalse(dc.valid)

        try:
            del dc[2]
        except ValueError as e:
            self.assertTrue(True)
        else:
            self.assertTrue(False)