示例#1
0
文件: test_shuffle.py 项目: towr/bndl
    def _test_dependency_failure(self, dset_size, pcount, key_count,
                                 kill_after):
        class WorkerKiller(object):
            def __init__(self, worker, count):
                self.count = count + 1
                self.worker = worker
                self.lock = threading.Lock()

            def countdown(self, i):
                with self.lock:
                    self.count -= i
                    if self.count == 0:
                        logger.info('Killing %r', self.worker.name)
                        pid = self.worker.service('tasks').execute(
                            lambda: os.getpid()).result()
                        os.kill(pid, signal.SIGKILL)

            def __str__(self):
                return 'kill %s after %s elements' % (self.worker.name,
                                                      self.count)

        def identity_mapper(killers, key_count, i):
            for killer in killers:
                killer.update('countdown', 1)
            return i

        def keyby_mapper(killers, key_count, i):
            identity_mapper(killers, key_count, i)
            return (i % key_count, i)

        try:
            self.ctx.conf['bndl.execute.attempts'] = 2

            killers = [
                self.ctx.accumulator(WorkerKiller(worker, count))
                for worker, count in zip(self.ctx.workers, kill_after)
            ]

            dset = self.ctx \
                       .range(dset_size, pcount=pcount) \
                       .map(identity_mapper, killers, key_count) \
                       .shuffle() \
                       .map(keyby_mapper, killers, key_count) \
                       .aggregate_by_key(sum)

            result = dset.collect()
            self.assertEqual(len(result), key_count)
            self.assertEqual(sorted(pluck(0, result)), list(range(key_count)))
            self.assertEqual(sum(pluck(1, result)), sum(range(dset_size)))

            time.sleep(1)
        finally:
            self.ctx.conf['bndl.execute.attempts'] = 1
示例#2
0
def test_pluck():
    assert list(pluck(0, [[0, 1], [2, 3], [4, 5]])) == [0, 2, 4]
    assert list(pluck([0, 1], [[0, 1, 2], [3, 4, 5]])) == [(0, 1), (3, 4)]
    assert list(pluck(1, [[0], [0, 1]], None)) == [None, 1]

    data = [{"id": 1, "name": "cheese"}, {"id": 2, "name": "pies", "price": 1}]
    assert list(pluck("id", data)) == [1, 2]
    assert list(pluck("price", data, None)) == [None, 1]
    assert list(pluck(["id", "name"], data)) == [(1, "cheese"), (2, "pies")]
    assert list(pluck(["name"], data)) == [("cheese",), ("pies",)]
    assert list(pluck(["price", "other"], data, None)) == [(None, None), (1, None)]

    assert raises(IndexError, lambda: list(pluck(1, [[0]])))
    assert raises(KeyError, lambda: list(pluck("name", [{"id": 1}])))
示例#3
0
def test_pluck():
    assert list(pluck(0, [[0, 1], [2, 3], [4, 5]])) == [0, 2, 4]
    assert list(pluck([0, 1], [[0, 1, 2], [3, 4, 5]])) == [(0, 1), (3, 4)]
    assert list(pluck(1, [[0], [0, 1]], None)) == [None, 1]

    data = [{'id': 1, 'name': 'cheese'}, {'id': 2, 'name': 'pies', 'price': 1}]
    assert list(pluck('id', data)) == [1, 2]
    assert list(pluck('price', data, None)) == [None, 1]
    assert list(pluck(['id', 'name'], data)) == [(1, 'cheese'), (2, 'pies')]
    assert list(pluck(['name'], data)) == [('cheese',), ('pies',)]
    assert list(pluck(['price', 'other'], data, None)) == [(None, None),
                                                           (1, None)]

    assert raises(IndexError, lambda: list(pluck(1, [[0]])))
    assert raises(KeyError, lambda: list(pluck('name', [{'id': 1}])))
示例#4
0
def test_pluck():
    assert list(pluck(0, [[0, 1], [2, 3], [4, 5]])) == [0, 2, 4]
    assert list(pluck([0, 1], [[0, 1, 2], [3, 4, 5]])) == [(0, 1), (3, 4)]
    assert list(pluck(1, [[0], [0, 1]], None)) == [None, 1]

    data = [{'id': 1, 'name': 'cheese'}, {'id': 2, 'name': 'pies', 'price': 1}]
    assert list(pluck('id', data)) == [1, 2]
    assert list(pluck('price', data, None)) == [None, 1]
    assert list(pluck(['id', 'name'], data)) == [(1, 'cheese'), (2, 'pies')]
    assert list(pluck(['name'], data)) == [('cheese',), ('pies',)]
    assert list(pluck(['price', 'other'], data, None)) == [(None, None),
                                                           (1, None)]

    assert raises(IndexError, lambda: list(pluck(1, [[0]])))
    assert raises(KeyError, lambda: list(pluck('name', [{'id': 1}])))
示例#5
0
    def test_average(self):
        values = range(100)
        keys = list(map(lambda i: i // 20, values))

        expected = {}
        for key, group in groupby(zip(keys, values), itemgetter(0)):
            vals = list(pluck(1, group))
            expected[key] = sum(vals) / len(vals)

        pairs = self.ctx.collection(zip(keys, values))
        sum_count = pairs.combine_by_key(
            lambda value: (value, 1), lambda x, value:
            (x[0] + value, x[1] + 1), lambda x, y: (x[0] + y[0], x[1] + y[1]))
        avg_by_key = sum_count.starmap(lambda key, value:
                                       (key, value[0] / value[1]))

        self.assertDictEqual(avg_by_key.collect_as_map(), expected)
def test_pluck():
    assert list(pluck(0, [[0, 1], [2, 3], [4, 5]])) == [0, 2, 4]
    assert list(pluck([0, 1], [[0, 1, 2], [3, 4, 5]])) == [(0, 1), (3, 4)]
    assert list(pluck(1, [[0], [0, 1]], None)) == [None, 1]

    data = [{"id": 1, "name": "cheese"}, {"id": 2, "name": "pies", "price": 1}]
    assert list(pluck("id", data)) == [1, 2]
    assert list(pluck("price", data, 0)) == [0, 1]
    assert list(pluck(["id", "name"], data)) == [(1, "cheese"), (2, "pies")]
    assert list(pluck(["name"], data)) == [("cheese", ), ("pies", )]
    assert list(pluck(["price", "other"], data, 0)) == [(0, 0), (1, 0)]

    assert raises(IndexError, lambda: list(pluck(1, [[0]])))
    assert raises(KeyError, lambda: list(pluck("name", [{"id": 1}])))

    assert list(pluck(0, [[0, 1], [2, 3], [4, 5]], no_default2)) == [0, 2, 4]
    assert raises(IndexError, lambda: list(pluck(1, [[0]], no_default2)))
    root = tree.getroot()

    reviews = root.findall("Review")
    sentences = root.findall("**/sentence")
    # print("# Reviews   : ", len(reviews))
    print("# Sentences : ", len(sentences))

    opinions = root.findall("**/**/Opinion")
    categories = [opinion.attrib["category"] for opinion in opinions]
    targets = [opinion.attrib["target"] for opinion in opinions]
    entities_and_aspects = [cat.split('#') for cat in categories]
    polarities = [opinion.attrib["polarity"] for opinion in opinions]
    print("# Opinions  : ", len(opinions))
    df = pd.DataFrame({
        "category": categories,
        "entity": list(pluck(0, entities_and_aspects)),
        'aspect': list(pluck(1, entities_and_aspects)),
        'target': targets,
        "polarity": polarities
    }).sort_values("entity")
    dfs.append(df)

df_all = pd.concat(dfs)
df_all.polarity = df_all.polarity.apply(lambda t: t if t else 'unknown')
df_all.to_csv(SEMEVAL_DATASETS_2016 / 'all-entities-and-aspects.csv')

g = sns.countplot(y="category",
                  hue="polarity",
                  data=df_all,
                  palette={
                      "neutral": "yellow",