Пример #1
0
class Actor:
    def __init__(self, args):
        self.estimator = Estimator(emb_dim=args.emb_dim,
                                   n_hidden=args.n_hidden,
                                   bidirectional=args.bi,
                                   n_layer=args.n_layer,
                                   dropout=args.dropout,
                                   lr=args.lr,
                                   decay=args.decay,
                                   lr_p=args.lr_p,
                                   clip=args.clip,
                                   batch_size=args.batch,
                                   epoch_num=args.epoch_num,
                                   cuda=args.cuda,
                                   path=args.path)

        self.transformer = Transformer(prolog_grammar.GRAMMAR_DICTIONARY,
                                       prolog_grammar.ROOT_RULE)

        self.performances = []
        self.actions = []
        self.path = args.path

    def search(self):
        self.perform('initial')
        #exit(0)
        for i in range(25):
            print(i)
            try:
                self.step()
                self.perform(i)
            except BaseException as e:
                print(e)
                print(self.actions)
                print(self.performances)
                with open('gra.pkl', 'wb') as f:
                    pickle.dump(self.transformer.get_grammar_dict(), f)
                exit(-1)
        print(self.performances)
        #exit(0)

    def step(self):
        import time
        t1 = time.time()
        action_space = self.transformer.get_act_space()
        t2 = time.time()
        method = []
        i = -1
        while len(method) == 0:
            i = random.randint(0, 3)
            method = action_space[i]
        action = random.choice(method)
        print(i, action)
        if i == 0:
            self.transformer.creat_nt(action)
        elif i == 1:
            self.transformer.merge_nt(action)
        elif i == 2:
            self.transformer.combine_nt(*action)
        else:
            assert i == 3
            self.transformer.delete_prod(action)
        self.actions.append((i, action))

    def perform(self, name):
        grammar_dict, root_rule = self.transformer.get_grammar_dict()
        with open(os.path.join(self.path, f'grammar-{name}'), 'wb') as f:
            pickle.dump(self.transformer, f)
        perform = self.estimator.estimate(grammar_dict,
                                          root_rule,
                                          toy=False,
                                          name=repr(name))
        self.performances.append(perform)
        print(perform)
        return perform

    def exp(self, name):
        for _ in range(100):
            self.step()
        self.perform(name)

    def one(self):
        #with open(path, 'rb') as f:
        #    self.transformer = pickle.load(f)
        for i in range(50):
            self.step()
        grammar_dict, root_rule = self.transformer.get_grammar_dict()
        for i in range(10000):
            perform = self.estimator.estimate(grammar_dict,
                                              root_rule,
                                              toy=False,
                                              name='tmp')
            print(perform)
Пример #2
0
def alg(sc,
        data_set_rdd,
        data_set_size,
        threshold,
        epsilon,
        randomized=True,
        alpha=0.1):
    data_set_rdd.cache()
    partitions_num = data_set_rdd.getNumPartitions()
    sample_size = _calculate_sample_size_2(threshold, data_set_size, epsilon,
                                           alpha)
    collected_sample = data_set_rdd.sample(False,
                                           float(sample_size) /
                                           data_set_size).collect()
    collected_sample2 = data_set_rdd.sample(False,
                                            float(sample_size) /
                                            data_set_size).collect()
    collected_sample3 = data_set_rdd.sample(False,
                                            float(sample_size) /
                                            data_set_size).collect()
    # collected_sample4 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect()
    # collected_sample5 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect()
    log.info('Using sample of size %d', sample_size)
    print 'Using sample of size %d' % sample_size
    print 'ratio - %f' % (float(sample_size) / data_set_size)
    scaled_threshold = float(
        threshold) * sample_size / data_set_size if randomized else threshold
    frequencies1 = _countElements(
        collected_sample,
        float(threshold) * sample_size / data_set_size)
    common_elements1 = set(frequencies1.keys())
    frequencies2 = _countElements(
        collected_sample2,
        float(threshold) * sample_size / data_set_size)
    common_elements2 = set(frequencies2.keys())
    del collected_sample2
    frequencies3 = _countElements(
        collected_sample3,
        float(threshold) * sample_size / data_set_size)
    common_elements3 = set(frequencies3.keys())
    del collected_sample3
    # frequencies4 = _countElements(collected_sample4, float(threshold) * sample_size / data_set_size)
    # common_elements4 = set(frequencies4.keys())
    # del collected_sample4
    # frequencies5 = _countElements(collected_sample5, float(threshold) * sample_size / data_set_size)
    # common_elements5 = set(frequencies5.keys())
    # del collected_sample5
    common_candidates = common_elements1.union(common_elements2).union(
        common_elements3)  #.union(common_elements4).union(common_elements5)
    common_elements_set = set()
    for candidate in common_candidates:
        i = 0
        if candidate in common_elements1:
            i += 1
        if candidate in common_elements2:
            i += 1
        if candidate in common_elements3:
            i += 1
        # if candidate in common_elements4:
        #     i += 1
        # if candidate in common_elements5:
        #     i += 1
        if i >= 2:
            common_elements_set.add(candidate)
    # frequencies = _get_averages(frequencies1, frequencies2, frequencies3)
    # common_elements = [k for k in frequencies.keys() if frequencies[k] >= float(threshold) * sample_size / data_set_size]
    common_elements = list(common_elements_set)
    data_estimator = Estimator(sc.parallelize(collected_sample)) if randomized \
        else Estimator(data_set_rdd)

    # log.info('Estimating singletons frequencies')
    # start = time.time()

    # log.info('There are %d common elements', len(common_elements))
    # log.info('Common elements are - %s', common_elements)
    # end = time.time()
    # log.info('Singletons frequencies computation completed in %d seconds', end - start)
    # singletons = [(set([item]), frequencies[item] * data_set_size / sample_size) for item in common_elements]
    singletons = data_estimator.getSingletons()
    # common_elements = data_estimator.estimate(singletons)
    cis_tree = frequents.Frequents()
    # common_cached = data_estimator.estimate_commons(singletons.collect(), scaled_threshold)
    candidates = [set([i]) for i in common_elements]
    iteration = 1
    scaling_factor = data_set_size / sample_size if randomized else 1.0

    while candidates:
        log.info('Iteration %d starts. candidates set size is %d', iteration,
                 len(candidates))
        log.info('Starting Estimating and filtering. There are %d candidates',
                 len(candidates))
        start = time.time()
        next_level = data_estimator.estimate(candidates).filter(
            lambda pair: pair[1][1] >= scaled_threshold).map(lambda x: (x[1][
                0], int(min(x[1][1] * scaling_factor, data_set_size))))
        next_level.cache()
        cis_next_level = next_level.collect()
        cis_next_level = filter(lambda x: x[0].issubset(common_elements_set),
                                cis_next_level)
        end = time.time()
        log.info(
            'Estimation and filter done in %d seconds. Filtering candidates',
            end - start)
        if not cis_next_level:
            log.info('No candidates remained. Quiting iteration %d', iteration)
            break
        log.info(
            'Adding new computed level to the resulting lattice, of size %d',
            len(cis_next_level))
        log.info('New level is - %s', cis_next_level)
        start = time.time()
        cis_tree.add_level(cis_next_level)
        end = time.time()
        log.info('Next level addition to lattice completed in %d seconds',
                 end - start)
        start = time.time()
        candidates = _expand(next_level, common_elements, partitions_num)
        end = time.time()
        log.info(
            'Fast expansion took %d seconds and created %d candidates, Iteration %d completed',
            end - start, len(candidates), iteration)
        log.info('New candidates are %s', candidates)

        iteration += 1

    if not randomized:
        cis_tree.result = [(itemset.items, itemset.frequency)
                           for itemset in cis_tree.get_all()]
        cis_tree.result = {
            str(sorted(list(i[0]))): i[1]
            for i in cis_tree.result
        }
        return cis_tree
    # return cis_tree

    estimator = Estimator(data_set_rdd)
    final_itemsets = [itemset.items for itemset in cis_tree.get_all()]
    cis_tree.result = estimator.compute(final_itemsets).collect()
    cis_tree.result = {
        str(sorted(list(json.loads(i[0])))): i[1]
        for i in cis_tree.result
    }
    return cis_tree