示例#1
0
def test_df(sample):
    from g4l.context_tree import ContextTree
    from g4l.util.degrees_of_freedom import degrees_of_freedom as df
    tree = ContextTree.init_from_sample(sample)
    assert len(df('g4l', tree)) == 53  # nodes have different dfs
    assert df('perl', tree) == -1
    assert df('ct06', tree) == -0.5
def load_cache(estimator, X):
    """
    Loads previously estimated context trees from file. The result
    folder is unique for each set of SMC parameters (epsilon and
    penalty interval) and sample.

    Parameters
    ----------
    estimator : g4l.estimators.SMC
        The resulting context tree
    X : g4l.data.Sample
        A sample

    """
    cache_folder, cachefile = cache_file(estimator, X)
    try:
        with open(cachefile, 'rb') as f:
            dic = pickle.load(f)
        print("Loaded from cache")
    except FileNotFoundError:
        return False
    estimator.max_depth = dic['max_depth']
    estimator.penalty_interval = dic['penalty_interval']
    estimator.epsilon = dic['epsilon']
    estimator.cache_dir = dic['cache_dir']
    estimator.thresholds = dic['thresholds']
    estimator.context_trees = []
    for i in range(dic['context_trees']):
        t = ContextTree.load_from_file('%s/%s.tree' % (cache_folder, i))
        estimator.context_trees.append(t)
    return True
def calculate_likelihoods(champion_trees_folder,
                          resamples_file, resample_size,
                          subsamples_separator=None, num_cores=0):

    listfiles = list_files(champion_trees_folder)
    champion_trees = [ContextTree.load_from_file(f) for f in listfiles]
    num_resamples = len(resamples(resamples_file))
    num_trees = len(champion_trees)
    pr = list(product(range(num_trees), range(num_resamples)))

    params = [(champion_trees_folder,
               resamples_file,
               resample_size,
               i, j,
               subsamples_separator) for i, j in pr]

    if num_cores is None:
        num_cores = 0
    if num_cores > 1:
        with Pool(num_cores) as p:
            result = list(tqdm.tqdm(p.imap(calc_likelihood_process, params),
                          total=len(params)))
    else:
        result = list(tqdm.tqdm(map(calc_likelihood_process, params),
                      total=len(params)))

    return np.reshape(result, (num_trees, num_resamples))
 def build(self):
     from g4l.context_tree import ContextTree
     # import code; code.interact(local=dict(globals(), **locals()))
     if len([c for c, f in self.contexts if c == '']) == 0:
         empty_tr_freq_dic = dict([(ctx, sum(ps)) for ctx, ps in self.contexts if len(ctx) == 1])
         empty_tr_freq_dic = defaultdict(int, empty_tr_freq_dic)
         empty_tr_freq = [empty_tr_freq_dic[x] for x in self.A]
         self.contexts.append(('', empty_tr_freq))
     df = self._build_contexts_dataframe()
     probs = self._build_transition_probs()
     max_depth = df.node.str.len().max()
     return ContextTree(max_depth, df, probs)
def create_samples(args):
    tree = ContextTree.load_from_file(args.tree.name)
    x = tree.generate_sample(args.size) + '\n'
    fl = os.path.abspath(args.output_file.name)
    open(fl, 'w').write(x)
    logging.info("Sample '%s' generated" % fl)
 def load_trees(self):
     champion_trees_folder = os.path.join(self.folder, 'champion_trees')
     self.champion_trees = []
     for f in sorted(glob.glob(champion_trees_folder + "/*.tree")):
         ct = ContextTree.load_from_file(f)
         self.champion_trees.append(ct)
 def optimal_tree(self):
     f = os.path.join(self.folder, 'optimal.tree')
     return ContextTree.load_from_file(f)
def get_tree(trees_folder, idx):
    return ContextTree.load_from_file(list_files(trees_folder)[idx])