def download(url, usecache=True, cached=None, cachedir='cache~/', cachedonly=False, **opts): """ Download (or cache) ``url`` to file. On success: return file name of stored contents. Upon failure: return None. Will retry ``tries`` times with ``pause`` seconds between each attempt to download. Download will timeout after ``timeout`` seconds. If ``cachedonly`` is enabled, this function will not download anything. It will simply return the cached filename if it exists. """ if not cached: if cachedir: mkdir(cachedir) cached = os.path.join(cachedir, secure_filename(url)) else: assert not usecache, 'must specify cachedir' # only return something for cached files if cachedonly and not os.path.exists(cached): return if usecache and os.path.exists(cached): return cached # use wget for ftp files if url.startswith('ftp'): return wget(url, cached) if url.startswith('http'): return robust_download(url, cached, **opts)
def __init__(self, corpus, Y, train, dev, initial_contexts, outer_iterations, inner_iterations, group_budget, regularizer, allowed_contexts, dump, no_failure_arcs=0): self.no_failure_arcs = no_failure_arcs # if true, runs model with last-char subst closure. # Create initial pattern set. VoCRF.__init__(self, Y, initial_contexts) self.dump = None if dump is not None: self.dump = Path(dump) mkdir(self.dump) self.corpus = corpus self.dev_best = -np.inf # the set of allowed contexts must be prefix closed to make sense. self.allowed_contexts = None if allowed_contexts is not None: self.allowed_contexts = set(prefix_closure(allowed_contexts)) self.train = train self.dev = dev # max number of higher-order features = # budget [green nodes - the max number of 'active' contexts at any time] # x extensions = |Y| [yellow nodes - a little room to grow] # x number of labels [because that's how we encode features] XXX: I think this is an overestimate we want |states| x |labels| self.H = max(group_budget * len(Y), len(self.C)) * self.A self.D = MAGIC * self.A self.group_budget = group_budget self.regularizer = regularizer / len(self.train) L = 2 if regularizer > 0 else -1 self.sparse = LazyRegularizedAdagrad(self.D, L=L, C=self.regularizer) self.dense = OnlineProx(self.group_structure(), self.H, L=L, C=self.regularizer) self.inner_iterations = inner_iterations self.outer_iterations = outer_iterations self.log = []