def safely_call(self, func): # TODO We need to handle "connection aborted" ConnectionResetError too. """A convenience function which calls the 0-ary function supplied, while handling Wikipedia errors.""" try: return func() except wikipedia.PageError as e: echo("Found red link", e) return None except wikipedia.DisambiguationError as e: echo("Ambiguous article found", e) return None
def expr_run(args): try: exprs = command.read(args.expr()) parts = {} for expr in exprs: search.CommandSearch(expr, parts).run() print(ET.tostring(xmlify.xmlify(parts)).decode()) except TokenizeError as e: logger.echo(str(e)) print("<data />")
def __init__(self, argv, arg_set): """ Parses the argument list using getopt. The specific allowable arguments should be passed as the second argument, as an instance of ArgSet. """ try: arglist = arg_set.string self._args = dict(getopt(argv, arglist)[0]) except GetoptError as e: self._args = {} echo("Error in arguments:", e)
def _crawl_once(page, depth_): self.wait() echo("Trying", escape(page.title), "at", depth_, flush = True) if match_function(page): echo("Taking", escape(page.title)) return page elif depth_ >= self.depth: return None else: state = LinkState(page, self.depth - depth_, self.depth) link = self.selector.select_link(state) if link is None: return None new_page = wikipedia.page(link) return _crawl_once(new_page, depth_ + 1)
def select_link(self, state): page = state.page() if not self.pages: # We are on the first page so store this one too self.pages.append(page.title) pages = filter(lambda x: x not in self.pages, page.links) pages = filter(lambda x: self.db.get_score(x).denom > 0, pages) pages = map(lambda x: (x, float(self.db.get_score(x))), pages) pages = list(pages) if not pages or self.should_explore(state): echo("Exploring", level = 2) result = self.explore.select_link(state) else: echo("Using prior knowledge", level = 2) result = self._weighted_random(pages) if result: self.pages.append(result) return result
def _crawl_once(page, depth_): self.wait() echo(" Trying:", escape(page.title), "(" + str(depth_) + ")", flush=True) if match_function(page): echo(" Accepted:", escape(page.title)) return page elif depth_ >= self.depth: return None else: state = LinkState(page, self.depth - depth_, self.depth) link = self.selector.select_link(state) if link is None: return None new_page = wikipedia.page(link) return _crawl_once(new_page, depth_ + 1)
def _safely_call(n): try: return func() except wikipedia.PageError as e: echo("Found red link:", werror.wrap(e)) return None except wikipedia.DisambiguationError as e: echo("Ambiguous article found:", werror.wrap(e)) return None except ConnectionResetError as e: if n < self.max_aborts: echo("Connection reset", e, "...", "retrying") return _safely_call(n + 1) else: echo("Connection reset", e, "...", "aborting") return None except requests.exceptions.ConnectionError as e: echo("Aborting because of connection error", e) return None
def crawl_times(self, base, match_function): """ Performs self.crawl_once until a single match is found or self.max_tries attempts have been made. """ if type(base) is str: echo("Basis:", escape(base)) base = self.safely_call(lambda: wikipedia.page(base)) if not base: return None else: echo("Basis:", escape(base.title)) for i in range(0, self.max_tries): res = self.crawl_once(base, match_function) if res: return res else: echo(" Rejected")