def query(self, q, provider=None, uri=None): filterctx = self.filters.build_filter_context(q) # Build scraper contexts if provider or uri: scrapectxs = [self.scraper.build_context(provider=provider, uri=uri)] else: scrapectxs = self.scraper.build_contexts_for_query(q) results = self.scraper.process(*scrapectxs) results = analyze.analyze(*results, mp=False) if not results: msg = "No results found for %r" msg = msg % q print(msg) return msg = "Found %s sources" msg = msg % (len(results),) print(msg) # Filter results results = self.filters.apply(filterctx, results) msg = "Got %s matching sources for %r" msg = msg % (len(results), q) print(msg) groups = self.filters.sort(results) return groups
def test_source_with_invalid_type_hint(self): src = build_source('foo') # build_source doesnt do parse src.hints = {'type': 'other'} asrc = analyze(src, mp=False)[0] self.assertTrue(isinstance(asrc.entity, Movie))
def run_analyze(self, app, args): raw = json.loads(args.input.read()) if isinstance(raw, dict): raw = [raw] raw = [schema.Source(**x) for x in raw] proc = analyze.analyze(*raw, mp=False) output = json.dumps([x.dict() for x in proc], indent=2, default=_json_encode_hook) args.output.write(output)
def do_query2(self, app, args): def _parse_queryparams(pairs): for pair in pairs: key, value = pair.split('=', 1) if not key or not value: raise ValueError(pair) yield (key, value) if not args.queryparams and not args.querystring: errmsg = "filter or querystring are requierd" print(errmsg, file=sys.stderr) raise extensions.CommandUsageError() q = {} if args.querystring: q = query.Query.fromstring(args.querystring) if args.queryparams: params = dict(_parse_queryparams(args.queryparams)) q = query.Query(**params) # Setup filters before scrape anything query_engine = query.Engine() try: filters = query_engine.build_filter(q) except query.MissingFiltersError as e: errmsg = "Unknow filters: %s" errmsg = errmsg % ', '.join(e.args[0]) print(errmsg, file=sys.stderr) raise extensions.CommandUsageError() # Build scrape ctxs and process them scrape_engine = scraper.Engine() ctxs = scrape_engine.build_contexts_for_query(q) sources = scrape_engine.process(*ctxs) sources = analyze.analyze(*sources) # Pass sources thru filters results = query_engine.apply(filters, sources) results = query_engine.sort(results) # Output results = [[entity.dict(), [src.dict() for src in sources]] for (entity, sources) in results] output = json.dumps(results, indent=2, default=_json_encode_hook) args.output.write(output)