def export(stream=sys.stdout): """Export file format: json_repr [SEPARATOR json_repr] """ json_list = [s.to_dict() for s in spider_utils.find_spiders() if s.type == "vk"] for s_repr in json_list: json.dump(s_repr, stream, separators=(',', ': '), indent=2) stream.write(settings.SPIDER_SEPARATOR)
def crawl_all(token=None): if not token: LOG.warn("No token passed, " "acquiring one using login data from settings") token = utils.get_access_token() LOG.info("Access token: %s" % token) runner = crawler.CrawlerRunner(project.get_project_settings()) dispatcher.connect(on_close, signal=signals.spider_closed) for spider_cls in spider_utils.find_spiders(): # FIXME incapsulation vialation # inject access_token to a VK spider spider_cls.access_token = token RUNNING_CRAWLERS.append(spider_cls) runner.crawl(spider_cls) d = runner.join() d.addBoth(lambda _: send_mail()) internet.reactor.run()
def query_results(): # if nothing selected - output all results query = request.args.get('q', '') token = request.args.get('access_token', '') sources = request.args.getlist('source') if len(sources) > 0: filter_sources = 'source:(%s)' % ' '.join(sources) query = (filter_sources if query.strip() == '' else query + ' AND ' + filter_sources) # FIXME some query preprocessing may be needed solr = pysolr.Solr(settings.SOLR_URL, timeout=settings.SOLR_TIMEOUT) items = solr.search(query, sort="date desc", rows=settings.QUERY_ROWS) items_out = list(items.docs) for item in items_out: # change date format from ugly Solr to nice user defined dt = datetime.datetime.strptime(item['date'], settings.SOLR_DATE_FORMAT) item['date'] = dt.strftime(settings.DATE_FORMAT) spiders = [s.name for s in spider_utils.find_spiders()] return render_template('show_items.html', items=items_out, query=query, spiders=spiders, access_token=token)