def _filter_by_query(self, spider): """Return those items from recently fetched that match the QUERY. Make sure that items have been uploaded to Solr but last crawl time not updated before calling this func """ # FIXME does Solr have a native way to do this? def escape(link): res = link for c in ['/', ':', '?', '&']: res = res.replace(c, '\\'+c) return res # increment date by 1 second to hide last seen result # FIXME how can we do it with a solr query? last_to_show = (datetime.datetime.now() - datetime.timedelta(days=settings.POSTS_TTL)) if not spider.last_ts: spider.last_ts = last_to_show inc_date = max(spider.last_ts + datetime.timedelta(0, 1), last_to_show) query = ((u"%(query)s AND date:([%(date)s TO NOW]) " "AND source: %(source)s") % {'query': settings.QUERY, 'date': utils.convert_date_to_solr_date(inc_date), 'source': spider.name}) items = self.solr.search(query, sort="date desc", rows=settings.QUERY_ROWS) # convert dates to human-readable non-solr format for item in items: # FIXME move to utils dt = datetime.datetime.strptime(item['date'], settings.SOLR_DATE_FORMAT) item['date'] = dt.strftime(settings.DATE_FORMAT) return items
def cleanup(): """Remove data from solr older than post ttl""" solr = pysolr.Solr(settings.SOLR_URL, timeout=settings.SOLR_TIMEOUT) date = utils.convert_date_to_solr_date( datetime.datetime.now() - datetime.timedelta(days=settings.POSTS_TTL)) xml = solr.delete(q="date:[* TO %s]" % date) return Response(xml, mimetype="text/xml")