class DuplicatesPipeline(object): def __init__(self): servers = SOLR_SERVERS self.cnn = SolrConnection(servers)[SOLR_COLLECTION_DEFAULT] self.cache_list = [] def process_item(self, item, spider): if self.cnn.search( {"q": "report_link:%s" % item["report_link"].encode("utf-8")} ).result.response.numFound != 0 or self.cache_duplicate(item["report_link"]): raise DropItem("Duplicate item found: %s" % item["report_link"]) else: if "report_revision_time_standard" in item: delta = datetime.timedelta(hours=8) dateTimezone = item["report_revision_time_standard"] - delta item["report_revision_time_standard"] = dateTimezone.strftime("%Y-%m-%dT%H:%M:%S") + "Z" return item def cache_duplicate(self, report_link): if report_link in self.cache_list: return True else: if len(self.cache_list) > SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: self.cache_list = [] else: self.cache_list.append(report_link) return False
class DuplicatesPipeline(object): def __init__(self): servers = SOLR_SERVERS self.cnn = SolrConnection(servers)[SOLR_COLLECTION_DEFAULT] self.cache_list = [] def process_item(self, item, spider): if self.cnn.search({"q":'report_link:%s' % item['report_link'].encode('utf-8')}).result.response.numFound != 0 \ or self.cache_duplicate(item["report_link"]): raise DropItem("Duplicate item found: %s" % item['report_link']) else: if "report_revision_time_standard" in item: delta = datetime.timedelta(hours=8) dateTimezone = item["report_revision_time_standard"] - delta item["report_revision_time_standard"] = dateTimezone.strftime( '%Y-%m-%dT%H:%M:%S') + "Z" return item def cache_duplicate(self, report_link): if report_link in self.cache_list: return True else: if len(self.cache_list) > SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: self.cache_list = [] else: self.cache_list.append(report_link) return False
class SolrBackend(object): def __init__(self, table, core="collection1"): self.table = table self.core = core self.url = 'localhost:8983' try: self.interface = SolrConnection(self.url)[self.core] except Exception as e: logger.warning("Cannot connect to Solr: %s" % e) raise RuntimeError("Cannot connect to Solr: %s" % e) def get_ids(self, queryset): return [r.id for r in queryset.select(self.table._id)] def indexes(self, *fieldnames): self.fieldnames = fieldnames def after_insert(self, fields, id): document = [{'id': id}] for name in self.fieldnames: if name in fields: document[0][name] = str(fields[name]) self.interface.add(document) self.interface.commit() return True def after_update(self, queryset, fields): """ caveat, this should work but only if ALL indexed fields are updated at once """ ids = self.get_ids(queryset) documents = [] for id in ids: self.interface.delete({'q':'id:%i'%id}) document = {'id':id} for name in self.fieldnames: if name in fields: document[name] = str(fields[name]) documents.append(document) self.interface.add(documents) self.interface.commit() return True def index_table(self, query, db): for row in db(query).select(): self.interface.delete({'q':'id:%i'%row.id}) self.interface.commit() documents = [] for row in db(query).select(): document = {'id':row.id} for name in self.fieldnames: document[name] = str(row[name]) documents.append(document) self.interface.add(documents) self.interface.commit() return True def update(self, query, fields, db, **core_fields): rows = db(query).select(*fields) documents = [] for row in rows: document={} for key in row.keys(): for core_field in core_fields: if core_field in key: document[key] = str(row[key]) if key == 'id': self.interface.delete({'q':'id:%i'%row[key]}) documents.append(document) self.interface.add(documents) self.interface.commit() return True def before_delete(self, queryset): self.ids = self.get_ids(queryset) return False def after_delete(self, queryset): self.ids = self.get_ids(queryset) for id in self.ids: self.interface.delete({'q':'id:%i'%id}) self.interface.commit() return True def meta_search(self, limit, offset, mode, compact, sort, **fieldkeys): query = '' items = len(fieldkeys) count = 0 # Convert to solrcloudpy search for fieldkey in fieldkeys: query += " %s:%s " % (fieldkey, fieldkeys[fieldkey]) count += 1 if items > 1 and count < items: query += mode se = SearchOptions() se.commonparams.q(query).rows(limit).sort(sort).start(offset) response = self.interface.search(se) if compact: return [r['id'] for r in response.result['response'].docs] return response.result['response']
class SolrBackend(object): def __init__(self, table, core="collection1"): self.table = table self.core = core self.url = 'localhost:8983' try: self.interface = SolrConnection(self.url)[self.core] except Exception as e: logger.warning("Cannot connect to Solr: %s" % e) raise RuntimeError("Cannot connect to Solr: %s" % e) def get_ids(self, queryset): return [r.id for r in queryset.select(self.table._id)] def indexes(self, *fieldnames): self.fieldnames = fieldnames def after_insert(self, fields, id): document = [{'id': id}] for name in self.fieldnames: if name in fields: document[0][name] = unicode(fields[name]) self.interface.add(document) self.interface.commit() return True def after_update(self, queryset, fields): """ caveat, this should work but only if ALL indexed fields are updated at once """ ids = self.get_ids(queryset) documents = [] for id in ids: self.interface.delete(id) document = {'id':id} for name in self.fieldnames: if name in fields: document[name] = unicode(fields[name]) documents.append(document) self.interface.add(documents) self.interface.commit() return True def update(self, query, fields, db, **core_fields): ''' Usage: ''' rows = db(query).select(*fields) documents = [] for row in rows: document={} for key in row.keys(): for core_field in core_fields: if core_field in row[key]: document[core_fields[core_field]] = unicode(row[key][core_field]) if core_field == 'id': self.interface.delete(row[key][core_field]) documents.append(document) self.interface.add(documents) self.interface.commit() return True def before_delete(self, queryset): self.ids = self.get_ids(queryset) return False def after_delete(self): for id in self.ids: self.interface.delete(id=id) self.interface.commit() return True def meta_search(self, limit, offset, mode, compact, sort, **fieldkeys): query = '' items = len(fieldkeys) count = 0 # Convert to solrcloudpy search for fieldkey in fieldkeys: query += " %s:%s " % (fieldkey, fieldkeys[fieldkey]) count += 1 if items > 1 and count < items: query += mode se = SearchOptions() se.commonparams.q(query).rows(limit).sort(sort).start(offset) print se response = self.interface.search(se) if compact: return [r['id'] for r in response.result['response'].docs] return response.result['response']