Пример #1
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Пример #2
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Пример #3
0
    def update(self, metaInfo, metaData):
        '''
        @see: ISearchProvider.update()
        '''

        si = SolrInterface('http://%s%s' % (self.solr_server_url, metaData.Type))

        document = dict()

        document["MetaInfoId"] = metaInfo.Id
        document["MetaDataId"] = metaData.Id
        document["languageId"] = metaInfo.Language

        # custom processing on some fields
        field = 'CreationDate'
        if hasattr(metaInfo, field) and getattr(metaInfo, field):
            document['CreationData_Year'] = getattr(metaInfo, field).year

        for field in si.schema.fields:
            if hasattr(metaInfo, field) and getattr(metaInfo, field):
                document[field] = getattr(metaInfo, field)
            elif hasattr(metaData, field) and getattr(metaData, field):
                document[field] = getattr(metaData, field)

        si.add(document)
        si.commit()
Пример #4
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Пример #5
0
    def update(self, metaInfo, metaData):
        '''
        @see: ISearchProvider.update()
        '''

        si = SolrInterface('http://%s%s' %
                           (self.solr_server_url, metaData.Type))

        document = dict()

        document["MetaInfoId"] = metaInfo.Id
        document["MetaDataId"] = metaData.Id
        document["languageId"] = metaInfo.Language

        # custom processing on some fields
        field = 'CreationDate'
        if hasattr(metaInfo, field) and getattr(metaInfo, field):
            document['CreationData_Year'] = getattr(metaInfo, field).year

        for field in si.schema.fields:
            if hasattr(metaInfo, field) and getattr(metaInfo, field):
                document[field] = getattr(metaInfo, field)
            elif hasattr(metaData, field) and getattr(metaData, field):
                document[field] = getattr(metaData, field)

        si.add(document)
        si.commit()
Пример #6
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Пример #7
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on Solr
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     si.commit()
     return _('Record %s deleted on Solr') % solr_id
Пример #8
0
from sunburnt import SolrInterface
import sys

si = SolrInterface("http://nick.dlib.vt.edu:8080/solr")

eventQuery = sys.argv[1]

response = si.query( event=eventQuery).execute()
tot = response.result.numFound
response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()
docs = {}
print response.result.numFound
i = 1
for res in response:
    f = open(str(i) + ".txt","w")
    f.write(res['content'].encode("utf-8"))
    f.close()
    i+=1
si.commit()
Пример #9
0
    def run(self, dataset_id, *args, **kwargs):
        """
        Execute import.
        """
        from redd.models import Dataset

        log = logging.getLogger('redd.tasks.DatasetImportTask')
        log.info('Beginning import, dataset_id: %i' % dataset_id)

        dataset = Dataset.objects.get(id=dataset_id)

        task_status = dataset.current_task
        task_status.status = 'STARTED' 
        task_status.start = datetime.now()
        task_status.message = 'Preparing to import'
        task_status.save()

        line_count = self._count_lines(dataset.data_upload.get_path())

        if self.is_aborted():
            task_status.status = 'ABORTED'
            task_status.end = datetime.now()
            task_status.message = 'Aborted during preperation'
            task_status.save()

            log.warning('Import aborted, dataset_id: %i' % dataset_id)

            return

        solr = SolrInterface(settings.SOLR_ENDPOINT)
            
        reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
        reader.next()

        add_buffer = []

        for i, row in enumerate(reader, start=1):
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

                task_status.message = '%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100)
                task_status.save()

                if self.is_aborted():
                    task_status.status = 'ABORTED'
                    task_status.end = datetime.now()
                    task_status.message = 'Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100)
                    task_status.save()

                    log.warning('Import aborted, dataset_id: %i' % dataset_id)

                    return

        if add_buffer:
            solr.add(add_buffer)
            add_buffer = []
        
        solr.commit()

        dataset.row_count = i
        dataset.save()

        log.info('Finished import, dataset_id: %i' % dataset_id)
Пример #10
0
def dataset_import_data(dataset_id):
    """
    Import a dataset into Solr.
    """
    from redd.models import Dataset

    log = logging.getLogger('redd.tasks.dataset_import_data')
    log.info('Beginning import, dataset_id: %i' % dataset_id)

    dataset = Dataset.objects.get(id=dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    #solr_fields = []

    #for h, t in dataset.schema:
    #    if t == 'NoneType':
    #        solr_fields.append(None)
    #    else:
    #        solr_fields.append('%s_%s' % (h, t.__name__))
        
    reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
    reader.next()

    add_buffer = []
    normal_type_exceptions = []

    for i, row in enumerate(reader, start=1):
        data = {}

        typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row):
         try:
                value = normalize_column_type([value], normal_type=t)[1][0]
            except InvalidValueForTypeException:
                # Convert exception to row-specific error
                normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t))
                continue

            # No reason to send null fields to Solr (also sunburnt doesn't like them) 
            if value == None:
                continue

            if t in [unicode, bool, int, float]:
                if value == None:
                    continue

                data[field] = value
            elif t == datetime:
                data[field] = value.isoformat()
            elif t == date:
                pass
            elif t == time:
                pass
            else:
                # Note: if NoneType should never fall through to here 
                raise TypeError('Unexpected normal type: %s' % t.__name__)"""

        # If we've had a normal type exception, don't bother do the rest of this
        if not normal_type_exceptions:
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'csv_data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

    if add_buffer:
        solr.add(add_buffer)
        add_buffer = []
    
    if not normal_type_exceptions:
        solr.commit()
    else:
        # Rollback pending changes
        solr.delete(queries=solr.query(dataset_id=dataset.id))
        
        for e in normal_type_exceptions:
            print e 

    log.info('Finished import, dataset_id: %i' % dataset_id)
Пример #11
0
def _add_address_to_history(geodata):
    server = endpoints.solr + '/dealschrome/geodata'
    solr = SolrInterface(server)
    solr.add(geodata)
    solr.commit()
Пример #12
0
class SolrPipeline(object):

    search_engine = endpoints.solr + '/dealschrome/search-engine'
    archive = endpoints.solr + '/dealschrome/archive'

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.si_eng = SolrInterface(self.search_engine)
        self.si_eng.init_schema()
        self.si_arc = SolrInterface(self.archive)
        self.si_arc.init_schema()
        self.old_deals = {}

    def spider_opened(self, spider):
        source = spider.allowed_domains[0]
        old_temp = self.get_old_deals(source)
        self.old_deals[spider] = {i['id']:i for i in old_temp}
        spider.old_deals = dict(self.old_deals[spider])

    def spider_closed(self, spider):
        source = spider.allowed_domains[0]
        old_deals = self.old_deals.pop(spider)
        
        if spider.crawled_items.items():        
            for k,v in spider.crawled_items.items():
                if old_deals.has_key(v['url']):
                    field_created = old_deals[v['url']]['created']
                    del old_deals[v['url']]
                else:
                    field_created = int(time())
                    
                data = {
                    'id' : v['url'],
                    'title' : v['title'],
                    'dealsource' : source,
                    'price' : str(v['price']),
                    'worth' : str(v['worth']),
                    'discount' : str(v['discount']),
                    'bought' : str(v['bought']),
                    'imgsrc' : v['imgsrc'],
                    'category' : v['category'],
                    'created' : field_created,
                    'expiry' : str(v['expiry']),
                    'merchant' : v['merchant'],
                    'address' : v['address'],
                    'description': v['description'],
                }
                if v['location']:
                    # only add location when location exists
                    data['location'] = v['location']
                
                # its a BUG,this code is to correct multiple valued category
                if len(data['category']) > 1 and not isinstance(data['category'], types.StringTypes):
                    data['category'] = data['category'][0]
                            
                self.si_eng.add(data)
                self.si_arc.add(data)
            
            self.si_eng.commit()
            self.si_arc.commit()
            
            pending_delete = [doc for doc in old_deals.itervalues()]
            if pending_delete:
                self.si_eng.delete(pending_delete)
            
            self.si_eng.commit()
            self.si_arc.commit()
    
    def get_old_deals(self, source):
        old_deals = self.si_eng\
            .query(dealsource_raw=source)\
            .field_limit(['id','created','category_raw'],score=False)\
            .paginate(rows=900)\
            .execute()
        return old_deals        
        
class SolrBackend(Component):
    implements(ISearchBackend)

    UNIQUE_ID = "unique_id"

    HIGHLIGHTABLE_FIELDS = {
        "unique_id" : True,
        "id" : True,
        "type" : True,
        "product" : True,
        "milestone" : True,
        "author" : True,
        "component" : True,
        "status" : True,
        "resolution" : True,
        "keywords" : True,
        "summary" : True,
        "content" : True,
        "changes" : True,
        "owner" : True,
        "repository" : True,
        "revision" : True,
        "message" : True,
        "name" : True
        }

    server_url = Option(
            BHSEARCH_CONFIG_SECTION,
            'solr_server_url',
            doc="""Url of the server running Solr instance.""",
            doc_domain='bhsearch')

    def __init__(self):
        self.solr_interface = SolrInterface(str(self.server_url))

    def add_doc(self, doc, operation_context=None):
        self._reformat_doc(doc)
        doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
                                                doc["type"], doc["id"])
        self.solr_interface.add(doc)
        self.solr_interface.commit()

    def delete_doc(product, doc_type, doc_id, operation_context=None):
        unique_id = self._create_unique_id(product, doc_type, doc_id)
        self.solr_interface.delete(unique_id)

    def optimize(self):
        self.solr_interface.optimize()

    def query(
            self, query, query_string, sort = None, fields = None,
            filter = None, facets = None, pagenum = 1, pagelen = 20,
            highlight = False, highlight_fields = None, context = None):

        if not query_string:
            query_string = "*.*"

        final_query_chain = self._create_query_chain(query, query_string)
        solr_query = self.solr_interface.query(final_query_chain)
        faceted_solr_query = solr_query.facet_by(facets)
        highlighted_solr_query = faceted_solr_query.highlight(
                                    self.HIGHLIGHTABLE_FIELDS)

        start = 0 if pagenum == 1 else pagelen * pagenum
        paginated_solr_query = highlighted_solr_query.paginate(
                            start=start, rows=pagelen)
        results = paginated_solr_query.execute()

        mlt, hexdigests = self.query_more_like_this(paginated_solr_query,
                                                    fields="type", mindf=1,
                                                    mintf=1)

        query_result = self._create_query_result(highlighted_solr_query,
                                                 results, fields, pagenum,
                                                 pagelen)
        return query_result, mlt, hexdigests

    def query_more_like_this(self, query_chain, **kwargs):
        mlt_results = query_chain.mlt(**kwargs).execute().more_like_these
        mlt_dict = {}
        hexdigests = {}

        for doc, results in mlt_results.iteritems():
            hexdigest = hashlib.md5(doc).hexdigest()
            hexdigests[doc] = hexdigest

            for mlt_doc in results.docs:
                if doc not in mlt_dict:
                    mlt_dict[doc] = [self._process_doc(mlt_doc)]
                else:
                    mlt_dict[doc].append(self._process_doc(mlt_doc))

        return mlt_dict, hexdigests

    def _process_doc(self, doc):
        ui_doc = dict(doc)

        if doc.get('product'):
            env = ProductEnvironment(self.env, doc['product'])
            product_href = ProductEnvironment.resolve_href(env, self.env)
            ui_doc["href"] = product_href(doc['type'], doc['id'])
        else:
            ui_doc["href"] = self.env.href(doc['type'], doc['id'])

        ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title()

        return ui_doc

    def _create_query_result(
                        self, query, results, fields, pagenum, pagelen):
        total_num, total_page_count, page_num, offset = \
                    self._prepare_query_result_attributes(query, results,
                                                          pagenum, pagelen)

        query_results = QueryResult()
        query_results.hits = total_num
        query_results.total_page_count = total_page_count
        query_results.page_number = page_num
        query_results.offset = offset

        docs = []
        highlighting = []

        for retrieved_record in results:
            result_doc = self._process_record(fields, retrieved_record)
            docs.append(result_doc)

            result_highlights = dict(retrieved_record['solr_highlights'])

            highlighting.append(result_highlights)
            query_results.docs = docs
            query_results.highlighting = highlighting

        return query_results

    def _create_query_chain(self, query, query_string):
        matches = re.findall(re.compile(r'([\w\*]+)'), query_string)
        tokens = set([match for match in matches])

        final_query_chain = None
        for token in tokens:
            token_query_chain = self._search_fields_for_token(token)
            if final_query_chain is None:
                final_query_chain = token_query_chain
            else:
                final_query_chain |= token_query_chain

        return final_query_chain

    def _process_record(self, fields, retrieved_record):
        result_doc = dict()
        if fields:
            for field in fields:
                if field in retrieved_record:
                    result_doc[field] = retrieved_record[field]
        else:
            for key, value in retrieved_record.items():
                result_doc[key] = value

        for key, value in result_doc.iteritems():
            result_doc[key] = self._from_whoosh_format(value)

        return result_doc

    def _from_whoosh_format(self, value):
        if isinstance(value, datetime):
            value = utc.localize(value)
        return value

    def _prepare_query_result_attributes(
                                    self, query, results, pagenum, pagelen):
        results_total_num = query.execute().result.numFound
        total_page_count = int(ceil(results_total_num / pagelen))
        pagenum = min(total_page_count, pagenum)

        offset = (pagenum-1) * pagelen
        if (offset+pagelen) > results_total_num:
            pagelen = results_total_num - offset

        return results_total_num, total_page_count, pagenum, offset

    def is_index_outdated(self):
        return False

    def recreate_index(self):
        return True

    @contextmanager
    def start_operation(self):
        yield

    def _search_fields_for_token(self, token):
        q_chain = None
        field_boosts = DefaultQueryParser(self.env).field_boosts

        for field, boost in field_boosts.iteritems():
            if field != 'query_suggestion_basket' and field != 'relations':
                field_token_dict = {field: token}
                if q_chain is None:
                    q_chain = self.solr_interface.Q(**field_token_dict)**boost
                else:
                    q_chain |= self.solr_interface.Q(**field_token_dict)**boost

        return q_chain

    def _reformat_doc(self, doc):
        for key, value in doc.items():
            if key is None:
                del doc[None]
            elif value is None:
                del doc[key]
            elif isinstance(value, basestring) and value == "":
                del doc[key]
            else:
                doc[key] = self._to_whoosh_format(value)

    def _to_whoosh_format(self, value):
        if isinstance(value, basestring):
            value = unicode(value)
        elif isinstance(value, datetime):
            value = self._convert_date_to_tz_naive_utc(value)
        return value

    def _convert_date_to_tz_naive_utc(self, value):
        if value.tzinfo:
            utc_time = value.astimezone(utc)
            value = utc_time.replace(tzinfo=None)
        return value

    def _create_unique_id(self, product, doc_type, doc_id):
        if product:
            return u"%s:%s:%s" % (product, doc_type, doc_id)
        else:
            return u"%s:%s" % (doc_type, doc_id)
Пример #14
0
class ScoreSpider(CrawlSpider):
    name = "score"
    allowed_domains = ["matchendirect.fr"]
    start_urls = ["http://www.matchendirect.fr/hier/"]
    rules = [
        Rule(
            SgmlLinkExtractor(allow=(r"/live-score/[a-z0-9\-]+\.html$", r"/foot-score/[a-z0-9\-]+\.html$")),
            "parse_score",
        )
    ]

    # init solr instance
    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.si = SolrInterface("http://*****:*****@class="tableau"][1]')
            rows = table.xpath("tr")
            for row in rows:
                # if match has has started & is finished
                scoring = row.xpath('td[@class="lm4"]/a[not(span)]/text()').extract()
                isPlaying = row.xpath('td[@class="lm2_1"]').extract()
                if scoring and not isPlaying:
                    score = ScoreItem()
                    score["id"] = "http://www.matchendirect.fr" + row.xpath('td[@class="lm4"]/a/@href').extract().pop()
                    score["host"] = row.xpath('td[@class="lm3"]/a/text()').extract().pop()
                    score["visitor"] = row.xpath('td[@class="lm5"]/a/text()').extract().pop()

                    scoringArr = scoring.pop().split(" - ")
                    score["scorehost"] = int(scoringArr[0])
                    score["scorevisitor"] = int(scoringArr[1])
                    if score["scorehost"] > score["scorevisitor"]:
                        score["winner"] = score["host"]
                    elif score["scorehost"] < score["scorevisitor"]:
                        score["winner"] = score["visitor"]

                    leagueArr = league.xpath("a[1]/text()").extract().pop().split(" : ")
                    score["country"] = leagueArr[0]
                    score["league"] = leagueArr[1]

                    docs.append(dict(score))

        # index crawled games
        self.si.add(docs)
        self.si.commit()

    # called on followed urls
    # get game details (goal scorer & time)
    def parse_score(self, response):
        sel = Selector(response)
        # if match has started & is finished
        scorehost = sel.xpath('//div[@id="match_score"]/div[@class="col2"]/text()').extract().pop().strip()
        scorevisitor = sel.xpath('//div[@id="match_score"]/div[@class="col3"]/text()').extract().pop().strip()
        isPlaying = sel.xpath('//div[@id="match_entete_2"]/img').extract()

        if scorehost and scorevisitor and not isPlaying:
            score = ScoreItem()

            # get already indexed data
            solr_doc = self.si.query(id=response.url).execute()
            if list(solr_doc):
                doc = solr_doc[0]
            else:
                doc = {}
                score["id"] = response.url

            # get goals
            table = sel.xpath('//table[@class="tableau match_evenement"]')
            rows = table.xpath("tr")
            score["goalscorershost"], score["goalscorersvisitor"], score["goaltimeshost"], score["goaltimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            score["penaltytimeshost"], score["penaltytimesvisitor"], score["ogtimeshost"], score["ogtimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            for row in rows:
                tdgoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement1"]]')
                tdpenaltyhost = row.xpath('td[@class="c1" and span[@class="ico_evenement2"]]')
                tdowngoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement7"]]')
                tdgoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement1"]]')
                tdpenaltyvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement2"]]')
                tdowngoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement7"]]')
                tdgoalhost = tdgoalhost or tdpenaltyhost or tdowngoalhost
                tdgoalvisitor = tdgoalvisitor or tdpenaltyvisitor or tdowngoalvisitor
                if tdgoalhost:
                    time = tdgoalhost.xpath('following-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    if tdpenaltyhost:
                        score["penaltytimeshost"].append(time)
                    elif tdowngoalhost:
                        score["ogtimeshost"].append(time)
                    score["goaltimeshost"].append(time)
                    score["goalscorershost"].append(tdgoalhost.xpath("a/text()").extract().pop())
                elif tdgoalvisitor:
                    time = (
                        tdgoalvisitor.xpath('preceding-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    )
                    if tdpenaltyvisitor:
                        score["penaltytimesvisitor"].append(time)
                    elif tdowngoalvisitor:
                        score["ogtimesvisitor"].append(time)
                    score["goaltimesvisitor"].append(time)
                    score["goalscorersvisitor"].append(tdgoalvisitor.xpath("a/text()").extract().pop())

            # get time, refree & stadium
            matchinfos = sel.xpath('//table[@id="match_entete_1"]/tr/td[@class="info"]/text()').extract()
            matchinfos.pop()
            matchinfos = [x.lstrip("\n\t\r") for x in matchinfos]
            if u"Arbitre : - " in matchinfos:
                matchinfos.remove(u"Arbitre : - ")
            date = format_date(matchinfos[0])
            time = matchinfos[1].split(" ")[-1].replace("h", ":") + ":00"
            score["date"] = "%sT%sZ" % (date, time)
            if len(matchinfos) >= 3:
                score["stadium"] = matchinfos[2]
                if len(matchinfos) == 4:
                    score["referee"] = matchinfos[3].split(" : ")[1]

            # index all datas
            doc = dict(doc.items() + dict(score).items())
            self.si.add(doc)
            self.si.commit()