Exemplo n.º 1
0
    def update(self, metaInfo, metaData):
        '''
        @see: ISearchProvider.update()
        '''

        si = SolrInterface('http://%s%s' % (self.solr_server_url, metaData.Type))

        document = dict()

        document["MetaInfoId"] = metaInfo.Id
        document["MetaDataId"] = metaData.Id
        document["languageId"] = metaInfo.Language

        # custom processing on some fields
        field = 'CreationDate'
        if hasattr(metaInfo, field) and getattr(metaInfo, field):
            document['CreationData_Year'] = getattr(metaInfo, field).year

        for field in si.schema.fields:
            if hasattr(metaInfo, field) and getattr(metaInfo, field):
                document[field] = getattr(metaInfo, field)
            elif hasattr(metaData, field) and getattr(metaData, field):
                document[field] = getattr(metaData, field)

        si.add(document)
        si.commit()
Exemplo n.º 2
0
    def processQuery(self, session, scheme, qa=None, qi=None, qd=None):
        '''
        Creates the solr query based on received REST queries
        '''

        si = SolrInterface('http://%sother' % self.solr_server_url)
        types = [self.queryIndexer.typesByMetaData[key] for key in self.queryIndexer.typesByMetaData.keys()]

        solrQuery = None
        orClauses = []

        if qa is not None:
            assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa
            solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses)
            if QMetaDataInfo.type in qa: types = qa.type.values

        if qi is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses)

        if qd is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses)

        if orClauses:
            extend = None
            for clause in orClauses:
                if extend: extend = extend | clause
                else: extend = clause

            if solrQuery is None: solrQuery = si.query(extend)
            else: solrQuery = solrQuery.query(extend)

        if solrQuery is None: solrQuery = si.query()
        solrQuery = buildShards(solrQuery, self.solr_server_url, types)

        return solrQuery
Exemplo n.º 3
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on SolR
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     return _('Record %s deleted on SolR') % solr_id
Exemplo n.º 4
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Exemplo n.º 5
0
def get_test_solr():
    settings.SOLR_ENDPOINT = 'http://localhost:8983/solr/data_test'

    solr = SolrInterface(settings.SOLR_ENDPOINT) 
    solr.delete(queries='*:*', commit=True)

    return solr
Exemplo n.º 6
0
    def _do_search(self, sort_by=None):
        si = SolrInterface(self.solr_endpoint)

        searchquery = si.Q(*[si.Q(s) for s in self.searchable])
        query = si.query(searchquery).field_limit(score=True)

        realm_query = self._build_realm_filter(si)
        if realm_query:
            query = query.filter(realm_query)

        author_query = self._build_author_filter(si)
        if author_query:
            query = query.filter(author_query)

        trac_query = self._build_trac_filter(si)
        if trac_query:
            query = query.filter(trac_query)

        for field in sort_by or []:
            query = query.sort_by(field)

        query = query.paginate(start=self.page_start, rows=self.page_size)\
                            .highlight('oneline',
                                    **{'simple.pre':'<span class="highlight">',
                                       'snippets': 3,
                                       'fragsize': 600,
                                       'simple.post':'</span>'})

        # boosting - super hacky but sunburnt is not support bq
        options = query.options()
        options['bq'] = ['realm:ticket^999','status:new^100', 'status:assigned^100',
                         'status:reopened^999', 'status:reviewing^100',
                         'status:accepted^100','(*:* -xxx)^999']
        result = query.interface.search(**options)
        return query.transform_result(result, dict)
Exemplo n.º 7
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Exemplo n.º 8
0
 def __init__(self):
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.si_eng = SolrInterface(self.search_engine)
     self.si_eng.init_schema()
     self.si_arc = SolrInterface(self.archive)
     self.si_arc.init_schema()
     self.old_deals = {}
Exemplo n.º 9
0
def _retrieve_address_from_history(original_source):
    server = endpoints.solr + '/dealschrome/geodata'
    solr = SolrInterface(server)
    res = solr.query(id=original_source).execute()
    if len(res):
        ll = str(res[0]['latlng'][0])+','+str(res[0]['latlng'][1])
        determined_source = res[0]['determined_source']
    else:
        ll = None
        determined_source = None
    return (ll,determined_source)
Exemplo n.º 10
0
def dataset_purge_data(dataset_id):
    """
    Purge a dataset from Solr.
    """
    log = logging.getLogger('redd.tasks.dataset_purge_data')
    log.info('Beginning purge, dataset_id: %i' % dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    solr.delete(queries='dataset_id: %i' % dataset_id, commit=True)

    log.info('Finished purge, dataset_id: %i' % dataset_id)
Exemplo n.º 11
0
def make_connection():
    solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    http_connection = Http()
    return SolrInterface(solr_url, http_connection=http_connection)
Exemplo n.º 12
0
def get_sunburnt_connection():
    from pylons import config
    solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    http_connection = Http()

    return SolrInterface(solr_url, http_connection=http_connection, mode='r')
Exemplo n.º 13
0
def check_mlt_query(i, o, E):
    if E is None:
        query_params, method, body = o
    content, content_charset, url = i
    d = {}
    conn = SolrInterface("http://test.example.com/", http_connection=MLTMockConnection(d))
    if E is None:
        conn.mlt_query(content=content, content_charset=content_charset, url=url).execute()
        assert_equal(d['params'], query_params)
        assert_equal(d['method'], method)
        assert_equal(d['body'], body)
    else:
        try:
            conn.mlt_query(content=content, content_charset=content_charset, url=url).execute()
        except E:
            pass
        else:
            assert False
Exemplo n.º 14
0
Arquivo: index.py Projeto: alkadis/vcv
def make_connection():
    solr_url = config.get_string('adhocracy.solr.url',
                                 'http://localhost:8983/solr/')
    solr_url = solr_url.strip()
    if not solr_url.endswith('/'):
        solr_url = solr_url + '/'
    kwargs = {}
    if config.get_bool('adhocracy.force_no_http_proxy'):
        kwargs['proxy_info'] = None
    http_connection = Http(**kwargs)
    return SolrInterface(solr_url, http_connection=http_connection)
Exemplo n.º 15
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Exemplo n.º 16
0
    def update(self, metaInfo, metaData):
        '''
        @see: ISearchProvider.update()
        '''

        si = SolrInterface('http://%s%s' %
                           (self.solr_server_url, metaData.Type))

        document = dict()

        document["MetaInfoId"] = metaInfo.Id
        document["MetaDataId"] = metaData.Id
        document["languageId"] = metaInfo.Language

        # custom processing on some fields
        field = 'CreationDate'
        if hasattr(metaInfo, field) and getattr(metaInfo, field):
            document['CreationData_Year'] = getattr(metaInfo, field).year

        for field in si.schema.fields:
            if hasattr(metaInfo, field) and getattr(metaInfo, field):
                document[field] = getattr(metaInfo, field)
            elif hasattr(metaData, field) and getattr(metaData, field):
                document[field] = getattr(metaData, field)

        si.add(document)
        si.commit()
Exemplo n.º 17
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Exemplo n.º 18
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on Solr
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     si.commit()
     return _('Record %s deleted on Solr') % solr_id
Exemplo n.º 19
0
    def processQuery(self, session, scheme, qa=None, qi=None, qd=None):
        '''
        Creates the solr query based on received REST queries
        '''

        si = SolrInterface('http://%sother' % self.solr_server_url)
        types = [
            self.queryIndexer.typesByMetaData[key]
            for key in self.queryIndexer.typesByMetaData.keys()
        ]

        solrQuery = None
        orClauses = []

        if qa is not None:
            assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa
            solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses)
            if QMetaDataInfo.type in qa: types = qa.type.values

        if qi is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses)

        if qd is not None:
            solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses)

        if orClauses:
            extend = None
            for clause in orClauses:
                if extend: extend = extend | clause
                else: extend = clause

            if solrQuery is None: solrQuery = si.query(extend)
            else: solrQuery = solrQuery.query(extend)

        if solrQuery is None: solrQuery = si.query()
        solrQuery = buildShards(solrQuery, self.solr_server_url, types)

        return solrQuery
Exemplo n.º 20
0
 def add(self, url, doc):
     si_item = self.__solr_pool.get(url)
     if not si_item:
         si_item = [SolrInterface(url), 0, []]
         self.__solr_pool[url] = si_item
     si_item[2].append(doc)
     si_item[1] += 1
     if si_item[
             1] % NUMBER_OF_DOCS_PER_ADD == 0:  # NOTE: Solr itself will also auto-commit after some time
         si_item[0].add(si_item[2])
         si_item[2] = []
     if si_item[1] > NUMBER_OF_DOCS_PER_COMMIT:
         si_item[0].commit()
         si_item[1] = 0
     return _('Record exported with ID %s on SolR.') % doc['id']
Exemplo n.º 21
0
def main():
	solr_url = "http://politicalframing.com:8983/solr/collection1"
	h = httplib2.Http(cache="/var/tmp/solr_cache")
	si = SolrInterface(url = solr_url, http_connection = h)

	# chamber = 'Senate'
	# print commit_solr()

	numFound = si.query(chamber='senate').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in senate " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='senate').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='Senate')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='house').paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches about Topic X in house " + str(numFound)
	for i in range(0, int(math.ceil(numFound/10000))):
		current_speeches = si.query(chamber='house').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], chamber='House')

			if partial_document:
				print speech['id'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()
Exemplo n.º 22
0
def dataset_import_data(dataset_id):
    """
    Import a dataset into Solr.
    """
    from redd.models import Dataset

    log = logging.getLogger('redd.tasks.dataset_import_data')
    log.info('Beginning import, dataset_id: %i' % dataset_id)

    dataset = Dataset.objects.get(id=dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    #solr_fields = []

    #for h, t in dataset.schema:
    #    if t == 'NoneType':
    #        solr_fields.append(None)
    #    else:
    #        solr_fields.append('%s_%s' % (h, t.__name__))
        
    reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
    reader.next()

    add_buffer = []
    normal_type_exceptions = []

    for i, row in enumerate(reader, start=1):
        data = {}

        typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row):
         try:
                value = normalize_column_type([value], normal_type=t)[1][0]
            except InvalidValueForTypeException:
                # Convert exception to row-specific error
                normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t))
                continue

            # No reason to send null fields to Solr (also sunburnt doesn't like them) 
            if value == None:
                continue

            if t in [unicode, bool, int, float]:
                if value == None:
                    continue

                data[field] = value
            elif t == datetime:
                data[field] = value.isoformat()
            elif t == date:
                pass
            elif t == time:
                pass
            else:
                # Note: if NoneType should never fall through to here 
                raise TypeError('Unexpected normal type: %s' % t.__name__)"""

        # If we've had a normal type exception, don't bother do the rest of this
        if not normal_type_exceptions:
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'csv_data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

    if add_buffer:
        solr.add(add_buffer)
        add_buffer = []
    
    if not normal_type_exceptions:
        solr.commit()
    else:
        # Rollback pending changes
        solr.delete(queries=solr.query(dataset_id=dataset.id))
        
        for e in normal_type_exceptions:
            print e 

    log.info('Finished import, dataset_id: %i' % dataset_id)
 def __init__(self):
     self.solr_interface = SolrInterface(str(self.server_url))
def main():
	solr_url = "http://politicalframing.com:8983/solr"
	h = httplib2.Http(cache="/var/tmp/solr_cache")
	si = SolrInterface(url = solr_url, http_connection = h)

	totalNumFound = si.query(**{"*":"*"}).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(totalNumFound)

	senateNumFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore  " + str(senateNumFound)

	houseNumFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(houseNumFound)

	extensionsNumFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(extensionsNumFound)

	print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound)

	print "-----------------------"
	print "-----------------------"


	numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Senate')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='House')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()

	numFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound
	print "-----------------------"
	print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(numFound)
	for i in range(0, int(math.ceil(numFound/100000))):
		current_speeches = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs
		json_documents = []
		for j, speech in enumerate(current_speeches):
			partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Extensions')

			print speech['id']
			if partial_document:
				print speech['speaker_raw'] + " queued to be ingested"
				json_documents.append(partial_document)

		if len(json_documents) > 1:
			json_doc_list_string, body = update_solr2(json_documents)
			print len(json_documents)
			print body
			print commit_solr()
Exemplo n.º 25
0
def search_tiles(request, collection_id=None, version=None, variant=None):
    results = None
    solr_metric = None
    subset_query = None
    query = None
    ed_class = None
    ed_level = None

    if request.GET:
        #if 'level' in request.GET:
        ed_level = request.GET.get('level')
        #if 'class' in request.GET:
        ed_class = request.GET.get('class')
        if 'q' in request.GET:
            query = remove_quotation_marks(request.GET.get('q'))
        if query:
            subset = ''
            if collection_id:
                subset = str(collection_id) + '/' + str(version) + '/' + str(variant).split('-')[0]
            page = request.GET.get('p')
            start_time = time.clock()

            setting = solr_switcher()

            if subset and not (ed_level and ed_class):
                subset_query = "%s*" % subset

            query_metric = QueryMetric(request)
            register_user_query(query_metric)
            request_metric = RequestMetric(request, query_hash=query_metric.get_md5())
            register_client_metric(request_metric)
            solr_metric = SolrMetric(request_hash=request_metric.get_md5(), query_hash=query_metric.get_md5())

            try:
                solr_interface = SolrInterface(setting.SOLR_MAIN_URL)
                solr_interface.conn.request_handler_name('search')
                results = solr_interface.query(solr_interface.Q('"' + query + '"'))
                if subset_query:
                    results = results.filter(collectionid=subset_query)
                if ed_level:
                    results = results.filter(collection_school_type_code=ed_level)
                if ed_class:
                    results = results.filter(collection_ep_class=ed_class)

                results = results.filter(published=True)
            except SolrError:
                solr_metric.solr_error = SolrError

        if results is not None:
            pages = Paginator(results, PAGINATION_ROWS)

            try:
                results = pages.page(page)
            except PageNotAnInteger:
                results = pages.page(1)
            except EmptyPage:
                results = pages.page(pages.num_pages)
            results.num_pages = pages.num_pages
            results.total_count = pages._count
            results.processing_time = time.clock() - start_time
            if solr_metric:
                if results.number < pages.num_pages:
                    solr_metric.next_page = results.number + 1
                solr_metric.num_pages = pages.num_pages
                if results.number > 1:
                    solr_metric.prev_page = results.number - 1
                solr_metric.page = results.number
                solr_metric.total_count = pages._count
                solr_metric.processing_time = results.processing_time
            # get post and pre pages
            max_count = pages.num_pages if pages.num_pages < results.number + PAGES_OFFSET else results.number + PAGES_OFFSET
            results.post_pages = range(results.number + 1, max_count + 1)
            results.pre_pages = [x for x in range(results.number - PAGES_OFFSET, results.number) if x > 0]

            if solr_metric:
                solr_metric.request_time = time.clock() - start_time
                register_solr_metric(solr_metric)
    else:
        results = None

    return render(request, 'search_tiles.html', {'results': results,
                                                 'solr_metric': solr_metric,
                                                 'query': query,
                                                 'collection_id': collection_id,
                                                 'variant': variant,
                                                 'version': version,
                                                 'level': ed_level,
                                                 'class': ed_class,
                                                 'chosen_education_level': ed_level,
                                                 'chosen_level': ed_class})
Exemplo n.º 26
0
    def run(self, dataset_id, *args, **kwargs):
        """
        Execute import.
        """
        from redd.models import Dataset

        log = logging.getLogger('redd.tasks.DatasetImportTask')
        log.info('Beginning import, dataset_id: %i' % dataset_id)

        dataset = Dataset.objects.get(id=dataset_id)

        task_status = dataset.current_task
        task_status.status = 'STARTED' 
        task_status.start = datetime.now()
        task_status.message = 'Preparing to import'
        task_status.save()

        line_count = self._count_lines(dataset.data_upload.get_path())

        if self.is_aborted():
            task_status.status = 'ABORTED'
            task_status.end = datetime.now()
            task_status.message = 'Aborted during preperation'
            task_status.save()

            log.warning('Import aborted, dataset_id: %i' % dataset_id)

            return

        solr = SolrInterface(settings.SOLR_ENDPOINT)
            
        reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
        reader.next()

        add_buffer = []

        for i, row in enumerate(reader, start=1):
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

                task_status.message = '%.0f%% complete (estimated)' % floor(float(i) / float(line_count) * 100)
                task_status.save()

                if self.is_aborted():
                    task_status.status = 'ABORTED'
                    task_status.end = datetime.now()
                    task_status.message = 'Aborted after importing %.0f%% (estimated)' % floor(float(i) / float(line_count) * 100)
                    task_status.save()

                    log.warning('Import aborted, dataset_id: %i' % dataset_id)

                    return

        if add_buffer:
            solr.add(add_buffer)
            add_buffer = []
        
        solr.commit()

        dataset.row_count = i
        dataset.save()

        log.info('Finished import, dataset_id: %i' % dataset_id)
Exemplo n.º 27
0
def get_solr(url=None):
    """ return a `SolrInterface` instance using the `solr_url` setting """
    if url is None:
        url = get_settings()['kotti_solr.solr_url']
    return SolrInterface(url)
def main():
    solr_url = "http://politicalframing.com:8983/solr"
    h = httplib2.Http(cache="/var/tmp/solr_cache")
    si = SolrInterface(url=solr_url, http_connection=h)

    totalNumFound = si.query(**{
        "*": "*"
    }).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(
        speaker_raw="the presiding officer").exclude(
            speaker_raw="the vice president").exclude(
                speaker_raw="the speaker pro tempore").exclude(
                    speaker_raw="the acting president pro tempore").sort_by(
                        "speaker_raw").paginate(
                            rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(
        totalNumFound)

    senateNumFound = si.query(chamber='Senate').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore  " + str(
        senateNumFound)

    houseNumFound = si.query(chamber='House').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        houseNumFound)

    extensionsNumFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        extensionsNumFound)

    print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound)

    print "-----------------------"
    print "-----------------------"

    numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Senate').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Senate')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(
        speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='House').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='House')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()

    numFound = si.query(chamber='Extensions').exclude(
        speaker_party="*").exclude(speaker_raw="recorder").exclude(
            speaker_raw="the presiding officer").exclude(
                speaker_raw="the vice president").exclude(
                    speaker_raw="the speaker pro tempore").exclude(
                        speaker_raw="the acting president pro tempore"
                    ).sort_by("speaker_raw").paginate(
                        rows=0, start=0).execute().result.numFound
    print "-----------------------"
    print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore   " + str(
        numFound)
    for i in range(0, int(math.ceil(numFound / 100000))):
        current_speeches = si.query(chamber='Extensions').exclude(
            speaker_party="*").exclude(speaker_raw="recorder").exclude(
                speaker_raw="the presiding officer").exclude(
                    speaker_raw="the vice president").exclude(
                        speaker_raw="the speaker pro tempore").exclude(
                            speaker_raw="the acting president pro tempore"
                        ).field_limit(
                            ["id", "speaker_raw", "congress",
                             "date"]).sort_by("speaker_raw").paginate(
                                 rows=100000,
                                 start=100000 * i).execute().result.docs
        json_documents = []
        for j, speech in enumerate(current_speeches):
            partial_document = get_speaker_metadata(
                id=speech['id'],
                date=speech['date'],
                congress=speech['congress'],
                speaker=speech['speaker_raw'],
                chamber='Extensions')

            print speech['id']
            if partial_document:
                print speech['speaker_raw'] + " queued to be ingested"
                json_documents.append(partial_document)

        if len(json_documents) > 1:
            json_doc_list_string, body = update_solr2(json_documents)
            print len(json_documents)
            print body
            print commit_solr()
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name 

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'CollectionSmall'

response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0,tot).execute()

docs = {}

print response.result.numFound

i = 1

directory = root + "/"
Exemplo n.º 30
0
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name 

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'D:\Test\EventCollections\SmallCollections'

#response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
response = si.query(text="west africa").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0,tot).execute()

docs = {}

print response.result.numFound

i = 1
class SolrBackend(Component):
    implements(ISearchBackend)

    UNIQUE_ID = "unique_id"

    HIGHLIGHTABLE_FIELDS = {
        "unique_id" : True,
        "id" : True,
        "type" : True,
        "product" : True,
        "milestone" : True,
        "author" : True,
        "component" : True,
        "status" : True,
        "resolution" : True,
        "keywords" : True,
        "summary" : True,
        "content" : True,
        "changes" : True,
        "owner" : True,
        "repository" : True,
        "revision" : True,
        "message" : True,
        "name" : True
        }

    server_url = Option(
            BHSEARCH_CONFIG_SECTION,
            'solr_server_url',
            doc="""Url of the server running Solr instance.""",
            doc_domain='bhsearch')

    def __init__(self):
        self.solr_interface = SolrInterface(str(self.server_url))

    def add_doc(self, doc, operation_context=None):
        self._reformat_doc(doc)
        doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
                                                doc["type"], doc["id"])
        self.solr_interface.add(doc)
        self.solr_interface.commit()

    def delete_doc(product, doc_type, doc_id, operation_context=None):
        unique_id = self._create_unique_id(product, doc_type, doc_id)
        self.solr_interface.delete(unique_id)

    def optimize(self):
        self.solr_interface.optimize()

    def query(
            self, query, query_string, sort = None, fields = None,
            filter = None, facets = None, pagenum = 1, pagelen = 20,
            highlight = False, highlight_fields = None, context = None):

        if not query_string:
            query_string = "*.*"

        final_query_chain = self._create_query_chain(query, query_string)
        solr_query = self.solr_interface.query(final_query_chain)
        faceted_solr_query = solr_query.facet_by(facets)
        highlighted_solr_query = faceted_solr_query.highlight(
                                    self.HIGHLIGHTABLE_FIELDS)

        start = 0 if pagenum == 1 else pagelen * pagenum
        paginated_solr_query = highlighted_solr_query.paginate(
                            start=start, rows=pagelen)
        results = paginated_solr_query.execute()

        mlt, hexdigests = self.query_more_like_this(paginated_solr_query,
                                                    fields="type", mindf=1,
                                                    mintf=1)

        query_result = self._create_query_result(highlighted_solr_query,
                                                 results, fields, pagenum,
                                                 pagelen)
        return query_result, mlt, hexdigests

    def query_more_like_this(self, query_chain, **kwargs):
        mlt_results = query_chain.mlt(**kwargs).execute().more_like_these
        mlt_dict = {}
        hexdigests = {}

        for doc, results in mlt_results.iteritems():
            hexdigest = hashlib.md5(doc).hexdigest()
            hexdigests[doc] = hexdigest

            for mlt_doc in results.docs:
                if doc not in mlt_dict:
                    mlt_dict[doc] = [self._process_doc(mlt_doc)]
                else:
                    mlt_dict[doc].append(self._process_doc(mlt_doc))

        return mlt_dict, hexdigests

    def _process_doc(self, doc):
        ui_doc = dict(doc)

        if doc.get('product'):
            env = ProductEnvironment(self.env, doc['product'])
            product_href = ProductEnvironment.resolve_href(env, self.env)
            ui_doc["href"] = product_href(doc['type'], doc['id'])
        else:
            ui_doc["href"] = self.env.href(doc['type'], doc['id'])

        ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title()

        return ui_doc

    def _create_query_result(
                        self, query, results, fields, pagenum, pagelen):
        total_num, total_page_count, page_num, offset = \
                    self._prepare_query_result_attributes(query, results,
                                                          pagenum, pagelen)

        query_results = QueryResult()
        query_results.hits = total_num
        query_results.total_page_count = total_page_count
        query_results.page_number = page_num
        query_results.offset = offset

        docs = []
        highlighting = []

        for retrieved_record in results:
            result_doc = self._process_record(fields, retrieved_record)
            docs.append(result_doc)

            result_highlights = dict(retrieved_record['solr_highlights'])

            highlighting.append(result_highlights)
            query_results.docs = docs
            query_results.highlighting = highlighting

        return query_results

    def _create_query_chain(self, query, query_string):
        matches = re.findall(re.compile(r'([\w\*]+)'), query_string)
        tokens = set([match for match in matches])

        final_query_chain = None
        for token in tokens:
            token_query_chain = self._search_fields_for_token(token)
            if final_query_chain is None:
                final_query_chain = token_query_chain
            else:
                final_query_chain |= token_query_chain

        return final_query_chain

    def _process_record(self, fields, retrieved_record):
        result_doc = dict()
        if fields:
            for field in fields:
                if field in retrieved_record:
                    result_doc[field] = retrieved_record[field]
        else:
            for key, value in retrieved_record.items():
                result_doc[key] = value

        for key, value in result_doc.iteritems():
            result_doc[key] = self._from_whoosh_format(value)

        return result_doc

    def _from_whoosh_format(self, value):
        if isinstance(value, datetime):
            value = utc.localize(value)
        return value

    def _prepare_query_result_attributes(
                                    self, query, results, pagenum, pagelen):
        results_total_num = query.execute().result.numFound
        total_page_count = int(ceil(results_total_num / pagelen))
        pagenum = min(total_page_count, pagenum)

        offset = (pagenum-1) * pagelen
        if (offset+pagelen) > results_total_num:
            pagelen = results_total_num - offset

        return results_total_num, total_page_count, pagenum, offset

    def is_index_outdated(self):
        return False

    def recreate_index(self):
        return True

    @contextmanager
    def start_operation(self):
        yield

    def _search_fields_for_token(self, token):
        q_chain = None
        field_boosts = DefaultQueryParser(self.env).field_boosts

        for field, boost in field_boosts.iteritems():
            if field != 'query_suggestion_basket' and field != 'relations':
                field_token_dict = {field: token}
                if q_chain is None:
                    q_chain = self.solr_interface.Q(**field_token_dict)**boost
                else:
                    q_chain |= self.solr_interface.Q(**field_token_dict)**boost

        return q_chain

    def _reformat_doc(self, doc):
        for key, value in doc.items():
            if key is None:
                del doc[None]
            elif value is None:
                del doc[key]
            elif isinstance(value, basestring) and value == "":
                del doc[key]
            else:
                doc[key] = self._to_whoosh_format(value)

    def _to_whoosh_format(self, value):
        if isinstance(value, basestring):
            value = unicode(value)
        elif isinstance(value, datetime):
            value = self._convert_date_to_tz_naive_utc(value)
        return value

    def _convert_date_to_tz_naive_utc(self, value):
        if value.tzinfo:
            utc_time = value.astimezone(utc)
            value = utc_time.replace(tzinfo=None)
        return value

    def _create_unique_id(self, product, doc_type, doc_id):
        if product:
            return u"%s:%s:%s" % (product, doc_type, doc_id)
        else:
            return u"%s:%s" % (doc_type, doc_id)
# getCollectionsFromSolr20140919 revised to cover several cases and to report progress
#  VT CS4984, Computational Linguistics, by Xuan Zhang, Tarek Kanan, Edward Fox
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

eventQuery = "Texas_Fertilizer_Plant_Explosion"

#these are the query lists for Team A
eventQueryList = ["Texas_Fertilizer_Plant_Explosion", "Rain_at_Islip"]
# Commented out lines support the special handling when there are spaces in the event name.
# eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
#root = 'D:\Test\EventCollections\SmallCollections'
# Or, for a Mac, use something like
#someone needs to change this part
root = '../Unit3/output'

# Create and execute a Solr query
words = eventQuery.split();
query = si.query(event=words[0])
for w in words[1:]:
    query = query.query(event = w)
response = query.execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
Exemplo n.º 33
0
from sunburnt import SolrInterface
import sys

si = SolrInterface("http://nick.dlib.vt.edu:8080/solr")

eventQuery = sys.argv[1]

response = si.query( event=eventQuery).execute()
tot = response.result.numFound
response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()
docs = {}
print response.result.numFound
i = 1
for res in response:
    f = open(str(i) + ".txt","w")
    f.write(res['content'].encode("utf-8"))
    f.close()
    i+=1
si.commit()
Exemplo n.º 34
0
def _add_address_to_history(geodata):
    server = endpoints.solr + '/dealschrome/geodata'
    solr = SolrInterface(server)
    solr.add(geodata)
    solr.commit()
Exemplo n.º 35
0
# getCollectionsFromSolr20140919 revised to cover several cases and to report progress
#  VT CS4984, Computational Linguistics, by Xuan Zhang, Tarek Kanan, Edward Fox
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

eventQuery = "Brazil_NightClub_Fire"
# Commented out lines support the special handling when there are spaces in the event name.
# eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
#root = 'D:\Test\EventCollections\SmallCollections'
# Or, for a Mac, use something like
root = '/Users/mzamani/Documents/CS4984/Unit2/Brazil_NightClub_Fire'

# Create and execute a Solr query
words = eventQuery.split()
query = si.query(event=words[0])
for w in words[1:]:
    query = query.query(event=w)
response = query.execute()
# Or, for the case of spaces in the name:
#  response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
tot = response.result.numFound

#print response.result.numFound
print tot, "documents found in collection [", eventQuery, "]\n"
Exemplo n.º 36
0
import os

from sunburnt import SolrInterface

si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr")

# This is where you put the event name

# eventQuery = "Typhoon Haiyan"
eventQuery = "Connecticut School Shooting"

# This is where you put the downloaded files
root = 'D:\Test\EventCollections\SmallCollections'

#response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute()
response = si.query(text="west africa").execute()

#response = si.query(event=eventQuery).execute()

tot = response.result.numFound
print tot
#response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute()

response = si.query(event=eventQuery).paginate(0, tot).execute()

docs = {}

print response.result.numFound

i = 1
Exemplo n.º 37
0
This repo has a YAML file we might be able to use
https://github.com/unitedstates/congress-legislators

It seems the original parser has been improved (or maybe just migrated)
https://github.com/unitedstates/congressional-record/blob/master/congressionalrecord/fdsys/cr_parser.py

"""

import httplib2
from sunburnt import SolrInterface
from dateutil import parser
from datetime import datetime

solr_url = "http://politicalframing.com:8983/solr"  # "http://localhost:8983/solr/"
h = httplib2.Http(cache="/var/tmp/solr_cache")
si = SolrInterface(url=solr_url, http_connection=h)


def get_speeches(rows, start, dabool, **kwargs):
    query = {}
    neg_query = {}

    if kwargs.get('speech_id'): query['id'] = kwargs['speech_id']
    if kwargs.get('phrase'): query['speaking'] = kwargs['phrase']
    if kwargs.get('congress'): query['congress'] = kwargs['congress']

    kwargs['start_date'] = parser.parse(
        kwargs['start_date']) if kwargs.get('start_date') else datetime(
            1994, 1, 1)
    kwargs['end_date'] = parser.parse(
        kwargs['end_date']) if kwargs.get('end_date') else datetime.now()
Exemplo n.º 38
0
class ScoreSpider(CrawlSpider):
    name = "score"
    allowed_domains = ["matchendirect.fr"]
    start_urls = ["http://www.matchendirect.fr/hier/"]
    rules = [
        Rule(
            SgmlLinkExtractor(allow=(r"/live-score/[a-z0-9\-]+\.html$", r"/foot-score/[a-z0-9\-]+\.html$")),
            "parse_score",
        )
    ]

    # init solr instance
    def __init__(self, *args, **kwargs):
        super(ScoreSpider, self).__init__(*args, **kwargs)
        self.si = SolrInterface("http://*****:*****@class="tableau"][1]')
            rows = table.xpath("tr")
            for row in rows:
                # if match has has started & is finished
                scoring = row.xpath('td[@class="lm4"]/a[not(span)]/text()').extract()
                isPlaying = row.xpath('td[@class="lm2_1"]').extract()
                if scoring and not isPlaying:
                    score = ScoreItem()
                    score["id"] = "http://www.matchendirect.fr" + row.xpath('td[@class="lm4"]/a/@href').extract().pop()
                    score["host"] = row.xpath('td[@class="lm3"]/a/text()').extract().pop()
                    score["visitor"] = row.xpath('td[@class="lm5"]/a/text()').extract().pop()

                    scoringArr = scoring.pop().split(" - ")
                    score["scorehost"] = int(scoringArr[0])
                    score["scorevisitor"] = int(scoringArr[1])
                    if score["scorehost"] > score["scorevisitor"]:
                        score["winner"] = score["host"]
                    elif score["scorehost"] < score["scorevisitor"]:
                        score["winner"] = score["visitor"]

                    leagueArr = league.xpath("a[1]/text()").extract().pop().split(" : ")
                    score["country"] = leagueArr[0]
                    score["league"] = leagueArr[1]

                    docs.append(dict(score))

        # index crawled games
        self.si.add(docs)
        self.si.commit()

    # called on followed urls
    # get game details (goal scorer & time)
    def parse_score(self, response):
        sel = Selector(response)
        # if match has started & is finished
        scorehost = sel.xpath('//div[@id="match_score"]/div[@class="col2"]/text()').extract().pop().strip()
        scorevisitor = sel.xpath('//div[@id="match_score"]/div[@class="col3"]/text()').extract().pop().strip()
        isPlaying = sel.xpath('//div[@id="match_entete_2"]/img').extract()

        if scorehost and scorevisitor and not isPlaying:
            score = ScoreItem()

            # get already indexed data
            solr_doc = self.si.query(id=response.url).execute()
            if list(solr_doc):
                doc = solr_doc[0]
            else:
                doc = {}
                score["id"] = response.url

            # get goals
            table = sel.xpath('//table[@class="tableau match_evenement"]')
            rows = table.xpath("tr")
            score["goalscorershost"], score["goalscorersvisitor"], score["goaltimeshost"], score["goaltimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            score["penaltytimeshost"], score["penaltytimesvisitor"], score["ogtimeshost"], score["ogtimesvisitor"] = (
                [],
                [],
                [],
                [],
            )
            for row in rows:
                tdgoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement1"]]')
                tdpenaltyhost = row.xpath('td[@class="c1" and span[@class="ico_evenement2"]]')
                tdowngoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement7"]]')
                tdgoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement1"]]')
                tdpenaltyvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement2"]]')
                tdowngoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement7"]]')
                tdgoalhost = tdgoalhost or tdpenaltyhost or tdowngoalhost
                tdgoalvisitor = tdgoalvisitor or tdpenaltyvisitor or tdowngoalvisitor
                if tdgoalhost:
                    time = tdgoalhost.xpath('following-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    if tdpenaltyhost:
                        score["penaltytimeshost"].append(time)
                    elif tdowngoalhost:
                        score["ogtimeshost"].append(time)
                    score["goaltimeshost"].append(time)
                    score["goalscorershost"].append(tdgoalhost.xpath("a/text()").extract().pop())
                elif tdgoalvisitor:
                    time = (
                        tdgoalvisitor.xpath('preceding-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'")
                    )
                    if tdpenaltyvisitor:
                        score["penaltytimesvisitor"].append(time)
                    elif tdowngoalvisitor:
                        score["ogtimesvisitor"].append(time)
                    score["goaltimesvisitor"].append(time)
                    score["goalscorersvisitor"].append(tdgoalvisitor.xpath("a/text()").extract().pop())

            # get time, refree & stadium
            matchinfos = sel.xpath('//table[@id="match_entete_1"]/tr/td[@class="info"]/text()').extract()
            matchinfos.pop()
            matchinfos = [x.lstrip("\n\t\r") for x in matchinfos]
            if u"Arbitre : - " in matchinfos:
                matchinfos.remove(u"Arbitre : - ")
            date = format_date(matchinfos[0])
            time = matchinfos[1].split(" ")[-1].replace("h", ":") + ":00"
            score["date"] = "%sT%sZ" % (date, time)
            if len(matchinfos) >= 3:
                score["stadium"] = matchinfos[2]
                if len(matchinfos) == 4:
                    score["referee"] = matchinfos[3].split(" : ")[1]

            # index all datas
            doc = dict(doc.items() + dict(score).items())
            self.si.add(doc)
            self.si.commit()
Exemplo n.º 39
0
class SolrPipeline(object):

    search_engine = endpoints.solr + '/dealschrome/search-engine'
    archive = endpoints.solr + '/dealschrome/archive'

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.si_eng = SolrInterface(self.search_engine)
        self.si_eng.init_schema()
        self.si_arc = SolrInterface(self.archive)
        self.si_arc.init_schema()
        self.old_deals = {}

    def spider_opened(self, spider):
        source = spider.allowed_domains[0]
        old_temp = self.get_old_deals(source)
        self.old_deals[spider] = {i['id']:i for i in old_temp}
        spider.old_deals = dict(self.old_deals[spider])

    def spider_closed(self, spider):
        source = spider.allowed_domains[0]
        old_deals = self.old_deals.pop(spider)
        
        if spider.crawled_items.items():        
            for k,v in spider.crawled_items.items():
                if old_deals.has_key(v['url']):
                    field_created = old_deals[v['url']]['created']
                    del old_deals[v['url']]
                else:
                    field_created = int(time())
                    
                data = {
                    'id' : v['url'],
                    'title' : v['title'],
                    'dealsource' : source,
                    'price' : str(v['price']),
                    'worth' : str(v['worth']),
                    'discount' : str(v['discount']),
                    'bought' : str(v['bought']),
                    'imgsrc' : v['imgsrc'],
                    'category' : v['category'],
                    'created' : field_created,
                    'expiry' : str(v['expiry']),
                    'merchant' : v['merchant'],
                    'address' : v['address'],
                    'description': v['description'],
                }
                if v['location']:
                    # only add location when location exists
                    data['location'] = v['location']
                
                # its a BUG,this code is to correct multiple valued category
                if len(data['category']) > 1 and not isinstance(data['category'], types.StringTypes):
                    data['category'] = data['category'][0]
                            
                self.si_eng.add(data)
                self.si_arc.add(data)
            
            self.si_eng.commit()
            self.si_arc.commit()
            
            pending_delete = [doc for doc in old_deals.itervalues()]
            if pending_delete:
                self.si_eng.delete(pending_delete)
            
            self.si_eng.commit()
            self.si_arc.commit()
    
    def get_old_deals(self, source):
        old_deals = self.si_eng\
            .query(dealsource_raw=source)\
            .field_limit(['id','created','category_raw'],score=False)\
            .paginate(rows=900)\
            .execute()
        return old_deals        
        
Exemplo n.º 40
0
 def __init__(self, *args, **kwargs):
     super(ScoreSpider, self).__init__(*args, **kwargs)
     self.si = SolrInterface("http://localhost:8080/solr")