Пример #1
0
class LogIndexer(object):
    '''
    classdocs
    '''

    def __init__(self,solrAddr):
        '''
        Constructor
        '''
        self.solr = Solr(solrAddr)

        
    
    def index(self,data):
        for key, value in data.items():
           if isinstance(value,datetime.datetime):
               try:
                   value = solr.core.utc_to_string(value)
               except:
                   pst = tz.gettz('Europe/Paris')
                   value = value.replace(tzinfo=pst)
                   value = solr.core.utc_to_string(value)
                   data[key] = value
                   
        try:
            self.solr.update([data])
        except:
            print "Erreur Index request: "        
        self.solr.commit()
        print "data indexed"
Пример #2
0
class SolrUtils:
    def __init__(self, url):
        self.url = url
        self.conn = Solr(url)

    def addJSONDoc(self, doc):
        self.conn.update(doc, 'json', commit=False)

    def commit(self):
        self.conn.commit()
Пример #3
0
class SolrUtils:

	def __init__(self, url):
		self.url = url
		self.conn = Solr(url)

	def addJSONDoc(self, doc):
		self.conn.update(doc, 'json', commit=False)

	def commit(self):
		self.conn.commit()
Пример #4
0
 def post_to_solr( self, solr_dict ):
     """ Posts solr_dict to solr. """
     SOLR_ROOT_URL = ( os.environ.get('BELL_I_SOLR_ROOT') )
     solr = Solr( SOLR_ROOT_URL )
     response = solr.update( [solr_dict], 'xml', commit=True )  # 'xml' param converts default json to xml for post; required for our old version of solr
     response_status = response.status
     self.logger.info( 'in tasks.indexer.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) )
     if not response_status == 200:
         raise Exception( 'custom-solr post problem logged' )
     return response_status
Пример #5
0
 def post_to_solr( self, solr_dict ):
     """ Posts solr_dict to solr.
         Called by update_custom_index_entry() """
     solr = Solr( self.CUSTOM_INDEX_SOLR_URL_ROOT )
     response = solr.update( [solr_dict], 'xml', commit=True )  # 'xml' param converts default json to xml for post; required for our old version of solr
     response_status = response.status
     self.logger.info( 'in tasks.indexer.CustomIndexUpdater.post_to_solr() [for custom-solr]; accession_number, %s; response_status, %s' % (solr_dict['accession_number_original'], response_status) )
     if not response_status == 200:
         raise Exception( 'custom-solr post problem logged' )
     return response_status
Пример #6
0
    def run(self):
        df = pd.read_csv(self.input().open('r'), sep='\t')
        df['id'] = df['url']

        solr = Solr('SOLR_HOST')

        # Index 10 docs at a time
        start = 0
        increment = 10
        while len(df[start:start + increment]) > 0:
            sliced = df[start:start + increment]
            docs = []
            for index, row in sliced.iterrows():
                doc = json.loads(row.to_json())
                docs.append(doc)

            solr.update(docs, 'json')
            if start % 1000 == 0:
                # Just to see that is working
                print start
            start += increment
Пример #7
0
def atomicUpdate(chunkFile, solrURL):

    session = requests.Session()
    solr = Solr(solrURL, make_request=session, version=4)

    bufferDocs = []

    with open(chunkFile, 'r') as inF:
        for docID in inF:
            docID = docID.strip()

            delta_update = { "id": docID,
                              "dataSource_s_md": {"set": "ice"} } ## Caution change this value

            bufferDocs.append(delta_update)


    x = solr.update(bufferDocs, commit=True)

    if x.raw_content['responseHeader']['status'] != 0:
        print "Solr Commit Failed !!!! Error Status code: ", x.raw_content['responseHeader']['status']
    else:
        print "Awesome!! Solr Commit was a Success"
Пример #8
0
class eBsolr:
    cursor = None

    def __init__(self, urls, config, version=4):
        self.cursor = Solr(urls, version=version)

    def update(self, documents, input_type='json', commit=False):
        self.cursor.update(documents, input_type, commit)

    def deleteById(self, tid, commit=False):
        return self.cursor.delete_by_key(tid, commit=commit)

    def deleteByQuery(self, query, commit=False):
        return self.cursor.delete_by_query(query=query, commit=commit)

    def deleteAll(self, commit=False):
        return self.cursor.delete_by_query("*:*", commit=commit)

    def getResponse(self, search, fields=None, start=0, rows=None, sort=None, fq=None):
        query = {'q': search}
        if fields:
            if isinstance(fields, basestring):
                query['fl'] = fields
            else:
                query['fl'] = ",".join(fields)
        if sort:
            query['sort'] = sort

        if fq:
            query['fq'] = fq

        # Default to 10000 rows
        limit = rows
        if rows is None:
            limit = _MAXROWS
        query['start'] = start
        query['rows'] = limit

        response = self.cursor.search(**query)
        if int(response.status) >= 400:
            raise Exception('Error Solr {}: {}'.format(response.status, response.extract_errmessage()))
        if rows is None and response.total_results > limit:
            # query['start'] = response.total_results
            query['rows'] = response.total_results
            response = self.cursor.search(**query)

        return response

    def get_language_query(self, language):
        q_temp = None
        if language is not None and language != "":
            langArray = language.split(';')
            if len(langArray) > 0:
                lang = langArray[0]
                q_temp = "language:%s" % lang
                for lang in langArray[1:]:
                    q_temp = "%s OR language:%s" % (q_temp, lang)
        return q_temp

    def getDocs(self, search, fields=None, start=0, rows=None, sort=None, fq=None):
        """search: query sintaks ex: "field:keys,field2:keys2"
           fields: field yg di ambil (list) ex: ['field', 'field2']
           start: start row
           rows: max / limit row
           sort: order rows ex: field asc, field2 desc"""
        # Get documents
        response = self.getResponse(search, fields, start, rows, sort, fq)

        return {"docs": response.documents, "count": response.total_results}

    def getFacetList(self, facets, facetField):
        ff = {}
        if not isinstance(facetField, list):
            facetField = facetField.split(",")
        for facet in facetField:
            if facet:
                ff[facet] = facets['facet_fields'][facet]

        return ff

    def getFacetPivotGeneral(self, query, facetField, pivotField, limit=None, fq=None):
        try:
            url = "{0}select?q={1}&rows=1&wt=json&indent=true&facet=true&facet.pivot={2},{3}".format(
                self.cursor.base_url, query.replace("+", "%2B"), facetField, pivotField)

            url = '{}select'.format(self.cursor.base_url)
            params = {'q': query,
                      'rows': 0,
                      'wt': 'json',
                      'indent': 'true',
                      'facet': 'true',
                      'facet.pivot': '{},{}'.format(facetField, pivotField)}

            if limit:
                params['facet.limit'] = limit
            if fq:
                params['fq'] = fq
                #                 url = "%s&facet.limit=%d" % (url, limit)
            http_response = requests.get(url, params=params)
            # print url
            #             http_response = requests.get(url)

            return http_response.json()['facet_counts']['facet_pivot']['{0},{1}'.format(facetField, pivotField)]
        except Exception, e:
            print("Error parsing facet pivot...")
            print e
        return None
from mysolr import Solr

# Default connection to localhost:8080
solr = Solr("http://localhost:8983/solr/barcore")

# All solr params are supported!
query = {'q' : '*:*', 'facet' : 'true', 'facet.field' : 'zip'}
response = solr.search(**query)

# do stuff with documents
for document in response.documents:
    # modify field 'foo'
    document['rating'] = 2.0

# update index with modified documents
solr.update(response.documents, commit=True)
Пример #10
0
class QueryResultTestCase(unittest.TestCase):
    def setUp(self):
        self.solr = Solr('http://localhost:8983/solr')

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id': 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id': 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Пример #11
0
from mysolr import Solr

from Resource.ResourceHelper import ResourceHelper
from Resource.Resource import Resource
from Util.PathTool import PathTool
from Digester.FeedDictFactory import FeedDictFactory

solrBase = "http://localhost:8983/solr/"
updateUrl = solrBase + 'update/'

solr = Solr(solrBase)

_pt = PathTool.PathTool()
_rh = ResourceHelper()
feeds = _rh.getAllFeedPaths()
for feed in feeds:   
    try:
        feedDictFactory = FeedDictFactory()
        feedDict = feedDictFactory.getFeedDict(feed)
        if feedDict != None and feedDict != {}:
            feedDict['id'] = Resource(feed, 'feed').get_id()
            print(feedDict['id'])
            print("Indexing", feedDict)
            
            solr.update([feedDict], 'json', commit=True)
            print('Indexed.')
    except (xml.parsers.expat.ExpatError, ValueError):
        print(("Failed:", feed))

print("done")
Пример #12
0
from mysolr import Solr

# Default connection to localhost:8080
solr = Solr("http://localhost:8983/solr/barcore")

# All solr params are supported!
query = {'q': '*:*', 'facet': 'true', 'facet.field': 'zip'}
response = solr.search(**query)

# do stuff with documents
for document in response.documents:
    # modify field 'foo'
    document['rating'] = 2.0

# update index with modified documents
solr.update(response.documents, commit=True)
Пример #13
0
    },
    {
        'q' : 'foo:bar'
    }
]

# using 10 threads
responses = solr.async_search(queries, size=10)

#See installation section for further information about how to install this feature.
#Indexing documents
from mysolr import Solr

solr = Solr()

# Create documents
documents = [
    {'id' : 1,
     'field1' : 'foo'
    },
    {'id' : 2,
     'field2' : 'bar'
    }
]
# Index using json is faster!
solr.update(documents, 'json', commit=False)

# Manual commit
solr.commit()

Пример #14
0
class QueryResultTestCase(unittest.TestCase):
    def setUp(self):
        self.solr = Solr(os.getenv("SOLR_URL"))

    def test_search(self):
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q="*:*")
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q="*:*")
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{"id": 1}]
        response = self.solr.update(documents, input_type="json")
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{"id": 2}]
        response = self.solr.update(documents, input_type="xml")
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = "id:1"
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Пример #15
0
class UpdateItems(object):
    """Create a new SharedItem or update it if it already exists.
    This will find all the entries, then create / update them. Then
    do a batch index to Solr.
    """

    def __init__(self, context, request):
        self.context = context
        self.request = request
        self.create_count = 0
        self.update_count = 0
        self.messages = []
        self.to_index = []
        solr_uri = request.registry.settings.get('push.solr_uri', None)
        if solr_uri is None:
            raise AttributeError(u'A push.solr_uri is required')
        # XXX: We are importing solr here to be able to mock it in the tests
        from mysolr import Solr
        self.solr = Solr(solr_uri)
        self.shared = context.shared

    def __call__(self):
        #  If the request isn't an RSS feed, bail out
        if self.request.content_type not in ALLOWED_CONTENT:
            body_msg = (
                "The content-type of the request must be one of the "
                "following: %s"
            ) % ", ".join(ALLOWED_CONTENT)
            return HTTPBadRequest(body=body_msg)
        # Create / update
        self._process_items()
        # Index in Solr
        self._update_index()
        # Return a 200 with details on what happened in the body
        self.messages.append("%s items created." % self.create_count)
        self.messages.append("%s items updated." % self.update_count)
        return HTTPOk(body=" ".join(self.messages))

    def _process_items(self):
        """Get a list of new items to create and existing items that
        need to be updated.
        """
        shared_content = feedparser.parse(self.request.body)
        for item in shared_content.entries:
            uid = item['id']
            # Get the uid, minus the urn:syndication bit
            item['uid'] = uid = normalize_uid(uid)
            logger.info('Processing item %s' % uid)
            item['link'] = item.link
            item['feed_link'] = shared_content.feed.link
            if uid in self.shared:
                self._update_item(item)
            else:
                self._create_item(item)

    def _create_item(self, entry):
        """Create new items in the feed
        """
        new_item = SharedItem()
        uid = entry['uid']
        logger.info('Creating item %s' % uid)
        new_item.update_from_entry(entry)
        # XXX: Should name and parent be necessary here? Shouldn't
        #      the `add` method do that for us?
        new_item.__name__ = uid
        new_item.__parent__ = self.shared
        self.shared.add(uid, new_item)
        self.to_index.append(self.shared[uid])
        self.create_count += 1

    def _update_item(self, entry):
        """Update existing items in the db using their UID
        """
        uid = entry['uid']
        logger.info('Updating item %s' % uid)
        obj = self.shared[uid]
        # XXX: these aren't coming from the object. Why is that? Is
        #      the `add` method on the folder not setting them?
        obj.__name__ = uid
        obj.__parent__ = self.shared
        selected_or_shared = (
            'selected' in entry['feed_link'] or
            'shared' in entry['feed_link']
        )
        if selected_or_shared and hasattr(obj, 'deletion_type'):
            remove_deleted_status(uid, self.shared, self.solr)
        obj.update_from_entry(entry)
        self.to_index.append(obj)
        self.update_count += 1

    def _update_index(self):
        """Clean up the item dictionaries to contain only items that
        are valid and send them over to Solr for indexing.

        NOTE: Solr may error out on index if it receives a field it is
              not aware of. We should change this code to look up the
              Solr schema, and remove attributes that it doesn't know,
              like __name__ and __parent__ below.
        """
        logger.debug('Updating index for %s objects' % len(self.to_index))
        cleaned = []
        ignored_attrs = [
            '__name__',
            '__parent__',
            'deletion_type',
        ]
        for item in self.to_index:
            item_dict = copy.deepcopy(item.__dict__)
            if 'Modified' in item_dict:
                if hasattr(item_dict['Modified'], 'isoformat'):
                    mod_date = item_dict['Modified'].isoformat()
                else:
                    mod_date = item_dict['Modified']
                # Make sure the date is acceptable to Solr, strip off
                # the +00:00 and replace it with a Z
                item_dict['Modified'] = "%sZ" % mod_date[:-6]
            item_dict['uid'] = item_dict['__name__']
            # XXX: Need to look up the schema, then modify the dict
            #      based on that.
            for attr in ignored_attrs:
                item_dict.pop(attr, '')
            cleaned.append(item_dict)
        # XXX: Need to handle Solr errors here
        response = self.solr.update(cleaned)
        return response
Пример #16
0
class UpdateItems(object):
    """Create a new SharedItem or update it if it already exists.
    This will find all the entries, then create / update them. Then
    do a batch index to Solr.
    """

    def __init__(self, context, request):
        self.context = context
        self.request = request
        self.create_count = 0
        self.update_count = 0
        self.messages = []
        self.to_index = []
        solr_uri = request.registry.settings.get('push.solr_uri', None)
        if solr_uri is None:
            raise AttributeError(u'A push.solr_uri is required')
        # XXX: We are importing solr here to be able to mock it in the tests
        from mysolr import Solr
        self.solr = Solr(solr_uri)
        self.shared = context.shared

    def __call__(self):
        #  If the request isn't an RSS feed, bail out
        if self.request.content_type not in ALLOWED_CONTENT:
            body_msg = (
                "The content-type of the request must be one of the "
                "following: %s"
            ) % ", ".join(ALLOWED_CONTENT)
            return HTTPBadRequest(body=body_msg)
        # Create / update
        self._process_items()
        # Index in Solr
        self._update_index()
        # Return a 200 with details on what happened in the body
        self.messages.append("%s items created." % self.create_count)
        self.messages.append("%s items updated." % self.update_count)
        return HTTPOk(body=" ".join(self.messages))

    def _process_items(self):
        """Get a list of new items to create and existing items that
        need to be updated.
        """
        shared_content = feedparser.parse(self.request.body)
        for item in shared_content.entries:
            uid = item['id']
            # Get the uid, minus the urn:syndication bit
            item['uid'] = uid = normalize_uid(uid)
            logger.info('Processing item %s' % uid)
            item['link'] = item.link
            item['feed_link'] = shared_content.feed.link
            if uid in self.shared:
                self._update_item(item)
            else:
                self._create_item(item)

    def _create_item(self, entry):
        """Create new items in the feed
        """
        new_item = SharedItem()
        uid = entry['uid']
        logger.info('Creating item %s' % uid)
        new_item.update_from_entry(entry)
        # XXX: Should name and parent be necessary here? Shouldn't
        #      the `add` method do that for us?
        new_item.__name__ = uid
        new_item.__parent__ = self.shared
        self.shared.add(uid, new_item)
        self.to_index.append(self.shared[uid])
        self.create_count += 1

    def _update_item(self, entry):
        """Update existing items in the db using their UID
        """
        uid = entry['uid']
        logger.info('Updating item %s' % uid)
        obj = self.shared[uid]
        # XXX: these aren't coming from the object. Why is that? Is
        #      the `add` method on the folder not setting them?
        obj.__name__ = uid
        obj.__parent__ = self.shared
        selected_or_shared = (
            'selected' in entry['feed_link'] or
            'shared' in entry['feed_link']
        )
        if selected_or_shared and hasattr(obj, 'deletion_type'):
            remove_deleted_status(uid, self.shared, self.solr)
        obj.update_from_entry(entry)
        self.to_index.append(obj)
        self.update_count += 1

    def _update_index(self):
        """Clean up the item dictionaries to contain only items that
        are valid and send them over to Solr for indexing.

        NOTE: Solr may error out on index if it receives a field it is
              not aware of. We should change this code to look up the
              Solr schema, and remove attributes that it doesn't know,
              like __name__ and __parent__ below.
        """
        logger.debug('Updating index for %s objects' % len(self.to_index))
        cleaned = []
        ignored_attrs = [
            '__name__',
            '__parent__',
            'deletion_type',
        ]
        for item in self.to_index:
            item_dict = copy.deepcopy(item.__dict__)
            if 'Modified' in item_dict:
                if hasattr(item_dict['Modified'], 'isoformat'):
                    mod_date = item_dict['Modified'].isoformat()
                else:
                    mod_date = item_dict['Modified']
                # Make sure the date is acceptable to Solr, strip off
                # the +00:00 and replace it with a Z
                item_dict['Modified'] = "%sZ" % mod_date[:-6]
            if 'content' in item_dict:
                items = [item['value'] for item in item_dict['content']]
                if items:
                    # XXX: use first content item, discard the rest
                    item_dict['content'] = items[0]
            item_dict['uid'] = item_dict['__name__']
            # XXX: Need to look up the schema, then modify the dict
            #      based on that.
            for attr in ignored_attrs:
                item_dict.pop(attr, '')
            cleaned.append(item_dict)
        # XXX: Need to handle Solr errors here
        response = self.solr.update(cleaned)
        return response
Пример #17
0
        '_id': True,
        'year': True,
        'court': True,
        'court_level': True,
        'url': True,
        'name': True,
        'content': True,
        'tags': True,
        'subjects': True
}):
    if count % 100 == 0:
        print count

    # don't know how else to get solr to take IDs...
    doc['_id'] = str(doc['_id'])
    # include subject tag in list of strings if weigth greater than 0.01
    if 'subjects' in doc:
        sub_tmp = [k for k, v in doc['subjects'].items() if v >= 0.05]
        doc['subjects'] = sub_tmp
    count += 1
    documents.append(doc)

# json indexing supposed to be faster
# at least with mysolr, doing them as a big list is much faster for 18300 docs
# 3 minutes vs 1 min 53 sec
print "updating..."
solr.update(documents, 'json', commit=False)
print "committing..."
solr.commit()
print "done..."
Пример #18
0
def index():
    client = MongoClient(host=HOST, port=PORT)
    db = client['crawl']
    coll = db['web']

    server = Solr(SERVER)
    max_indexed_id = get_max_indexed_id(server)
    if not max_indexed_id:
        max_indexed_id = ObjectId('000000000000')
    else:
        max_indexed_id = ObjectId(max_indexed_id)

    sites = get_host_name()

    step = 100
    count = 0

    jdocs = []
    for row in coll.find({'_id': {'$gt': max_indexed_id}}).sort([('_id',1)]):
        jdoc = {}
        jdoc['id'] = str(row['_id'])
        if len(jdocs) == 0:
            start = row['_id']
        jdoc['url'] = row['curi:url']

        jdoc['site'] = sites[get_url_domain(row['curi:url'])]

        jdoc['ip'] = row['curi:ip']
        if 'curi:processed_at' in row:
            jdoc['processed_at'] = row['curi:processed_at']
        if 'content_type' in row:
            jdoc['content_type'] = row['content_type']
        if 'content_length' in row:
            jdoc['content_length'] = row['content_length']
        if 'class_key' in row:
            jdoc['class_key'] = row['class_key']
        if 'host' in row:
            jdoc['host'] = row['host']
        if 'curi:request' in row:
            jdoc['request'] = row['curi:request']
        if 'content:headers' in row:
            jdoc['headers'] = row['content:headers']
        if 'text' in row:
            jdoc['text'] = row['text']
        if 'title' in row:
            jdoc['title'] = row['title']
        if 'parse:keywords' in row:
            jdoc['keywords'] = row['parse:keywords']
        if 'parse:content-encoding' in row:
            jdoc['content_encoding'] = row['parse:content-encoding']
        if 'content:raw_data' in row:
            jdoc['raw_data'] = row['content:raw_data']

        jdocs.append(jdoc)
        count = count + 1

        if len(jdocs) >= step:
            end = row['_id']
            xx = server.update(jdocs)
            server.commit()
            print xx
            jdocs = []
            print('commit %d documents. %s to %s'  % (count, start, end))

    if len(jdocs) > 0:
        server.update(jdocs)
        server.commit()
Пример #19
0
class QueryResultTestCase(unittest.TestCase):

    def setUp(self):
        self.solr = Solr('http://localhost:8983/solr')

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id' : 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id' : 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Пример #20
0
		pass
	else:
		continue
	
	print page['name'].encode('utf-8')
	#print page['subname']
	print page['_id']

	item['id'] = page['_id']
	item['name'] = page['name']
	if page.has_key('subname'):
		item['subname'] = page['subname']
	#item['content'] = page['content']
	items.append(item)

solr.update(items, 'json',commit=False)
solr.commit()

'''
items = []

for index in xrange(11,16):
	game = games[index]
	item = {}
	item['bname'] = game['name']
	item['bsummary'] = game['des']
	item['id'] = game['url']
	tmp = ''
	for x in game['property']:
		tmp = tmp + x['title']+':'
		tmp = tmp + x['content']+'\n'