Пример #1
0
class LogIndexer(object):
    '''
    classdocs
    '''

    def __init__(self,solrAddr):
        '''
        Constructor
        '''
        self.solr = Solr(solrAddr)

        
    
    def index(self,data):
        for key, value in data.items():
           if isinstance(value,datetime.datetime):
               try:
                   value = solr.core.utc_to_string(value)
               except:
                   pst = tz.gettz('Europe/Paris')
                   value = value.replace(tzinfo=pst)
                   value = solr.core.utc_to_string(value)
                   data[key] = value
                   
        try:
            self.solr.update([data])
        except:
            print "Erreur Index request: "        
        self.solr.commit()
        print "data indexed"
Пример #2
0
class SolrUtils:

	def __init__(self, url):
		self.url = url
		self.conn = Solr(url)

	def addJSONDoc(self, doc):
		self.conn.update(doc, 'json', commit=False)

	def commit(self):
		self.conn.commit()
Пример #3
0
class QueryResultTestCase(unittest.TestCase):

    def setUp(self):
        self.solr = Solr('http://localhost:8983/solr')

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id' : 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id' : 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Пример #4
0
class QueryResultTestCase(unittest.TestCase):
    def setUp(self):
        self.solr = Solr(os.getenv('SOLR_URL'))

    def test_search(self):
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q='*:*')
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{'id': 1}]
        response = self.solr.update(documents, input_type='json')
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{'id': 2}]
        response = self.solr.update(documents, input_type='xml')
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = 'id:1'
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q='*:*')
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass
Пример #5
0
    },
    {
        'q' : 'foo:bar'
    }
]

# using 10 threads
responses = solr.async_search(queries, size=10)

#See installation section for further information about how to install this feature.
#Indexing documents
from mysolr import Solr

solr = Solr()

# Create documents
documents = [
    {'id' : 1,
     'field1' : 'foo'
    },
    {'id' : 2,
     'field2' : 'bar'
    }
]
# Index using json is faster!
solr.update(documents, 'json', commit=False)

# Manual commit
solr.commit()

Пример #6
0
def del_all_index(server):
    server = Solr(server)
    server.delete_by_query('*:*')
    server.commit()
Пример #7
0
def index():
    client = MongoClient(host=HOST, port=PORT)
    db = client['crawl']
    coll = db['web']

    server = Solr(SERVER)
    max_indexed_id = get_max_indexed_id(server)
    if not max_indexed_id:
        max_indexed_id = ObjectId('000000000000')
    else:
        max_indexed_id = ObjectId(max_indexed_id)

    sites = get_host_name()

    step = 100
    count = 0

    jdocs = []
    for row in coll.find({'_id': {'$gt': max_indexed_id}}).sort([('_id',1)]):
        jdoc = {}
        jdoc['id'] = str(row['_id'])
        if len(jdocs) == 0:
            start = row['_id']
        jdoc['url'] = row['curi:url']

        jdoc['site'] = sites[get_url_domain(row['curi:url'])]

        jdoc['ip'] = row['curi:ip']
        if 'curi:processed_at' in row:
            jdoc['processed_at'] = row['curi:processed_at']
        if 'content_type' in row:
            jdoc['content_type'] = row['content_type']
        if 'content_length' in row:
            jdoc['content_length'] = row['content_length']
        if 'class_key' in row:
            jdoc['class_key'] = row['class_key']
        if 'host' in row:
            jdoc['host'] = row['host']
        if 'curi:request' in row:
            jdoc['request'] = row['curi:request']
        if 'content:headers' in row:
            jdoc['headers'] = row['content:headers']
        if 'text' in row:
            jdoc['text'] = row['text']
        if 'title' in row:
            jdoc['title'] = row['title']
        if 'parse:keywords' in row:
            jdoc['keywords'] = row['parse:keywords']
        if 'parse:content-encoding' in row:
            jdoc['content_encoding'] = row['parse:content-encoding']
        if 'content:raw_data' in row:
            jdoc['raw_data'] = row['content:raw_data']

        jdocs.append(jdoc)
        count = count + 1

        if len(jdocs) >= step:
            end = row['_id']
            xx = server.update(jdocs)
            server.commit()
            print xx
            jdocs = []
            print('commit %d documents. %s to %s'  % (count, start, end))

    if len(jdocs) > 0:
        server.update(jdocs)
        server.commit()
Пример #8
0
        '_id': True,
        'year': True,
        'court': True,
        'court_level': True,
        'url': True,
        'name': True,
        'content': True,
        'tags': True,
        'subjects': True
}):
    if count % 100 == 0:
        print count

    # don't know how else to get solr to take IDs...
    doc['_id'] = str(doc['_id'])
    # include subject tag in list of strings if weigth greater than 0.01
    if 'subjects' in doc:
        sub_tmp = [k for k, v in doc['subjects'].items() if v >= 0.05]
        doc['subjects'] = sub_tmp
    count += 1
    documents.append(doc)

# json indexing supposed to be faster
# at least with mysolr, doing them as a big list is much faster for 18300 docs
# 3 minutes vs 1 min 53 sec
print "updating..."
solr.update(documents, 'json', commit=False)
print "committing..."
solr.commit()
print "done..."
Пример #9
0
class QueryResultTestCase(unittest.TestCase):
    def setUp(self):
        self.solr = Solr(os.getenv("SOLR_URL"))

    def test_search(self):
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, 4)
        self.assertEqual(len(response.documents), 4)

    def test_search_cursor(self):
        cursor = self.solr.search_cursor(q="*:*")
        i = 0
        for response in cursor.fetch(1):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 4)

        cursor = self.solr.search_cursor(q="*:*")
        i = 0
        for response in cursor.fetch(4):
            self.assertEqual(response.status, 200)
            i += 1
        self.assertEqual(i, 1)

    def test_commit(self):
        response = self.solr.commit()
        self.assertEqual(response.status, 200)

    def test_optimize(self):
        response = self.solr.optimize()
        self.assertEqual(response.status, 200)

    def test_ping(self):
        response = self.solr.ping()
        self.assertEqual(response.status, 200)

    def test_is_up(self):
        response = self.solr.is_up()
        self.assertEqual(response, True)

    def test_update_delete(self):
        # Get total results
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        total_results = response.total_results
        # Post one document using json
        documents = [{"id": 1}]
        response = self.solr.update(documents, input_type="json")
        self.assertEqual(response.status, 200)
        # Post anoter document using xml
        documents = [{"id": 2}]
        response = self.solr.update(documents, input_type="xml")
        self.assertEqual(response.status, 200)
        # Compare total results
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results + 2)

        # Now delete the two document posted above
        query = "id:1"
        key = 2
        response = self.solr.delete_by_query(query)
        self.assertEqual(response.status, 200)
        response = self.solr.delete_by_key(key)
        self.assertEqual(response.status, 200)
        response = self.solr.search(q="*:*")
        self.assertEqual(response.status, 200)
        self.assertEqual(response.total_results, total_results)

    def tearDown(self):
        pass

    def test_query(self):
        pass