class SingleMongodbPipeline(object): u""" @summary: save the data to mongodb. """ MONGODB_SERVER = "localhost" MONGODB_PORT = 27017 MONGODB_DB = "books_fs" def __init__(self): u""" The only async framework that PyMongo fully supports is Gevent. Currently there is no great way to use PyMongo in conjunction with Tornado or Twisted. PyMongo provides built-in connection pooling, so some of the benefits of those frameworks can be achieved just by writing multi-threaded code that shares a MongoClient. """ self.client = None self.db = None def open_spider(self, spider): u''' @summary: spider开启时调用 ''' self.MONGODB_SERVER = SingleMONGODB_SERVER self.MONGODB_PORT = SingleMONGODB_PORT self.MONGODB_DB = SingleMONGODB_DB try: self.client = MongoClient(self.MONGODB_SERVER, self.MONGODB_PORT) self.db = self.client[self.MONGODB_DB] except Exception as e: print log.ERROR("ERROR(SingleMongodbPipeline): %s" % (str(e),)) traceback.print_exc() def process_item(self, item, spider): if(not isinstance(item, GoodsItem)): return item result = self.db['goodsitems'].insert(dict(item)) # item["mongodb_id"] = str(result) # # log.msg("Item %s wrote to MongoDB database %s/book_detail" % # (result, self.MONGODB_DB), level=log.DEBUG, spider=spider) return item def close_spider(self, spider): u''' @summary: spider关闭时调用 ''' if(self.client): self.client.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--last_id", type=int, default=1) parser.add_argument("--index", type=str, default='sunrise3') parser.add_argument("--force", action="store_true") parser.add_argument("--cached", action="store_true") parser.add_argument("--stathat", action="store_true") args = parser.parse_args() print args db = MongoClient()['test'] es = Elasticsearch() stats = StatHat('hq08Ng2ujA8o3VPe') lastfm_url = "http://ws.audioscrobbler.com/2.0/?api_key=048cd62229f507d6d577393a6d7ac972&format=json" factory = Factory(db, lastfm_url) factory.cached = args.cached last_id = args.last_id while True: where = {'_id': {'$gt': last_id}, 'end_ts': {'$gt': 0}} #where = {'_id': {'$gt': last_id}} if not args.force: where['pub_ts'] = 0 print where oid = last_id for air in db.air.find(where).sort('ts').limit(100): oid = air['_id'] audio = factory.build_audio_from_air(air) es.index(index=args.index, doc_type='audio', id=oid, body=audio) if not args.force: db.air.update({'_id': oid}, {'$set': { 'pub_ts': int(time.time()) }}) print '---' * 10, oid pp(audio) if args.stathat: stats.count('index.audio', 1) if audio.get('is_track'): stats.count('index.track', 1) if oid == last_id: if args.force: continue else: print 'wait for new tracks...' time.sleep(10) else: last_id = oid
def __init__(self): """ The only async framework that PyMongo fully supports is Gevent. Currently there is no great way to use PyMongo in conjunction with Tornado or Twisted. PyMongo provides built-in connection pooling, so some of the benefits of those frameworks can be achieved just by writing multi-threaded code that shares a MongoClient. """ self.style = color.color_style() try: client = MongoClient(self.MONGODB_SERVER,self.MONGODB_PORT) self.db = client[self.MONGODB_DB] except Exception as e: print self.style.ERROR("ERROR(ShardMongodbPipeline): %s"%(str(e),)) traceback.print_exc()
def open_spider(self, spider): u''' @summary: spider开启时调用 ''' self.MONGODB_SERVER = SingleMONGODB_SERVER self.MONGODB_PORT = SingleMONGODB_PORT self.MONGODB_DB = SingleMONGODB_DB try: self.client = MongoClient(self.MONGODB_SERVER, self.MONGODB_PORT) self.db = self.client[self.MONGODB_DB] except Exception as e: print log.ERROR("ERROR(SingleMongodbPipeline): %s" % (str(e),)) traceback.print_exc()
def __init__(self, host='localhost', port=27017, db='taskdb', user='', password=None, coll='task', pool=100): """The only async framework that PyMongo fully supports is Gevent. Currently there is no great way to use PyMongo in conjunction with Tornado or Twisted. PyMongo provides built-in connection pooling, so some of the benefits of those frameworks can be achieved just by writing multi-threaded code that shares a MongoClient. """ try: client = MongoClient(host, port, max_pool_size=pool) if user: client.the_database.authenticate(user, password, source=db) self.db = client[db] self.coll = self.db[coll] except Exception as e: print('connect to mongodb error.', e)
'book_detail':\ {\ (('book_name',ASCENDING),('author',ASCENDING)):{'name':'book_name_author','unique':True}, 'book_name':{'name':'book_name'}, 'author':{'name':'author'}, 'alias_name':{'name':'alias_name'}, }\ } def drop_database(name_or_database): if name_or_database and client: client.drop_database(name_or_database) def create_index(): """ create index for books_fs.book_detail """ for k, v in INDEX.items(): for k, kwargs in v.items(): client[DATABASE_NAME][K].ensure_index( list(key) if type(key) == types.TupleType else key, **kwargs) if __name__ == '__main__': client = MongoClient(DATABASE_HOST, DATABASE_PORT) drop_database(DATABASE_NAME) create_index()
#!/usr/bin/env python from pymongo.connection import MongoClient import csv import json """ Script para re-importar los posts a la base de datos """ reader = csv.DictReader(open('../data/redit.csv'), fieldnames=('image_id', 'unixtime', 'rawtime', 'title', 'total_votes', 'reddit_id', 'number_of_upvotes', 'subreddit', 'number_of_downvotes', 'localtime', 'score', 'number_of_comments', 'username')) conn = MongoClient() db = conn.reddit print "Cleaning DB collections %s.%s" % ("reddit", "posts") db.posts.remove() for it in reader: try: it['total_votes'] = int(it['total_votes']) it['number_of_upvotes'] = int(it['number_of_upvotes']) it['number_of_downvotes'] = int(it['number_of_downvotes']) it['score'] = int(it['score']) it['number_of_comments'] = int(it['number_of_comments']) db.posts.insert(it) except Exception as e: print e, "while inserting", it print "Inserted %d records" % db.posts.count() assert db.posts.count(
def setUp(self): conn = MongoClient() db = conn.reddit data_collection = db.posts self.ej_result = lambda res_name, mask: getattr(db, res_name).find(mask )
def __init__(self): client = MongoClient(settings.MONGO_URI) self._db = client[settings.MONGO_DB]
def setUp(self): """ Setup new MongoDB database, subclasses of simplemongo documents and store some test data. """ self.mongo = MongoClient() self.database = self.mongo['_simplemongo_testsuite'] class TestDocument(simplemongo.Document): __database__ = self.database index_by = [ { 'fields': [ ('slug', pymongo.ASCENDING), ], 'options': { 'unique': True, 'sparse': True, }, } ] class NamedCollectionDocument(simplemongo.Document): __database__ = self.database __collection__ = 'namedcol' slug_field = 'slugfield' slug_source_fields = 'title' class StructuredDocument(simplemongo.Document): __database__ = self.database __attribute_access__ = True structure = { '_id': unicode, 'text_field': unicode, 'int_field': int, 'float_field': float, 'bool_field': bool, 'test_doc': TestDocument, } self.testdoc = TestDocument self.namedcoldoc = NamedCollectionDocument self.structureddoc = StructuredDocument testcol = self.database['TestDocument'] testcol.insert({ 'slug': 'test-document-instance', 'title': 'Test Document Instance', }) testcol.insert({ 'slug': 'test-document-instance-2', 'title': 'Test Document Instance 2', }) testcol.insert({ '_id': 'ManualId', }) testcol.insert({ '_id': 500, }) self.testobjectid = ObjectId() testcol.insert({ '_id': self.testobjectid, }) namedcol = self.database['namedcol'] namedcol.insert({ 'slugfield': 'named-slug-field', }) structuredcol = self.database['StructuredDocument'] structuredcol.insert({ '_id': '1234', 'text_field': u'A string', 'int_field': 124, 'float_field': 124.01, 'bool_field': True, })
class SimpleMongoTestCase(unittest.TestCase): def setUp(self): """ Setup new MongoDB database, subclasses of simplemongo documents and store some test data. """ self.mongo = MongoClient() self.database = self.mongo['_simplemongo_testsuite'] class TestDocument(simplemongo.Document): __database__ = self.database index_by = [ { 'fields': [ ('slug', pymongo.ASCENDING), ], 'options': { 'unique': True, 'sparse': True, }, } ] class NamedCollectionDocument(simplemongo.Document): __database__ = self.database __collection__ = 'namedcol' slug_field = 'slugfield' slug_source_fields = 'title' class StructuredDocument(simplemongo.Document): __database__ = self.database __attribute_access__ = True structure = { '_id': unicode, 'text_field': unicode, 'int_field': int, 'float_field': float, 'bool_field': bool, 'test_doc': TestDocument, } self.testdoc = TestDocument self.namedcoldoc = NamedCollectionDocument self.structureddoc = StructuredDocument testcol = self.database['TestDocument'] testcol.insert({ 'slug': 'test-document-instance', 'title': 'Test Document Instance', }) testcol.insert({ 'slug': 'test-document-instance-2', 'title': 'Test Document Instance 2', }) testcol.insert({ '_id': 'ManualId', }) testcol.insert({ '_id': 500, }) self.testobjectid = ObjectId() testcol.insert({ '_id': self.testobjectid, }) namedcol = self.database['namedcol'] namedcol.insert({ 'slugfield': 'named-slug-field', }) structuredcol = self.database['StructuredDocument'] structuredcol.insert({ '_id': '1234', 'text_field': u'A string', 'int_field': 124, 'float_field': 124.01, 'bool_field': True, }) def tearDown(self): """ Drop the test database """ self.mongo.drop_database(self.database) def test_collectionproxy(self): """ can we access the pymongo collection via our metaclass setup? do accessess to methods not defined in a Document get passed up to the pymongo collection? """ self.assertTrue( isinstance(self.testdoc.collection, pymongo.collection.Collection)) self.assertEqual( self.testdoc.insert, self.database['TestDocument'].insert) def test_collectionnaming(self): """ does automatic and manual collection naming work? """ self.assertEqual('TestDocument', self.testdoc.collection.name) self.assertEqual('namedcol', self.namedcoldoc.collection.name) def test_find(self): """ does a find return instances of our TestDocument? """ for doc in self.testdoc.find(): self.assertTrue(isinstance(doc, self.testdoc)) def test_find_one(self): """ can we find one of our test objects, is it a TestDocument? """ doc = self.testdoc.find_one({'slug': 'test-document-instance'}) self.assertTrue(isinstance(doc, self.testdoc)) def test_create_indexes(self): """ TODO: can we create indexes, idempotently? """ pass def test_by_id(self): """ can we find our test objects by int, string and objectid _id? """ for _id in ( 'ManualId', 500, self.testobjectid, str(self.testobjectid)): doc = self.testdoc.by_id(_id) self.assertTrue(isinstance(doc, self.testdoc)) # type coercion on defined structure test doc = self.structureddoc.by_id(1234) self.assertTrue(isinstance(doc, self.structureddoc)) def test_by_slug(self): """ can we retrieve a document by slug in any slugfield? """ doc = self.testdoc.by_slug('test-document-instance') self.assertTrue(isinstance(doc, self.testdoc)) doc = self.namedcoldoc.by_slug('named-slug-field') self.assertTrue(isinstance(doc, self.namedcoldoc)) def test_collection_update(self): """ can we update objects on the collection level? """ self.testdoc.update( {'_id': self.testobjectid}, {'$set': {'updated': True}}) doc = self.testdoc.by_id(self.testobjectid) self.assertTrue(doc.get('updated')) def test_slugify_function(self): """ can we slugify correctly? """ self.assertEqual( 'simple-test-case', simplemongo.slugify('Simple Test Case')) self.assertEqual( 'gtiru-tala-hgar', simplemongo.slugify(u'Gætirðu talað hægar')) self.assertEqual( 'some-simplepunctuation', simplemongo.slugify(u'some simple:punctuation££!@$')) def test_no_database_set(self): """ does subclassing a Document raise NoDatabaseError when __database__ is not a MongoClient Database instance? is NoDatabaseError raised by class methods of a Document without a database? is NoDatabaseError raised by instantiating a new instance of a Document without a set database? """ with self.assertRaises(simplemongo.NoDatabaseSet): class BrokenDb(simplemongo.Document): __database__ = 123 with self.assertRaises(simplemongo.NoDatabaseSet): simplemongo.Document.find() with self.assertRaises(simplemongo.NoDatabaseSet): simplemongo.Document.find_one() with self.assertRaises(simplemongo.NoDatabaseSet): simplemongo.Document.create_indexes() with self.assertRaises(simplemongo.NoDatabaseSet): simplemongo.Document() def test_getattr_exception(self): """ does a Document class raise AttributeError when an attrib isn't found and there's no MongoClient collection to reach into? """ with self.assertRaises(AttributeError): simplemongo.Document.non_existant_attribute def test_index_creation(self): """ does create_index work? """ self.testdoc.create_indexes() self.assertIn('slug_1', self.testdoc.index_information()) def test_save(self): """ can we save a new record? """ new_doc = self.testdoc() new_doc.save() self.assertTrue(isinstance(new_doc.get('_id'), ObjectId)) def test_update_and_reload(self): """ can we update? will it raise an exception for an unsaved doc? """ new_doc = self.testdoc() with self.assertRaises(simplemongo.DocumentNotSaved): new_doc.update() with self.assertRaises(simplemongo.DocumentNotSaved): new_doc.reload() new_doc['state'] = 'initial' new_doc.save() fetch_doc = self.testdoc.by_id(new_doc['_id']) self.assertEqual(new_doc['state'], fetch_doc['state']) new_doc.update({'$set': {'state': 'inplace'}}, reload_after=False) fetch_doc = self.testdoc.by_id(new_doc['_id']) self.assertNotEqual(fetch_doc['state'], new_doc['state']) new_doc.reload() self.assertEqual(fetch_doc['state'], new_doc['state']) new_doc.update({'$set': {'state': 'reloaded'}}) self.assertEqual(new_doc['state'], 'reloaded') new_doc['state'] = 'revised' new_doc.update() fetch_doc.reload() self.assertEqual(new_doc['state'], fetch_doc['state']) def test_dict_update(self): """ does the original dict update method still work? """ new_doc = self.testdoc() new_doc.dict_update({'dictupdated': True}) new_doc.save() fetch_doc = self.testdoc.by_id(new_doc['_id']) self.assertTrue(fetch_doc.get('dictupdated')) def test_delete(self): """ can we delete a doc? will it except if unsaved? """ new_doc = self.testdoc() with self.assertRaises(simplemongo.DocumentNotSaved): new_doc.delete() new_doc.save() self.assertEqual(new_doc, self.testdoc.by_id(new_doc['_id'])) new_doc.delete() self.assertIsNone(self.testdoc.by_id(new_doc['_id'])) def test_id_property(self): """ does id work as a property? """ new_doc = self.testdoc() self.assertIsNone(new_doc.id) new_doc.save() self.assertTrue(isinstance(new_doc.id, ObjectId)) def test_attribute_access(self): """ check attribute access works as expected """ test_doc = self.testdoc() test_doc['test_field'] = True with self.assertRaises(AttributeError): test_doc.test_field struct_doc = self.structureddoc() with self.assertRaises(AttributeError): struct_doc.test_field struct_doc['test_field'] = True struct_doc['bool_field'] = True self.assertEqual(struct_doc['test_field'], struct_doc.test_field) self.assertEqual(struct_doc['bool_field'], struct_doc.bool_field) test_doc.save() struct_doc['test_doc'] = test_doc['_id'] struct_doc.save() self.assertEqual(struct_doc.test_doc, test_doc) test_doc_2 = self.testdoc() test_doc_2['another_test'] = True test_doc_2.save() struct_doc_2 = self.structureddoc() struct_doc_2['test_doc'] = test_doc_2['_id'] struct_doc_2.save() self.assertEqual(struct_doc_2.test_doc, test_doc_2) def test_slugify_method(self): """ can we slugify documents? """ no_slug_field_doc = self.testdoc() slug_field_doc = self.namedcoldoc() self.assertEqual( no_slug_field_doc.slugify(from_text='Test Doc'), 'test-doc') self.assertEqual( no_slug_field_doc.slugify(from_text='Test Doc'), 'test-doc-2') no_slug_field_doc['title'] = 'Test Doc' self.assertEqual( no_slug_field_doc.slugify('title'), 'test-doc-3') with self.assertRaises(ValueError): no_slug_field_doc.slugify() slug_field_doc['title'] = 'My Test' self.assertEqual( slug_field_doc.slugify(), 'my-test') no_slug_field_doc['author'] = 'Me' self.assertEqual( no_slug_field_doc.slugify(['title', 'author']), 'test-doc-me')
def get_mongo(): global _mongo if not _mongo: mongo_host = get_settings("mongo_host") _mongo = MongoClient(mongo_host) return _mongo