def run(businesses_file, filtered_urls_file, collection): db = MongoClient(Settings.CONNECTION_STRING)[Settings.DATABASE_NAME] client = importio.importio( user_id=Settings.IMPORTIO_USER_ID, api_key=Settings.IMPORTIO_API_KEY, host=Settings.IMPORTIO_HOST) client.connect() crawler_active = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_ACTIVE, client) crawler_filtered = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FILTERED, client) crawler_friends = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FRIENDS, client) businesses = Crawler.generate_urls(businesses_file, filtered_urls_file) done = 0 total = len(businesses) print 'Done {0}/{1}'.format(done, total) for business in businesses: if not Crawler.exists(business.name, db, collection): for url in business.activeUrls: crawler_active.crawl(url) crawler_active.save(business, is_not_recommended=False) for url in business.filteredUrls: crawler_filtered.crawl(url) crawler_filtered.save(business, is_not_recommended=True) crawler_friends.update_user_friends(business) # client.disconnect() # client.connect() # print 'Client reconnected' done += 1 print 'Done {0}/{1}'.format(done, total) return True
def handle(self, *args, **options): selected_shop = Shop.objects.get(pk=options['shop_id']) print unicode(selected_shop) products = selected_shop.products.all() client = importio.importio(user_id=settings.IMPORTIO['guid'], api_key=settings.IMPORTIO['key']) client.connect() lock = latch.latch(len(products)) print '%d/%d' % (0, len(products)), stdout.flush() def callback(query, message): if message['type'] == 'MESSAGE': if 'pageUrl' in message['data']: _url = message['data']['pageUrl'] _product = Product.objects.get(url=_url) result = message['data']['results'][0] price = str_to_number(result['price']) price2 = str_to_number(result['price2']) if _product.price != price or _product.price2 != price2: _new_price = Price(product=_product, price=price, price2=price2) _product.price = price _product.price2 = price2 _product.save() _new_price.save() else: logger.error(message) if query.finished(): lock.countdown() print '\r%d/%d' % (len(products) - lock.count, len(products)), stdout.flush() for product in products: client.query(create_query(selected_shop.crawler_id, product.url), callback) lock.await() client.disconnect()
def sync_queries(queries): io = importio.importio(user_id=os.getenv('IMPORTIO_USER_ID'), api_key=os.getenv('IMPORTIO_API_KEY')) io.connect() queryLatch = latch.latch(len(queries)) dataRows = [] # In order to receive the data from the queries we issue, we need to define a callback method # This method will receive each message that comes back from the queries, and we can take that # data and store it for use in our app def callback(query, message): log.debug("QueryLatch: %s" % queryLatch) # Disconnect messages happen if we disconnect the client library while a query is in progress if message["type"] == "DISCONNECT": log.error("Query in progress when library disconnected") log.error(json.dumps(message["data"], indent=4)) # Check the message we receive actually has some data in it if message["type"] == "MESSAGE": if "errorType" in message["data"]: # In this case, we received a message, but it was an error from the external service log.error("Got an error!") log.error(json.dumps(message["data"], indent=4)) else: # We got a message and it was not an error, so we can process the data log.debug("Got data!") log.debug(json.dumps(message["data"], indent=4)) dataRows.extend(message["data"]["results"]) log.debug(dataRows) # When the query is finished, countdown the latch so the program can continue when everything is done if query.finished(): queryLatch.countdown() for q in queries: io.query(q, callback) queryLatch. await () return dataRows
def run(businesses_file, filtered_urls_file, collection): db = MongoClient(Settings.CONNECTION_STRING)[Settings.DATABASE_NAME] client = importio.importio(user_id=Settings.IMPORTIO_USER_ID, api_key=Settings.IMPORTIO_API_KEY, host=Settings.IMPORTIO_HOST) client.connect() crawler_active = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_ACTIVE, client) crawler_filtered = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FILTERED, client) crawler_friends = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FRIENDS, client) businesses = Crawler.generate_urls(businesses_file, filtered_urls_file) done = 0 total = len(businesses) print 'Done {0}/{1}'.format(done, total) for business in businesses: if not Crawler.exists(business.name, db, collection): for url in business.activeUrls: crawler_active.crawl(url) crawler_active.save(business, is_not_recommended=False) for url in business.filteredUrls: crawler_filtered.crawl(url) crawler_filtered.save(business, is_not_recommended=True) crawler_friends.update_user_friends(business) # client.disconnect() # client.connect() # print 'Client reconnected' done += 1 print 'Done {0}/{1}'.format(done, total) return True
def getNEXT(searchTerm, place, records): # To use an API key for authentication, use the following code: client = importio.importio(user_id="132bbe63-5552-41a2-ab3c-440ca93b8fa9", api_key="Ge28+Cy7Kxs8Z9gatZgj5BZv9MF8JwCpRxB97O1fwUgbv7kYXdgUQuE00fW4tTOi6HwEfPVlR2zAvfLdsI3QMQ==", host="https://query.import.io") # Once we have started the client and authenticated, we need to connect it to the server: client.connect() # Because import.io queries are asynchronous, for this simple script we will use a "latch" # to stop the script from exiting before all of our queries are returned # For more information on the latch class, see the latch.py file included in this client library queryLatch = latch.latch(1) # Define here a global variable that we can put all our results in to when they come back from # the server, so we can use the data later on in the script dataRows2 = [] g = geocoder.google(place) # In order to receive the data from the queries we issue, we need to define a callback method # This method will receive each message that comes back from the queries, and we can take that # data and store it for use in our app def callback(query, message): global dataRows # Disconnect messages happen if we disconnect the client library while a query is in progress if message["type"] == "DISCONNECT": print "Query in progress when library disconnected" print json.dumps(message["data"], indent = 4) # Check the message we receive actually has some data in it if message["type"] == "MESSAGE": if "errorType" in message["data"]: # In this case, we received a message, but it was an error from the external service print "erreur link" else: # We got a message and it was not an error, so we can process the data dataRows2.extend(message["data"]["results"]) # When the query is finished, countdown the latch so the program can continue when everything is done if query.finished(): queryLatch.countdown() # Issue queries to your data sources and with your inputs # You can modify the inputs and connectorGuids so as to query your own sources # Query for tile Magic Api if lp == 0: client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": "https://maps.google.fr/maps?sll="+str(g.lat)+","+str(g.lng)+"&q="+searchTerm+"&ie=UTF8&hl=fr&sspn=0.000000,0.000000&dg=brw&sa=N&start="+str(records)+"&output=classic&dg=brw" } }, callback) if lp == 10: global nextlink print nextlink+str(lp) client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": ""+str(nextlink)+"&output=classic"+"" } }, callback) if lp > 10: global nextlink print nextlink+str(lp) client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": ""+str(nextlink)+"&output=classic"+"" } }, callback) # print "Queries dispatched, now waiting for results" # Now we have issued all of the queries, we can "await" on the latch so that we know when it is all done queryLatch.await() # print "Latch has completed, all results returned" # It is best practice to disconnect when you are finished sending queries and getting data - it allows us to # clean up resources on the client and the server client.disconnect() # Now we can print out the data we got # print "All data received:" jdata = json.dumps(dataRows2, indent = 4, ensure_ascii=False , encoding="utf8") decoded = json.loads(jdata) for vc in decoded: nextlink = vc['my_column'] return nextlink
def __init__(self): self.dataRows = [] self.client = importio.importio(user_id="b12f3a23-b267-45a4-99f2-b8a0d2e9b491", api_key="YqcifZoCcEPdmXwAL8g855gQ2ZSmtGZwiwaBpj71TMNKsvAXhpvhLiz9mpy5DlC7KIZX62sC+TnaSxhNLfJNXg==", host="https://query.import.io") self.geolocator = GoogleV3Custom()