def __init__(self, database, collection, connector, client): self.database = database self.collection = collection self.connector = connector self.client = client self.dataRows = [] self.queryLatch = latch.latch(0)
def crawl(self, query_url): self.dataRows = [] self.queryLatch = latch.latch(1) self.client.query( {"connectorGuids": self.connector, "input": {"webpage/url": query_url} }, self.callback) self.queryLatch.await()
def crawl(self, query_url): self.dataRows = [] self.queryLatch = latch.latch(1) self.client.query( { "connectorGuids": self.connector, "input": { "webpage/url": query_url } }, self.callback) self.queryLatch. await ()
def handle(self, *args, **options): selected_shop = Shop.objects.get(pk=options['shop_id']) print unicode(selected_shop) products = selected_shop.products.all() client = importio.importio(user_id=settings.IMPORTIO['guid'], api_key=settings.IMPORTIO['key']) client.connect() lock = latch.latch(len(products)) print '%d/%d' % (0, len(products)), stdout.flush() def callback(query, message): if message['type'] == 'MESSAGE': if 'pageUrl' in message['data']: _url = message['data']['pageUrl'] _product = Product.objects.get(url=_url) result = message['data']['results'][0] price = str_to_number(result['price']) price2 = str_to_number(result['price2']) if _product.price != price or _product.price2 != price2: _new_price = Price(product=_product, price=price, price2=price2) _product.price = price _product.price2 = price2 _product.save() _new_price.save() else: logger.error(message) if query.finished(): lock.countdown() print '\r%d/%d' % (len(products) - lock.count, len(products)), stdout.flush() for product in products: client.query(create_query(selected_shop.crawler_id, product.url), callback) lock.await() client.disconnect()
def sync_queries(queries): io = importio.importio(user_id=os.getenv('IMPORTIO_USER_ID'), api_key=os.getenv('IMPORTIO_API_KEY')) io.connect() queryLatch = latch.latch(len(queries)) dataRows = [] # In order to receive the data from the queries we issue, we need to define a callback method # This method will receive each message that comes back from the queries, and we can take that # data and store it for use in our app def callback(query, message): log.debug("QueryLatch: %s" % queryLatch) # Disconnect messages happen if we disconnect the client library while a query is in progress if message["type"] == "DISCONNECT": log.error("Query in progress when library disconnected") log.error(json.dumps(message["data"], indent=4)) # Check the message we receive actually has some data in it if message["type"] == "MESSAGE": if "errorType" in message["data"]: # In this case, we received a message, but it was an error from the external service log.error("Got an error!") log.error(json.dumps(message["data"], indent=4)) else: # We got a message and it was not an error, so we can process the data log.debug("Got data!") log.debug(json.dumps(message["data"], indent=4)) dataRows.extend(message["data"]["results"]) log.debug(dataRows) # When the query is finished, countdown the latch so the program can continue when everything is done if query.finished(): queryLatch.countdown() for q in queries: io.query(q, callback) queryLatch. await () return dataRows
def getNEXT(searchTerm, place, records): # To use an API key for authentication, use the following code: client = importio.importio(user_id="132bbe63-5552-41a2-ab3c-440ca93b8fa9", api_key="Ge28+Cy7Kxs8Z9gatZgj5BZv9MF8JwCpRxB97O1fwUgbv7kYXdgUQuE00fW4tTOi6HwEfPVlR2zAvfLdsI3QMQ==", host="https://query.import.io") # Once we have started the client and authenticated, we need to connect it to the server: client.connect() # Because import.io queries are asynchronous, for this simple script we will use a "latch" # to stop the script from exiting before all of our queries are returned # For more information on the latch class, see the latch.py file included in this client library queryLatch = latch.latch(1) # Define here a global variable that we can put all our results in to when they come back from # the server, so we can use the data later on in the script dataRows2 = [] g = geocoder.google(place) # In order to receive the data from the queries we issue, we need to define a callback method # This method will receive each message that comes back from the queries, and we can take that # data and store it for use in our app def callback(query, message): global dataRows # Disconnect messages happen if we disconnect the client library while a query is in progress if message["type"] == "DISCONNECT": print "Query in progress when library disconnected" print json.dumps(message["data"], indent = 4) # Check the message we receive actually has some data in it if message["type"] == "MESSAGE": if "errorType" in message["data"]: # In this case, we received a message, but it was an error from the external service print "erreur link" else: # We got a message and it was not an error, so we can process the data dataRows2.extend(message["data"]["results"]) # When the query is finished, countdown the latch so the program can continue when everything is done if query.finished(): queryLatch.countdown() # Issue queries to your data sources and with your inputs # You can modify the inputs and connectorGuids so as to query your own sources # Query for tile Magic Api if lp == 0: client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": "https://maps.google.fr/maps?sll="+str(g.lat)+","+str(g.lng)+"&q="+searchTerm+"&ie=UTF8&hl=fr&sspn=0.000000,0.000000&dg=brw&sa=N&start="+str(records)+"&output=classic&dg=brw" } }, callback) if lp == 10: global nextlink print nextlink+str(lp) client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": ""+str(nextlink)+"&output=classic"+"" } }, callback) if lp > 10: global nextlink print nextlink+str(lp) client.query({ "connectorGuids": [ "1f59482a-3c8e-479d-985e-daafe92e71a3" ], "input": { "webpage/url": ""+str(nextlink)+"&output=classic"+"" } }, callback) # print "Queries dispatched, now waiting for results" # Now we have issued all of the queries, we can "await" on the latch so that we know when it is all done queryLatch.await() # print "Latch has completed, all results returned" # It is best practice to disconnect when you are finished sending queries and getting data - it allows us to # clean up resources on the client and the server client.disconnect() # Now we can print out the data we got # print "All data received:" jdata = json.dumps(dataRows2, indent = 4, ensure_ascii=False , encoding="utf8") decoded = json.loads(jdata) for vc in decoded: nextlink = vc['my_column'] return nextlink
def sync(self, company): self.client.connect() print "Property List = " + company.url self.queryLatch = latch.latch() self.client.query({ "connectorGuids": [ "903de60e-6edc-49a3-aa6e-671cdb0d8ac5" ], "input": { "webpage/url": company.url, } }, self._callback) self.queryLatch.await() links = [x['details_link'] for x in self.dataRows] self.dataRows = [] self.queryLatch = latch.latch(len(links)) #self.queryLatch = latch.latch(len) for link in links: self.client.query({ "connectorGuids": [ "897bdd91-24c0-409d-9e29-dffee6f1d64c" ], "input": { "webpage/url": link, } }, self._callback) print "Waiting...\n" self.queryLatch.await() print "Finished!\n" self.client.disconnect() properties = self.dataRows print "JSON = " + str(properties) active_property_ids = [] existing_property_ids = map(lambda x: x.pk, company.address_set.all()) for property in properties: published_address = property['address'] if company.address_set.filter(published_address=published_address).exists(): existing_property = company.address_set.get(published_address=published_address) if 'bed' in property: existing_property.bedrooms = int(property['bed']) if 'bath' in property: existing_property.baths = float(property['bath']) if 'rent' in property: existing_property.price = int(property['rent']) if 'available_on' in property: #existing_property.date_available = datetime.strptime(property['available_on'], "%m/%d/%y").date() existing_property.date_available = datetime.now() if 'description' in property: existing_property.description = property['description'] if 'sqft' in property: existing_property.sqft = int(re.sub(',', '', property['sqft'])) existing_property.active = True existing_property.save() active_property_ids.append(existing_property.pk) else: # Create a new property address, (latitude, longitude) = self.geolocator.geocode(published_address) new_property = Address( num = int(address['num']), unit = address['unit'], city = address['city'], state = address['state'], country = address['country'], postal_code = int(address['postal_code']), formatted = address['formatted'], published_address = published_address, active = True, company = company, ) if 'bed' in property: new_property.bedrooms = int(property['bed']) if 'bath' in property: new_property.baths = float(property['bath']) if 'rent' in property: new_property.price = int(property['rent']) if 'available_on' in property: #new_property.date_available = datetime.strptime(property['available_on'], "%m/%d/%y").date() new_property.date_available = datetime.now() if 'description' in property: new_property.description = property['description'] if 'sqft' in property: new_property.sqft = int(re.sub(',', '', property['sqft'])) new_property.save() active_property_ids.append(new_property.pk) # see what was removed for id in existing_property_ids: if id not in active_property_ids: inactive_property = Address.objects.get(pk=id) inactive_property.active = False inactive_property.save()