示例#1
0
 def __init__(self, database, collection, connector, client):
     self.database = database
     self.collection = collection
     self.connector = connector
     self.client = client
     self.dataRows = []
     self.queryLatch = latch.latch(0)
示例#2
0
 def __init__(self, database, collection, connector, client):
     self.database = database
     self.collection = collection
     self.connector = connector
     self.client = client
     self.dataRows = []
     self.queryLatch = latch.latch(0)
示例#3
0
    def crawl(self, query_url):
        self.dataRows = []
        self.queryLatch = latch.latch(1)
        self.client.query(
            {"connectorGuids": self.connector,
             "input": {"webpage/url": query_url}
            }, self.callback)

        self.queryLatch.await()
示例#4
0
    def crawl(self, query_url):
        self.dataRows = []
        self.queryLatch = latch.latch(1)
        self.client.query(
            {
                "connectorGuids": self.connector,
                "input": {
                    "webpage/url": query_url
                }
            }, self.callback)

        self.queryLatch. await ()
示例#5
0
    def handle(self, *args, **options):
        selected_shop = Shop.objects.get(pk=options['shop_id'])
        print unicode(selected_shop)
        products = selected_shop.products.all()
        client = importio.importio(user_id=settings.IMPORTIO['guid'], api_key=settings.IMPORTIO['key'])
        client.connect()

        lock = latch.latch(len(products))

        print '%d/%d' % (0, len(products)),
        stdout.flush()

        def callback(query, message):
            if message['type'] == 'MESSAGE':
                if 'pageUrl' in message['data']:
                    _url = message['data']['pageUrl']
                    _product = Product.objects.get(url=_url)
                    result = message['data']['results'][0]
                    price = str_to_number(result['price'])
                    price2 = str_to_number(result['price2'])
                    if _product.price != price or _product.price2 != price2:
                        _new_price = Price(product=_product, price=price,
                                           price2=price2)
                        _product.price = price
                        _product.price2 = price2
                        _product.save()
                        _new_price.save()
                else:
                    logger.error(message)
            if query.finished():
                lock.countdown()
                print '\r%d/%d' % (len(products) - lock.count, len(products)),
                stdout.flush()

        for product in products:
            client.query(create_query(selected_shop.crawler_id, product.url), callback)

        lock.await()
        client.disconnect()
示例#6
0
def sync_queries(queries):
    io = importio.importio(user_id=os.getenv('IMPORTIO_USER_ID'),
                           api_key=os.getenv('IMPORTIO_API_KEY'))
    io.connect()
    queryLatch = latch.latch(len(queries))
    dataRows = []

    # In order to receive the data from the queries we issue, we need to define a callback method
    # This method will receive each message that comes back from the queries, and we can take that
    # data and store it for use in our app
    def callback(query, message):
        log.debug("QueryLatch: %s" % queryLatch)

        # Disconnect messages happen if we disconnect the client library while a query is in progress
        if message["type"] == "DISCONNECT":
            log.error("Query in progress when library disconnected")
            log.error(json.dumps(message["data"], indent=4))

        # Check the message we receive actually has some data in it
        if message["type"] == "MESSAGE":
            if "errorType" in message["data"]:
                # In this case, we received a message, but it was an error from the external service
                log.error("Got an error!")
                log.error(json.dumps(message["data"], indent=4))
            else:
                # We got a message and it was not an error, so we can process the data
                log.debug("Got data!")
                log.debug(json.dumps(message["data"], indent=4))
                dataRows.extend(message["data"]["results"])
                log.debug(dataRows)

        # When the query is finished, countdown the latch so the program can continue when everything is done
        if query.finished(): queryLatch.countdown()

    for q in queries:
        io.query(q, callback)
    queryLatch. await ()
    return dataRows
示例#7
0
def getNEXT(searchTerm, place, records):
 # To use an API key for authentication, use the following code:
    client = importio.importio(user_id="132bbe63-5552-41a2-ab3c-440ca93b8fa9", api_key="Ge28+Cy7Kxs8Z9gatZgj5BZv9MF8JwCpRxB97O1fwUgbv7kYXdgUQuE00fW4tTOi6HwEfPVlR2zAvfLdsI3QMQ==", host="https://query.import.io")
    
    # Once we have started the client and authenticated, we need to connect it to the server:
    client.connect()
    
    # Because import.io queries are asynchronous, for this simple script we will use a "latch"
    # to stop the script from exiting before all of our queries are returned
    # For more information on the latch class, see the latch.py file included in this client library
    queryLatch = latch.latch(1)
    
    # Define here a global variable that we can put all our results in to when they come back from
    # the server, so we can use the data later on in the script
    dataRows2 = []
    g = geocoder.google(place)
    # In order to receive the data from the queries we issue, we need to define a callback method
    # This method will receive each message that comes back from the queries, and we can take that
    # data and store it for use in our app
    def callback(query, message):
      global dataRows
      
      # Disconnect messages happen if we disconnect the client library while a query is in progress
      if message["type"] == "DISCONNECT":
        print "Query in progress when library disconnected"
        print json.dumps(message["data"], indent = 4)
    
      # Check the message we receive actually has some data in it
      if message["type"] == "MESSAGE":
        if "errorType" in message["data"]:
          # In this case, we received a message, but it was an error from the external service
            print "erreur link"
        else:
          # We got a message and it was not an error, so we can process the data

          dataRows2.extend(message["data"]["results"])
      
      # When the query is finished, countdown the latch so the program can continue when everything is done
      if query.finished(): queryLatch.countdown()
    
    # Issue queries to your data sources and with your inputs
    # You can modify the inputs and connectorGuids so as to query your own sources
    # Query for tile Magic Api
    if lp == 0:
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": "https://maps.google.fr/maps?sll="+str(g.lat)+","+str(g.lng)+"&q="+searchTerm+"&ie=UTF8&hl=fr&sspn=0.000000,0.000000&dg=brw&sa=N&start="+str(records)+"&output=classic&dg=brw"
                            
          }
        }, callback)
       
    if lp == 10:
        global nextlink
        print nextlink+str(lp)
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": ""+str(nextlink)+"&output=classic"+""
                           
          }
        }, callback)
        
    if lp > 10:
        global nextlink
        print nextlink+str(lp)
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": ""+str(nextlink)+"&output=classic"+""
                           
          }
        }, callback)    
        
  #  print "Queries dispatched, now waiting for results"
    
    # Now we have issued all of the queries, we can "await" on the latch so that we know when it is all done
    queryLatch.await()
    
  #  print "Latch has completed, all results returned"
    
    # It is best practice to disconnect when you are finished sending queries and getting data - it allows us to
    # clean up resources on the client and the server
    client.disconnect()
    
    # Now we can print out the data we got
  #  print "All data received:"
    jdata = json.dumps(dataRows2, indent = 4, ensure_ascii=False , encoding="utf8")
 
    decoded = json.loads(jdata)
  
    for vc in decoded:
        nextlink = vc['my_column']
    return nextlink
示例#8
0
    def sync(self, company):
        self.client.connect()

        print "Property List = " + company.url
        self.queryLatch = latch.latch()

        self.client.query({
            "connectorGuids": [
                "903de60e-6edc-49a3-aa6e-671cdb0d8ac5"
            ],
            "input": {
                "webpage/url": company.url,
            }
        }, self._callback)

        self.queryLatch.await()

        links = [x['details_link'] for x in self.dataRows]

        self.dataRows = []

        self.queryLatch = latch.latch(len(links))
        #self.queryLatch = latch.latch(len)
        for link in links:
            self.client.query({
                "connectorGuids": [
                    "897bdd91-24c0-409d-9e29-dffee6f1d64c"
                ],
                "input": {
                    "webpage/url": link,
                }
            }, self._callback)

        print "Waiting...\n"
        self.queryLatch.await()
        print "Finished!\n"

        self.client.disconnect()

        properties = self.dataRows

        print "JSON = " + str(properties)
        active_property_ids = []
        existing_property_ids = map(lambda x: x.pk, company.address_set.all())
        for property in properties:
            published_address = property['address']
            if company.address_set.filter(published_address=published_address).exists():
                existing_property = company.address_set.get(published_address=published_address)

                if 'bed' in property:
                    existing_property.bedrooms = int(property['bed'])

                if 'bath' in property:
                    existing_property.baths = float(property['bath'])

                if 'rent' in property:
                    existing_property.price = int(property['rent'])

                if 'available_on' in property:
                    #existing_property.date_available = datetime.strptime(property['available_on'], "%m/%d/%y").date()
                    existing_property.date_available = datetime.now()

                if 'description' in property:
                    existing_property.description = property['description']

                if 'sqft' in property:
                    existing_property.sqft = int(re.sub(',', '', property['sqft']))

                existing_property.active = True
                existing_property.save()
                active_property_ids.append(existing_property.pk)
            else:
                # Create a new property
                address, (latitude, longitude) = self.geolocator.geocode(published_address)

                new_property = Address(
                    num = int(address['num']),
                    unit = address['unit'],
                    city = address['city'],
                    state = address['state'],
                    country = address['country'],
                    postal_code = int(address['postal_code']),
                    formatted = address['formatted'],
                    published_address = published_address,
                    active = True,
                    company = company, 
                )

                if 'bed' in property:
                    new_property.bedrooms = int(property['bed'])

                if 'bath' in property:
                    new_property.baths = float(property['bath'])

                if 'rent' in property:
                    new_property.price = int(property['rent'])

                if 'available_on' in property:
                    #new_property.date_available = datetime.strptime(property['available_on'], "%m/%d/%y").date()
                    new_property.date_available = datetime.now()

                if 'description' in property:
                    new_property.description = property['description']

                if 'sqft' in property:
                    new_property.sqft = int(re.sub(',', '', property['sqft']))

                new_property.save()
                active_property_ids.append(new_property.pk)

            # see what was removed
            for id in existing_property_ids:
                if id not in active_property_ids:
                    inactive_property = Address.objects.get(pk=id)
                    inactive_property.active = False
                    inactive_property.save()