Пример #1
0
    def run(businesses_file, filtered_urls_file, collection):

        db = MongoClient(Settings.CONNECTION_STRING)[Settings.DATABASE_NAME]
        client = importio.importio(
            user_id=Settings.IMPORTIO_USER_ID,
            api_key=Settings.IMPORTIO_API_KEY,
            host=Settings.IMPORTIO_HOST)
        client.connect()

        crawler_active = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_ACTIVE, client)
        crawler_filtered = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FILTERED, client)
        crawler_friends = Crawler(db, collection, Settings.IMPORTIO_CRAWLER_FRIENDS, client)

        businesses = Crawler.generate_urls(businesses_file, filtered_urls_file)
        done = 0
        total = len(businesses)
        print 'Done {0}/{1}'.format(done, total)

        for business in businesses:
            if not Crawler.exists(business.name, db, collection):
                for url in business.activeUrls:
                    crawler_active.crawl(url)
                    crawler_active.save(business, is_not_recommended=False)
                for url in business.filteredUrls:
                    crawler_filtered.crawl(url)
                    crawler_filtered.save(business, is_not_recommended=True)
                crawler_friends.update_user_friends(business)
            # client.disconnect()
            # client.connect()
            # print 'Client reconnected'

            done += 1
            print 'Done {0}/{1}'.format(done, total)

        return True
Пример #2
0
    def handle(self, *args, **options):
        selected_shop = Shop.objects.get(pk=options['shop_id'])
        print unicode(selected_shop)
        products = selected_shop.products.all()
        client = importio.importio(user_id=settings.IMPORTIO['guid'], api_key=settings.IMPORTIO['key'])
        client.connect()

        lock = latch.latch(len(products))

        print '%d/%d' % (0, len(products)),
        stdout.flush()

        def callback(query, message):
            if message['type'] == 'MESSAGE':
                if 'pageUrl' in message['data']:
                    _url = message['data']['pageUrl']
                    _product = Product.objects.get(url=_url)
                    result = message['data']['results'][0]
                    price = str_to_number(result['price'])
                    price2 = str_to_number(result['price2'])
                    if _product.price != price or _product.price2 != price2:
                        _new_price = Price(product=_product, price=price,
                                           price2=price2)
                        _product.price = price
                        _product.price2 = price2
                        _product.save()
                        _new_price.save()
                else:
                    logger.error(message)
            if query.finished():
                lock.countdown()
                print '\r%d/%d' % (len(products) - lock.count, len(products)),
                stdout.flush()

        for product in products:
            client.query(create_query(selected_shop.crawler_id, product.url), callback)

        lock.await()
        client.disconnect()
Пример #3
0
def sync_queries(queries):
    io = importio.importio(user_id=os.getenv('IMPORTIO_USER_ID'),
                           api_key=os.getenv('IMPORTIO_API_KEY'))
    io.connect()
    queryLatch = latch.latch(len(queries))
    dataRows = []

    # In order to receive the data from the queries we issue, we need to define a callback method
    # This method will receive each message that comes back from the queries, and we can take that
    # data and store it for use in our app
    def callback(query, message):
        log.debug("QueryLatch: %s" % queryLatch)

        # Disconnect messages happen if we disconnect the client library while a query is in progress
        if message["type"] == "DISCONNECT":
            log.error("Query in progress when library disconnected")
            log.error(json.dumps(message["data"], indent=4))

        # Check the message we receive actually has some data in it
        if message["type"] == "MESSAGE":
            if "errorType" in message["data"]:
                # In this case, we received a message, but it was an error from the external service
                log.error("Got an error!")
                log.error(json.dumps(message["data"], indent=4))
            else:
                # We got a message and it was not an error, so we can process the data
                log.debug("Got data!")
                log.debug(json.dumps(message["data"], indent=4))
                dataRows.extend(message["data"]["results"])
                log.debug(dataRows)

        # When the query is finished, countdown the latch so the program can continue when everything is done
        if query.finished(): queryLatch.countdown()

    for q in queries:
        io.query(q, callback)
    queryLatch. await ()
    return dataRows
Пример #4
0
    def run(businesses_file, filtered_urls_file, collection):

        db = MongoClient(Settings.CONNECTION_STRING)[Settings.DATABASE_NAME]
        client = importio.importio(user_id=Settings.IMPORTIO_USER_ID,
                                   api_key=Settings.IMPORTIO_API_KEY,
                                   host=Settings.IMPORTIO_HOST)
        client.connect()

        crawler_active = Crawler(db, collection,
                                 Settings.IMPORTIO_CRAWLER_ACTIVE, client)
        crawler_filtered = Crawler(db, collection,
                                   Settings.IMPORTIO_CRAWLER_FILTERED, client)
        crawler_friends = Crawler(db, collection,
                                  Settings.IMPORTIO_CRAWLER_FRIENDS, client)

        businesses = Crawler.generate_urls(businesses_file, filtered_urls_file)
        done = 0
        total = len(businesses)
        print 'Done {0}/{1}'.format(done, total)

        for business in businesses:
            if not Crawler.exists(business.name, db, collection):
                for url in business.activeUrls:
                    crawler_active.crawl(url)
                    crawler_active.save(business, is_not_recommended=False)
                for url in business.filteredUrls:
                    crawler_filtered.crawl(url)
                    crawler_filtered.save(business, is_not_recommended=True)
                crawler_friends.update_user_friends(business)
            # client.disconnect()
            # client.connect()
            # print 'Client reconnected'

            done += 1
            print 'Done {0}/{1}'.format(done, total)

        return True
Пример #5
0
def getNEXT(searchTerm, place, records):
 # To use an API key for authentication, use the following code:
    client = importio.importio(user_id="132bbe63-5552-41a2-ab3c-440ca93b8fa9", api_key="Ge28+Cy7Kxs8Z9gatZgj5BZv9MF8JwCpRxB97O1fwUgbv7kYXdgUQuE00fW4tTOi6HwEfPVlR2zAvfLdsI3QMQ==", host="https://query.import.io")
    
    # Once we have started the client and authenticated, we need to connect it to the server:
    client.connect()
    
    # Because import.io queries are asynchronous, for this simple script we will use a "latch"
    # to stop the script from exiting before all of our queries are returned
    # For more information on the latch class, see the latch.py file included in this client library
    queryLatch = latch.latch(1)
    
    # Define here a global variable that we can put all our results in to when they come back from
    # the server, so we can use the data later on in the script
    dataRows2 = []
    g = geocoder.google(place)
    # In order to receive the data from the queries we issue, we need to define a callback method
    # This method will receive each message that comes back from the queries, and we can take that
    # data and store it for use in our app
    def callback(query, message):
      global dataRows
      
      # Disconnect messages happen if we disconnect the client library while a query is in progress
      if message["type"] == "DISCONNECT":
        print "Query in progress when library disconnected"
        print json.dumps(message["data"], indent = 4)
    
      # Check the message we receive actually has some data in it
      if message["type"] == "MESSAGE":
        if "errorType" in message["data"]:
          # In this case, we received a message, but it was an error from the external service
            print "erreur link"
        else:
          # We got a message and it was not an error, so we can process the data

          dataRows2.extend(message["data"]["results"])
      
      # When the query is finished, countdown the latch so the program can continue when everything is done
      if query.finished(): queryLatch.countdown()
    
    # Issue queries to your data sources and with your inputs
    # You can modify the inputs and connectorGuids so as to query your own sources
    # Query for tile Magic Api
    if lp == 0:
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": "https://maps.google.fr/maps?sll="+str(g.lat)+","+str(g.lng)+"&q="+searchTerm+"&ie=UTF8&hl=fr&sspn=0.000000,0.000000&dg=brw&sa=N&start="+str(records)+"&output=classic&dg=brw"
                            
          }
        }, callback)
       
    if lp == 10:
        global nextlink
        print nextlink+str(lp)
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": ""+str(nextlink)+"&output=classic"+""
                           
          }
        }, callback)
        
    if lp > 10:
        global nextlink
        print nextlink+str(lp)
        client.query({
          "connectorGuids": [
            "1f59482a-3c8e-479d-985e-daafe92e71a3"
          ],
          "input": {
            "webpage/url": ""+str(nextlink)+"&output=classic"+""
                           
          }
        }, callback)    
        
  #  print "Queries dispatched, now waiting for results"
    
    # Now we have issued all of the queries, we can "await" on the latch so that we know when it is all done
    queryLatch.await()
    
  #  print "Latch has completed, all results returned"
    
    # It is best practice to disconnect when you are finished sending queries and getting data - it allows us to
    # clean up resources on the client and the server
    client.disconnect()
    
    # Now we can print out the data we got
  #  print "All data received:"
    jdata = json.dumps(dataRows2, indent = 4, ensure_ascii=False , encoding="utf8")
 
    decoded = json.loads(jdata)
  
    for vc in decoded:
        nextlink = vc['my_column']
    return nextlink
Пример #6
0
 def __init__(self):
     self.dataRows = []
     self.client = importio.importio(user_id="b12f3a23-b267-45a4-99f2-b8a0d2e9b491", api_key="YqcifZoCcEPdmXwAL8g855gQ2ZSmtGZwiwaBpj71TMNKsvAXhpvhLiz9mpy5DlC7KIZX62sC+TnaSxhNLfJNXg==", host="https://query.import.io")
     self.geolocator = GoogleV3Custom()