示例#1
0
 def push_log(self, spider_name, stats):
     self.connect_kafka()
     stats['start_time'] = time.mktime(stats['start_time'].timetuple())
     stats['finish_time'] = time.mktime(stats['finish_time'].timetuple())
     stats['spider'] = spider_name
     try:
         #logging.info(stats)
         self.kafka.send_messages(KAFKA_LOG_STATS_TOPIC,
                                  *[json.dumps(stats)])
     except FailedPayloadsError as e:
         logging.error(e)
         logging.info(stats)
     del stats['spider']
     for key in stats.keys():
         if '.' in key:
             del stats[key]
     stat_spider_old = self.collection_stats.find_one(
         {"spider": spider_name})
     if stat_spider_old:
         stat_spider_new = stat_spider_old
     else:
         stat_spider_new = {'spider': spider_name, 'stats': []}
     if len(stat_spider_new['stats']) < 7:
         stat_spider_new['stats'].append(stats)
     else:
         stat_spider_new['stats'].pop(0)
         stat_spider_new['stats'].append(stats)
     stat_spider_new['last_history'] = stats
     self.collection_stats.update({"spider": spider_name},
                                  {"$set": stat_spider_new},
                                  upsert=True,
                                  multi=False)
示例#2
0
def getDoc(db, docid):
    try:
        doc = db.get(docid)
        return doc.value
    except NotFoundError:
        logging.error("Not found %s" % docid)
        return None
示例#3
0
def getCrawlerServers():
    global spiders_running
    spiders_running = []
    config_servers = getCrawlerServerConfig()
    crawler_servers = []
    for server in config_servers:
        crawler = {}
        crawler['name_server'] = server['name_server']
        crawler['max_thread'] = server['max_thread']
        crawler['free_thread'] = server['free_thread']
        crawler['status'] = server['status']
        try:
            resp = requests.get('http://' + server['name_server'] +
                                '.localhost:' + SERVER_PORT +
                                '/crawler/thread_count')
            data = resp.json()
            spiders_running.extend(data['running_spiders'])
            free_thread = server['max_thread'] - data['total_crawler_thread']
            if free_thread <= 0:
                crawler['free_thread'] = 0
            else:
                crawler['free_thread'] = free_thread
        except (Timeout, ConnectionError):
            logging.error("Connection error at %s", server['name_server'])
            crawler['status'] = False
        except:
            logging.error("Error at %s", server['name_server'])
            crawler['status'] = False
        crawler_servers.append(crawler)
    return crawler_servers
示例#4
0
def createDb(name,
             user="",
             passwd="",
             ram=100,
             replica=0,
             server="http://localhost:8091/pools/default/buckets"):
    """ Create a new bucket by using system curl command
    """
    # curl -X POST -u username:password -d name=newbucket -d ramQuotaMB=100 -d authType=none
    # -d replicaNumber=1 -d proxyPort=11216 http://localhost:8091/pools/default/buckets
    command = "curl -X POST -u %s:%s -d name=%s -d ramQuotaMB=%s -d authType=sasl " \
                    "-d replicaNumber=%s %s" \
                    % (user, passwd, name, ram, replica, server)
    import commands
    _, output = commands.getstatusoutput(command)
    lines = output.split("\n")
    if len(lines) < 4:
        logging.info("Create new bucket: %s" % name)
        return True
    response = json.loads(lines[3])
    if 'errors' in response:
        logging.error(response)
        return False
    else:
        logging.info("Create new bucket: %s" % name)
        return True
示例#5
0
def xtractFromUrl(url, xpath):
    try:
        response = requests.get(url)
        jsonContent = json.loads(response.text)
        return xtract(jsonContent, xpath)
    except:
        logging.error('Cannot load json from %s', url)
        return None
示例#6
0
def checkContentContain(url, expectContent):
    logging.info("checkContentContain %s  with %s" % (expectContent, url))
    try:
        response = requests.get(url)
        content = response.text.lower()
        return content.find(expectContent.lower()) >= 0
    except:
        logging.error('Cannot load content from %s', url)
        return False
示例#7
0
def xtract(jsonContent, xpath):
    value = jsonContent
    tokens = xpath.split('.')
    for path in tokens:
        if type(value) is list: value = value[0]
        if path in value: value = value[path]
        else:
            logging.error('Cannot find %s', xpath)
            return None
    return value
示例#8
0
def getDb(server, dbname, new=False):
    """ Return a db given server, db.
    If new is True then delete old db and create new 
    """
    if type(server) == str:
        logging.warning("getDb() with server string is deprecated, please " +
                        "pass a  Server object instead")
        server = couchdb.Server(server)
    if new:
        try:
            server.delete(dbname)
        except:
            logging.error('Database %s not found!' % dbname)
        db = server.create(dbname)
    else:
        db = server[dbname]
    return db
示例#9
0
def createOrUpdateBatch(db, doc_batch):
    """ createOrUpdate in batch.
    
    Input is a list of couchdb.Document objects.
    """
    assert type(doc_batch) == list, "Bad input %s" % type(doc_batch)
    # break down doc_batch if doc_batch too large
    try:
        responses = db.update(doc_batch)
    except:
        logging.warning(
            "Error with doc batch of size %s. Try to break it down" %
            len(doc_batch))
        responses = []
        for doc in doc_batch:
            responses.extend(db.update([doc]))
    failed_docs = []
    failed_keys = []
    for (success, docid, rev_or_exc), doc in zip(responses, doc_batch):
        if not success:
            assert type(rev_or_exc) == ResourceConflict
            logging.warning("  ---  try updating %s" % ` docid `)
            failed_keys.append(docid)
            failed_docs.append(doc)
    existing_docs = getDocsByIds(db, failed_keys)
    for existing_doc, failed_doc in zip(existing_docs, failed_docs):
        if existing_doc["_id"] != failed_doc["_id"]:
            logging.warning("mismatch docid %s != %s" %
                            (existing_doc["_id"], failed_doc["_id"]))
            continue
        # Copy _rev so that we can update a new version.
        failed_doc["_rev"] = existing_doc["_rev"]

    responses = db.update(failed_docs)
    num_failed = 0
    for (success, docid, exc) in responses:
        if not success:
            logging.error('Can not update %s %s' % ( ` docid `, ` exc `))
            num_failed += 1
    if num_failed:
        logging.error("%d out of %d updates failed" %
                      (num_failed, len(responses)))
示例#10
0
def startServiceMaster():
    logging.info("Start master-service!")
    list_crawler_servers = getCrawlerServers()
    list_spiders = buildListCrawlSpiders()
    count_start = 0
    if list_spiders is not None and len(list_spiders) > 0:
        for spider in list_spiders:
            crawler_server, list_crawler_servers = chooseCrawlerServer(
                list_crawler_servers)
            if crawler_server is not None:
                crawler_server = startSpider(spider['name'], crawler_server)
                list_crawler_servers = updateCrawlerServers(
                    crawler_server, list_crawler_servers)
                count_start += 1
            else:
                logging.info("=======> All crawler servers were full!")
                break
        logging.info("Started %d spiders." % (count_start))
    else:
        logging.error("No result from MongoDb!")
示例#11
0
def get_ids_pager(db,
                  design="doc",
                  view_name="_all_docs",
                  startkey=None,
                  startkey_docid=None,
                  endkey=None,
                  endkey_docid=None,
                  bulk=10000,
                  include_docs=False):
    """ Iterate over docs of db by bulk
    """
    options = {'limit': bulk}
    if startkey:
        options['startkey'] = startkey
        if startkey_docid:
            options['startkey_docid'] = startkey_docid
    if endkey:
        options['endkey'] = endkey
        if endkey_docid:
            options['endkey_docid'] = endkey_docid
    options['include_docs'] = include_docs
    options['full_set'] = True
    done = False
    try:
        while not done:
            rows = db.query(design, view_name, **options)
            cnt = 0
            for row in rows:
                cnt += 1
                options['startkey'] = row.key
                options['startkey_docid'] = row.docid
                options['skip'] = 1
                yield row.docid
            if cnt < bulk:
                done = True
    except HTTPError:
        logging.error(
            "_all_docs design has not exists. Please use function design_doc to create."
        )
        raise HTTPError
示例#12
0
def get_status_code(url):
    try:
        r = requests.get(url, allow_redirects=False, timeout = 20)
        return r.status_code
        #prints the int of the status code. Find more at httpstatusrappers.com :)
    except requests.ConnectionError:
        logging.error("failed to connect")
        return None
    except InvalidURL:
        logging.error("Url %s has an invalid label" % url)
        return None
    except Exception as e:
        logging.error(e)
        return None
示例#13
0
 def setExpiredItemsBaseOnStatus(self, url, status):
     isExpired = False
     if status == 404:
         logging.error(
             "==============================> Item expired because of 404: %s",
             url)
         isExpired = True
     if status == 500:
         logging.error(
             "==============================> Item expired because of 500: %s",
             url)
         isExpired = True
     if status == 302:
         logging.error(
             "==============================> Item expired because of 302: %s",
             url)
         isExpired = True
     if status == 301:
         logging.error(
             "==============================> Item expired because of 301: %s",
             url)
         isExpired = True
     return isExpired
示例#14
0
def startSpider(spider_name, crawler_server):
    if spider_name is not None:
        if spider_name in spiders_running:
            logging.error("Spider name \"%s\" running", spider_name)
            spider = collection.find_one({'doc.spider': spider_name})
            logging.info("Info spider: %s", spider['crawler_status'])
            return crawler_server
        try:
            requests.get('http://' + crawler_server['name_server'] +
                         '.localhost:' + SERVER_PORT +
                         '/crawler/startcrawl?spider=' + spider_name)
            crawler_server = set_free_thread(crawler_server)
            logging.info("Start spider \"%s\" successful at \"%s\"!",
                         spider_name, crawler_server['name_server'])
        except (Timeout, ConnectionError):
            crawler_server['status'] = False
            logging.error("Start spider \"%s\" failed at \"%s\"!", spider_name,
                          crawler_server['name_server'])
    else:
        logging.error("Spider is None!")
    return crawler_server