def push_log(self, spider_name, stats): self.connect_kafka() stats['start_time'] = time.mktime(stats['start_time'].timetuple()) stats['finish_time'] = time.mktime(stats['finish_time'].timetuple()) stats['spider'] = spider_name try: #logging.info(stats) self.kafka.send_messages(KAFKA_LOG_STATS_TOPIC, *[json.dumps(stats)]) except FailedPayloadsError as e: logging.error(e) logging.info(stats) del stats['spider'] for key in stats.keys(): if '.' in key: del stats[key] stat_spider_old = self.collection_stats.find_one( {"spider": spider_name}) if stat_spider_old: stat_spider_new = stat_spider_old else: stat_spider_new = {'spider': spider_name, 'stats': []} if len(stat_spider_new['stats']) < 7: stat_spider_new['stats'].append(stats) else: stat_spider_new['stats'].pop(0) stat_spider_new['stats'].append(stats) stat_spider_new['last_history'] = stats self.collection_stats.update({"spider": spider_name}, {"$set": stat_spider_new}, upsert=True, multi=False)
def getDoc(db, docid): try: doc = db.get(docid) return doc.value except NotFoundError: logging.error("Not found %s" % docid) return None
def getCrawlerServers(): global spiders_running spiders_running = [] config_servers = getCrawlerServerConfig() crawler_servers = [] for server in config_servers: crawler = {} crawler['name_server'] = server['name_server'] crawler['max_thread'] = server['max_thread'] crawler['free_thread'] = server['free_thread'] crawler['status'] = server['status'] try: resp = requests.get('http://' + server['name_server'] + '.localhost:' + SERVER_PORT + '/crawler/thread_count') data = resp.json() spiders_running.extend(data['running_spiders']) free_thread = server['max_thread'] - data['total_crawler_thread'] if free_thread <= 0: crawler['free_thread'] = 0 else: crawler['free_thread'] = free_thread except (Timeout, ConnectionError): logging.error("Connection error at %s", server['name_server']) crawler['status'] = False except: logging.error("Error at %s", server['name_server']) crawler['status'] = False crawler_servers.append(crawler) return crawler_servers
def createDb(name, user="", passwd="", ram=100, replica=0, server="http://localhost:8091/pools/default/buckets"): """ Create a new bucket by using system curl command """ # curl -X POST -u username:password -d name=newbucket -d ramQuotaMB=100 -d authType=none # -d replicaNumber=1 -d proxyPort=11216 http://localhost:8091/pools/default/buckets command = "curl -X POST -u %s:%s -d name=%s -d ramQuotaMB=%s -d authType=sasl " \ "-d replicaNumber=%s %s" \ % (user, passwd, name, ram, replica, server) import commands _, output = commands.getstatusoutput(command) lines = output.split("\n") if len(lines) < 4: logging.info("Create new bucket: %s" % name) return True response = json.loads(lines[3]) if 'errors' in response: logging.error(response) return False else: logging.info("Create new bucket: %s" % name) return True
def xtractFromUrl(url, xpath): try: response = requests.get(url) jsonContent = json.loads(response.text) return xtract(jsonContent, xpath) except: logging.error('Cannot load json from %s', url) return None
def checkContentContain(url, expectContent): logging.info("checkContentContain %s with %s" % (expectContent, url)) try: response = requests.get(url) content = response.text.lower() return content.find(expectContent.lower()) >= 0 except: logging.error('Cannot load content from %s', url) return False
def xtract(jsonContent, xpath): value = jsonContent tokens = xpath.split('.') for path in tokens: if type(value) is list: value = value[0] if path in value: value = value[path] else: logging.error('Cannot find %s', xpath) return None return value
def getDb(server, dbname, new=False): """ Return a db given server, db. If new is True then delete old db and create new """ if type(server) == str: logging.warning("getDb() with server string is deprecated, please " + "pass a Server object instead") server = couchdb.Server(server) if new: try: server.delete(dbname) except: logging.error('Database %s not found!' % dbname) db = server.create(dbname) else: db = server[dbname] return db
def createOrUpdateBatch(db, doc_batch): """ createOrUpdate in batch. Input is a list of couchdb.Document objects. """ assert type(doc_batch) == list, "Bad input %s" % type(doc_batch) # break down doc_batch if doc_batch too large try: responses = db.update(doc_batch) except: logging.warning( "Error with doc batch of size %s. Try to break it down" % len(doc_batch)) responses = [] for doc in doc_batch: responses.extend(db.update([doc])) failed_docs = [] failed_keys = [] for (success, docid, rev_or_exc), doc in zip(responses, doc_batch): if not success: assert type(rev_or_exc) == ResourceConflict logging.warning(" --- try updating %s" % ` docid `) failed_keys.append(docid) failed_docs.append(doc) existing_docs = getDocsByIds(db, failed_keys) for existing_doc, failed_doc in zip(existing_docs, failed_docs): if existing_doc["_id"] != failed_doc["_id"]: logging.warning("mismatch docid %s != %s" % (existing_doc["_id"], failed_doc["_id"])) continue # Copy _rev so that we can update a new version. failed_doc["_rev"] = existing_doc["_rev"] responses = db.update(failed_docs) num_failed = 0 for (success, docid, exc) in responses: if not success: logging.error('Can not update %s %s' % ( ` docid `, ` exc `)) num_failed += 1 if num_failed: logging.error("%d out of %d updates failed" % (num_failed, len(responses)))
def startServiceMaster(): logging.info("Start master-service!") list_crawler_servers = getCrawlerServers() list_spiders = buildListCrawlSpiders() count_start = 0 if list_spiders is not None and len(list_spiders) > 0: for spider in list_spiders: crawler_server, list_crawler_servers = chooseCrawlerServer( list_crawler_servers) if crawler_server is not None: crawler_server = startSpider(spider['name'], crawler_server) list_crawler_servers = updateCrawlerServers( crawler_server, list_crawler_servers) count_start += 1 else: logging.info("=======> All crawler servers were full!") break logging.info("Started %d spiders." % (count_start)) else: logging.error("No result from MongoDb!")
def get_ids_pager(db, design="doc", view_name="_all_docs", startkey=None, startkey_docid=None, endkey=None, endkey_docid=None, bulk=10000, include_docs=False): """ Iterate over docs of db by bulk """ options = {'limit': bulk} if startkey: options['startkey'] = startkey if startkey_docid: options['startkey_docid'] = startkey_docid if endkey: options['endkey'] = endkey if endkey_docid: options['endkey_docid'] = endkey_docid options['include_docs'] = include_docs options['full_set'] = True done = False try: while not done: rows = db.query(design, view_name, **options) cnt = 0 for row in rows: cnt += 1 options['startkey'] = row.key options['startkey_docid'] = row.docid options['skip'] = 1 yield row.docid if cnt < bulk: done = True except HTTPError: logging.error( "_all_docs design has not exists. Please use function design_doc to create." ) raise HTTPError
def get_status_code(url): try: r = requests.get(url, allow_redirects=False, timeout = 20) return r.status_code #prints the int of the status code. Find more at httpstatusrappers.com :) except requests.ConnectionError: logging.error("failed to connect") return None except InvalidURL: logging.error("Url %s has an invalid label" % url) return None except Exception as e: logging.error(e) return None
def setExpiredItemsBaseOnStatus(self, url, status): isExpired = False if status == 404: logging.error( "==============================> Item expired because of 404: %s", url) isExpired = True if status == 500: logging.error( "==============================> Item expired because of 500: %s", url) isExpired = True if status == 302: logging.error( "==============================> Item expired because of 302: %s", url) isExpired = True if status == 301: logging.error( "==============================> Item expired because of 301: %s", url) isExpired = True return isExpired
def startSpider(spider_name, crawler_server): if spider_name is not None: if spider_name in spiders_running: logging.error("Spider name \"%s\" running", spider_name) spider = collection.find_one({'doc.spider': spider_name}) logging.info("Info spider: %s", spider['crawler_status']) return crawler_server try: requests.get('http://' + crawler_server['name_server'] + '.localhost:' + SERVER_PORT + '/crawler/startcrawl?spider=' + spider_name) crawler_server = set_free_thread(crawler_server) logging.info("Start spider \"%s\" successful at \"%s\"!", spider_name, crawler_server['name_server']) except (Timeout, ConnectionError): crawler_server['status'] = False logging.error("Start spider \"%s\" failed at \"%s\"!", spider_name, crawler_server['name_server']) else: logging.error("Spider is None!") return crawler_server