def __init__(self, stats): self.stats = stats self.collection_spider = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "spiders") self.collection_stats = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "stats") self.client = None self.kafka = None
def get_spider_history(spider_name): collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, 'stats') spider_history = collection.find_one({'spider': spider_name}) if spider_history: return spider_history return None
def get_blacklist_category(): coll = mongo.connectCol('proc', 'category', 'category_blacklist') results = coll.find() categories = [] for cat in results: categories.append(cat['feature']) return categories
def threadCount(): collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, 'spiders') commands = 'ps -ef| grep scrapy' msg = subprocess.Popen(commands, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = msg.communicate() list_spiders = [] output = {} p = psutil.Process(os.getpid()) cpu_usage = psutil.cpu_percent(interval=1.0) memory_usage = p.memory_percent() list_res = out.split('\n') for item in list_res: if item.find("crawl") > 0: start = item.find("crawl") + 6 res = item[start:] spider_name = res.split(' ')[0] last_start = getLastStartSpider(collection, spider_name) if not last_start: continue spider_info = {'name': spider_name, 'last_start': last_start} list_spiders.append(spider_info) output['percent_cpu_usage'] = cpu_usage output['percent_mem_usage'] = round(memory_usage) output['total_crawler_thread'] = len(list_spiders) output['running_spiders'] = list_spiders return output
def getAllSpidersRunning(): collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "spiders") spiders = collection.find({"crawler_status.status": 1}) list_spiders = [] for spider in spiders: list_spiders.append(spider['doc']['spider']) return list_spiders
def getSpider(spider_name): print "Mongo server %s" % config.get("mongo_server") assert config.get("mongo_server") print "Db %s" % config.get("mongo_crawler_db") print " spider name : %s" % spider_name collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, flags.get("spider_col")) data = collection.find_one({"doc.spider": spider_name}) return data['doc']
def get_number_spider_created(days): now = time.time() query = {'created': {'$gte': now - days * 24 * 60 * 60}} collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, 'spider') results = collection.find(query) spiders = [] if results: for spider in results: spiders.append(spider['spider']) return {'number_hit': len(spiders), 'hits': spiders}
def insertSpider(document): collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "spiders") data = {'doc': document, 'last_modified': time.time()} data['doc']['_id'] = hashlib.md5(data['doc']['spider']).hexdigest() res = collection.update({'doc.spider': document['spider']}, {"$set": data}, upsert=False, multi=False) if not res['updatedExisting']: data['crawler_status.last_stop_time'] = time.time() - 86400 * 14 data['crawler_status.status'] = 0 data['created'] = time.time() res = collection.update({'doc.spider': document['spider']}, {"$set": data}, upsert=True, multi=False) return res
def connectDatabase(db_name): return mongo.connectCol("staging", "hello", db_name)
#!/usr/bin/env python # encoding: utf-8 ''' Created on Feb 24, 2016 @author: Quyet ''' from common import mongo import requests coll = mongo.connectCol('staging', 'crawler', 'spiders') def getSpiderGood(): spiders = coll.find({"crawler_status.last_stop_time": {"$exists":1}}) print spiders.count() spider_names = [] for spider in spiders: spider_names.append(spider['doc']['spider']) return spider_names def generateSpiderGood(): spider_names = getSpiderGood() count = 1 for spider in spider_names: print "Generate: ", count, spider requests.get("http://localhost:6081/generate?spider=" + spider) count += 1 print "Done!" if __name__=="__main__":
#!/usr/bin/env python # encoding: utf-8 import math from common import mongo import time from operator import itemgetter import requests from requests.exceptions import ConnectionError, Timeout from time import sleep from common.logger import logging import config from pymongo.errors import NetworkTimeout, ServerSelectionTimeoutError # NOW = time.time() collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "spiders") CRAWLER_SERVERS_CONFIG = [ # {'name_server':'crawl1', 'max_thread':25, 'free_thread':25, 'status':True}, # {'name_server':'crawl2', 'max_thread':25, 'free_thread':25, 'status':True}, { 'name_server': 'crawler', 'max_thread': 40, 'free_thread': 40, 'status': True }, ] SERVER_PORT = '6081' def getAllValidSpiders(): one_day_ago = time.time() - 24 * 60 * 60
def get_xpath(spider_name): coll = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, 'spiders') spider = coll.find_one({'doc.spider': spider_name}) return spider
def delete_spider_mongo(spider_name): collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB, "spiders") collection.delete_one({'doc.spider': spider_name})