示例#1
0
 def __init__(self, stats):
     self.stats = stats
     self.collection_spider = mongo.connectCol(config.MONGO_ENV,
                                               config.MONGO_CRAWLER_DB,
                                               "spiders")
     self.collection_stats = mongo.connectCol(config.MONGO_ENV,
                                              config.MONGO_CRAWLER_DB,
                                              "stats")
     self.client = None
     self.kafka = None
示例#2
0
def get_spider_history(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'stats')
    spider_history = collection.find_one({'spider': spider_name})
    if spider_history:
        return spider_history
    return None
示例#3
0
def get_blacklist_category():
    coll = mongo.connectCol('proc', 'category', 'category_blacklist')
    results = coll.find()
    categories = []
    for cat in results:
        categories.append(cat['feature'])
    return categories
示例#4
0
def threadCount():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spiders')
    commands = 'ps -ef| grep scrapy'
    msg = subprocess.Popen(commands,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    out, err = msg.communicate()
    list_spiders = []
    output = {}
    p = psutil.Process(os.getpid())
    cpu_usage = psutil.cpu_percent(interval=1.0)
    memory_usage = p.memory_percent()

    list_res = out.split('\n')
    for item in list_res:
        if item.find("crawl") > 0:
            start = item.find("crawl") + 6
            res = item[start:]
            spider_name = res.split(' ')[0]
            last_start = getLastStartSpider(collection, spider_name)
            if not last_start:
                continue
            spider_info = {'name': spider_name, 'last_start': last_start}
            list_spiders.append(spider_info)
    output['percent_cpu_usage'] = cpu_usage
    output['percent_mem_usage'] = round(memory_usage)
    output['total_crawler_thread'] = len(list_spiders)
    output['running_spiders'] = list_spiders
    return output
示例#5
0
def getAllSpidersRunning():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    spiders = collection.find({"crawler_status.status": 1})
    list_spiders = []
    for spider in spiders:
        list_spiders.append(spider['doc']['spider'])
    return list_spiders
示例#6
0
def getSpider(spider_name):
    print "Mongo server %s" % config.get("mongo_server")
    assert config.get("mongo_server")
    print "Db %s" % config.get("mongo_crawler_db")
    print "  spider name : %s" % spider_name

    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  flags.get("spider_col"))
    data = collection.find_one({"doc.spider": spider_name})
    return data['doc']
示例#7
0
def get_number_spider_created(days):
    now = time.time()
    query = {'created': {'$gte': now - days * 24 * 60 * 60}}
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spider')
    results = collection.find(query)
    spiders = []
    if results:
        for spider in results:
            spiders.append(spider['spider'])
    return {'number_hit': len(spiders), 'hits': spiders}
示例#8
0
def insertSpider(document):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    data = {'doc': document, 'last_modified': time.time()}

    data['doc']['_id'] = hashlib.md5(data['doc']['spider']).hexdigest()
    res = collection.update({'doc.spider': document['spider']}, {"$set": data},
                            upsert=False,
                            multi=False)
    if not res['updatedExisting']:
        data['crawler_status.last_stop_time'] = time.time() - 86400 * 14
        data['crawler_status.status'] = 0
        data['created'] = time.time()
        res = collection.update({'doc.spider': document['spider']},
                                {"$set": data},
                                upsert=True,
                                multi=False)
    return res
示例#9
0
def connectDatabase(db_name):
    return mongo.connectCol("staging", "hello", db_name)
示例#10
0
#!/usr/bin/env python
# encoding: utf-8
'''
Created on Feb 24, 2016

@author: Quyet
'''
from common import mongo
import requests
coll = mongo.connectCol('staging', 'crawler', 'spiders')


def getSpiderGood():
    spiders = coll.find({"crawler_status.last_stop_time": {"$exists":1}})
    print spiders.count()
    spider_names = []
    for spider in spiders:
        spider_names.append(spider['doc']['spider'])
    return spider_names


def generateSpiderGood():
    spider_names = getSpiderGood()
    count = 1
    for spider in spider_names:
        print "Generate: ", count, spider
        requests.get("http://localhost:6081/generate?spider=" + spider)
        count += 1
    print "Done!"

if __name__=="__main__":
示例#11
0
#!/usr/bin/env python
# encoding: utf-8
import math
from common import mongo
import time
from operator import itemgetter
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
from common.logger import logging
import config
from pymongo.errors import NetworkTimeout, ServerSelectionTimeoutError

# NOW = time.time()
collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                              "spiders")
CRAWLER_SERVERS_CONFIG = [
    #     {'name_server':'crawl1', 'max_thread':25, 'free_thread':25, 'status':True},
    #     {'name_server':'crawl2', 'max_thread':25, 'free_thread':25, 'status':True},
    {
        'name_server': 'crawler',
        'max_thread': 40,
        'free_thread': 40,
        'status': True
    },
]
SERVER_PORT = '6081'


def getAllValidSpiders():
    one_day_ago = time.time() - 24 * 60 * 60
示例#12
0
def get_xpath(spider_name):
    coll = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                            'spiders')
    spider = coll.find_one({'doc.spider': spider_name})
    return spider
示例#13
0
def delete_spider_mongo(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    collection.delete_one({'doc.spider': spider_name})