Пример #1
0
 def __init__(self, stats):
     self.stats = stats
     self.collection_spider = mongo.connectCol(config.MONGO_ENV,
                                               config.MONGO_CRAWLER_DB,
                                               "spiders")
     self.collection_stats = mongo.connectCol(config.MONGO_ENV,
                                              config.MONGO_CRAWLER_DB,
                                              "stats")
     self.client = None
     self.kafka = None
Пример #2
0
def get_spider_history(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'stats')
    spider_history = collection.find_one({'spider': spider_name})
    if spider_history:
        return spider_history
    return None
Пример #3
0
def get_blacklist_category():
    coll = mongo.connectCol('proc', 'category', 'category_blacklist')
    results = coll.find()
    categories = []
    for cat in results:
        categories.append(cat['feature'])
    return categories
Пример #4
0
def threadCount():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spiders')
    commands = 'ps -ef| grep scrapy'
    msg = subprocess.Popen(commands,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    out, err = msg.communicate()
    list_spiders = []
    output = {}
    p = psutil.Process(os.getpid())
    cpu_usage = psutil.cpu_percent(interval=1.0)
    memory_usage = p.memory_percent()

    list_res = out.split('\n')
    for item in list_res:
        if item.find("crawl") > 0:
            start = item.find("crawl") + 6
            res = item[start:]
            spider_name = res.split(' ')[0]
            last_start = getLastStartSpider(collection, spider_name)
            if not last_start:
                continue
            spider_info = {'name': spider_name, 'last_start': last_start}
            list_spiders.append(spider_info)
    output['percent_cpu_usage'] = cpu_usage
    output['percent_mem_usage'] = round(memory_usage)
    output['total_crawler_thread'] = len(list_spiders)
    output['running_spiders'] = list_spiders
    return output
Пример #5
0
def getAllSpidersRunning():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    spiders = collection.find({"crawler_status.status": 1})
    list_spiders = []
    for spider in spiders:
        list_spiders.append(spider['doc']['spider'])
    return list_spiders
Пример #6
0
def getSpider(spider_name):
    print "Mongo server %s" % config.get("mongo_server")
    assert config.get("mongo_server")
    print "Db %s" % config.get("mongo_crawler_db")
    print "  spider name : %s" % spider_name

    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  flags.get("spider_col"))
    data = collection.find_one({"doc.spider": spider_name})
    return data['doc']
Пример #7
0
def get_number_spider_created(days):
    now = time.time()
    query = {'created': {'$gte': now - days * 24 * 60 * 60}}
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spider')
    results = collection.find(query)
    spiders = []
    if results:
        for spider in results:
            spiders.append(spider['spider'])
    return {'number_hit': len(spiders), 'hits': spiders}
Пример #8
0
def insertSpider(document):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    data = {'doc': document, 'last_modified': time.time()}

    data['doc']['_id'] = hashlib.md5(data['doc']['spider']).hexdigest()
    res = collection.update({'doc.spider': document['spider']}, {"$set": data},
                            upsert=False,
                            multi=False)
    if not res['updatedExisting']:
        data['crawler_status.last_stop_time'] = time.time() - 86400 * 14
        data['crawler_status.status'] = 0
        data['created'] = time.time()
        res = collection.update({'doc.spider': document['spider']},
                                {"$set": data},
                                upsert=True,
                                multi=False)
    return res
Пример #9
0
def connectDatabase(db_name):
    return mongo.connectCol("staging", "hello", db_name)
Пример #10
0
#!/usr/bin/env python
# encoding: utf-8
'''
Created on Feb 24, 2016

@author: Quyet
'''
from common import mongo
import requests
coll = mongo.connectCol('staging', 'crawler', 'spiders')


def getSpiderGood():
    spiders = coll.find({"crawler_status.last_stop_time": {"$exists":1}})
    print spiders.count()
    spider_names = []
    for spider in spiders:
        spider_names.append(spider['doc']['spider'])
    return spider_names


def generateSpiderGood():
    spider_names = getSpiderGood()
    count = 1
    for spider in spider_names:
        print "Generate: ", count, spider
        requests.get("http://localhost:6081/generate?spider=" + spider)
        count += 1
    print "Done!"

if __name__=="__main__":
Пример #11
0
#!/usr/bin/env python
# encoding: utf-8
import math
from common import mongo
import time
from operator import itemgetter
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
from common.logger import logging
import config
from pymongo.errors import NetworkTimeout, ServerSelectionTimeoutError

# NOW = time.time()
collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                              "spiders")
CRAWLER_SERVERS_CONFIG = [
    #     {'name_server':'crawl1', 'max_thread':25, 'free_thread':25, 'status':True},
    #     {'name_server':'crawl2', 'max_thread':25, 'free_thread':25, 'status':True},
    {
        'name_server': 'crawler',
        'max_thread': 40,
        'free_thread': 40,
        'status': True
    },
]
SERVER_PORT = '6081'


def getAllValidSpiders():
    one_day_ago = time.time() - 24 * 60 * 60
Пример #12
0
def get_xpath(spider_name):
    coll = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                            'spiders')
    spider = coll.find_one({'doc.spider': spider_name})
    return spider
Пример #13
0
def delete_spider_mongo(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    collection.delete_one({'doc.spider': spider_name})