Python connectCol示例，common.mongo.connectCol Python示例

示例#1

0

显示文件

文件： extension.py 项目： dovanduy/choinho

 def __init__(self, stats):
     self.stats = stats
     self.collection_spider = mongo.connectCol(config.MONGO_ENV,
                                               config.MONGO_CRAWLER_DB,
                                               "spiders")
     self.collection_stats = mongo.connectCol(config.MONGO_ENV,
                                              config.MONGO_CRAWLER_DB,
                                              "stats")
     self.client = None
     self.kafka = None

示例#2

0

显示文件

def get_spider_history(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'stats')
    spider_history = collection.find_one({'spider': spider_name})
    if spider_history:
        return spider_history
    return None

示例#3

0

显示文件

def get_blacklist_category():
    coll = mongo.connectCol('proc', 'category', 'category_blacklist')
    results = coll.find()
    categories = []
    for cat in results:
        categories.append(cat['feature'])
    return categories

示例#4

0

显示文件

def threadCount():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spiders')
    commands = 'ps -ef| grep scrapy'
    msg = subprocess.Popen(commands,
                           shell=True,
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)
    out, err = msg.communicate()
    list_spiders = []
    output = {}
    p = psutil.Process(os.getpid())
    cpu_usage = psutil.cpu_percent(interval=1.0)
    memory_usage = p.memory_percent()

    list_res = out.split('\n')
    for item in list_res:
        if item.find("crawl") > 0:
            start = item.find("crawl") + 6
            res = item[start:]
            spider_name = res.split(' ')[0]
            last_start = getLastStartSpider(collection, spider_name)
            if not last_start:
                continue
            spider_info = {'name': spider_name, 'last_start': last_start}
            list_spiders.append(spider_info)
    output['percent_cpu_usage'] = cpu_usage
    output['percent_mem_usage'] = round(memory_usage)
    output['total_crawler_thread'] = len(list_spiders)
    output['running_spiders'] = list_spiders
    return output

示例#5

0

显示文件

def getAllSpidersRunning():
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    spiders = collection.find({"crawler_status.status": 1})
    list_spiders = []
    for spider in spiders:
        list_spiders.append(spider['doc']['spider'])
    return list_spiders

示例#6

0

显示文件

文件： generate.py 项目： dovanduy/choinho

def getSpider(spider_name):
    print "Mongo server %s" % config.get("mongo_server")
    assert config.get("mongo_server")
    print "Db %s" % config.get("mongo_crawler_db")
    print "  spider name : %s" % spider_name

    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  flags.get("spider_col"))
    data = collection.find_one({"doc.spider": spider_name})
    return data['doc']

示例#7

0

显示文件

def get_number_spider_created(days):
    now = time.time()
    query = {'created': {'$gte': now - days * 24 * 60 * 60}}
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  'spider')
    results = collection.find(query)
    spiders = []
    if results:
        for spider in results:
            spiders.append(spider['spider'])
    return {'number_hit': len(spiders), 'hits': spiders}

示例#8

0

显示文件

def insertSpider(document):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    data = {'doc': document, 'last_modified': time.time()}

    data['doc']['_id'] = hashlib.md5(data['doc']['spider']).hexdigest()
    res = collection.update({'doc.spider': document['spider']}, {"$set": data},
                            upsert=False,
                            multi=False)
    if not res['updatedExisting']:
        data['crawler_status.last_stop_time'] = time.time() - 86400 * 14
        data['crawler_status.status'] = 0
        data['created'] = time.time()
        res = collection.update({'doc.spider': document['spider']},
                                {"$set": data},
                                upsert=True,
                                multi=False)
    return res

示例#9

0

显示文件

文件： merchant_services.py 项目： dovanduy/choinho

def connectDatabase(db_name):
    return mongo.connectCol("staging", "hello", db_name)

示例#10

0

显示文件

文件： generate_all_spider.py 项目： dovanduy/choinho

#!/usr/bin/env python
# encoding: utf-8
'''
Created on Feb 24, 2016

@author: Quyet
'''
from common import mongo
import requests
coll = mongo.connectCol('staging', 'crawler', 'spiders')


def getSpiderGood():
    spiders = coll.find({"crawler_status.last_stop_time": {"$exists":1}})
    print spiders.count()
    spider_names = []
    for spider in spiders:
        spider_names.append(spider['doc']['spider'])
    return spider_names


def generateSpiderGood():
    spider_names = getSpiderGood()
    count = 1
    for spider in spider_names:
        print "Generate: ", count, spider
        requests.get("http://localhost:6081/generate?spider=" + spider)
        count += 1
    print "Done!"

if __name__=="__main__":

示例#11

0

显示文件

#!/usr/bin/env python
# encoding: utf-8
import math
from common import mongo
import time
from operator import itemgetter
import requests
from requests.exceptions import ConnectionError, Timeout
from time import sleep
from common.logger import logging
import config
from pymongo.errors import NetworkTimeout, ServerSelectionTimeoutError

# NOW = time.time()
collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                              "spiders")
CRAWLER_SERVERS_CONFIG = [
    #     {'name_server':'crawl1', 'max_thread':25, 'free_thread':25, 'status':True},
    #     {'name_server':'crawl2', 'max_thread':25, 'free_thread':25, 'status':True},
    {
        'name_server': 'crawler',
        'max_thread': 40,
        'free_thread': 40,
        'status': True
    },
]
SERVER_PORT = '6081'


def getAllValidSpiders():
    one_day_ago = time.time() - 24 * 60 * 60

示例#12

0

显示文件

def get_xpath(spider_name):
    coll = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                            'spiders')
    spider = coll.find_one({'doc.spider': spider_name})
    return spider

示例#13

0

显示文件

def delete_spider_mongo(spider_name):
    collection = mongo.connectCol(config.MONGO_ENV, config.MONGO_CRAWLER_DB,
                                  "spiders")
    collection.delete_one({'doc.spider': spider_name})