示例#1
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')

            proxy = Proxy()
            proxy.set_value(
                ip=inputs.get('ip'),
                port=inputs.get('port'),
                country=inputs.get('country', None),
                anonymity=inputs.get('anonymity', None),
                https=inputs.get('https', 'no'),
                speed=inputs.get('speed', -1),
                source=inputs.get('source', name),
            )

            utils.sql_insert_proxy(sql, name, proxy)

            command = "SELECT ip FROM {0} WHERE ip={1} AND port={2}".format(
                name, inputs.get('ip'), inputs.get('port'))
            res = sql.query_one(command)
            return res is None
        except:
            pass

        return False
示例#2
0
class BaseSpider(Spider):
    name = 'basespider'

    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10

        self.sql = SqlHelper()

        self.dir_log = 'log/proxy/%s' % self.name

        self.is_record_web_page = False

    def init(self):
        self.meta = {
            'download_timeout': self.timeout,
        }

        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(config.free_ipproxy_table)
        self.sql.execute(command)

    def start_requests(self):
        for i, url in enumerate(self.urls):
            yield Request(
                url=url,
                headers=self.headers,
                meta=self.meta,
                dont_filter=True,
                callback=self.parse_page,
                errback=self.error_parse,
            )

    def parse_page(self, response):
        self.write(response.body)
        pass

    def error_parse(self, failure):
        request = failure.request
        pass

    def add_proxy(self, proxy):
        utils.sql_insert_proxy(self.sql, config.free_ipproxy_table, proxy)

    def write(self, data):
        if self.is_record_web_page:
            with open(
                    '%s/%s.html' %
                (self.dir_log,
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
                    'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
示例#3
0
    def __init__(self, red, key, user):
        self.key = key
        self.red = red

        data = json.loads(user)
        self.product_id = data.get('product_id')
        self.url = data.get('url')
        self.email = data.get('email')
        self.guid = data.get('guid')
        self.spider_name = 'tb_comment'
        self.spargs = data

        self.sql = SqlHelper()
        self.spargs['red'] = self.red
        self.spargs['sql'] = self.sql

        if not os.path.exists('log'):
            os.makedirs('log')

        configure_logging(install_root_handler = False)
        logging.basicConfig(
                filename = 'log/%s.log' % self.product_id,
                format = '%(levelname)s %(asctime)s: %(message)s',
                level = logging.DEBUG
        )
示例#4
0
    def _import_data_to_table(self, file, table_name, name_list, type_list):
        result = 0

        # read file
        with open(file, newline='') as csvfile:

            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')

            firstLine = True
            for row in csvreader:
                # first row denotes table head
                if firstLine:
                    firstLine = False

                elif len(row) == len(name_list):
                    values = []
                    for i in range(len(row)):
                        v = SqlHelper.type_to_dbstatement(row[i], type_list[i])
                        values.append(v)

                    insert_query = SqlHelper.sql_insert_query(
                        table_name, values)
                    success = SqlHelper.execute_statement(
                        self.connection, insert_query)
                    result += int(success == True)

                else:
                    print('ignoring unexpected row: {0}'.format(row))

        return result
示例#5
0
    def _extract_name_and_types(file):
        name_list = []
        type_list = []

        # read file
        with open(file, newline='') as csvfile:

            csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')

            firstLine = True
            for row in csvreader:
                # first row denotes table head
                if firstLine:
                    firstLine = False

                    for i in range(len(row)):
                        name = SqlHelper.sanitize_colname(row[i])
                        name_list.append(name)
                        type_list.append(DbType.NULL)

                elif len(row) == len(name_list):
                    for i in range(len(row)):
                        type_class = SqlHelper.classify_dbtype(row[i])
                        type_list[i] = DbType.order_max(
                            type_list[i], type_class)

        return name_list, type_list
    def __init__(self, *a , **kw):
        super(RecipeDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)
示例#7
0
    def __init__(self, *a, **kw):
        super(GameUrls, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)
示例#8
0
    def __init__(self, *a, **kw):
        super(GameInfo, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

        self.error_count = 0
示例#9
0
    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''
示例#10
0
    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10

        self.sql = SqlHelper()

        self.dir_log = 'log/proxy/%s' % self.name
示例#11
0
def randitem(spargs):
    guid = spargs.get('guid', 0)
    utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False)

    url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \
          '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682'
    headers = {
        'Host':
        'diviner.taobao.com',
        'Referer':
        'https://www.taobao.com/',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    cookies = {
        '__jda':
        '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1',
        '__jdb': '122270672.1.1492415671516609876050|1.1492415672',
        '__jdc': '122270672',
        '__jdv': '122270672|direct|-|none|-|1492415671524',
        '__jdu': '1492415671516609876050',
    }

    r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20)
    pattern = re.compile('"sku":(\d+),', re.S)
    ids = re.findall(pattern, r.text)
    id = random.choice(ids)

    url = 'https://item.taobao.com/%s.html' % str(id)
    utils.push_redis(guid,
                     0,
                     '生成商品链接:<a href="%s" target="_blank">%s' % (url, url),
                     save_to_mysql=False)

    sql = SqlHelper()
    command = "SELECT id FROM {table} WHERE id={product_id}". \
        format(table = config.tb_item_table, product_id = id)
    result = sql.query_one(command)

    # 如果数据库中没有,则重新抓取
    if result == None:
        cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
              '-a product_id={product_id} -a url={url};'. \
            format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid,
                   product_id = id)
        subprocess.Popen(cmd, shell=True)
    else:
        # 如果数据库中存在则,直接读取数据库中数据
        command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
            format(config.analysis_item_table, id)
        result = sql.query(command)
        for res in result:
            utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
示例#12
0
 def __init__(self):
     super(Crawler, self).__init__()
     self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}'
     self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}'
     self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U)
     self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U)
     self.model_pattern = re.compile(
         '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U)
     self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U)
     self.links = []
     self.ids = []
     self.names = []
     self.sql = SqlHelper()
示例#13
0
    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            ip = inputs.get('ip')
            command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            sql.execute(command)

            command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            res = sql.query_one(command)
            return res is None
        except:
            pass
        return False
示例#14
0
    def _create_table(self, file, indices):
        table_name = SqlHelper.get_tablename(file)

        result_create = 0
        result_data = 0
        result_key = 0

        # create table in database
        name_list, type_list = SqlImporter._extract_name_and_types(file)

        if len(name_list) > 0:
            create_query = SqlHelper.sql_create_table(table_name, name_list,
                                                      type_list)
            create_success = SqlHelper.execute_statement(
                self.connection, create_query)
            self.connection.commit()

        if create_success:
            result_create = 1

            # import data into table
            result_data = self._import_data_to_table(file, table_name,
                                                     name_list, type_list)
            self.connection.commit()

            # attempt to set primary key in first integer-type column
            primary_key = -1
            for i in range(len(type_list)):
                t = type_list[i]
                if t == DbType.INTEGER:
                    success = self._set_primarykey(table_name, i, name_list,
                                                   type_list)
                    self.connection.commit()
                    if success:
                        result_key = 1
                        primary_key = i
                    break

            # set indices on _id named columns if specified
            if indices:
                for i in range(len(type_list)):
                    t = type_list[i]
                    if i != primary_key and t == DbType.INTEGER and (
                            name_list[i] == 'id' or '_id_' in name_list[i]
                            or name_list[i].endswith('_id')):
                        self._set_index(table_name, i, name_list, type_list)

        return result_create, result_data, result_key
示例#15
0
def _search_foods_by_nutrient(connection, nutrient, food_name):
    script = []
    rand_id = SqlHelper._get_rand_tableno()
    for l in sql_script_food_descr_nutrient:
        script.append(l.format(rand_id, nutrient, food_name, 'ASC'))
    result_index = 1

    return connection.queries(script, result_index)
示例#16
0
    def __init__(self, name=None, **kwargs):
        super(JDSpider, self).__init__(name, **kwargs)
        self.product_id = kwargs.get('product_id', -1)
        self.log('product_id:%s' % self.product_id)
        self.item_table = 'item_%s' % self.product_id
        self.product_page = '%s_page' % self.product_id

        self.log_dir = 'log/%s' % self.product_id
        self.is_record_page = False
        if self.is_record_page:
            utils.make_dir(self.log_dir)

        self.sql = SqlHelper()
        self.red = redis.StrictRedis(host=config.redis_host,
                                     port=config.redis_part,
                                     db=config.redis_db,
                                     password=config.redis_pass)
示例#17
0
def _search_nutrients_by_fdcid(connection, fdc_id):
    script = []
    rand_id = SqlHelper._get_rand_tableno()
    for l in sql_script_nutrition_list_for_fdcid:
        script.append(l.format(rand_id, fdc_id))
    result_index = 1

    return _get_nutrient_list(connection.queries(script, result_index))
示例#18
0
class SendSms(object):
    def __init__(self):
        self.sql = SqlHelper()

        self.weather_table_name = config.weather_table
        self.user_table_name = config.user_table

    def send_sms(self):
        command = ("SELECT * FROM {};".format(self.user_table_name))
        self.sql.execute(command)
        users = self.sql.cursor.fetchall()
        if users != None:
            for user in users:
                utils.log('send_sms get user info user:%s' % str(user))
                # 判断用户定义的时间,只有满足用户定义时间才发送短信
                user_time = user[5]
                time_info = user_time.split(':')

                u_hour = time_info[0]
                u_minute = time_info[1]

                # 获取系统时间
                s_hour = datetime.datetime.now().hour
                s_minute = datetime.datetime.now().minute

                if int(u_hour) == s_hour and int(u_minute) == s_minute:
                    utils.log('send sms to user:%s' % str(user))

                    command = (
                        "select * from {0} where city_name='{1}' order by id desc limit 1;"
                        .format(self.weather_table_name, user[3]))
                    self.sql.execute(command)
                    weather = self.sql.cursor.fetchone()
                    if weather != None:
                        temp_code = 'SMS_41855112'
                        phone = user[2]
                        info = {
                            'name': user[1],
                            'city': user[3],
                            'weather': weather[15],
                            'temp': '%s ~ %s' % (weather[9], weather[8]),
                            'aqilevel': utils.get_aqi_level_info(weather[12]),
                        }

                        sms = AliyunSms()
                        sms.send_sms(temp_code, info, phone)
示例#19
0
 def _print_import_result(status, file):
     print('table: ' + SqlHelper.get_tablename(file))
     if status[0] > 0:
         print('- created')
     if status[1] > 0:
         print('- filled')
     if status[2] > 0:
         print('- primary key detected')
     print('---')
示例#20
0
    def _set_primarykey(self,
                        table_name,
                        primary_key,
                        col_list=[],
                        type_list=[]):
        sql_stm = 'ALTER TABLE {0} ADD PRIMARY KEY ({1});'.format(
            table_name, primary_key)

        return SqlHelper.execute_statement(self.connection, sql_stm)
示例#21
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')
            command = "SELECT * FROM {0}".format(name)
            result = sql.query(command)
            data = [{
                'ip': item[1],
                'port': item[2],
                'speed': item[6]
            } for item in result]
            data = json.dumps(data, indent=4)
            return data
        except:
            pass

        return []
示例#22
0
    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            anonymity = inputs.get('anonymity', None)
            https = inputs.get('https', None)
            order = inputs.get('order', 'speed')
            sort = inputs.get('sort', 'asc')
            count = inputs.get('count', 100)

            command = ''
            if anonymity is None and https is None:
                command = "SELECT * FROM {name} ORDER BY {order} {sort} LIMIT {count}". \
                    format(name = name, order = order, sort = sort, count = count)
            elif anonymity is not None and https is None:
                command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' ORDER BY {order} {sort} " \
                          "LIMIT {count}". \
                    format(name = name, anonymity = anonymity, order = order, sort = sort, count = count)
            elif anonymity is None and https is not None:
                command = "SELECT * FROM {name} WHERE https=\'{https}\' ORDER BY {order} {sort} LIMIT {count}". \
                    format(name = name, https = https, order = order, sort = sort, count = count)
            elif anonymity is not None and https is not None:
                command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' AND https=\'{https}\' ORDER BY " \
                          "{order} {sort} limit {count}". \
                    format(name = name, anonymity = anonymity, https = https, order = order, sort = sort, count = count)
            result = sql.query(command)
            data = [{
                'id': item[0],
                'ip': item[1],
                'port': item[2],
                'anonymity': item[4],
                'https': item[5],
                'speed': item[6],
                'save_time': str(item[8])
            } for item in result]

            data = json.dumps(data, indent=4)
            return data
        except Exception, e:
            utils.log('select exception msg:%s' % e)
            pass
示例#23
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')
            anonymity = inputs.get('anonymity', None)
            https = inputs.get('https', None)
            sort = inputs.get('sort', 'speed')
            count = inputs.get('count', 100)

            command = ''
            if anonymity is None and https is None:
                command = "SELECT * FROM {0} ORDER BY {1} LIMIT {2}".format(
                    name, sort, count)
            elif anonymity is not None and https is None:
                command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' ORDER BY {2} LIMIT {3}". \
                    format(name, anonymity, sort, count)
            elif anonymity is None and https is not None:
                command = "SELECT * FROM {0} WHERE https=\'{1}\' ORDER BY {2} LIMIT {3}". \
                    format(name, https, sort, count)
            elif anonymity is not None and https is not None:
                command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' AND https=\'{2}\' ORDER BY {3} limit {4}". \
                    format(name, anonymity, https, sort, count)

            result = sql.query(command)
            data = [{
                'ip': item[1],
                'port': item[2],
                'speed': item[6]
            } for item in result]
            data = json.dumps(data, indent=4)
            return data
        except:
            pass

        return []
示例#24
0
    def handle(self, *args, **options):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        os.chdir(sys.path[0])

        spargs = utils.arglist_to_dict(options['spargs'])

        if not os.path.exists('log'):
            os.makedirs('log')

        configure_logging(install_root_handler=False)
        logging.basicConfig(filename='log/%s.log' % spargs.get('user_id'),
                            format='%(levelname)s %(asctime)s: %(message)s',
                            level=logging.ERROR)

        guid = spargs.get('guid', '0')
        user_id = spargs.get('user_id', '0')

        logging.warn('user_id')
        if guid == '0' or user_id == '0':
            utils.log('分析数据传入参数不对,接收到的参数为: spargs:%s' % spargs)
            utils.push_redis(guid=guid,
                             user_id=user_id,
                             info='分析数据传入参数不对,接收到的参数为:%s' % spargs)
            utils.push_redis(guid=guid, user_id=user_id, info='finish')
            return

        utils.log('开始分析:%s' % spargs)
        sql = SqlHelper()
        red = redis.StrictRedis(host=config.redis_host,
                                port=config.redis_part,
                                db=config.redis_db,
                                password=config.redis_pass)
        spargs['sql'] = sql
        spargs['red'] = red

        # 运行爬虫
        logging.warn(spargs)
        runspider(spargs)

        # 开启分析
        logging.warn(spargs)
        analysis = RealTimeAnalysis(**spargs)
        analysis.run()
示例#25
0
    def __init__(self, *a, **kwargs):
        super(AssetStoreSpider, self).__init__(*a, **kwargs)

        # 存储插件下载的目录
        self.dir_plugins = 'Plugins/'
        self.dir_all = self.dir_plugins + 'all'

        utils.make_dir(self.dir_plugins)
        utils.make_dir(self.dir_all)

        # 所有插件的一个列表
        self.plugin_list = []

        self.sql = SqlHelper()
        self.table_name = config.assetstore_table_name

        self.priority_adjust = 2

        # unity 的版本
        self.unity_version = ''

        # 请求 header
        self.headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection': 'keep-alive',
            'Host': 'www.assetstore.unity3d.com',
            'Referer': 'https://www.assetstore.unity3d.com/en/',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
            'X-Kharma-Version': self.unity_version,
            'X-Requested-With': 'UnityAssetStore',
            'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
        }

        self.init()
示例#26
0
if __name__ == '__main__':
    if not os.path.exists('log'):
        os.makedirs('log')

    if not os.path.exists('temp'):
        os.makedirs('temp')

    reload(sys)
    sys.setdefaultencoding('utf-8')

    logging.basicConfig(filename='log/job.log',
                        format='%(levelname)s %(asctime)s: %(message)s',
                        level=logging.DEBUG)

    sql = SqlHelper()
    red = redis.StrictRedis(host='localhost', port=6379, db=10)

    init()

    wx = MyWXBot()
    t1 = threading.Thread(target=wx.run_wx)
    t2 = threading.Thread(target=wx.user_query_job)
    t3 = threading.Thread(target=wx.crawl_boss_job)
    t4 = threading.Thread(target=wx.crawl_lagou_job)
    t5 = threading.Thread(target=wx.crawl_liepin_job)
    t1.start()
    t2.start()
    t3.start()
    t4.start()
    t5.start()
示例#27
0
class GameUrls(Spider):
    name = 'game_urls'

    start_urls = [
        'http://store.steampowered.com/search/?sort_by=Released_DESC&page=%s' %
        n for n in range(1, 1058)
    ]

    def __init__(self, *a, **kw):
        super(GameUrls, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`type` CHAR(10) NOT NULL,"
                   "`name` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`is_crawled` CHAR(5) DEFAULT 'no',"
                   "`page` INT(5) NOT NULL ,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_urls_table))
        self.sql.create_table(command)

    def start_requests(self):
        for i, url in enumerate(self.start_urls):
            yield Request(
                url=url,
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'url': url,
                    'page': i + 1,
                },
                dont_filter=True,
                callback=self.parse_all,
                errback=self.error_parse,
            )

    def parse_all(self, response):
        # file_name = '%s/%s.html' % (self.dir_game, response.meta.get('page'))
        # self.save_page(file_name, response.body)

        self.log('parse_all url:%s' % response.url)

        game_list = response.xpath(
            '//div[@id="search_result_container"]/div[2]/a').extract()
        count = 0
        for game in game_list:
            sel = Selector(text=game)
            url = sel.xpath('//@href').extract_first()

            id, type = self.get_id(url)
            # id = sel.xpath('//@data-ds-appid').extract_first()
            name = sel.xpath(
                '//div[@class="col search_name ellipsis"]/span/text()'
            ).extract_first()

            msg = (None, type, name, url, 'no', response.meta.get('page'))
            command = ("INSERT IGNORE INTO {} "
                       "(id, type, name, url, is_crawled, page)"
                       "VALUES(%s, %s, %s, %s, %s, %s)".format(
                           config.steam_game_urls_table))

            self.sql.insert_data(command, msg)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id, type

        utils.log('get_id error url:%s' % url)
        return 0, 'error'

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
示例#28
0
class Validator(Spider):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''
        self.is_record_web_page = False

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {
            'CONCURRENT_REQUESTS': cls.concurrent_requests,
            'RETRY_ENABLED': cls.retry_enabled,
        },
                         priority='spider')

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, config.httpbin_table)

        ids = utils.get_table_ids(self.sql, self.name)
        ids_free = utils.get_table_ids(self.sql, config.httpbin_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_free[i - len(ids)]

            proxy = utils.get_proxy_info(self.sql, table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                        'vali_count':
                        proxy.get('vali_count', 0)
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('success_parse speed:%s meta:%s' %
                  (time.time() - response.meta.get('cur_time'), response.meta))

        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')
        id = response.meta.get('id')
        ip = proxy.get('ip')

        self.save_page(ip, response.body)

        if self.success_mark in response.body or self.success_mark is '':
            speed = time.time() - response.meta.get('cur_time')
            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    vali_count = response.meta.get('vali_count', 0) + 1
                    command = utils.get_update_data_command(
                        table, id, speed, vali_count)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None, 1)

                    self.sql.insert_data(command, msg, commit=True)
        else:
            # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip
            if table == self.name:
                command = utils.get_delete_data_command(table, id)
                self.sql.execute(command)

    def error_parse(self, failure):
        request = failure.request
        utils.log('error_parse value:%s url:%s meta:%s' %
                  (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)
        utils.log('filename:%s' % filename)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
示例#29
0
class RecipeDetail(CrawlSpider):
    name = "recipe_detail"

    base_url = 'https://www.xiachufang.com'
    
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Connection': 'keep-alive',
        'Host': 'www.xiachufang.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
    }

    

    def __init__(self, *a , **kw):
        super(RecipeDetail, self).__init__(*a, **kw)

        self.dir_name = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()
        utils.make_dir(self.dir_name)


    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` INT(8) NOT NULL AUTO_INCREMENT,"
            "`name` CHAR(20) NOT NULL COMMENT 'recipe name',"
            "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID',"
            "`source_name` CHAR(20) NOT NULL COMMENT 'source name',"
            "`source_id` INT(8) NOT NULL COMMENT 'source ID',"
            "`create_time` DATETIME NOT NULL,"
            "PRIMARY KEY(id)"
            ") ENGINE=InnoDB".format(config.item_detail_table)
        )

        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * from {}".format(config.item_list_table)
        data = self.sql.query(command)

        for i, recipe in enumerate(data):
            if recipe[0] > 8999 and recipe[0] < 10000:
                url = self.base_url + recipe[2]
                utils.log(url)
                yield Request(
                    url = url,
                    headers = self.header,
                    callback = self.parse_all,
                    errback = self.error_parse,
                    meta={"re_id": recipe[3], "re_name": recipe[1]},
                    dont_filter = True,
                )


    def parse_all(self, response):
        utils.log(response.url)
        if response.status == 429:
            raise CloseSpider('Too much request, IP banned')
        if response.status == 200:
            file_name = '%s/recipe.html' % (self.dir_name)
            self.save_page(file_name, response.body)
            sources = response.xpath("//div[@class='ings']//tr").extract()

            for source in sources:
                sel = Selector(text = source)
                
                source_name = sel.xpath("//a/text()").extract_first()
                url = sel.xpath("//a/@href").extract_first()
                if source_name is not None and url is not None:
                    source_id = url.split('/')[-2]
                    r_name = response.meta["re_name"]
                    r_id = response.meta["re_id"]
                    dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    msg = (None, r_name, r_id, source_name, source_id, dt)
                    command = ("INSERT IGNORE INTO {} "
                                "(id, name, recipe_id, source_name, source_id, create_time)"
                                "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table)
                    )
                    self.sql.insert_data(command, msg)


    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))


    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()
示例#30
0
#-*- coding: utf-8 -*-

import sys

import time
from selenium import webdriver
from scrapy import Selector
from sqlhelper import SqlHelper

sql = SqlHelper()

# command = (
#     "CREATE TABLE IF NOT EXISTS {} ("
#     "`id` CHAR(10) NOT NULL UNIQUE,"
#     "`name` CHAR(10) NOT NULL,"
#     "PRIMARY KEY(name)"
#     ") ENGINE=InnoDB".format('liepin_city_id'))
# sql.create_table(command)
#
# reload(sys)
# sys.setdefaultencoding('utf-8')
#
# url = 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=python'
#
# driver = webdriver.PhantomJS()
# driver.get(url = url)
# driver.save_screenshot('liepin.png')
# with open('liepin.html', 'w')  as f:
#     f.write(driver.page_source)
#     f.close()
#