def worker(final_url, total_num, dl):
    try:
        current_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                     time.localtime(time.time()))
        current_date = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        mutex.acquire()
        try:
            url = final_url.pop(0)
        except:
            pass
        mutex.release()
        ontime_num = len(final_url)
        print u'[%s],parse,processing: %s in %s' % (
            str(current_time), str(total_num - ontime_num), str(total_num))
        counter = 0
        while True:
            # dl = downloader()
            html = dl.get_page(url, [], mutex, ['login'], min_len=40)
            if html:
                item = {}
                res = lxml.html.document_fromstring(html)
                item['stk'] = get_data(re.findall(re.compile('\?p=(.*)'), url),
                                       0)
                item['currentQtr'] = get_data(
                    res.xpath('//td[@data-reactid="128"]/span/text()'), 0)
                item['nextQtr'] = get_data(
                    res.xpath('//td[@data-reactid="130"]/span/text()'), 0)
                item['currentYear'] = get_data(
                    res.xpath('//td[@data-reactid="132"]/span/text()'), 0)
                item['nextYear'] = get_data(
                    res.xpath('//td[@data-reactid="134"]/span/text()'), 0)
                item['current_date'] = current_date
                #print item['stk'],item['currentQtr'],item['current_date']
                mutex.acquire()
                try:
                    db = MySql('36.110.128.75', 3306, 'root', 'Bigdata1234',
                               'crawler_db')
                    db.insert_single(
                        '''INSERT INTO finance_yahoo(stk,currentQtr,nextQtr,currentYear,nextYear,currentDate)VALUES ('%s','%s','%s','%s','%s','%s')'''
                        % (item['stk'], item['currentQtr'], item['nextQtr'],
                           item['currentYear'], item['nextYear'],
                           item['current_date']))
                    db.close()
                except Exception, e:
                    print 'save to db error...', e
                finally:
                    write2file('yahoo_finance.txt', item)
                mutex.release()
                break
def worker_shop(final_url, total_num, dl):
    # 解析店铺url more products
    global shop_cat_set
    try:
        current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        mutex.acquire()
        try:
            url = final_url.pop(0)
        except:
            pass
        mutex.release()
        ontime_num = len(final_url)
        print u'[%s],parse,processing: %s in %s' % (str(current_time), str(total_num - ontime_num), str(total_num))
        counter = 0
        while True:
            # dl = downloader()
            res_0 = dl.get_page(url,[],mutex)
            if res_0:
                res = lxml.html.document_fromstring(res_0)
            else:
                res = res_0
            # dl.delete()
            if res:
                shop_blocks = res.xpath('//div[@class="shopHeader-info"]')
                base_url_more = 'https://list.tmall.com/search_shopitem.htm?user_id=%s&q=&sort=s&cat=%s&from=_1_&is=p'
                if len(shop_blocks) > 0:
                    for shop_block in shop_blocks:
                        shop_url = get_data(shop_block.xpath('./a/@href'),0)
                        shop_brand = get_data(shop_block.xpath('./p/span/text()'),0).replace("'",'')
                        user_id = get_data(re.findall(re.compile('\?user_id=(\d+)&'), shop_url), 0)
                        #base_url_more = 'https://list.tmall.com/search_shopitem.htm?%s&q=&sort=s&%s&%s'
                        cat_id = get_data(re.findall(re.compile('cat=(\d+)'), url), 0)
                        #para2 = get_data(re.findall(re.compile('(start_price=\d+&end_price=\d+)'), url), 0)
                        #url_more = base_url_more % (shop_id, para1, para2)
                        if (user_id,cat_id) not in shop_cat_set:
                            url_more = base_url_more%(user_id,cat_id)
                            shop_cat_set.add((user_id,cat_id))
                            mutex.acquire()
                            try:
                                db = MySql('ip', 3306, 'root', 'pwd', 'db')
                                db.insert_single('''INSERT INTO tmall_shop_url(cat_id,shop_url,brands)VALUES ('%s','%s','%s')'''%(cat_id,url_more,shop_brand))
                                db.close()
                            except Exception,e:
                                print 'save to db error: ',str(e)
                            finally:
                                with codecs.open(os.path.join(current_dir, 'shop_url.txt'), 'a', 'utf-8') as f:
                                    f.write(url_more+'&/#'+ shop_brand + '\n')
                            mutex.release()
def crawler_tmall_shop_ultra(final_url, total_num, dl,mutex):
    try:
        current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        mutex.acquire()
        try:
            task_item = final_url.pop(0)
        except:
            task_item = None
        mutex.release()
        if task_item != None:
            global split_cat_ids
            split_cat_ids = []
            ontime_num = len(final_url)
            print u'[%s],parse,processing: %s in %s' % (str(current_time), str(total_num - ontime_num), str(total_num))
            pre_url = (task_item,0,100000)
            split_price(pre_url)
            run_get_max_page()
            run_parse_html(1) # thread num
            # 更新记录数据库和文件
            # successfully received msg update to db
            db = MySql('ip', 3306, 'root', 'pwd', 'db')
            db.update("""update task_list set status = 2 where task_item = '%s'"""%task_item)
            db.close()
            func_name = get_func_name()
            try:
                with codecs.open(os.path.join(files_dir, 'task_url_old.txt'), 'a', 'utf-8') as f:
                    f.write(func_name + '&/#' + task_item + '\n')
                # 清除final_url记录
                for each_file in os.listdir(current_dir):
                    if 'final_url' in each_file:
                        os.remove(os.path.join(current_dir,each_file))
            except:
                pass

    except Exception, e:
        print 'crawler_tmall_shop_ultra error: ', e
def check_tcp_status(ip, port, message):
    try:
        wk = Worker()
        # 汇总task_pool
        task_pool = []
        task_pool_old = []
        # func_name = 'crawler_tmall'
        for each_file in os.listdir(settings.files_dir):
            if each_file == 'task_url.txt':
                try:
                    with codecs.open(
                            os.path.join(settings.files_dir, each_file), 'r',
                            'utf-8') as f:
                        task_pool = [(x.split('\n')[0].split('&/#')[0],
                                      x.split('\n')[0].split('&/#')[1])
                                     for x in f]
                except:
                    pass
            if each_file == 'task_url_old.txt':
                try:
                    with codecs.open(
                            os.path.join(settings.files_dir, each_file), 'r',
                            'utf-8') as f:
                        task_pool_old = [(x.split('\n')[0].split('&/#')[0],
                                          x.split('\n')[0].split('&/#')[1])
                                         for x in f]
                except:
                    pass
        task_pool = list(set(task_pool) ^ set(task_pool_old))
        if len(task_pool) == 0:
            #wk = Worker()
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            server_address = (ip, port)
            sock.connect(server_address)
            print 'Connecting to %s:%s.' % server_address
            #while True:
            #message = raw_input('-->')
            if message:
                print 'Sending "%s".' % message
                sock.sendall(message)
            data = sock.recv(102400)
            data = json.loads(data)

            print 'Closing socket.'
            sock.sendall('bye')
            sock.close()

            if data['task'] != 'no more':
                # successfully received msg update to db
                db = MySql('ip', 3306, 'root', 'pwd', 'db')
                for each_task in data['task']['task_list']:
                    task_url = each_task
                    db.update(
                        """update task_list set status = 1 where task_item = '%s'"""
                        % task_url)
                db.close()
                pool_temp = zip(data['task']['func_list'],
                                data['task']['task_list'])
                task_pool.extend(pool_temp)
        print 'task_num', len(task_pool)
        # 清除task_url记录
        for each_file in os.listdir(settings.files_dir):
            if 'task_url' in each_file:
                os.remove(os.path.join(settings.files_dir, each_file))
        if len(task_pool) > 0:
            with codecs.open(os.path.join(settings.files_dir, 'task_url.txt'),
                             'a', 'utf-8') as f:
                for each_task in task_pool:
                    f.write(each_task[0] + '&/#' + each_task[1] + '\n')
            wk.run(task_pool)
            # feedback = 'got it'
            #print 'Sending "%s".'% 'feedback'
            #sock.sendall(feedback)
            time.sleep(randint(1, 3))
        else:
            print 'no more task...'
            time.sleep(10 * 60)
        # 清除task_url记录
        for each_file in os.listdir(settings.files_dir):
            if 'task_url' in each_file:
                os.remove(os.path.join(settings.files_dir, each_file))
    except Exception, e:
        print 'check_tcp_status err', e
示例#5
0
except Exception, e:
    print 'guba error: ', e

# xueqiu spider 每天一次
try:
    if curr_hour in ['08']:
        dt_xueqiu = detector_xueqiu()
        dt_xueqiu.run()
except Exception,e:
    print 'guba error: ',e

# tieba 每天一次 八点执行一次detector 查询上一批次数据
try:
    if curr_hour in ['08']:
        dt_tieba = detector_tieba()
        dt_tieba.run()
except Exception,e:
    print 'guba error: ',e

time.sleep(5)

# **********************************************************
# 判断是否是否超过预警值,选择负责人,发送邮件
db=MySql('ip',3307, 'user', 'pwd', 'db')
q_res = db.select('SELECT * FROM detector WHERE status = 0')
db.update('UPDATE detector SET STATUS = 1 WHERE STATUS = 0')
db.close()


sendmsg(q_res)
def worker_parse_detail(url, brand, dl, mutex):
    try:
        #print chardet.detect(brand)
        crawl_time_stamp = str(time.time())
        res = dl.get_page(url, [], mutex)
        # dl.delete()
        if res:
            item = {}
            res = lxml.html.document_fromstring(res)
            try:
                item['cat_id'] = get_data(
                    re.findall(re.compile('cat_id=(\d+)&'), url), 0)
            except:
                item['cat_id'] = 'none'
            try:
                item['cat_name'] = catDic[item['cat_id']]
            except:
                item['cat_name'] = 'none'
            try:
                item['product_id'] = get_data(
                    re.findall(re.compile('[\?|&]id=(\d+)&'), url), 0)
            except:
                item['product_id'] = 'none'
            try:
                item['sku_id'] = get_data(
                    re.findall(re.compile('skuId=(\d+)&'), url), 0)
            except:
                item['sku_id'] = 'none'
            try:
                item['price'] = get_data(
                    res.xpath('//span[@class="tm-price"]/text()'), -1)
            except:
                item['price'] = 'none'
            try:
                item['sales'] = get_data(
                    res.xpath(
                        '//li[@class="tm-ind-item tm-ind-sellCount "]/div[@class="tm-indcon"]/span[@class="tm-count"]/text()'
                    ), 0)
            except:
                item['sales'] = 'none'
            #item['brand'] = get_data(res.xpath('//div[@id="J_BrandAttr"]/div[@class="name"]/a/text()'),0).strip()
            #try:
            #    item['brand'] = brand.decode('utf-8')
            #except:
            try:
                item['brand'] = brand
            except:
                item['brand'] = 'none'
            #print brand
            try:
                item['title'] = get_data(
                    res.xpath('//h1[@data-spm="1000983"]/a/text()'),
                    0).strip()
            except:
                item['title'] = 'none'
            if item['title'] == 'None':
                item['title'] = get_data(
                    res.xpath('//h1[@data-spm="1000983"]/text()'), 0).strip()
            #try:
            #    with codecs.open(os.path.join(current_dir, 'code_temp.txt'), 'wb', 'utf-8') as f:
            #        f.write(item['title'])
            #except:
            #    with open(os.path.join(current_dir, 'code_temp.txt'), 'wb',) as f:
            #        f.write(item['title'])
            #with codecs.open(os.path.join(current_dir, 'code_temp.txt'), 'r', 'utf-8') as f:
            #    item['title'] = [x for x in f][0].replace('\'','')
            item['title'] = item['title'].replace('\'', '')

            item['flag'] = flag
            item['crawl_time_stamp'] = crawl_time_stamp
            try:
                item['shop_name'] = get_data(
                    res.xpath('//a[@class="slogo-shopname"]/strong/text()'), 0)
            except:
                item['shop_name'] = 'none'

            item['price_range'] = 'None'
            item['worker_num'] = port

            mutex.acquire()
            try:
                db = MySql('ip', 3306, 'root', 'pwd', 'db')
                db.insert_single(
                    '''INSERT INTO tmall_product(sku_id,product_id,cat_name,title,price,crawl_time,brand,sales,shop_name,flag,price_range,cat_id,worker_num)VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')'''
                    % (item['sku_id'], item['product_id'], item['cat_name'],
                       item['title'], item['price'], item['crawl_time_stamp'],
                       item['brand'], item['sales'], item['shop_name'],
                       item['flag'], item['price_range'], item['cat_id'],
                       item['worker_num']))
                db.close()
            except Exception, e:
                print 'save to db error: ', str(e)
            finally:
                write2file(
                    os.path.join(
                        current_dir,
                        port + '_' + flag + '_' + 'tmall_product_info.txt'),
                    item)
            mutex.release()
def crawler_tmall(final_url, total_num, dl, mutex):
    # 解析product url
    shop_brand = ''
    is_q = True
    try:
        current_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                     time.localtime(time.time()))
        mutex.acquire()
        pre_url = final_url.pop(0)
        mutex.release()
        try:
            url = pre_url.split('&/#')[0]
            shop_brand = pre_url.split('&/#')[1]
        except:
            url = pre_url
            shop_brand = 'None'
        ontime_num = len(final_url)
        print u'[%s],parse,processing: %s in %s' % (
            str(current_time), str(total_num - ontime_num), str(total_num))
        global time_start, time_scope
        time_delta = time.time() - time_start
        if time_delta < time_scope and (total_num - ontime_num) != 1:
            print u'time_delta left %d s' % (time_scope - time_delta)
            time.sleep(time_scope - time_delta)
        counter = 0
        while True:
            # dl = downloader()
            base_url = 'https://www.tmall.com/'
            url_pre = 'https://list.tmall.com/search_product.htm?' + re.findall(
                '(cat.*)', url)[0] + '&style=g'
            url_bef = 'https://list.tmall.com/search_product.htm?' + re.findall(
                '(cat.*)', url)[0] + '&style=w'
            time.sleep(randint(5, 20))
            res_0 = dl.get_page(url.replace('&q=', ''), [], mutex)
            # dl.delete()
            time_start = time.time()
            if res_0:
                #print len(res_0)
                res = lxml.html.document_fromstring(res_0)
                total_num_rule = '//p[@class="crumbTitle"]/span/text()'
                max_page_rule = u'共(\d+)页'
                product_num = get_data(res.xpath(total_num_rule), 0)
                max_page = get_data(
                    re.findall(re.compile(max_page_rule), res_0), 0)
                #print product_num, max_page
                if product_num != 'None' and max_page != 'None':
                    for each_page in range(1, int(max_page) + 1):
                        blocks_rule = '//div[@class="productImg-wrap"]/a/@href'
                        final_url_product = url + '&totalPage=%d&jumpto=%d' % (
                            int(max_page), each_page)
                        # print 'final_url', final_url
                        res_1 = lxml.html.document_fromstring(
                            dl.get_page(final_url_product, [], mutex))
                        product_urls = res_1.xpath(
                            '//div[@class="productImg-wrap"]/a/@href')
                        if len(product_urls) > 0:
                            for each_product_url in product_urls:
                                if 'areaId' not in each_product_url:
                                    each_product_url = each_product_url + '&areaId=120100'
                                mutex.acquire()
                                with codecs.open(
                                        os.path.join(current_dir,
                                                     'product_url.txt'), 'a',
                                        'utf-8') as f:
                                    f.write('https:' + each_product_url + '\n')
                                mutex.release()
                                # parse detail
                                # time.sleep(randint(5,15)*0.1)
                                worker_parse_detail(
                                    'https:' + each_product_url, shop_brand,
                                    dl, mutex)
                        else:
                            print u'该页没有product列表', final_url_product
                    mutex.acquire()
                    try:
                        with codecs.open(
                                os.path.join(files_dir, 'shop_url_old.txt'),
                                'a', 'utf-8') as f:
                            f.write(url + '&/#' + shop_brand + '\n')
                    except:
                        pass
                    mutex.release()
                else:
                    print u'no list item worker_product'

                break
            else:
                counter += 1
                if counter > 3:
                    print u'worker_product failure more than 3,break... '
                    break
        # successfully received msg update to db
        db = MySql('ip', 3306, 'root', 'pwd', 'db')
        shop_url = url
        db.update(
            """update tmall_shop_url set status = 2 where shop_url = '%s'""" %
            shop_url)
        db.close()
    except Exception, e:
        print 'worker_product error: ', e