def worker(final_url, total_num, dl): try: current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) current_date = time.strftime('%Y-%m-%d', time.localtime(time.time())) mutex.acquire() try: url = final_url.pop(0) except: pass mutex.release() ontime_num = len(final_url) print u'[%s],parse,processing: %s in %s' % ( str(current_time), str(total_num - ontime_num), str(total_num)) counter = 0 while True: # dl = downloader() html = dl.get_page(url, [], mutex, ['login'], min_len=40) if html: item = {} res = lxml.html.document_fromstring(html) item['stk'] = get_data(re.findall(re.compile('\?p=(.*)'), url), 0) item['currentQtr'] = get_data( res.xpath('//td[@data-reactid="128"]/span/text()'), 0) item['nextQtr'] = get_data( res.xpath('//td[@data-reactid="130"]/span/text()'), 0) item['currentYear'] = get_data( res.xpath('//td[@data-reactid="132"]/span/text()'), 0) item['nextYear'] = get_data( res.xpath('//td[@data-reactid="134"]/span/text()'), 0) item['current_date'] = current_date #print item['stk'],item['currentQtr'],item['current_date'] mutex.acquire() try: db = MySql('36.110.128.75', 3306, 'root', 'Bigdata1234', 'crawler_db') db.insert_single( '''INSERT INTO finance_yahoo(stk,currentQtr,nextQtr,currentYear,nextYear,currentDate)VALUES ('%s','%s','%s','%s','%s','%s')''' % (item['stk'], item['currentQtr'], item['nextQtr'], item['currentYear'], item['nextYear'], item['current_date'])) db.close() except Exception, e: print 'save to db error...', e finally: write2file('yahoo_finance.txt', item) mutex.release() break
def worker_shop(final_url, total_num, dl): # 解析店铺url more products global shop_cat_set try: current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) mutex.acquire() try: url = final_url.pop(0) except: pass mutex.release() ontime_num = len(final_url) print u'[%s],parse,processing: %s in %s' % (str(current_time), str(total_num - ontime_num), str(total_num)) counter = 0 while True: # dl = downloader() res_0 = dl.get_page(url,[],mutex) if res_0: res = lxml.html.document_fromstring(res_0) else: res = res_0 # dl.delete() if res: shop_blocks = res.xpath('//div[@class="shopHeader-info"]') base_url_more = 'https://list.tmall.com/search_shopitem.htm?user_id=%s&q=&sort=s&cat=%s&from=_1_&is=p' if len(shop_blocks) > 0: for shop_block in shop_blocks: shop_url = get_data(shop_block.xpath('./a/@href'),0) shop_brand = get_data(shop_block.xpath('./p/span/text()'),0).replace("'",'') user_id = get_data(re.findall(re.compile('\?user_id=(\d+)&'), shop_url), 0) #base_url_more = 'https://list.tmall.com/search_shopitem.htm?%s&q=&sort=s&%s&%s' cat_id = get_data(re.findall(re.compile('cat=(\d+)'), url), 0) #para2 = get_data(re.findall(re.compile('(start_price=\d+&end_price=\d+)'), url), 0) #url_more = base_url_more % (shop_id, para1, para2) if (user_id,cat_id) not in shop_cat_set: url_more = base_url_more%(user_id,cat_id) shop_cat_set.add((user_id,cat_id)) mutex.acquire() try: db = MySql('ip', 3306, 'root', 'pwd', 'db') db.insert_single('''INSERT INTO tmall_shop_url(cat_id,shop_url,brands)VALUES ('%s','%s','%s')'''%(cat_id,url_more,shop_brand)) db.close() except Exception,e: print 'save to db error: ',str(e) finally: with codecs.open(os.path.join(current_dir, 'shop_url.txt'), 'a', 'utf-8') as f: f.write(url_more+'&/#'+ shop_brand + '\n') mutex.release()
def crawler_tmall_shop_ultra(final_url, total_num, dl,mutex): try: current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) mutex.acquire() try: task_item = final_url.pop(0) except: task_item = None mutex.release() if task_item != None: global split_cat_ids split_cat_ids = [] ontime_num = len(final_url) print u'[%s],parse,processing: %s in %s' % (str(current_time), str(total_num - ontime_num), str(total_num)) pre_url = (task_item,0,100000) split_price(pre_url) run_get_max_page() run_parse_html(1) # thread num # 更新记录数据库和文件 # successfully received msg update to db db = MySql('ip', 3306, 'root', 'pwd', 'db') db.update("""update task_list set status = 2 where task_item = '%s'"""%task_item) db.close() func_name = get_func_name() try: with codecs.open(os.path.join(files_dir, 'task_url_old.txt'), 'a', 'utf-8') as f: f.write(func_name + '&/#' + task_item + '\n') # 清除final_url记录 for each_file in os.listdir(current_dir): if 'final_url' in each_file: os.remove(os.path.join(current_dir,each_file)) except: pass except Exception, e: print 'crawler_tmall_shop_ultra error: ', e
def check_tcp_status(ip, port, message): try: wk = Worker() # 汇总task_pool task_pool = [] task_pool_old = [] # func_name = 'crawler_tmall' for each_file in os.listdir(settings.files_dir): if each_file == 'task_url.txt': try: with codecs.open( os.path.join(settings.files_dir, each_file), 'r', 'utf-8') as f: task_pool = [(x.split('\n')[0].split('&/#')[0], x.split('\n')[0].split('&/#')[1]) for x in f] except: pass if each_file == 'task_url_old.txt': try: with codecs.open( os.path.join(settings.files_dir, each_file), 'r', 'utf-8') as f: task_pool_old = [(x.split('\n')[0].split('&/#')[0], x.split('\n')[0].split('&/#')[1]) for x in f] except: pass task_pool = list(set(task_pool) ^ set(task_pool_old)) if len(task_pool) == 0: #wk = Worker() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_address = (ip, port) sock.connect(server_address) print 'Connecting to %s:%s.' % server_address #while True: #message = raw_input('-->') if message: print 'Sending "%s".' % message sock.sendall(message) data = sock.recv(102400) data = json.loads(data) print 'Closing socket.' sock.sendall('bye') sock.close() if data['task'] != 'no more': # successfully received msg update to db db = MySql('ip', 3306, 'root', 'pwd', 'db') for each_task in data['task']['task_list']: task_url = each_task db.update( """update task_list set status = 1 where task_item = '%s'""" % task_url) db.close() pool_temp = zip(data['task']['func_list'], data['task']['task_list']) task_pool.extend(pool_temp) print 'task_num', len(task_pool) # 清除task_url记录 for each_file in os.listdir(settings.files_dir): if 'task_url' in each_file: os.remove(os.path.join(settings.files_dir, each_file)) if len(task_pool) > 0: with codecs.open(os.path.join(settings.files_dir, 'task_url.txt'), 'a', 'utf-8') as f: for each_task in task_pool: f.write(each_task[0] + '&/#' + each_task[1] + '\n') wk.run(task_pool) # feedback = 'got it' #print 'Sending "%s".'% 'feedback' #sock.sendall(feedback) time.sleep(randint(1, 3)) else: print 'no more task...' time.sleep(10 * 60) # 清除task_url记录 for each_file in os.listdir(settings.files_dir): if 'task_url' in each_file: os.remove(os.path.join(settings.files_dir, each_file)) except Exception, e: print 'check_tcp_status err', e
except Exception, e: print 'guba error: ', e # xueqiu spider 每天一次 try: if curr_hour in ['08']: dt_xueqiu = detector_xueqiu() dt_xueqiu.run() except Exception,e: print 'guba error: ',e # tieba 每天一次 八点执行一次detector 查询上一批次数据 try: if curr_hour in ['08']: dt_tieba = detector_tieba() dt_tieba.run() except Exception,e: print 'guba error: ',e time.sleep(5) # ********************************************************** # 判断是否是否超过预警值,选择负责人,发送邮件 db=MySql('ip',3307, 'user', 'pwd', 'db') q_res = db.select('SELECT * FROM detector WHERE status = 0') db.update('UPDATE detector SET STATUS = 1 WHERE STATUS = 0') db.close() sendmsg(q_res)
def worker_parse_detail(url, brand, dl, mutex): try: #print chardet.detect(brand) crawl_time_stamp = str(time.time()) res = dl.get_page(url, [], mutex) # dl.delete() if res: item = {} res = lxml.html.document_fromstring(res) try: item['cat_id'] = get_data( re.findall(re.compile('cat_id=(\d+)&'), url), 0) except: item['cat_id'] = 'none' try: item['cat_name'] = catDic[item['cat_id']] except: item['cat_name'] = 'none' try: item['product_id'] = get_data( re.findall(re.compile('[\?|&]id=(\d+)&'), url), 0) except: item['product_id'] = 'none' try: item['sku_id'] = get_data( re.findall(re.compile('skuId=(\d+)&'), url), 0) except: item['sku_id'] = 'none' try: item['price'] = get_data( res.xpath('//span[@class="tm-price"]/text()'), -1) except: item['price'] = 'none' try: item['sales'] = get_data( res.xpath( '//li[@class="tm-ind-item tm-ind-sellCount "]/div[@class="tm-indcon"]/span[@class="tm-count"]/text()' ), 0) except: item['sales'] = 'none' #item['brand'] = get_data(res.xpath('//div[@id="J_BrandAttr"]/div[@class="name"]/a/text()'),0).strip() #try: # item['brand'] = brand.decode('utf-8') #except: try: item['brand'] = brand except: item['brand'] = 'none' #print brand try: item['title'] = get_data( res.xpath('//h1[@data-spm="1000983"]/a/text()'), 0).strip() except: item['title'] = 'none' if item['title'] == 'None': item['title'] = get_data( res.xpath('//h1[@data-spm="1000983"]/text()'), 0).strip() #try: # with codecs.open(os.path.join(current_dir, 'code_temp.txt'), 'wb', 'utf-8') as f: # f.write(item['title']) #except: # with open(os.path.join(current_dir, 'code_temp.txt'), 'wb',) as f: # f.write(item['title']) #with codecs.open(os.path.join(current_dir, 'code_temp.txt'), 'r', 'utf-8') as f: # item['title'] = [x for x in f][0].replace('\'','') item['title'] = item['title'].replace('\'', '') item['flag'] = flag item['crawl_time_stamp'] = crawl_time_stamp try: item['shop_name'] = get_data( res.xpath('//a[@class="slogo-shopname"]/strong/text()'), 0) except: item['shop_name'] = 'none' item['price_range'] = 'None' item['worker_num'] = port mutex.acquire() try: db = MySql('ip', 3306, 'root', 'pwd', 'db') db.insert_single( '''INSERT INTO tmall_product(sku_id,product_id,cat_name,title,price,crawl_time,brand,sales,shop_name,flag,price_range,cat_id,worker_num)VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')''' % (item['sku_id'], item['product_id'], item['cat_name'], item['title'], item['price'], item['crawl_time_stamp'], item['brand'], item['sales'], item['shop_name'], item['flag'], item['price_range'], item['cat_id'], item['worker_num'])) db.close() except Exception, e: print 'save to db error: ', str(e) finally: write2file( os.path.join( current_dir, port + '_' + flag + '_' + 'tmall_product_info.txt'), item) mutex.release()
def crawler_tmall(final_url, total_num, dl, mutex): # 解析product url shop_brand = '' is_q = True try: current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) mutex.acquire() pre_url = final_url.pop(0) mutex.release() try: url = pre_url.split('&/#')[0] shop_brand = pre_url.split('&/#')[1] except: url = pre_url shop_brand = 'None' ontime_num = len(final_url) print u'[%s],parse,processing: %s in %s' % ( str(current_time), str(total_num - ontime_num), str(total_num)) global time_start, time_scope time_delta = time.time() - time_start if time_delta < time_scope and (total_num - ontime_num) != 1: print u'time_delta left %d s' % (time_scope - time_delta) time.sleep(time_scope - time_delta) counter = 0 while True: # dl = downloader() base_url = 'https://www.tmall.com/' url_pre = 'https://list.tmall.com/search_product.htm?' + re.findall( '(cat.*)', url)[0] + '&style=g' url_bef = 'https://list.tmall.com/search_product.htm?' + re.findall( '(cat.*)', url)[0] + '&style=w' time.sleep(randint(5, 20)) res_0 = dl.get_page(url.replace('&q=', ''), [], mutex) # dl.delete() time_start = time.time() if res_0: #print len(res_0) res = lxml.html.document_fromstring(res_0) total_num_rule = '//p[@class="crumbTitle"]/span/text()' max_page_rule = u'共(\d+)页' product_num = get_data(res.xpath(total_num_rule), 0) max_page = get_data( re.findall(re.compile(max_page_rule), res_0), 0) #print product_num, max_page if product_num != 'None' and max_page != 'None': for each_page in range(1, int(max_page) + 1): blocks_rule = '//div[@class="productImg-wrap"]/a/@href' final_url_product = url + '&totalPage=%d&jumpto=%d' % ( int(max_page), each_page) # print 'final_url', final_url res_1 = lxml.html.document_fromstring( dl.get_page(final_url_product, [], mutex)) product_urls = res_1.xpath( '//div[@class="productImg-wrap"]/a/@href') if len(product_urls) > 0: for each_product_url in product_urls: if 'areaId' not in each_product_url: each_product_url = each_product_url + '&areaId=120100' mutex.acquire() with codecs.open( os.path.join(current_dir, 'product_url.txt'), 'a', 'utf-8') as f: f.write('https:' + each_product_url + '\n') mutex.release() # parse detail # time.sleep(randint(5,15)*0.1) worker_parse_detail( 'https:' + each_product_url, shop_brand, dl, mutex) else: print u'该页没有product列表', final_url_product mutex.acquire() try: with codecs.open( os.path.join(files_dir, 'shop_url_old.txt'), 'a', 'utf-8') as f: f.write(url + '&/#' + shop_brand + '\n') except: pass mutex.release() else: print u'no list item worker_product' break else: counter += 1 if counter > 3: print u'worker_product failure more than 3,break... ' break # successfully received msg update to db db = MySql('ip', 3306, 'root', 'pwd', 'db') shop_url = url db.update( """update tmall_shop_url set status = 2 where shop_url = '%s'""" % shop_url) db.close() except Exception, e: print 'worker_product error: ', e