def __init__(self): self.conn = connect_mysql.w_shark_erp() self.cur = self.conn.cursor() self.had_list = [] self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", } with open('collect.pk', 'rb') as f: self.collect = pickle.load(f)
def get_text(name, sc): conn1 = connect_mysql.w_shark_erp() cur1 = conn1.cursor() conn2 = connect_mysql.w_shark_erp() cur2 = conn2.cursor() while True: try: info = sc.pop() except IndexError: cur1.close() conn1.close() cur2.close() conn2.close() break result_list, unexcept_result_list = crawl_text(info[1]) # if len(result_list) < 3 and len(unexcept_result_list) < 3: # print(info[1], '暂无文案') # sql = f"""update cm_commodity set HAVE_TEXT=3 WHERE URL_ID='{info[0]}';""" # cur1.execute(sql) # conn1.commit() # continue sql = f"""insert into crawler_commodity_module_description_copy (CONTENT,URL_ID) VALUES ("{str(result_list).replace('"', ",")}","{info[0]}" );""" conn2.ping(True) while True: try: cur2.execute(sql) break except pymysql.err.OperationalError: cur2.close() conn2.close() conn2 = connect_mysql.local_bs() cur2 = conn2.cursor() conn2.commit() print(name, datetime.datetime.now(), info[1])
def __init__(self): self.conn_T = connect_mysql.test() self.cur_T = self.conn_T.cursor() self.conn_W = connect_mysql.w_shark_erp() self.cur_W = self.conn_W.cursor() # 数据库已有的文章id列表 self.have_list = [] self.users_num = {} self.headers = { "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; xiaomi mix Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 AliApp(TB/9.1.0) TTID/600000@taobao_android_9.1.0 WindVane/8.5.0 900X1600 UT4Aplus/0.2.16", "Cookie": "_m_h5_tk=36b5227cd1a1e340e4d56bcc93555f2f_1587526955005; _m_h5_tk_enc=7385708053b9b4519913b71659d347aa;" }
def __init__(self): self.conn_T = connect_mysql.test() self.cur_T = self.conn_T.cursor() self.conn_W = connect_mysql.w_shark_erp() self.cur_W = self.conn_W.cursor() self.CLASS = { "新品": "1375", "首页": "1203", "新鲜": "1518", "评测": "1363", "园艺": "1379", "影视": "1516", "游戏": "1370", "二次 ": "1359", "垂钓": "1362", "数码": "1387", "优惠": "3626", "如何": "1378", "居家": "1377", "视频": "1340", "型男": "1361", "汽车": "1341", "摄影": "1360", "手机": "1513", "美妆": "1372", "萌宠": "1342", "旅行": "1514", "精选": "1204", "美搭": "1373", "运动": "1369", "没事": "1358", "母婴": "1364", } self.headers = { "Referer": "https://market.m.taobao.com/app/mtb/headline/pages/portal?spm=a215s.7406091.home_m_h_v5_toutiao_corner_1.3&utparam=%7B%22ranger_buckets_native%22%3A%22tsp2584_22605%22%7D&scm=1007.home_headline.headline.d&wh_weex=true&wx_navbar_hidden=true&_wx_statusbar_hidden=hidden_light_text&feedListFeeds=true&columnId=1206&pushFeedIds=209933620800,200253499132", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; xiaomi mix Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 AliApp(TB/9.1.0) TTID/600000@taobao_android_9.1.0 WindVane/8.5.0 900X1600 UT4Aplus/0.2.16", "Cookie": "_m_h5_tk=d2fd278808f43520fbcbdc710af0923c_1589783019427;_m_h5_tk_enc=53dc2d73b37a50c68dbf4bf9acc83c02" } self.have_list = [] self.context = ''
def daemon(name, sc): conn_t = connect_mysql.w_shark_erp() cur_t = conn_t.cursor() while True: try: info = sc.pop() except IndexError as e: print(e) return url = f"https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=%7B%22itemNumId%22%3A%22{info[0]}%22%7D" imgs, popularity, grade = get_imgs(url) if not imgs: print(print(info[0]), '不符合条件') sql = f"""update cm_commodity set NEED=2 where URL_ID={info[0]} limit 1;""" continue elif imgs and not popularity and grade: continue else: print('符合条件') # sql = f"""update cm_commodity set IMG_URL='{imgs}',POPULARITY={popularity},GRADE='{grade}', NEED=1 where URL_ID={info[0]} limit 1;""" sql = f"""update cm_commodity set IMG_URL='{imgs}',POPULARITY={popularity},GRADE='{grade}' where URL_ID={info[0]} limit 1;""" conn_t.ping(True) cur_t.execute(sql) conn_t.commit()
def comment(sc): # while True: # try: conn = connect_mysql.w_shark_erp() cur = conn.cursor() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'} isNew7 = False while True: try: info = sc.pop() except IndexError: cur.close() conn.close() return for i in range(0, 2500, 25): url = f'https://kxuan.taobao.com/searchSp.htm?data-key=s&data-value=25&ajax=true&_ksTS=1575682938492_769&callback=jsonp770&ruletype=2&bcoffset=2&navigator=all&nested=we&is_spu=0&1=1&ntoffset=0&s={i}&kxuan_swyt_item=37662&cat={info[0]}&searchtype=item&uniq=pid&id=4525&enginetype=0&bcoffset=2&ntoffset=0' while True: try: page_text = requests.get(url=url, headers=header, verify=False).text break except Exception as e: print(e) continue string = page_text.split("(", 1)[1][:-1] result = json.loads(string) goods = result['mods']['itemlist']['data']['auctions'] goods_info = [] for good in goods: if int(good['nid']) in have_list: continue icon = [] for each in good['icon']: icon.append({"icon_key":each['icon_key'],"innerText":each['innerText'],"position":each['position']}) if each['innerText'] == '营销': isNew7 = True have_list.append(int(good['nid'])) if not isNew7: isNew7 = False continue try: sameStyleCount = good['sameStyleCount'] except KeyError: sameStyleCount = 0 goods_info.append((info[2], good['nid'], good['raw_title'], good['detail_url'], good['view_sales'].strip('人付款'), json.dumps(icon, ensure_ascii=False), good['nick'], good['shopLink'], good['q_score'], good['pic_url'], good['view_price'], json.dumps(good["shopcard"]), sameStyleCount)) while True: try: sql = "insert into cm_commodity_new (CLASSIFY_ID, URL_ID,TITLE,URL,SALES,CREATE_DATE,ICON,NICK,SHOPLINK,Q_SCORE,PIC_URL,PRICE,SHOPCARD,SAMESTYLECOUNT) values (%s,%s,%s,%s,%s,NOW(),%s,%s,%s,%s,%s,%s,%s,%s);" num = cur.executemany(sql, goods_info) conn.commit() print(info[1], i - 25, '-', i, datetime.datetime.now(), '更新了', num, '条') break except pymysql.err.OperationalError: print('由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。') conn.ping(True) if len(goods) < 25: break
print(info[1], i - 25, '-', i, datetime.datetime.now(), '更新了', num, '条') break except pymysql.err.OperationalError: print('由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。') conn.ping(True) if len(goods) < 25: break # except: # continue if __name__ == '__main__': while True: try: conn = connect_mysql.w_shark_erp() cur = conn.cursor() sql = "select DISTINCT(URL_ID) from cm_commodity_new;" cur.execute(sql) have_list = [] for each in cur.fetchall(): have_list.append(each[0]) urllib3.disable_warnings() Schedule = schedule.schedule('select distinct(ID),cat,MAIN_ID from class_id order by ID desc;', connect_mysql.w_shark_erp()) thread_list = [] for i in range(6): thread_list.append(threading.Thread(target=comment, args=(Schedule,)))
""" 添加好物类别 """ import CralwerSet.connect_mysql as connect_mysql import traceback conn_T = connect_mysql.test() cur_T = conn_T.cursor() conn_W = connect_mysql.w_shark_erp() cur_W = conn_W.cursor() sql = "select ID, TITLE from yhh_hw where ID>14000;" cur_T.execute(sql) try: for item in cur_T.fetchall(): sql = f"""SELECT t6.cat,t5.num FROM (select t4.MAIN_ID MAIN_ID,count(t4.MAIN_ID) num FROM (SELECT t2.CLASSIFY_ID CLASSIFY_ID FROM (select URL_ID, CONTENT from crawler_commodity_module_description where match(CONTENT) against('{item[1].replace("'","’")}') limit 100) t1, cm_commodity t2 where t1.URL_ID=t2.URL_ID ) t3, class_id t4 where t3.CLASSIFY_ID = t4.ID GROUP BY t4.MAIN_ID) t5, class_id t6 WHERE t6.ID=t5.MAIN_ID ORDER BY t5.num desc LIMIT 1;""" cur_W.execute(sql) result = cur_W.fetchone() if not result: type = '类型不明' else: type = result[0][:-1] sql = f"""update yhh_hw set `TYPE`='{type}' where ID={item[0]} limit 1;""" cur_T.execute(sql) conn_T.commit() print(item[0], item[1], type) except: traceback.print_exc() cur_T.close() conn_T.close() cur_W.close()
def daemon(): while True: try: urllib3.disable_warnings() global HEADER HEADER = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36', 'Content-Type': 'application/x-www-form-urlencoded' } login() sql = """select distinct(URL_ID),TITLE,ID,CREATE_DATE from cm_commodity where CREATE_DATE > "2019-11-14 " and URL_ID NOT IN(SELECT URL_ID FROM crawler_commodity_module_description_copy) ORDER BY CREATE_DATE DESC limit 2000;""" Schedule = schedule.schedule(sql, connect_mysql.w_shark_erp()) print('开始线程') t1 = mythread( "1", Schedule, ) # t2 = mythread("2", Schedule, ) # t3 = mythread("3", Schedule, ) # t4 = mythread("4", Schedule, ) # t5 = mythread("5", Schedule, ) # t6 = mythread("6", Schedule, ) # t7 = mythread("7", Schedule, ) # t8 = mythread("8", Schedule, ) # t9 = mythread("9", Schedule, ) # t10 = mythread("0", Schedule, ) t1.start() print('线程1启动') time.sleep(1) # t2.start() # print('线程2启动') # time.sleep(1) # # t3.start() # print('线程3启动') # time.sleep(1) # # t4.start() # print('线程4启动') # time.sleep(1) # # t5.start() # print('线程5启动') # time.sleep(1) # # t6.start() # print('线程6启动') # time.sleep(1) # # t7.start() # print('线程7启动') # time.sleep(1) # # t8.start() # print('线程8启动') # time.sleep(1) # # t9.start() # print('线程9启动') # time.sleep(1) # # t10.start() # print('线程10启动') # time.sleep(1) # t10.join() # t9.join() # t8.join() # t7.join() # t6.join() # t5.join() # t4.join() # t3.join() # t2.join() t1.join() except: pass
for thread in thread_list: thread.start() except: pass time.sleep(600) if __name__ == '__main__': headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36', } sql = "select URL_ID from cm_commodity where CREATE_DATE >= date_sub(now(),interval 2 day) AND IMG_URL is null order by CREATE_DATE DESC limit 10;" while True: try: Schedule = schedule.schedule(sql, connect_mysql.w_shark_erp()) thread_list = [] for i in range(1): thread_list.append(mythread( str(i + 1), Schedule, )) for thread in thread_list: thread.start() time.sleep(1) while True: if not len(Schedule.classes): print("新一轮数据更新") break else: