Пример #1
0
 def __init__(self):
     self.conn = connect_mysql.w_shark_erp()
     self.cur = self.conn.cursor()
     self.had_list = []
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
         "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
     }
     with open('collect.pk', 'rb') as f:
         self.collect = pickle.load(f)
Пример #2
0
def get_text(name, sc):
    conn1 = connect_mysql.w_shark_erp()
    cur1 = conn1.cursor()
    conn2 = connect_mysql.w_shark_erp()
    cur2 = conn2.cursor()
    while True:
        try:
            info = sc.pop()
        except IndexError:
            cur1.close()
            conn1.close()
            cur2.close()
            conn2.close()
            break
        result_list, unexcept_result_list = crawl_text(info[1])
        # if len(result_list) < 3 and len(unexcept_result_list) < 3:
        #     print(info[1], '暂无文案')
        #     sql = f"""update cm_commodity  set HAVE_TEXT=3 WHERE URL_ID='{info[0]}';"""
        #     cur1.execute(sql)
        #     conn1.commit()
        #     continue

        sql = f"""insert into crawler_commodity_module_description_copy (CONTENT,URL_ID) VALUES ("{str(result_list).replace('"', ",")}","{info[0]}" );"""

        conn2.ping(True)
        while True:
            try:
                cur2.execute(sql)
                break
            except pymysql.err.OperationalError:
                cur2.close()
                conn2.close()
                conn2 = connect_mysql.local_bs()
                cur2 = conn2.cursor()
        conn2.commit()

        print(name, datetime.datetime.now(), info[1])
    def __init__(self):
        self.conn_T = connect_mysql.test()
        self.cur_T = self.conn_T.cursor()

        self.conn_W = connect_mysql.w_shark_erp()
        self.cur_W = self.conn_W.cursor()
        # 数据库已有的文章id列表
        self.have_list = []
        self.users_num = {}
        self.headers = {
            "Connection":
            "keep-alive",
            "User-Agent":
            "Mozilla/5.0 (Linux; Android 5.1.1; xiaomi mix Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 AliApp(TB/9.1.0) TTID/600000@taobao_android_9.1.0 WindVane/8.5.0 900X1600 UT4Aplus/0.2.16",
            "Cookie":
            "_m_h5_tk=36b5227cd1a1e340e4d56bcc93555f2f_1587526955005; _m_h5_tk_enc=7385708053b9b4519913b71659d347aa;"
        }
Пример #4
0
    def __init__(self):
        self.conn_T = connect_mysql.test()
        self.cur_T = self.conn_T.cursor()
        self.conn_W = connect_mysql.w_shark_erp()
        self.cur_W = self.conn_W.cursor()

        self.CLASS = {
            "新品": "1375",
            "首页": "1203",
            "新鲜": "1518",
            "评测": "1363",
            "园艺": "1379",
            "影视": "1516",
            "游戏": "1370",
            "二次 ": "1359",
            "垂钓": "1362",
            "数码": "1387",
            "优惠": "3626",
            "如何": "1378",
            "居家": "1377",
            "视频": "1340",
            "型男": "1361",
            "汽车": "1341",
            "摄影": "1360",
            "手机": "1513",
            "美妆": "1372",
            "萌宠": "1342",
            "旅行": "1514",
            "精选": "1204",
            "美搭": "1373",
            "运动": "1369",
            "没事": "1358",
            "母婴": "1364",
        }
        self.headers = {
            "Referer":
            "https://market.m.taobao.com/app/mtb/headline/pages/portal?spm=a215s.7406091.home_m_h_v5_toutiao_corner_1.3&utparam=%7B%22ranger_buckets_native%22%3A%22tsp2584_22605%22%7D&scm=1007.home_headline.headline.d&wh_weex=true&wx_navbar_hidden=true&_wx_statusbar_hidden=hidden_light_text&feedListFeeds=true&columnId=1206&pushFeedIds=209933620800,200253499132",
            "Connection":
            "keep-alive",
            "User-Agent":
            "Mozilla/5.0 (Linux; Android 5.1.1; xiaomi mix Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Safari/537.36 AliApp(TB/9.1.0) TTID/600000@taobao_android_9.1.0 WindVane/8.5.0 900X1600 UT4Aplus/0.2.16",
            "Cookie":
            "_m_h5_tk=d2fd278808f43520fbcbdc710af0923c_1589783019427;_m_h5_tk_enc=53dc2d73b37a50c68dbf4bf9acc83c02"
        }
        self.have_list = []
        self.context = ''
Пример #5
0
def daemon(name, sc):
    conn_t = connect_mysql.w_shark_erp()
    cur_t = conn_t.cursor()
    while True:
        try:
            info = sc.pop()
        except IndexError as e:
            print(e)
            return
        url = f"https://acs.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=%7B%22itemNumId%22%3A%22{info[0]}%22%7D"
        imgs, popularity, grade = get_imgs(url)
        if not imgs:
            print(print(info[0]), '不符合条件')
            sql = f"""update cm_commodity set NEED=2 where URL_ID={info[0]} limit 1;"""
            continue
        elif imgs and not popularity and grade:
            continue
        else:
            print('符合条件')
            # sql = f"""update cm_commodity set IMG_URL='{imgs}',POPULARITY={popularity},GRADE='{grade}', NEED=1 where URL_ID={info[0]} limit 1;"""
            sql = f"""update cm_commodity set IMG_URL='{imgs}',POPULARITY={popularity},GRADE='{grade}' where URL_ID={info[0]} limit 1;"""
        conn_t.ping(True)
        cur_t.execute(sql)
        conn_t.commit()
Пример #6
0
def comment(sc):
    # while True:
    #     try:
    conn = connect_mysql.w_shark_erp()
    cur = conn.cursor()
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'}
    isNew7 = False
    while True:
        try:
            info = sc.pop()
        except IndexError:
            cur.close()
            conn.close()
            return

        for i in range(0, 2500, 25):
            url = f'https://kxuan.taobao.com/searchSp.htm?data-key=s&data-value=25&ajax=true&_ksTS=1575682938492_769&callback=jsonp770&ruletype=2&bcoffset=2&navigator=all&nested=we&is_spu=0&1=1&ntoffset=0&s={i}&kxuan_swyt_item=37662&cat={info[0]}&searchtype=item&uniq=pid&id=4525&enginetype=0&bcoffset=2&ntoffset=0'
            while True:
                try:
                    page_text = requests.get(url=url, headers=header, verify=False).text
                    break
                except Exception as e:
                    print(e)
                    continue
            string = page_text.split("(", 1)[1][:-1]
            result = json.loads(string)
            goods = result['mods']['itemlist']['data']['auctions']
            goods_info = []

            for good in goods:
                if int(good['nid']) in have_list:
                    continue
                icon = []
                for each in good['icon']:
                    icon.append({"icon_key":each['icon_key'],"innerText":each['innerText'],"position":each['position']})
                    if each['innerText'] == '营销':
                        isNew7 = True
                have_list.append(int(good['nid']))
                if not isNew7:
                    isNew7 = False
                    continue

                try:
                    sameStyleCount = good['sameStyleCount']
                except KeyError:
                    sameStyleCount = 0
                goods_info.append((info[2], good['nid'], good['raw_title'], good['detail_url'],
                                   good['view_sales'].strip('人付款'), json.dumps(icon, ensure_ascii=False), good['nick'],
                                   good['shopLink'], good['q_score'], good['pic_url'],
                                   good['view_price'], json.dumps(good["shopcard"]), sameStyleCount))

            while True:
                try:
                    sql = "insert into cm_commodity_new (CLASSIFY_ID, URL_ID,TITLE,URL,SALES,CREATE_DATE,ICON,NICK,SHOPLINK,Q_SCORE,PIC_URL,PRICE,SHOPCARD,SAMESTYLECOUNT) values (%s,%s,%s,%s,%s,NOW(),%s,%s,%s,%s,%s,%s,%s,%s);"
                    num = cur.executemany(sql, goods_info)
                    conn.commit()
                    print(info[1], i - 25, '-', i, datetime.datetime.now(), '更新了', num, '条')
                    break
                except pymysql.err.OperationalError:
                    print('由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。')
                    conn.ping(True)

            if len(goods) < 25:
                break
Пример #7
0
                    print(info[1], i - 25, '-', i, datetime.datetime.now(), '更新了', num, '条')
                    break
                except pymysql.err.OperationalError:
                    print('由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。')
                    conn.ping(True)

            if len(goods) < 25:
                break
        # except:
        #     continue


if __name__ == '__main__':
    while True:
        try:
            conn = connect_mysql.w_shark_erp()
            cur = conn.cursor()
            sql = "select DISTINCT(URL_ID) from cm_commodity_new;"
            cur.execute(sql)

            have_list = []
            for each in cur.fetchall():
                have_list.append(each[0])
            urllib3.disable_warnings()

            Schedule = schedule.schedule('select distinct(ID),cat,MAIN_ID from class_id order by ID desc;',
                                         connect_mysql.w_shark_erp())

            thread_list = []
            for i in range(6):
                thread_list.append(threading.Thread(target=comment, args=(Schedule,)))
Пример #8
0
"""
添加好物类别
"""
import CralwerSet.connect_mysql as connect_mysql
import traceback

conn_T = connect_mysql.test()
cur_T = conn_T.cursor()
conn_W = connect_mysql.w_shark_erp()
cur_W = conn_W.cursor()
sql = "select ID, TITLE from yhh_hw where ID>14000;"
cur_T.execute(sql)

try:
    for item in cur_T.fetchall():
        sql = f"""SELECT t6.cat,t5.num FROM (select t4.MAIN_ID MAIN_ID,count(t4.MAIN_ID) num FROM (SELECT  t2.CLASSIFY_ID CLASSIFY_ID FROM (select URL_ID, CONTENT from crawler_commodity_module_description where match(CONTENT) against('{item[1].replace("'","’")}') limit 100) t1, cm_commodity t2 where t1.URL_ID=t2.URL_ID ) t3, class_id t4 where t3.CLASSIFY_ID = t4.ID GROUP BY t4.MAIN_ID) t5, class_id t6 WHERE t6.ID=t5.MAIN_ID ORDER BY t5.num desc LIMIT 1;"""
        cur_W.execute(sql)
        result = cur_W.fetchone()
        if not result:
            type = '类型不明'
        else:
            type = result[0][:-1]
        sql = f"""update yhh_hw set `TYPE`='{type}' where ID={item[0]} limit 1;"""
        cur_T.execute(sql)
        conn_T.commit()
        print(item[0], item[1], type)
except:
    traceback.print_exc()
cur_T.close()
conn_T.close()
cur_W.close()
Пример #9
0
def daemon():
    while True:
        try:
            urllib3.disable_warnings()
            global HEADER
            HEADER = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36',
                'Content-Type': 'application/x-www-form-urlencoded'
            }
            login()
            sql = """select distinct(URL_ID),TITLE,ID,CREATE_DATE from cm_commodity where CREATE_DATE > "2019-11-14 " and URL_ID NOT IN(SELECT URL_ID FROM crawler_commodity_module_description_copy) ORDER BY CREATE_DATE DESC limit 2000;"""
            Schedule = schedule.schedule(sql, connect_mysql.w_shark_erp())
            print('开始线程')
            t1 = mythread(
                "1",
                Schedule,
            )
            # t2 = mythread("2", Schedule, )
            # t3 = mythread("3", Schedule, )
            # t4 = mythread("4", Schedule, )
            # t5 = mythread("5", Schedule, )
            # t6 = mythread("6", Schedule, )
            # t7 = mythread("7", Schedule, )
            # t8 = mythread("8", Schedule, )
            # t9 = mythread("9", Schedule, )
            # t10 = mythread("0", Schedule, )

            t1.start()
            print('线程1启动')
            time.sleep(1)

            # t2.start()
            # print('线程2启动')
            # time.sleep(1)
            #
            # t3.start()
            # print('线程3启动')
            # time.sleep(1)
            #
            # t4.start()
            # print('线程4启动')
            # time.sleep(1)
            #
            # t5.start()
            # print('线程5启动')
            # time.sleep(1)
            #
            # t6.start()
            # print('线程6启动')
            # time.sleep(1)
            #
            # t7.start()
            # print('线程7启动')
            # time.sleep(1)
            #
            # t8.start()
            # print('线程8启动')
            # time.sleep(1)
            #
            # t9.start()
            # print('线程9启动')
            # time.sleep(1)
            #
            # t10.start()
            # print('线程10启动')
            # time.sleep(1)

            # t10.join()
            # t9.join()
            # t8.join()
            # t7.join()
            # t6.join()
            # t5.join()
            # t4.join()
            # t3.join()
            # t2.join()
            t1.join()
        except:
            pass
Пример #10
0
            for thread in thread_list:
                thread.start()
        except:
            pass
        time.sleep(600)


if __name__ == '__main__':
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36',
    }
    sql = "select URL_ID from cm_commodity where CREATE_DATE >= date_sub(now(),interval 2 day) AND IMG_URL is null order by CREATE_DATE DESC limit 10;"
    while True:
        try:
            Schedule = schedule.schedule(sql, connect_mysql.w_shark_erp())
            thread_list = []
            for i in range(1):
                thread_list.append(mythread(
                    str(i + 1),
                    Schedule,
                ))

            for thread in thread_list:
                thread.start()
                time.sleep(1)
            while True:
                if not len(Schedule.classes):
                    print("新一轮数据更新")
                    break
                else: