示例#1
0
def extend_information_jiexi(product_id, content_watch, content_buy,
                             content_linija, update_time):
    try:
        x = Mymysql()
        x._GetConnect()
        content_watch = return_to_json_text(content_watch)
        content_watch = json.loads(content_watch, encoding='gbk')['result']

        content_buy = return_to_json_text(content_buy)
        content_buy = json.loads(content_buy, encoding='gbk')['result']

        content_linija = return_to_json_text(content_linija)
        if content_linija:
            content_linija = json.loads(content_linija,
                                        encoding='gbk')['result']
        else:
            content_linija = None
        re_1 = help(product_id, content_watch, x, 1, update_time)
        re_2 = help(product_id, content_buy, x, 2, update_time)
        re_3 = help(product_id, content_linija, x, 3, update_time)
        if re_1 == re_2 == re_3 == 'ok':
            return "ok"
        else:
            raise Exception("help error")
    except Exception as ex:
        print(product_id, "extend_information_jiexi", "error:", ex)
        return False
    finally:
        x.EndSql()
示例#2
0
 def __init__(self):
     threading.Thread.__init__(self)
     self.y = Mymysql()
     self.y._GetConnectY()
     self.header = {
         'Host':
         'img.alicdn.com',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
     }
     self.set_ip()
     print(self.name, "初始化完毕:", self.ip)
示例#3
0
class Producer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.x = Mymysql()
        self.x._GetConnect()
        sql = "SELECT pic_md5,img_src,id FROM `image` where isSaved_Picture='0' limit  20000"  # where id >
        self.cur = self.x.ExecQueryGetcur(sql)

    def run(self):
        while True:
            item = self.cur.fetchone()
            if item == None: break
            img_list.put(item)
        print("put down~")

    def __del__(self):
        self.x.EndSql()
def shop_service_jiexi(shop_id, content, update_time):
    try:
        x = Mymysql()
        x._GetConnect()
        tree = etree.HTML(content)
        items = tree.xpath(".//li[@class='service-item']//h3[@class='name']/text()")
        refund_day_for_no_reason = delivery_hour_after_payment = -1
        for item in items:
            if refund_day_for_no_reason == -1:
                refund_day_for_no_reason = first_or_zero(re.findall(u"(\d+)天无理由退货", item))
            if delivery_hour_after_payment == -1:
                delivery_hour_after_payment = first_or_zero(re.findall(u"(\d+)小时发货", item))
        sql = """
              update seller_info set refund_day_for_no_reason = '%s',
                                     delivery_hour_after_payment = '%s'
                                     where shop_id = '%s' and update_time = '%s'
              """ % (refund_day_for_no_reason, delivery_hour_after_payment, shop_id, update_time)
        try:
            x.ExecNonQuery(sql)
        except Exception as ex:
            print(ex)
            print(sql)
            print("=============================")
        x.EndSql()
        return "ok"

    except Exception as ex:
        print(shop_id, "shop_service_jiexi", "error:", ex)
        x.EndSql()
        return False
示例#5
0
def descContent_jiexi(product_id, content, update_time, item_dict,
                      thread_name):
    try:
        x = Mymysql()
        x._GetConnect()
        content = content.replace("var desc='", "")
        content = content.replace("';", "")
        tree = etree.HTML(content)
        imgs = tree.xpath(".//img")
        count = 1
        for img in imgs:
            src = img.get("src")
            if not src or not src.endswith("jpg"):
                continue
            src = src.replace("50x50", "400x400")

            from_who = 1
            position = 2
            sequence = count
            pic_md5 = get_pic_md5(src, product_id)
            sql = """
                              insert into image(product_id,img_src,position,sequence,update_time,from_who,pic_md5)
                                          values('%s','%s','%s','%s','%s','%s','%s')
                              """ % (product_id, src, position, sequence,
                                     update_time, from_who, pic_md5)
            try:
                x.ExecNonQuery(sql)
                count += 1

            except Exception as ex:
                print(ex)
                print(sql)
                print("=============================")
                continue
        text = tree.xpath(".//text()")
        text = [item.strip() for item in text]
        text = " ".join(text)
        text = re.sub("\s+", "", text)
        text = text.replace("\\", "")
        item_dict['note'] = text.replace("'", "")
        return "ok"

    except Exception as ex:
        print(product_id, "descContent_jiexi", "error:", ex)
        file_save = os.path.join("jiexi",
                                 product_id + "_" + "descContent" + ".html")
        f = codecs.open(file_save, "a+", encoding="utf-8")
        f.write(content)
        f.close()
        return False

    finally:
        x.EndSql()
示例#6
0
def rate_jiexi(product_id, content, update_time, item_dict):
    try:
        x = Mymysql()
        x._GetConnect()
        begin_index = content.index("(")
        end_index = content.rindex(")")
        content = content[begin_index + 1:end_index]
        all_info = json.loads(content, encoding="gbk")
        data = all_info['data']
        impress_item = data['impress']
        for impress in impress_item:
            impress_type = impress['title']
            impress_count = impress['count']
            impress_id = uuid.uuid1()
            sql = """
                     insert  into product_impress(impress_id,product_id,
                                                 impress_type,impress_count,update_time) values
                                                 ('%s','%s','%s','%s','%s')
                     """ % (impress_id, product_id, impress_type,
                            impress_count, update_time)
            try:
                x.ExecNonQuery(sql)
            except Exception as ex:
                print(ex)
                print(sql)
                print("=============================")
                continue
        # bar info
        comment_with_picture_num = data['count']['pic']
        append_comment_num = data['count']['additional']
        moderate_comment_num = data['count']['normal']
        negative_comment_num = data['count']['bad']
        refund_comment_num = all_info['sellerRefundCount']

        item_dict['comment_with_picture_num'] = comment_with_picture_num
        item_dict['append_comment_num'] = append_comment_num
        item_dict['moderate_comment_num'] = moderate_comment_num
        item_dict['negative_comment_num'] = negative_comment_num
        item_dict['refund_comment_num'] = refund_comment_num
        item_dict['positive_comment_num'] = data['count']['good']

        return "ok", comment_with_picture_num
    except Exception as ex:
        print(product_id, "rate_jiexi", "error:", ex)
        return False, -1

    finally:
        x.EndSql()
示例#7
0
class Customer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.y = Mymysql()
        self.y._GetConnectY()
        self.header = {
            'Host':
            'img.alicdn.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
        }
        self.set_ip()
        print(self.name, "初始化完毕:", self.ip)

    def set_ip(self):
        global ip_count
        self.ip = get_ip()
        self.proxies = {'http': self.ip, 'https': self.ip}
        ip_count += 1

    def get_image(self, url, id):
        error_time = 0
        while 1:
            try:
                if error_time == 3:
                    return False
                response = requests.get(url,
                                        headers=self.header,
                                        timeout=3,
                                        proxies=self.proxies)
                if response.status_code != 200 and error_time == 1:
                    raise Exception("status_code error:" +
                                    str(response.status_code))
                elif response.status_code != 200 and error_time == 0:
                    time.sleep(3)
                    error_time = 1
                    continue
                else:
                    return response.content
            except Exception as ex:
                self.set_ip()
                lock.acquire()
                print("%s encouter error %s: id=%s" % (self.name, str(ex), id))
                lock.release()
                error_time += 1

    def run(self):
        global lock
        while True:
            try:
                pic_md5, url, id = img_list.get()
                save_path = r"D:\image\%s.jpg" % (pic_md5)
                url = url.strip()
                if not url.startswith("http"):
                    url = "https:" + url

                try:
                    sql = "insert into img_unique(img_unique) values ('%s')" % (
                        pic_md5)
                    self.y.ExecNonQuery(sql)
                except Exception as ex:
                    if '1062' in str(ex):
                        sql = "update image set isSaved_Picture='1' where id='%s'" % (
                            id)
                        self.y.ExecNonQuery(sql)
                        now = datetime.datetime.now()
                        now = now.strftime('%Y-%m-%d %H:%M:%S')
                        lock.acquire()
                        print("1062 %s:%s:%s done at %s" %
                              (self.name, pic_md5, str(id), now))
                        lock.release()
                    else:
                        print(ex, url)
                    continue

                content = self.get_image(url, id)
                if not content:
                    raise Exception("no content")

                f = open(save_path, "wb")
                f.write(content)
                f.close()

                sql = "update image set isSaved_Picture='1' where id='%s'" % (
                    id)
                self.y.ExecNonQuery(sql)
                now = datetime.datetime.now()
                now = now.strftime('%Y-%m-%d %H:%M:%S')
                lock.acquire()
                print("%s:%s:%s done at %s" %
                      (self.name, pic_md5, str(id), now))
                lock.release()
            except Exception as ex:
                log_path = os.path.join("log", self.name + ".log")
                f = codecs.open(log_path, "a+", encoding="utf-8")
                f.write("%s:%s\n" % (url, str(ex)))
                f.write("======================\n")
                f.close()

    def __del__(self):
        self.y.EndSql()
示例#8
0
 def __init__(self):
     threading.Thread.__init__(self)
     self.x = Mymysql()
     self.x._GetConnect()
     sql = "SELECT pic_md5,img_src,id FROM `image` where isSaved_Picture='0' limit  20000"  # where id >
     self.cur = self.x.ExecQueryGetcur(sql)
示例#9
0
def pic_jiexi(product_id, pic_content_original,update_time,thread_name):
    try:
        x = Mymysql()
        x._GetConnect()
        pic_content = pic_content_original.strip()
        #pic_content = re.findall("\((.*?)\)", pic_content)[0]
        begin_index = pic_content.index("(")
        end_index = pic_content.rindex(")")
        pic_content = pic_content[begin_index+1:end_index]
        #try:
        pic_content = json.loads(pic_content, encoding="gbk")
        #except Exception as ex:
        #    print (ex)
        #    print(pic_content)
        #    time.sleep(36000)
        max_page = pic_content['maxPage']
        comments = pic_content['comments']
        for item in comments:
            customer_name = item['user']['nick']
            rate = item['user']['displayRatePic'].split('.')[0]
            review_id = item['rateId']
            review_type = '3'
            content = item['content']
            if item['date']:
                review_info = datetime.datetime.strptime(item['date'].strip(),u'%Y年%m月%d日 %H:%M')
                review_date = review_info.date()
                review_time = review_info.time()
            else:
                review_date_info = ""
                review_date = '0000-00-00'
                review_time = '00:00:00'

            count_num = item['useful']
            refund_time = ""
            Brief_information = item['auction']['sku']

            # 图片插入
            count = 1
            photos = item['photos']
            for photo in photos:
                src = photo['url']
                img_id = uuid.uuid1()
                from_who = '2'
                position = '4'
                sequence = count
                pic_md5 = get_pic_md5(src, product_id)
                sql = """
                      insert into image(pic_md5,product_id,img_src,position,sequence,update_time,from_who,review_id)
                              values('%s','%s','%s','%s','%s','%s','%s','%s')
                      """ % (pic_md5,product_id,src,position,sequence,update_time,from_who,review_id)
                try:
                    x.ExecNonQuery(sql)
                    count +=1
                except Exception as ex:
                       print (ex)
                       print (sql)
                       print ("=============================")
                       continue
            back_comment = ""
            back_comment_day = ""
            # 追加解析
            append_list = item['appendList']
            for item_append in append_list:
                count = 0
                for item_append_photos in item_append['photos']:
                    src = item_append_photos['thumbnail']
                    img_id = uuid.uuid1()
                    from_who = '2'
                    position = '4'
                    sequence = count
                    pic_md5 = get_pic_md5(src, product_id)
                    sql = """
                                          insert into image(pic_md5,product_id,img_src,position,sequence,update_time,from_who,review_id)
                                                  values('%s','%s','%s','%s','%s','%s','%s','%s')
                                          """ % (
                        pic_md5, product_id, src, position, sequence, update_time, from_who, review_id)
                    try:
                        x.ExecNonQuery(sql)
                        count += 1
                    except Exception as ex:
                        print(ex)
                        print(sql)
                        print("=============================")
                        continue
                back_comment = item_append['content']
                back_comment_day = item_append['dayAfterConfirm']
                break
            content = content.replace("'","")
            content = content.replace("\\","")
            content = content.replace("?","")
            back_comment = back_comment.replace("'","")
            back_comment =  back_comment.replace("\\","")
            back_comment = back_comment.replace("?", "")
            sql = """
                             insert into product_comment(product_id,review_id,customer_name,rate,
                                    review_type,content,review_date,review_time,Brief_information,back_comment,
                                    back_comment_day,count_num,refund_time,update_time)
                             values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                             """ % (product_id, review_id, customer_name, rate, review_type,
                                    content, review_date, review_time,
                                    Brief_information, back_comment, back_comment_day, count_num, refund_time,
                                    update_time)
            try:
                x.ExecNonQuery(sql)
                # print sql
            except Exception as ex:
                    print(ex)
                    print(sql)
                    print("=============================")
                    continue

        return max_page, "ok"

                
            
    except Exception as ex:
        print(product_id, "pic_jiexi", "error:", ex)
        return -1, False
    finally:
        x.EndSql()
示例#10
0
def index_jiexi(shop_ID, html, shop_url, update_time):
    try:
        x = Mymysql()
        x._GetConnect()
        tree = etree.HTML(html)
        # 有没有进入爱逛街
        guang = tree.xpath(".//a[@class='guang-logo']//text()")
        if len(guang) > 0:
            print(shop_ID, guang[0])
            return True

        # noitem
        error = tree.xpath(".//*[@id='error-notice']/div[2]")
        if len(error) > 0:
            print(shop_ID, "no_item")
            return True

        products = tree.xpath(".//*[starts-with(@href,'//item.taobao.com')]")
        products_item = []  # item_id ,URL,name
        count = 1
        # 1 店铺ID shop_id %s
        shop_id = re.findall("\"shopI[dD]\":(.*?),", html)[0].strip()[1:-1]
        for item in products:
            url = item.get("href")
            item_id = re.findall("id=(\d+)", url)
            if len(item_id) == 0:
                continue
            # 产品ID
            item_id = item_id[0]
            name = item.xpath("string(.)")
            # 产品名称
            name = re.sub("[\r\n\t ]*", "", name)
            name = name.replace("\xc2\xa0", "")
            name = name.replace("'", "")
            if len(name) == 0:
                name = ""
            name = name.strip()[:100]

            # 产品相对顺序
            sequence = count

            #print [shop_id,item_id,url,name,sequence]

            # 日期
            sql = "insert  ignore into shop_homepage(shop_id,product_id,product_name,sequence,update_time) values ('%s','%s','%s',%d,'%s')" % (
                shop_id, item_id, name, sequence, update_time)
            #print sql
            try:
                x.ExecNonQuery(sql)
                count += 1
            except Exception as ex:
                print(ex)
                print(sql)
                print("=============================")
                continue

        x.EndSql()
        if count < 10:
            print(shop_ID, ":", shop_id, ":", u"主页产品", ":", count, shop_url)

        alipay_Authentication = tree.xpath(
            ".//span[@class='id-time J_id_time']/text()")
        if not alipay_Authentication:
            alipay_Authentication = '0000-00-00'
        else:
            alipay_Authentication = alipay_Authentication[0]
        return alipay_Authentication
    except Exception as ex:
        print(ex)
        save_path = os.path.join("re_jiexi", "%s_index" % (shop_ID))
        f = codecs.open(save_path, "w+", encoding="utf-8")
        f.write(html)
        f.close()
        x.EndSql()
        return False
示例#11
0
from connectMysql import Mymysql

x = Mymysql()
#x.__init__(host='localhost',user='******',passwd='wenyuan123',db='taobao')
x._GetConnect()
sql = "select * from shop_homepage"
target_ids = x.ExecQuery(sql)
print(target_ids)
示例#12
0
    def main_method(self, product_id, total_sales_volume, shop_id):
        try:
            update_time = datetime.datetime.now().strftime("%Y-%m-%d")
            main_url = "https://item.taobao.com/item.htm?id=%s" % (product_id)

            # 获取主页内容
            mainPageConent, status_code = self.get_page(main_url, host="item.taobao.com")
            if status_code == 'no':
                self.f_log.write("%s can not crawl\n" % (main_url))
                return "ok"
            self.header['Referer'] = main_url
            if not mainPageConent:
                self.f_log.write(" get mainPageConent fail status_code:%s\n" % status_code)
                raise Exception("mainPageConent error")
            else:
                self.f_log.write("get mainPageConent succeed\n")
            if '下架' in mainPageConent:
                return 'ok'

            # 获取categroy_id,sellerId,descUrl
            categroy_id = re.findall('data-catid="(\d+)"', mainPageConent)
            if not categroy_id:
                raise Exception("categroy_id out of  index")
            else:
                categroy_id = categroy_id[0]
            sellerId = re.findall("sellerId\s*:\s*'(\d+)',", mainPageConent)
            if not sellerId:
                raise Exception("sellerId out of  index")
            else:
                sellerId = sellerId[0]
            descUrl = re.findall("location.protocol===(.*),", mainPageConent)
            if not descUrl:
                raise Exception("descUrl out of  index")
            else:
                descUrl = descUrl[0]

            # 申明商品字典
            product_information_Item_dict = {}
            product_information_Item_dict['shop_id'] = shop_id
            product_information_Item_dict['product_id'] = product_id
            product_information_Item_dict['update_time'] = update_time
            product_information_Item_dict['total_sales_volume'] = total_sales_volume
            self.f_log.write("categroy_id,sellerId,descUrl succeed\n")
            # 解析主页内容
            main_page_jiexi_re = main_page_jiexi(product_id, mainPageConent, update_time, product_information_Item_dict,self.name)
            if main_page_jiexi_re != 'ok':
                self.f_log.write("main_page_jiexi_fail\n")
                raise Exception("main_page_jiexi_re error")
            else:
                self.f_log.write("main_page_jiexi_succeed\n")

            # 获取detail页面
            detail_url = "https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId=%s&sellerId=%s&modules=dynStock,qrcode,viewer,price,contract,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,tradeContract&callback=onSibRequestSuccess" % (
                product_id, sellerId)
            detail_content, satus_code = self.get_page(detail_url, host="detailskip.taobao.com")
            if not detail_content:
                self.f_log.write("get detail_content fail %s\n" % str(satus_code))
                self.f_log.write(detail_url + "\n")
                raise Exception("detail_content error")
            else:
                self.f_log.write("get detail_content succeed\n")

            # 解析detail 页面
            detail_content_jiexi_re = detail_content_jiexi(product_id, detail_content, update_time,
                                                           product_information_Item_dict)
            if detail_content_jiexi_re != 'ok':
                self.f_log.write("detail_content_jiexi_re fail\n")
                raise Exception("detail_content_jiexi_re error")
            else:
                self.f_log.write("detail_content_jiexi_re succeed\n")

            # 获取描述内容
            descUrl = "https:" + descUrl.split(":")[2].strip()[1:-1]
            descContent, status_code = self.get_page(descUrl, host="desc.alicdn.com")
            if not descContent:
                self.f_log.write("get descContent fail,status_code:%s\n" % status_code)
                raise Exception("descContent Error")
            else:
                self.f_log.write("get descContent succeed\n")

            # 解析描述内容
            descContent_jiexi_re = descContent_jiexi(product_id, descContent, update_time,
                                                     product_information_Item_dict,self.name)
            if descContent_jiexi_re != 'ok':
                self.f_log.write("get descContent_jiexi fail\n")
                raise Exception("descContent_jiexi_re error")
            else:
                self.f_log.write("get descContent_jiexi succeed\n")

            # 累计评论获取
            detailCounturl = "https://rate.taobao.com/detailCount.do?itemId=%s" % (product_id)
            detailCountContent, status_code = self.get_page(detailCounturl, host="rate.taobao.com")
            if not detailCountContent:
                self.f_log.write("get detailCountContent fail,status_code:%s\n" % status_code)
                raise Exception("detailCount_Content Error")
            else:
                self.f_log.write("get detailCountContent succeed\n")
            detailCount = re.findall("\d+", detailCountContent)
            if not detailCount:
                self.f_log.write("detailCount index out of range\n")
                raise Exception("detailCount index out of range")
            else:
                self.f_log.write("detailCount succeed\n")
            product_information_Item_dict['cumulative_review'] = detailCount[0]

            # 看了还看了获取
            recommend_one_url = "https://tui.taobao.com/recommend?&callback=detail_recommend_viewed&appid=9&count=12&sellerid=%s&itemid=%s&categoryid=%s" % (
                sellerId, product_id, categroy_id)
            recommend_oneContent, status_code = self.get_page(recommend_one_url, host="tui.taobao.com")
            if not recommend_oneContent:
                self.f_log.write("get recommend_oneContent fail,status_code:%s\n" % status_code)
                raise Exception("recommend_oneContent Error")

            # 还买了
            recommend_two_url = "https://tui.taobao.com/recommend?callback=detail_recommend_bought&appid=11&" + "count=12&sellerid=%s&itemid=%s&categoryid=%s" % (
                sellerId, product_id, categroy_id)
            recommend_twoContent, status_code = self.get_page(recommend_two_url, host="tui.taobao.com")
            if not recommend_twoContent:
                self.f_log.write("get recommend_twoContent fail, status_code:%s\n" % status_code)
                raise Exception("recommend_twoContent Error")

            # 邻家好货
            recommend_third_url = "https://tui.taobao.com/recommend?itemid=%s&sellerid=%s&callback=jsonp1524&appid=3066" % (
                product_id, sellerId)
            recommend_thirdContent, status_code = self.get_page(recommend_third_url, host="tui.taobao.com")
            if not recommend_thirdContent:
                self.f_log.write("get recommend_thirdContent fail,status_code:%s\n" % status_code)
                raise Exception("recommend_thirdContent Error")

            # 拓展信息解析
            extend_information_jiexi_re = extend_information_jiexi(product_id, recommend_oneContent,
                                                                   recommend_twoContent, recommend_thirdContent,
                                                                   update_time)
            if not extend_information_jiexi_re:
                self.f_log.write("extend_information_jiexi_re fail\n")
                raise Exception("extend_information_jiexi error")
            else:
                self.f_log.write("extend_information_jiexi succeed\n")

            # 大家印象 及 评论数量
            rate_url = "https://rate.taobao.com/detailCommon.htm?auctionNumId=%s&userNumId=%s&callback=json_tbc_rate_summary" % (
                product_id, sellerId)
            rate_content, status_code = self.get_page(rate_url, host='rate.taobao.com')
            if not rate_content:
                self.f_log.write("get rate_content fail,status_code:%s\n" % (status_code))
                raise Exception("rate_content error")
            else:
                self.f_log.write("get rate_content succeed\n")
            # rate 解析
            rate_jiexi_re, comment_with_picture_num = rate_jiexi(product_id, rate_content, update_time,
                                                                 product_information_Item_dict)
            if not rate_jiexi_re:
                self.f_log.write("get rate_jiexi_re error\n")
                raise Exception("rate_jiexi_re error")
            else:
                self.f_log.write("rate_jiexi succeed\n")

            # 收藏数量获取
            collectcount_URL = "https://count.taobao.com/counter3?callback=jsonp87&keys=ICCP_1_%s" % (product_id)
            collectcount_Content, status_code = self.get_page(collectcount_URL, host='count.taobao.com')
            if not collectcount_Content:
                self.f_log.write("get collectcount_Content fail,status_code:%s\n" % (status_code))
                raise Exception("collectcount_Content error")
            else:
                self.f_log.write("get collectcount_Content succeed\n")
            # 解析收藏数量
            product_information_Item_dict['collection_number'] = re.findall("\d+", collectcount_Content.split(":")[1])[
                0]

            # 图片获取
            if str(comment_with_picture_num) != '0':
                get_picture_comment_re = self.get_picture_comment(product_id, sellerId, update_time)
                if get_picture_comment_re == 'ok':
                    self.f_log.write("get_picture_comment succeed\n")
                else:
                    raise Exception("get_picture_comment_re ERROR" % ())
            else:
                self.f_log.write("not exist picture\n")

            # 插入产品信息
            sql = """
                          insert  into product_information(
            			                        shop_id,
                                                total_sales_volume,
            	                                product_name,
            	                                product_profile,
                                                cumulative_review,
                                                transaction_volume,
                                                price,
                                                taobao_price,
                                                place_of_delivery,
                                                express_fee,
                                                amount_of_inventory,
                                                promise,
                                                payment_method,
                                                collection_number ,
                                                note,
                                                update_time,
            				                    product_id,comment_with_picture_num,
            				                    append_comment_num,moderate_comment_num,negative_comment_num,
            				                    refund_comment_num,positive_comment_num
            	                                  ) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')
                   """ % (
                shop_id, total_sales_volume, product_information_Item_dict['product_name'],
                product_information_Item_dict['product_profile'], product_information_Item_dict['cumulative_review'],
                product_information_Item_dict['transaction_volume'],
                product_information_Item_dict['price'],
                product_information_Item_dict['taobao_price'],
                product_information_Item_dict['place_of_delivery'], product_information_Item_dict['express_fee'],
                product_information_Item_dict['amount_of_inventory'],
                product_information_Item_dict['promise'], product_information_Item_dict['payment_method'],
                product_information_Item_dict['collection_number'], product_information_Item_dict['note'], update_time,
                product_id,
                product_information_Item_dict['comment_with_picture_num'],
                product_information_Item_dict['append_comment_num'],
                product_information_Item_dict['moderate_comment_num'],
                product_information_Item_dict['negative_comment_num'],
                product_information_Item_dict['refund_comment_num'],
                product_information_Item_dict['positive_comment_num']
            )
            x = Mymysql()
            x._GetConnect()
            try:
                x.ExecNonQuery(sql)
            except:
                print(sql)
                raise Exception("product information insert error")
            finally:
                x.EndSql()

            return "ok"







        except Exception as ex:
            self.f_log.write("main method error:%s:%s\n" % (str(ex), self.name))
            return False
示例#13
0
def rating_jiexi(shop_ID, html, update_time, alipay_Authentication_main_page):
    try:
        x = Mymysql()
        x._GetConnect()
        values = []
        tree = etree.HTML(html)
        # 1 店铺ID shop_id %s
        shop_id = re.findall("\"shopID\":(.*?),", html)[0].strip()[1:-1]
        #print shopID
        values.append(shop_id)

        # 2 .支付宝认证时间: alipay_Authentication
        alipay_Authentication = tree.xpath(".//span[@class='id-time']/text()")
        if len(alipay_Authentication) == 0:
            alipay_Authentication = alipay_Authentication_main_page
        else:
            alipay_Authentication = alipay_Authentication[0]
        values.append(alipay_Authentication)

        # 3. 主营: main_products
        main_products = tree.xpath(".//*[@id='chart-name']/text()")[0].strip()
        #print main_products
        values.append(main_products)

        # 4. 所在地区 Location
        location_text = "".join(
            tree.xpath(
                ".//div[@class='info-block info-block-first']//ul/li[2]/text()"
            )[0])
        #print location_text
        Location = location_text.split(":")[1].strip()
        #print Location
        values.append(Location)

        # 5. 卖家信用 seller_credit
        seller_credit = tree.xpath(
            ".//ul[@class='sep']/li[1]/text()")[0].split(":")[1].strip()
        values.append(seller_credit)

        # 6. 买家信用 buyer_credit
        buyer_credit = tree.xpath(".//ul[@class='sep']/li[2]/text()")[0].split(
            ":")[1].strip()
        values.append(buyer_credit)

        # 7.保证金余额: seller_bond
        seller_bond = tree.xpath(".//div[@class='charge']/span/text()")
        if len(seller_bond) == 0:
            seller_bond = 0
        else:
            seller_bond = seller_bond[0][1:].split(".")[0].replace(",", "")
        values.append(seller_bond)

        # 8-10 评分
        rating = tree.xpath(".//div[@class='item-scrib']")
        commodity_score = -1
        seller_attitude_score = -1
        logistics_score = -1
        commodity_score_compare = -1
        seller_attitude_score_compare = -1
        logistics_score_compare = -1
        if len(rating) == 3:
            # 8.宝贝与描述相符分数 commodity_score
            commodity_score = rating[0].xpath(
                ".//em[@class='count']/text()")[0]

            # 9.卖家的服务态度 seller_attitude_score
            seller_attitude_score = rating[1].xpath(
                ".//em[@class='count']/text()")[0]

            # 10.物流服务质量 logistics_score
            logistics_score = rating[2].xpath(
                ".//em[@class='count']/text()")[0]

            # 11-13 同行业
            # 11 宝贝与描述相符分数比同行业平均水平 commodity _score_compare
            rating_baby_Same_industry = rating[0].xpath(".//strong")[0]
            strong_class = rating_baby_Same_industry.get("class")
            rating_baby_industry_score = rating_baby_Same_industry.xpath(
                ".//text()")[0]
            if "--" in rating_baby_industry_score:
                commodity_score_compare = 0
            else:
                commodity_score_compare = float(
                    rating_baby_Same_industry.xpath(".//text()")[0]
                    [:-1]) * 1.0 / 100
                if "over" not in strong_class:
                    commodity_score_compare *= -1
            #print rating_baby_industry_score

            # 12 卖家的服务态度分数比同行业平均水平 seller_attitude_score_compare
            rating_seller_Same_industry = rating[1].xpath(".//strong")[0]
            strong_class = rating_seller_Same_industry.get("class")
            rating_seller_industry_score = rating_seller_Same_industry.xpath(
                ".//text()")[0]
            if "--" in rating_seller_industry_score:
                seller_attitude_score_compare = 0
            else:
                seller_attitude_score_compare = float(
                    rating_seller_Same_industry.xpath(".//text()")[0]
                    [:-1]) * 1.0 / 100
                if "over" not in strong_class:
                    seller_attitude_score_compare *= -1

            # 13 物流服务的质量分数比同行业平均水平 logistics_score_compare
            rating_logistics_Same_industry = rating[2].xpath(".//strong")[0]
            strong_class = rating_logistics_Same_industry.get("class")
            rating_logistics_industry_score = rating_logistics_Same_industry.xpath(
                ".//text()")[0]
            if "--" in rating_logistics_industry_score:
                logistics_score_compare = 0
            else:
                logistics_score_compare = float(
                    rating_logistics_Same_industry.xpath(".//text()")[0]
                    [:-1]) * 1.0 / 100
                if "over" not in strong_class:
                    logistics_score_compare *= -1

        values.append(commodity_score)
        values.append(seller_attitude_score)
        values.append(logistics_score)
        values.append(commodity_score_compare)
        values.append(seller_attitude_score_compare)
        values.append(logistics_score_compare)

        # 14最近一周
        week = tree.xpath(".//div[@id='J_show_list']//li[1]//text()")
        week = filter(lambda x: len(re.findall("\d+", x)) >= 1, week)
        week = [re.findall("\d+", item)[0] for item in week]
        # 最近一周好评总数
        positive_comment_week = week[0]
        values.append(positive_comment_week)
        # 最近一周中评总数
        moderate_comment_week = week[1]
        values.append(moderate_comment_week)
        # 最近一周差评总数
        negative_comment_week = week[2]
        values.append(negative_comment_week)
        # 最近一周所属类别好评总数
        core_positive_comment_week = week[3]
        values.append(core_positive_comment_week)
        # 最近一周所属类别中评总数
        core_moderate_comment_week = week[4]
        values.append(core_moderate_comment_week)
        # 最近一周所属类别差评总数
        core_negative_comment_week = week[5]
        values.append(core_negative_comment_week)
        # 最近一周非主营行业好评总数
        non_core_positive_comment_week = week[6]
        values.append(non_core_positive_comment_week)
        # 最近一周非主营行业中评总数
        non_core_moderate_comment_week = week[7]
        values.append(non_core_moderate_comment_week)
        # 最近一周非主营行业差评总数
        non_core_negative_comment_week = week[8]
        values.append(non_core_negative_comment_week)

        # 15 最近一月
        month = tree.xpath(".//div[@id='J_show_list']//li[2]//text()")
        month = filter(lambda x: len(re.findall("\d+", x)) >= 1, month)
        month = [re.findall("\d+", item)[0] for item in month]
        # 最近一月好评总数
        positive_comment_month = month[0]
        values.append(positive_comment_month)
        # 最近一月中评总数
        moderate_comment_month = month[1]
        values.append(moderate_comment_month)
        # 最近一月差评总数
        negative_comment_month = month[2]
        values.append(negative_comment_month)
        # 最近一月所属类别好评总数
        core_positive_comment_month = month[3]
        values.append(core_positive_comment_month)
        # 最近一月所属类别中评总数
        core_moderate_comment_month = month[4]
        values.append(core_moderate_comment_month)
        # 最近一月所属类别差评总数
        core_negative_comment_month = month[5]
        values.append(core_negative_comment_month)
        # 最近一月非主营行业好评总数
        non_core_positive_comment_month = month[6]
        values.append(non_core_positive_comment_month)
        # 最近一月非主营行业中评总数
        non_core_moderate_comment_month = month[7]
        values.append(non_core_moderate_comment_month)
        # 最近一月非主营行业差评总数
        non_core_negative_comment_month = month[8]
        values.append(non_core_negative_comment_month)

        # 16 最近半年
        half_year = tree.xpath(".//div[@id='J_show_list']//li[3]//text()")
        half_year = filter(lambda x: len(re.findall("\d+", x)) >= 1, half_year)
        half_year = [re.findall("\d+", item)[0] for item in half_year]
        # 最近半年好评总数
        positive_comment_half_year = half_year[0]
        values.append(positive_comment_half_year)
        # 最近半年中评总数
        moderate_comment_half_year = half_year[1]
        values.append(moderate_comment_half_year)
        # 最近半年差评总数
        negative_comment_half_year = half_year[2]
        values.append(negative_comment_half_year)
        # 最近半年所属类别好评总数
        core_positive_comment_half_year = half_year[3]
        values.append(core_positive_comment_half_year)
        # 最近半年所属类别中评总数
        core_moderate_comment_half_year = half_year[4]
        values.append(core_moderate_comment_half_year)
        # 最近半年所属类别差评总数
        core_negative_comment_half_year = half_year[5]
        values.append(core_negative_comment_half_year)
        # 最近半年非主营行业好评总数
        non_core_positive_comment_half_year = half_year[6]
        values.append(non_core_positive_comment_half_year)
        # 最近半年非主营行业中评总数
        non_core_moderate_comment_half_year = half_year[7]
        values.append(non_core_moderate_comment_half_year)
        # 最近半年非主营行业差评总数
        non_core_negative_comment_half_year = half_year[8]
        values.append(non_core_negative_comment_half_year)

        # 17 半年以前
        before_half = tree.xpath(".//div[@id='J_show_list']//li[4]//text()")
        before_half = filter(lambda x: len(re.findall("\d+", x)) >= 1,
                             before_half)
        before_half = [re.findall("\d+", item)[0] for item in before_half]
        #print before_half
        #半年以前好评总数
        positive_comment_before_half_year = before_half[0]
        values.append(positive_comment_before_half_year)
        #半年以前中评总数
        moderate_comment_before_half_year = before_half[1]
        values.append(moderate_comment_before_half_year)
        #半年以前差评总数
        negative_comment_before_half_year = before_half[2]
        values.append(negative_comment_before_half_year)

        # 18-25 30天服务情况
        table = tree.xpath(".//table[@class='tb-rate-table']/tbody/tr")
        total_penalty = -1
        after_sales_speed_nearly_30 = -1
        after_sales_speed_nearly_30_compare = -1
        after_sale_rate_nearly_30 = -1
        after_sale_rate_nearly_30_compare = -1
        dispute_rate_nearly_30 = -1
        dispute_rate_nearly_30_compare = -1
        penalty_number_nearly_30 = -1
        penalty_number_nearly_30_compare = -1
        penalty_number_fake_good = -1
        penalty_number_false_transaction = -1
        penalty_number_breach_promise = -1
        penalty_number_bad_desc = -1
        penalty_number_malicious_harassment = -1

        if len(table) != 0:
            # 18 售后速度
            # 19 售后速度行业值
            tds = table[0].xpath(".//td/text()")[1:]
            aftermarket_Speed = float(tds[0][:-1])
            aftermarket_Industry_Speed = float(tds[2][:-1])
            if tds[1] == "小于":
                aftermarket_Industry_Speed *= -1
            #本店近30天售后速度
            after_sales_speed_nearly_30 = aftermarket_Speed

            #本店近30天售后速度比行业均值
            after_sales_speed_nearly_30_compare = aftermarket_Industry_Speed

            # 20 售后率
            # 21 售后率行业值
            tds = table[1].xpath(".//td/text()")[1:]
            after_sale_rate = float(tds[0][:-1].replace(",", "")) * 0.01
            after_sale_Industry_rate = float(tds[2][:-1]) * 0.01
            if tds[1] == "小于":
                after_sale_Industry_rate *= -1
            # 本店近30天售后率
            after_sale_rate_nearly_30 = after_sale_rate

            # 本店近30天售后率比行业均值
            after_sale_rate_nearly_30_compare = after_sale_Industry_rate

            # 22 纠纷率
            # 23 纠纷率行业值
            tds = table[2].xpath(".//td/text()")[1:]
            dispute_rate = float(tds[0][:-1]) * 0.01
            dispute_Industry_rate = float(tds[2][:-1]) * 0.01
            if tds[1] == "小于":
                dispute_Industry_rate *= -1
            # 本店近30天纠纷率
            dispute_rate_nearly_30 = dispute_rate

            # 本店近30天纠纷率比行业均值
            dispute_rate_nearly_30_compare = dispute_Industry_rate

            # 24 处罚数
            # 25 处罚数行业值
            tds = table[3].xpath(".//td/text()")[1:]
            penalty_number = float(tds[0][:-1])
            penalty_Industry_number = float(tds[2][:-1])
            if tds[1] == "小于":
                penalty_Industry_number *= -1
            # 本店近30天处罚数
            penalty_number_nearly_30 = penalty_number

            # 本店近30天处罚数比行业均值
            penalty_number_nearly_30_compare = penalty_Industry_number

            # 26-31 虚假信息
            tds = tree.xpath(
                ".//div[@class='J_TBR_MonthInfo_Detail detail']/div[4]")[0]
            info = tds.xpath("string(.)")
            content = info.replace('\n', '').replace(' ', '')
            #print content
            fake_info = re.findall("\d+", content)
            fake_info = [int(item) for item in fake_info]
            #print fake_info
            #26 本店近30天被处罚总次数
            total_penalty = fake_info[1]

            #print total_penalty
            #27 因出售假冒商品,被处罚次数
            penalty_number_fake_good = fake_info[2]

            #28 因虚假交易,被处罚次数
            penalty_number_false_transaction = fake_info[3]

            #29 因违背承诺,被处罚次数
            penalty_number_breach_promise = fake_info[4]

            #30 因描述不符,被处罚次数
            penalty_number_bad_desc = fake_info[5]

            #31 因恶意骚扰,被处罚次数
            penalty_number_malicious_harassment = fake_info[6]
        """
    after_sales_speed_nearly_30,
    after_sales_speed_nearly_30_compare,
    after_sale_rate_nearly_30,
    after_sale_rate_nearly_30_compare,
    dispute_rate_nearly_30,
    dispute_rate_nearly_30_compare,
    penalty_number_nearly_30,
    penalty_number_nearly_30_compare,
    penalty_number_fake_good,
    penalty_number_false_transaction,
    penalty_number_breach_promise,
    penalty_number_bad_desc,
    penalty_number_malicious_harassment,
    """

        values.append(total_penalty)
        values.append(after_sales_speed_nearly_30)
        values.append(after_sales_speed_nearly_30_compare)
        values.append(after_sale_rate_nearly_30)
        values.append(after_sale_rate_nearly_30_compare)
        values.append(dispute_rate_nearly_30)
        values.append(dispute_rate_nearly_30_compare)
        values.append(penalty_number_nearly_30)
        values.append(penalty_number_nearly_30_compare)
        values.append(penalty_number_fake_good)
        values.append(penalty_number_false_transaction)
        values.append(penalty_number_breach_promise)
        values.append(penalty_number_bad_desc)
        values.append(penalty_number_malicious_harassment)

        #32-38 评分
        tbs = tree.xpath(".//div[@class='box-wrap']")

        average_score_for_commodity = '-1'
        count_of_judger_for_commodity = '-1'
        five_score_rate_for_commodity = '-1'
        four_score_rate_for_commodity = '-1'
        three_score_rate_for_commodity = '-1'
        two_score_rate_for_commodity = '-1'
        one_score_rate_for_commodity = '-1'

        average_score_for_seller = '-1'
        count_of_judger_for_seller = '-1'
        five_score_rate_for_seller = '-1'
        four_score_rate_for_seller = '-1'
        three_score_rate_for_seller = '-1'
        two_score_rate_for_seller = '-1'
        one_score_rate_for_seller = '-1'

        average_score_for_logistics = '-1'
        count_of_judger_for_logistics = '-1'
        five_score_rate_for_logistics = '-1'
        four_score_rate_for_logistics = '-1'
        three_score_rate_for_logistics = '-1'
        two_score_rate_for_logistics = '-1'
        one_score_rate_for_logistics = '-1'

        if len(tbs) == 3:
            # 宝贝评分打星
            baby = tbs[0]
            # 宝贝评分均分
            average_score_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='total']/em[@class='h']/text()"))

            # 评价总人数
            count_of_judger_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='total']/span/text()"))

            # 五分好评人数占比
            five_score_rate_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='count count5']/em/text()"))

            # 四分好评人数占比
            four_score_rate_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='count count4']/em/text()"))

            # 三分好评人数占比
            three_score_rate_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='count count3']/em/text()"))

            # 二分好评人数占比
            two_score_rate_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='count count2']/em/text()"))

            # 一分好评人数占比
            one_score_rate_for_commodity = exist_or_0(
                baby.xpath(".//div[@class='count count1']/em/text()"))

            # 服务态度评分打星
            attitude = tbs[1]
            # 服务态度均分
            average_score_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='total']/em[@class='h']/text()"))

            # 评价总人数
            count_of_judger_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='total']/span/text()"))

            # 五分好评人数占比
            five_score_rate_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='count count5']/em/text()"))

            # 四分好评人数占比
            four_score_rate_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='count count4']/em/text()"))

            # 三分好评人数占比
            three_score_rate_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='count count3']/em/text()"))

            # 二分好评人数占比
            two_score_rate_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='count count2']/em/text()"))

            # 一分好评人数占比
            one_score_rate_for_seller = exist_or_0(
                attitude.xpath(".//div[@class='count count1']/em/text()"))

            # 物流评分打星
            logistic = tbs[2]
            # 物流评分均分
            average_score_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='total']/em[@class='h']/text()"))

            # 评价总人数
            count_of_judger_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='total']/span/text()"))

            # 五分好评人数占比
            five_score_rate_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='count count5']/em/text()"))

            # 四分好评人数占比
            four_score_rate_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='count count4']/em/text()"))

            # 三分好评人数占比
            three_score_rate_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='count count3']/em/text()"))

            # 二分好评人数占比
            two_score_rate_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='count count2']/em/text()"))

            # 一分好评人数占比
            one_score_rate_for_logistics = exist_or_0(
                logistic.xpath(".//div[@class='count count1']/em/text()"))
        #print "count_of_judger_for_commodity:",count_of_judger_for_commodity
        values.append(average_score_for_commodity)
        values.append(count_of_judger_for_commodity)
        values.append(five_score_rate_for_commodity)
        values.append(four_score_rate_for_commodity)
        values.append(three_score_rate_for_commodity)
        values.append(two_score_rate_for_commodity)
        values.append(one_score_rate_for_commodity)

        values.append(average_score_for_seller)
        values.append(count_of_judger_for_seller)
        values.append(five_score_rate_for_seller)
        values.append(four_score_rate_for_seller)
        values.append(three_score_rate_for_seller)
        values.append(two_score_rate_for_seller)
        values.append(one_score_rate_for_seller)

        values.append(average_score_for_logistics)
        values.append(count_of_judger_for_logistics)
        values.append(five_score_rate_for_logistics)
        values.append(four_score_rate_for_logistics)
        values.append(three_score_rate_for_logistics)
        values.append(two_score_rate_for_logistics)
        values.append(one_score_rate_for_logistics)
        #print len(values)
        values.append(update_time)
        sql = """
insert  ignore into  seller_info \
(  \
shop_id,alipay_Authentication,main_products,Location,\
seller_credit,buyer_credit,seller_bond,commodity_score,\
seller_attitude_score,logistics_score,commodity_score_compare,seller_attitude_score_compare,\
logistics_score_compare,positive_comment_week,moderate_comment_week,negative_comment_week,\
core_positive_comment_week,core_moderate_comment_week,core_negative_comment_week,\
non_core_positive_comment_week,non_core_moderate_comment_week,non_core_negative_comment_week,\
positive_comment_month,moderate_comment_month,negative_comment_month,\
core_positive_comment_month,core_moderate_comment_month,core_negative_comment_month,\
non_core_positive_comment_month,non_core_moderate_comment_month,non_core_negative_comment_month,\
positive_comment_half_year,moderate_comment_half_year,negative_comment_half_year,\
core_positive_comment_half_year,core_moderate_comment_half_year,core_negative_comment_half_year,\
non_core_positive_comment_half_year,non_core_moderate_comment_half_year,non_core_negative_comment_half_year,\
positive_comment_before_half_year,moderate_comment_before_half_year,negative_comment_before_half_year,\
total_penalty,after_sales_speed_nearly_30,after_sales_speed_nearly_30_compare,\
after_sale_rate_nearly_30,after_sale_rate_nearly_30_compare,\
dispute_rate_nearly_30,dispute_rate_nearly_30_compare,\
penalty_number_nearly_30,penalty_number_nearly_30_compare,\
penalty_number_fake_good,penalty_number_false_transaction,penalty_number_breach_promise,penalty_number_bad_desc,penalty_number_malicious_harassment,\
average_score_for_commodity,count_of_judger_for_commodity,\
five_score_rate_for_commodity,four_score_rate_for_commodity,three_score_rate_for_commodity,two_score_rate_for_commodity,one_score_rate_for_commodity,\
average_score_for_seller,count_of_judger_for_seller,\
five_score_rate_for_seller,four_score_rate_for_seller,three_score_rate_for_seller,two_score_rate_for_seller,one_score_rate_for_seller,\
average_score_for_logistics,count_of_judger_for_logistics,\
five_score_rate_for_logistics,four_score_rate_for_logistics,three_score_rate_for_logistics,two_score_rate_for_logistics,one_score_rate_for_logistics,\
update_time \
)\
values\
(\
'%s','%s','%s','%s',\
'%s','%s','%s','%s',\
'%s','%s','%s','%s',\
'%s','%s','%s','%s',\
'%s','%s',%s,'%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s','%s',\
'%s','%s',\
'%s','%s',\
'%s','%s',\
'%s','%s',\
'%s','%s','%s','%s','%s',\
'%s','%s',\
'%s','%s','%s','%s','%s',\
'%s','%s',\
'%s','%s','%s','%s','%s',\
'%s','%s',\
'%s','%s','%s','%s','%s',\
'%s'\
)
""" % tuple(values)
        try:
            x.ExecNonQuery(sql)
        except Exception as ex:
            print(ex)
            print(sql)
            return False
        x.EndSql()
        return "ok"
    except Exception as ex:
        print(ex)
        save_path = os.path.join("re_jiexi", "%s_rating" % (shop_id))
        f = codecs.open(save_path, "w+", encoding="utf-8")
        f.write(html)
        f.close()
        x.EndSql()
        return False
示例#14
0
               self.f_log.write(shop_product[0][0]+":"+self.name+":done\n")
           self.shop_time += 1
           if self.shop_time >30:
              self.f_log.write(self.name + ":重启浏览器\n")
              self.driver.quit()
              self.start_chrome()
              time.sleep(5)
              self.f_log.write(self.name + ":重启浏览器成功\n")
         except Exception as ex:
                print ("fucntion run",ex,self.name)


lock = threading.Lock()
shop_product_queue = queue.Queue()
# 获取带爬商品ID 列表
x = Mymysql()
x._GetConnect()


#target:
f = codecs.open("../has_craw_shop.txt","r+",encoding="utf-8")
shop_infos = f.readlines()
f.close()
shop_infos = [item.strip().split("|") for item in shop_infos]


# has crawl
if not os.path.exists("has_craw.txt"):
    f = open("has_craw.txt","w+")
    f.close()
f = codecs.open("has_craw.txt","r+",encoding="utf-8")
示例#15
0
def good_jiexi(shop_ID, shop_id, content, page, update_time, thread_name):
    try:
        x = Mymysql()
        x._GetConnect()
        count = 0
        tree = etree.HTML(content)
        dls = tree.xpath(".//dl[contains(@class,'item')]")
        #now = datetime.datetime.now()
        #otherStyleTime = now.strftime("%Y-%m-%d")
        path = os.getcwd()
        parent_path = os.path.dirname(path)
        if len(dls) != 0:
            for item in dls:
                #price = item.xpath(".//*[@class='c-price']//text()")
                #if len(price)==0:
                #   continue
                #price = price[0].strip()
                detail_a = item.xpath(".//dd[@class='detail']/a")
                if not detail_a:
                    continue
                detail_a = detail_a[0]
                #name = detail_a.xpath(".//text()")[0].strip()
                url = detail_a.get("href")
                id = re.findall("id=(\d+)", url)
                if len(id) == 0:
                    continue
                id = id[0]

                sale_num = item.xpath(".//*[@class='sale-num']/text()")
                if len(sale_num) == 0:
                    sale_num = -1
                else:
                    sale_num = sale_num[0]
                count += 1
                sql = "insert  ignore into product_list(shop_id,product_id,total_sales_volume,update_time)\
                values('%s','%s','%s','%s')" % (shop_id, id, sale_num,
                                                update_time)
                try:
                    x.ExecNonQuery(sql)
                except Exception as ex:
                    print(ex)
                    print(sql)
                    print("=" * 20)

        div = tree.xpath(".//div[@class='item']")
        if len(div) != 0:
            for item in div:
                #price = item.xpath(".//div[@class='price']/strong/text()")[0].strip()
                #price = re.findall("\d+.?\d+",price)[0]
                sale_num = item.xpath(
                    ".//*[@class='sale-num']//text()|.//*[@class='sales-amount']//text()"
                )
                if len(sale_num) == 0:
                    sale_num = -1
                else:
                    sale_num = sale_num[0]
                    sale_num = re.findall("\d+", sale_num)
                    sale_num = sale_num[0] if len(sale_num) > 0 else -1

                detail_a = item.xpath(".//div[@class='desc']/a")
                if not detail_a:
                    continue

                detail_a = detail_a[0]
                #name = detail_a.xpath(".//text()")[0].strip()
                url = detail_a.get("href")
                id = re.findall("id=(\d+)", url)
                if len(id) == 0:
                    continue
                id = id[0]
                #print id,name,url,price
                count += 1
                sql = "insert  ignore into  product_list(shop_id,product_id,total_sales_volume,update_time)\
                        values('%s','%s','%s','%s')" % (shop_id, id, sale_num,
                                                        update_time)

                try:
                    x.ExecNonQuery(sql)
                except Exception as ex:
                    print(ex)
                    print(sql)
                    print("=" * 20)

        return "ok"
    except Exception as ex:
        print(ex)
        save_path = os.path.join("re_jiexi", "%s_%s.txt" % (shop_ID, page))
        f = codecs.open(save_path, "w+", encoding="utf-8")
        f.write(content)
        f.close()
        return False
    finally:
        x.EndSql()
示例#16
0
def main_page_jiexi(product_id, content, update_time, item_dict, thread_name):
    try:
        x = Mymysql()
        x._GetConnect()

        # 属性插入:
        tree = etree.HTML(content)
        attributes = tree.xpath(".//ul[@class='attributes-list']/li/text()")
        attributes = [item.split(":") for item in attributes]
        for item in attributes:
            if item:
                detail_id = uuid.uuid1()
                index_name = item[0].strip()
                index_value = item[1].strip().replace("'", "")
                sql = """
                          insert into product_detail_information(detail_id,product_id,index_name,index_value,update_time)
                          values('%s','%s','%s','%s','%s')
                      """ % (detail_id, product_id, index_name, index_value,
                             update_time)
                try:
                    x.ExecNonQuery(sql)
                except Exception as ex:
                    print("attributes jiexi error", ex)
                    print(sql)
                    print("=============================")
                    return False
        # 左上角图片插入
        imgs_left = tree.xpath(".//ul[@id='J_UlThumb']/li//img")
        count = 1
        for item in imgs_left:
            src = item.get("src")
            if not src:
                src = item.get("data-src")
            src = "http:" + src
            src = src.replace("50x50", "400x400")
            if not src.endswith("jpg"):
                continue
            from_who = position = 1
            sequence = count
            pic_md5 = get_pic_md5(src, product_id)
            sql = """
                              insert into image(product_id,img_src,position,sequence,update_time,from_who,pic_md5)
                                          values('%s','%s','%s','%s','%s','%s','%s')
                              """ % (product_id, src, position, sequence,
                                     update_time, from_who, pic_md5)
            try:
                x.ExecNonQuery(sql)
                count += 1
            except Exception as ex:
                print("left_img jiexi error", ex)
                print(sql)
                print("=============================")
                continue

        # size 插入
        code = tree.xpath(
            ".//ul[@class='J_TSaleProp tb-clearfix']//span/text()")
        for item in code:
            Size_ID = uuid.uuid1()
            Size = item.replace("'", "")
            Size = item.replace("\\", "")
            sql = """
                             insert into  size(product_id,size_id, size,update_time) values('%s','%s','%s','%s')
                  """ % (product_id, Size_ID, Size, update_time)
            try:
                x.ExecNonQuery(sql)
            except Exception as ex:
                print('size jiexi error', ex)
                print(sql)
                print("=============================")
                return False

        # color 插入
        color = tree.xpath(".//ul[@class='J_TSaleProp tb-img tb-clearfix']//a")
        count = 1
        for item in color:
            src = item.get("style")
            img_id = ""
            if src:  # 有图片
                src = "http:" + re.findall("\((.*?)\)", src)[0]
                src = src.replace("30x30", "400x400")
                if not src.endswith("jpg"):
                    continue
                from_who = 1
                position = 3
                sequence = count
                pic_md5 = get_pic_md5(src, product_id)
                sql = """
                          insert into image(pic_md5,product_id,img_src,position,sequence,update_time,from_who)
                                      values('%s','%s','%s','%s','%s','%s','%s')
                          """ % (pic_md5, product_id, src, position, sequence,
                                 update_time, from_who)
                try:
                    x.ExecNonQuery(sql)
                    count += 1
                except Exception as ex:
                    print("color imagejiexi error", ex)
                    print(sql)
                    print("=============================")
                    continue

            color_id = uuid.uuid1()
            color = item.xpath(".//span/text()")[0]
            sql = """
                          insert into color(product_id,img_id,color_id,color,update_time)
                                      values('%s','%s','%s','%s','%s')
                          """ % (product_id, img_id, color_id, color,
                                 update_time)
            try:
                x.ExecNonQuery(sql)
            except Exception as ex:
                print("colortext jiexi error", ex)
                print(sql)
                print("=============")
                continue
        # 3 产品名称
        product_name = tree.xpath(".//h3[@class='tb-main-title']")
        if not product_name:
            product_name = ""
        else:
            product_name = product_name[0].get("data-title")
        product_name = product_name.replace("'", "")
        item_dict['product_name'] = product_name

        # 产品简述
        product_profile = tree.xpath(".//*[@id='J_Title']/p/text()")
        product_profile = product_profile[0] if product_profile else ""
        product_profile = product_profile.replace("'", "")
        item_dict['product_profile'] = product_profile

        return "ok"

    except Exception as ex:
        print(product_id, "main_page_jiexi", "error:", ex)
        file_save = os.path.join("jiexi", product_id + "_" + "1" + ".html")
        f = codecs.open(file_save, "a+", encoding="utf-8")
        f.write(content)
        f.close()
        return False
示例#17
0
def refund_jiexi(shop_id, content, update_time):
    try:
        x = Mymysql()
        x._GetConnect()
        tree = etree.HTML(content)
        kuang = tree.xpath(".//div[@class='tb-r-box kg-rate-wd-refund']")
        trs = tree.xpath(".//tr[@class='J_KgRate_RefundSummary_TR']")

        tmp_tr = trs[0]
        tds = tmp_tr.xpath(".//td/text()")
        # 近30天售后速度
        after_sales_speed_nearly_30 = re.findall("\d+.?\d+", tds[1])
        after_sales_speed_nearly_30 = -1 if len(
            after_sales_speed_nearly_30
        ) == 0 else after_sales_speed_nearly_30[0]
        # print "after_sales_speed_nearly_30",after_sales_speed_nearly_30
        # 近30天售后速度 比行业值
        if u"持平" in tds[2]:
            after_sales_speed_nearly_30_compare = 0
        else:
            after_sales_speed_nearly_30_compare = re.findall(
                "\d+.?\d+", tds[2])
            after_sales_speed_nearly_30_compare = -1 if len(after_sales_speed_nearly_30_compare) == 0 else \
            after_sales_speed_nearly_30_compare[0]
            if u"快" not in tds[2] and after_sales_speed_nearly_30_compare != -1:
                after_sales_speed_nearly_30_compare = "-" + after_sales_speed_nearly_30_compare
                after_sales_speed_nearly_30_compare = float(
                    after_sales_speed_nearly_30_compare) * 0.01
        # print "after_sales_speed_nearly_30_compare",after_sales_speed_nearly_30_compare

        tmp = tree.xpath(
            ".//div[@data-kg-rate-gl-hover='refundfeedback.3.6']/ul/li/text()")
        # print "tmp:",tmp
        if len(tmp) != 0:
            # 仅退款速度
            refund_speed_nearly_30 = re.findall(u"仅退款速度 (\d+.?\d+)", tmp[0])
            # print "refund_speed_nearly_30:",refund_speed_nearly_30
            refund_speed_nearly_30 = -1 if len(
                refund_speed_nearly_30) == 0 else refund_speed_nearly_30[0]
            # print "refund_speed_nearly_30",refund_speed_nearly_30
            # 退货退款速度
            full_refund_speed_nearly_30 = re.findall(u"退货退款速度 (\d+.?\d+)",
                                                     tmp[-1])
            # print "full_refund_speed_nearly_30:",full_refund_speed_nearly_30
            full_refund_speed_nearly_30 = -1 if len(
                full_refund_speed_nearly_30
            ) == 0 else full_refund_speed_nearly_30[0]
            # print "full_refund_speed_nearly_30",full_refund_speed_nearly_30
        else:
            refund_speed_nearly_30 = -1
            full_refund_speed_nearly_30 = -1

        tmp_tr = trs[1]
        tds = tmp_tr.xpath(".//td/text()")
        # 近30天纠纷率
        dispute_rate_nearly_30 = re.findall("\d+.?\d+", tds[1])

        if len(dispute_rate_nearly_30) > 0:
            dispute_rate_nearly_30 = float(dispute_rate_nearly_30[0]) * 0.01
        else:
            dispute_rate_nearly_30 = -1
        # print "dispute_rate_nearly_30",dispute_rate_nearly_30
        #  近30天纠纷率 比行业值
        if u"持平" in tds[2]:
            dispute_rate_nearly_30_compare = 0
        else:
            dispute_rate_nearly_30_compare = re.findall("\d+.?\d+", tds[2])
            dispute_rate_nearly_30_compare = -1 if len(dispute_rate_nearly_30_compare) == 0 else \
            dispute_rate_nearly_30_compare[0]
            if u"低" in tds[2] and dispute_rate_nearly_30_compare != -1:
                dispute_rate_nearly_30_compare = "-" + dispute_rate_nearly_30_compare
                dispute_rate_nearly_30_compare = float(
                    dispute_rate_nearly_30_compare) * 0.01
        # print "dispute_rate_nearly_30_compare",dispute_rate_nearly_30_compare

        tmp_tr = trs[2]
        tds = tmp_tr.xpath(".//td/text()")
        # 近30天售后率
        after_sale_rate_nearly_30 = re.findall("\d+.?\d+", tds[1])
        after_sale_rate_nearly_30 = -1 if len(
            after_sale_rate_nearly_30) == 0 else after_sale_rate_nearly_30[0]
        after_sale_rate_nearly_30 = float(after_sale_rate_nearly_30) * 0.01
        # print "after_sale_rate_nearly_30",after_sale_rate_nearly_30

        if u"持平" in tds[2]:
            after_sale_rate_nearly_30_compare = 0
        else:
            after_sale_rate_nearly_30_compare = re.findall("\d+.?\d+", tds[2])
            after_sale_rate_nearly_30_compare = -1 if len(after_sale_rate_nearly_30_compare) == 0 else \
            after_sale_rate_nearly_30_compare[0]

            if u"低" in tds[2] and after_sale_rate_nearly_30_compare != -1:
                after_sale_rate_nearly_30_compare = "-" + after_sale_rate_nearly_30_compare
                after_sale_rate_nearly_30_compare = float(
                    after_sale_rate_nearly_30_compare) * 0.01
        # print "after_sale_rate_nearly_30_compare",after_sale_rate_nearly_30_compare

        # hover frame
        hover_frame_for_rate = tree.xpath(
            ".//div[@data-kg-rate-gl-hover='refundfeedback.3.8']//text()")
        hover_frame_for_rate = list(
            filter(lambda x: len(x.strip()) > 0, hover_frame_for_rate))
        after_sales_count_nearly_30 = re.findall("\d+",
                                                 hover_frame_for_rate[0])[1]
        # print "after_sales_count_nearly_30",after_sales_count_nearly_30
        bad_goods_count_nearly_30 = re.findall("\d+",
                                               hover_frame_for_rate[1])[0]
        # print "bad_goods_count_nearly_30",bad_goods_count_nearly_30
        buyer_dislike_count_nearly_30 = re.findall("\d+",
                                                   hover_frame_for_rate[2])[0]
        # print "buyer_dislike_count_nearly_30",buyer_dislike_count_nearly_30
        bad_seller_attitude_nearly_30 = re.findall("\d+",
                                                   hover_frame_for_rate[3])[0]
        # print "bad_seller_attitude_nearly_30",bad_seller_attitude_nearly_30

        tmp_tr = trs[3]
        tds = tmp_tr.xpath(".//td/text()")
        # 近180天售后态度评分
        aftersale_attitude_score_nearly_180 = re.findall("\d+.?\d+", tds[1])
        aftersale_attitude_score_nearly_180 = -1 if len(aftersale_attitude_score_nearly_180) == 0 else \
        aftersale_attitude_score_nearly_180[0]
        # print "aftersale_attitude_score_nearly_180",aftersale_attitude_score_nearly_180

        if u"持平" in tds[2]:
            aftersale_attitude_score_nearly_180_compare = 0
        else:
            aftersale_attitude_score_nearly_180_compare = re.findall(
                "\d+.?\d+", tds[2])
            aftersale_attitude_score_nearly_180_compare = -1 if len(
                aftersale_attitude_score_nearly_180_compare
            ) == 0 else aftersale_attitude_score_nearly_180_compare[0]
            if u"低" in tds[
                    2] and aftersale_attitude_score_nearly_180_compare != -1:
                aftersale_attitude_score_nearly_180_compare = "-" + aftersale_attitude_score_nearly_180_compare
                aftersale_attitude_score_nearly_180_compare = float(
                    aftersale_attitude_score_nearly_180_compare) * 0.01
        # print "aftersale_attitude_score_nearly_180_compare",aftersale_attitude_score_nearly_180_compare

        tmp_tr = trs[4]
        tds = tmp_tr.xpath(".//td/text()")
        # 近180天售后速度评分
        after_sale_rate_nearly_180 = re.findall("\d+.?\d+", tds[1])
        after_sale_rate_nearly_180 = -1 if len(
            after_sale_rate_nearly_180) == 0 else after_sale_rate_nearly_180[0]
        # print "after_sale_rate_nearly_180",after_sale_rate_nearly_180
        if u"持平" in tds[2]:
            after_sale_rate_nearly_180_compare = 0
        else:
            after_sale_rate_nearly_180_compare = re.findall("\d+.?\d+", tds[2])
            after_sale_rate_nearly_180_compare = -1 if len(after_sale_rate_nearly_180_compare) == 0 else \
            after_sale_rate_nearly_180_compare[0]
            if u"低" in tds[2] and after_sale_rate_nearly_180_compare != -1:
                after_sale_rate_nearly_180_compare = "-" + after_sale_rate_nearly_180_compare
                after_sale_rate_nearly_180_compare = float(
                    after_sale_rate_nearly_180_compare) * 0.01
        # print "after_sale_rate_nearly_180_compare",after_sale_rate_nearly_180_compare
        sql = """
              update seller_info set after_sales_speed_nearly_30='%s',
                                     after_sales_speed_nearly_30_compare='%s',
                                     refund_speed_nearly_30='%s',
                                     full_refund_speed_nearly_30='%s',
                                     dispute_rate_nearly_30='%s',
                                     dispute_rate_nearly_30_compare='%s',
                                     after_sale_rate_nearly_30='%s',
                                     after_sale_rate_nearly_30_compare='%s',
                                     after_sales_count_nearly_30='%s',
                                     bad_goods_count_nearly_30='%s',
                                     buyer_dislike_count_nearly_30='%s',
                                     bad_seller_attitude_nearly_30='%s',
                                     aftersale_attitude_score_nearly_180='%s',
                                     aftersale_attitude_score_nearly_180_compare='%s',
                                     after_sale_rate_nearly_180='%s',
                                     after_sale_rate_nearly_180_compare='%s'           
              where shop_id = '%s' and update_time='%s'   
              """ % (
            after_sales_speed_nearly_30, after_sales_speed_nearly_30_compare,
            refund_speed_nearly_30, full_refund_speed_nearly_30,
            dispute_rate_nearly_30, dispute_rate_nearly_30_compare,
            after_sale_rate_nearly_30, after_sale_rate_nearly_30_compare,
            after_sales_count_nearly_30, bad_goods_count_nearly_30,
            buyer_dislike_count_nearly_30, bad_seller_attitude_nearly_30,
            aftersale_attitude_score_nearly_180,
            aftersale_attitude_score_nearly_180_compare,
            after_sale_rate_nearly_180, after_sale_rate_nearly_180_compare,
            shop_id, update_time)
        x.ExecNonQuery(sql)
    except Exception as ex:
        print(shop_id, "refund_jiexi", "error:", ex)
        log_path = os.path.join("jiexi", shop_id + "_" + "refund_jiexi.txt")
        f = codecs.open(log_path, "w+", encoding="utf-8")
        f.write(content)
        f.close()
        return False
    finally:
        x.EndSql()