Exemplo n.º 1
0
def prod_search_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "prod_search_job 开始----")
    job = sched.get_job(job_id="prod_search_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBProdSearchCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)
    # 如果队列中的元素为空,则加入一批到队列中
    if count < 10:
        pass
    #开启40个线程进行处理
    tpool = MyThreadPool.MyThreadPool(40)
    for i in range(10000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 3:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "prod_search_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    pass
    #如果要提前结束,则放入一批新的查询
    qlist = tbDao.random_prod_name()
    city = chinaCity.getFristCity()
    for q in qlist:
        if (tbpool.ProdQuerykeyExist(q)):
            continue
        prod = tbHttp.TBProdSearchCrawer()
        prod.pageno = 1
        prod.q = q
        prod.city = city
        BaseHttpGet.pushHttpGet(prod)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "prod_search_job 提前结束----")
Exemplo n.º 2
0
def query_pianyuan_font(mv=None):
    global myheaders,BASE_URL
    r = BaseHttpGet.getSessionPool().get(mv.pub_font_url, headers=myheaders, timeout=10)
    html = r.content.decode("gbk", 'replace')
    soup = BeautifulSoup(html, "lxml")
    links = soup.find_all("a")

    font_name=None
    down_url=None
    for link in links:
        href = link.get("href")
        if href is None or href.find("mod=attachment")==-1 :
            continue
        font_name = link.string
        if font_name is None or font_name.find("rar")==-1:
            continue
        down_url=href
    if down_url is None:
        print("字幕下载地址获取失败", down_url)
        return False
    pass
    #下载字幕哦
    dr = BaseHttpGet.getSessionPool().get(BASE_URL+down_url, headers=myheaders, stream=True)
    f = open("D:/temp/"+font_name, "wb")
    for chunk in dr.iter_content(chunk_size=512):
        if chunk:
            f.write(chunk)
    f.close()
    mv.save()

    return True
Exemplo n.º 3
0
def update_shop_create_time_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 开始----")
    job = sched.get_job(job_id="update_shop_create_time_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBShopCreateTimeCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)

    #如果队列中的元素为空,则加入一批到队列中
    if count == 0:
        list = models.TTbShop.objects.filter(shop_createtime=None)[0:5000]
        for shop in list:
            http = tbHttp.TBShopCreateTimeCrawer()
            http.shopid = shop.shopid
            http.isProxy = True
            BaseHttpGet.pushHttpGet(http)
        pass
    #开启线程进行处理
    tpool = MyThreadPool.MyThreadPool(5)
    for i in range(10000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 10:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "update_shop_create_time_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
Exemplo n.º 4
0
def update_prod_item_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 开始----")
    job = sched.get_job(job_id="update_prod_item_job")
    next = int(job.next_run_time.strftime('%Y%m%d%H%M%S'))
    clasName = tbHttp.TBProdItemCrawer.__name__
    count = BaseHttpGet.getHttpGetPoolCount(clasName)
    # 开启线程进行处理
    tpool = MyThreadPool.MyThreadPool(10)
    for i in range(2000):
        now = int(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        if next - now < 7:
            print(
                time.strftime("%d %H:%M:%S", time.localtime(time.time())), i,
                "update_shop_create_time_job 结束--------------------------------------------------------"
            )
            return
        tpool.callInThread(do_http, clasName)
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
    #如果队列中的元素为空,则加入一批到队列中
    list = models.TTbShopProd.objects.filter(shopid=None)[0:5000]
    for p in list:
        http = tbHttp.TBProdItemCrawer()
        http.product_id = p.product_id
        http.uid = p.uid
        BaseHttpGet.pushHttpGet(http)
    pass
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "do_update_shop_create_time 提前结束----")
    pass
Exemplo n.º 5
0
def do_http(clasName=None):
    http = BaseHttpGet.popHttpGet(clasName)
    if http is None:
        return
    if http.run():
        pass
    else:
        BaseHttpGet.pushHttpGet(http)
        pass
Exemplo n.º 6
0
def query_xp_torrent(mv=None):
    try:
        #如果下载地址是种子,则截取种子HASH码
        #http://www1.downsx.net/torrent/D27DF676675F54E82A2294FE71AAE720F45B6634
        if (mv.pub_down_url.find("torrent") != -1):
            mv.pub_down_url = "magnet:?xt=urn:btih:" + mv.pub_down_url[
                mv.pub_down_url.find("torrent") + 8:]
            print(mv.pub_down_url)
            return True
        #如果是下载地址,则下载种子
        if (mv.pub_down_url.find("updowm/file.php/") > 0):

            r = BaseHttpGet.getSessionPool().get(mv.pub_down_url,
                                                 headers=headers,
                                                 timeout=10)
            html = r.content.decode("utf-8", 'replace')
            soup = BeautifulSoup(html, "lxml")

            down_id = soup.find("input", id="id").get("value")
            down_name = soup.find("input", id="name").get("value")
            down_type = soup.find("input", id="type").get("value")
            d = {'id': down_id, 'name': down_name, 'type': down_type}

            #下载资源
            headers2 = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
                'Referer': mv.pub_down_url
            }
            down_url = mv.pub_down_url[0:mv.pub_down_url.find("file.php"
                                                              )] + "down.php"
            print("开始下载种子", down_url)
            down_r = BaseHttpGet.getSessionPool().post(down_url,
                                                       data=d,
                                                       headers=headers2,
                                                       timeout=15)

            if (len(down_r.content) < 10):
                print("下载种子失败")
                return False

            fileObject = open(
                'd:/moviedata/torrent/1024xp_' + mv.pub_id + ".torrent", 'wb')
            fileObject.write(down_r.content)
            fileObject.flush()
            fileObject.close()
            #把种子转换成磁性链接
            down_r.pub_down_url = BTBencode.BTByteToCode(down_r.content)
            print("种子已转换为磁性链接", down_r.pub_down_url)
            return True

        pass
    except Exception as e:  # 远程文件不存在
        print("下载种子失败", e)
        return False

    return False
Exemplo n.º 7
0
    def nextQuery(self):
        ncity = chinaCity.getNextCity(self.city)
        if ncity is not None:
            self.city = ncity
            self.id = None  # id必须设置为空,否则无放入到运行队列里
            self.pageno = 1
            BaseHttpGet.pushHttpGet(self)

        # 如果结束了。则把查询的key放入缓存,同一个查询key,3天内部重复查询
        tbpool.ShopQuerykeyExist(self.q)
Exemplo n.º 8
0
 def nextQuery(self):
     n_c = chinaCity.getNextCity(self.city)
     if n_c is not None:
         self.city = n_c
         self.id = None  # id必须设置为空,否则无放入到运行队列里
         self.pageno = 1
         BaseHttpGet.pushHttpGet(self)
     #如果结束了,则把查询关键字放入缓存
     tbpool.ProdQuerykeyExist(self.q)
     return
Exemplo n.º 9
0
def xp1024_search_job():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),"xp1024_search_job 开始----")
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=5&page=" + str(i)
        http.pub_type="亚洲无码"
        BaseHttpGet.pushHttpGet(http)
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=22&page=" + str(i)
        http.pub_type="日本骑兵"
        BaseHttpGet.pushHttpGet(http)
    for i in range(1, 10):
        http=xp1024Http.xp1024_list_crawer()
        http.url=xp_base_url+"/pw/thread.php?fid=7&page=" + str(i)
        http.pub_type="歐美新片"
        BaseHttpGet.pushHttpGet(http)

    #执行多线程处理
    # 开启5个线程进行处理

    tpool = MyThreadPool.MyThreadPool(2)
    for i in range(10000):
        if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_list_crawer.__name__)==0:
            break
        tpool.callInThread(do_http, xp1024Http.xp1024_list_crawer.__name__)
    pass

    for i in range(10000):
        if BaseHttpGet.getHttpGetPoolCount(xp1024Http.xp1024_info_crawer.__name__) == 0:
            break
        tpool.callInThread(do_http,  xp1024Http.xp1024_info_crawer.__name__)
    pass
    tpool.wait()
Exemplo n.º 10
0
def query_pianyuan_info(mv=None):
    global myheaders
    r = BaseHttpGet.getSessionPool().get(mv.pub_info_url, headers=myheaders, timeout=10)
    html = r.content.decode("gbk", 'replace')
    soup = BeautifulSoup(html, "lxml")
    hdiv = soup.find("div",class_="showhide")
    sidx = hdiv.text.find("magnet:")
    if sidx==-1:
        sidx = hdiv.text.find("http://gdl.lixian.vip.xunlei.com")
    if sidx==-1:
        print("下载地址无效",hdiv.text)
        return False

    mv.pub_down_url = hdiv.text[sidx:]
    if(len(mv.pub_down_url)>1000):
        print("下载地址无效2" ,len(mv.pub_down_url),mv.pub_down_url)
        return False
    td = hdiv.parent.parent
    links = td.find_all("a")

    for link in links:
        href = link.get("href")
        if href is not None and href.find("thread-")>0:
            mv.pub_font_url=href
            break
    if mv.pub_font_url is None:
        #没有字幕,直接保存
        mv.save()
        return False
    else:
        query_pianyuan_font(mv)
    pass
    return True
Exemplo n.º 11
0
def init_shop_search():
    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "init_shop_search 开始----")
    cityl = chinaCity.listAllCity()
    cat = tbcategory.getFristQueryKey()
    count = 0
    for city in cityl:
        tshop = tbHttp.TBShopSearchCrawer()
        tshop.pageno = 1
        tshop.q = cat
        tshop.city = city
        tshop.id = "shop_search," + cat + city
        BaseHttpGet.pushHttpGet(tshop)
        count = count + 1

    print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
          "init_shop_search 结束----", count)
Exemplo n.º 12
0
    def parse(self, response):
        try:
            print(time.strftime("%d %H:%M:%S", time.localtime(time.time())),
                  "抓取网站(", self.url, ")开始-----")
            soup = BeautifulSoup(response.content.decode("utf-8", 'replace'),
                                 "lxml")
            trs = soup.find_all("tr", class_='tr3 t_one', align="center")
            for tr in trs:
                if (str(tr).find("置顶") > 0):
                    continue
                namelink = tr.find('h3').find('a')
                if namelink is None:
                    continue
                pub_id = namelink.get("id")
                mv = models.XP1024Movie.objects.filter(pub_id=pub_id).first()
                # 判断影片是否已经存在,如果存在,则不在进行下一步处理
                if mv is None:
                    mv = models.XP1024Movie()
                    mv.pub_src = "1024xp"
                else:
                    continue
                mv.pub_type = self.pub_type
                mv.pub_day = tr.find('a', class_='f10').string.strip()
                mv.pub_name = namelink.string.strip()
                mv.pub_info_url = "/pw/" + namelink.get("href")
                mv.pub_id = pub_id
                catidx = mv.pub_name.find("] ")
                if catidx > 0:
                    mv.pub_name = mv.pub_name[catidx + 2:]

                # 抽取明细
                info = xp1024_info_crawer()
                info.mv = mv
                #放入ID,避免重复
                info.id = pub_id
                #print("pub_id",pub_id)
                BaseHttpGet.pushHttpGet(info)

        except Exception as e:
            print("xp1024_list_crawer数据解析出错:", e)
            return False
        return True
        pass
Exemplo n.º 13
0
def getRemoteFileSize(url):
    """ 通过content-length头获取远程文件大小
        url - 目标文件URL
        """
    if url is None or len(url) < 5:
        return 0
    try:
        r = BaseHttpGet.getSessionPool().get(url, headers=headers, timeout=10)
        return len(r.content)
    except Exception as e:  # 远程文件不存在
        return 0
Exemplo n.º 14
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False

            g_pagestr = stringExt.StringExt(rettext).extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()
            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有shopItems,则抓取结束了
            if g_pagestr.find("shopItems") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["shoplist"]["data"]["shopItems"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                shopurl = item["shopUrl"]
                shopid = self.paseInt(shopurl[shopurl.find("shop") +
                                              4:shopurl.find(".taobao")])
                #如果在缓存中存在,则直接跳过
                if tbpool.ShopIdExist(shopid):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                shop = models.TTbShop.objects.filter(shopid=shopid).first()
                if shop is None:
                    sesscount = sesscount + 1
                    shop = models.TTbShop()
                    shop.shopid = shopid
                else:
                    continue
                shop.mainpage = shopurl
                shop.uid = self.paseInt(item["uid"])
                shop.nick = item["nick"]
                shop.user_rate_url = item['userRateUrl']
                shop.title = item['title']
                # shop.shop_score = self.paseInt(item['totalsold'])
                shop.prod_count = self.paseInt(item['procnt'])
                shop.shop_area = item['provcity']
                if item["isTmall"] is True:
                    shop.shop_type = "TM"
                else:
                    shop.shop_type = "TB"
                shop.save()
            pass
            # 如果整页都没有一条新的的,则直接跳过10页
            if sesscount == 0:
                self.pageno = self.pageno + 10
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBShopSearchCrawer数据解析出错:", e)
            return False
        return True
        pass
Exemplo n.º 15
0
    def parse(self, response):
        try:
            rettext = response.text
            # 成功的话,必然包含下面的字符串
            if rettext.find("g_page_config =") == -1:
                if CRA_COUNT % 50 == 0:
                    print("数据抓取错误:", rettext, CRA_COUNT)
                return False
            st = stringExt.StringExt(rettext)
            g_pagestr = st.extractLine(
                "g_page_config", "pageName").ExtStr("g_page_config = ").str()

            if g_pagestr is None:
                return False
            g_pagestr = g_pagestr[:len(g_pagestr) - 1]
            # 如果没有auctions,则抓取结束了
            if g_pagestr.find("auctions") == -1:
                self.nextQuery()
                return True
            page = json.loads(g_pagestr)
            items = page["mods"]["itemlist"]["data"]["auctions"]
            itemcount = 0
            sesscount = 0
            for item in items:
                itemcount = itemcount + 1
                product_id = item["nid"]
                product_id = self.paseInt(product_id)
                if product_id is None:
                    continue
                view_sales = 0
                if "view_sales" in item:
                    view_sales = stringExt.StringExt(
                        item["view_sales"]).ExtStr("", "人").int()
                if view_sales == 0:
                    continue
                #如果在缓存中存在,则直接跳过
                if tbpool.prodIdExist(product_id):
                    continue
                #查找数据库中是否存在,如果存在,则直接跳过,如果不存在,则新建一个,避免了把旧的数据替换掉的问题
                prod = models.TTbShopProd.objects.filter(
                    product_id=product_id).first()
                if prod is None:
                    prod = models.TTbShopProd()
                    prod.product_id = product_id
                else:
                    continue
                prod.prod_loc = item["item_loc"]
                prod.name = item["raw_title"]
                prod.uid = item["user_id"]
                prod.view_sales = view_sales
                prod.create_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.update_time = time.strftime("%Y%m%d",
                                                 time.localtime(time.time()))
                prod.shop_price = self.paseInt(item["view_price"] * 100)
                prod.save()
                sesscount = sesscount + 1

            pass
            # 如果整页都没有一条新的的,则加100页
            if sesscount == 0:
                self.pageno = self.pageno + 100
            # 每20条输出一条
            if CRA_COUNT % 50 == 0:
                print("数据抓取结束", self.city, self.q, self.pageno, sesscount)
            # 执行完,把一下页放入待执行列表,如果超过100页,则把下一个关键字放入
            if self.pageno < 100:
                self.pageno = self.pageno + 1
                self.id = None  # id必须设置为空,否则无放入到运行队列里
                BaseHttpGet.pushHttpGet(self)
            else:
                self.nextQuery()
        except Exception as e:
            print("TBProdSearchCrawer数据解析出错:", e)
            return False
        return True
        pass
Exemplo n.º 16
0
# -*- coding: utf-8 -*-

from mysite.libs import BaseHttpGet
import json

if __name__ == '__main__':
    test = BaseHttpGet.TestHttpGet()
    test.url = "https://www.baidu.com"
    test.name = "中文来了"
    test.run()
    print(test.url)
    jtxt = json.dumps(test, default=lambda o: o.__dict__, ensure_ascii=False)
    print(jtxt)
    test2 = json.loads(jtxt)
    test3 = BaseHttpGet.TestHttpGet()
    test3.__dict__ = test2
    print(test3.name)
    pass