Exemplo n.º 1
0
def put(message=None):
    '''作业任务'''
    if message:
        rd.sadd(config.content_work_task, json.dumps(message))
        return {"status": "1", "msg": u"success"}
    else:
        return {"status": "-1", "msg": u"error,参数错误"}
Exemplo n.º 2
0
def process():
    while True:
        p = rd.spop("imageTask")
        IM = p.replace('"', "")
        IM = IM.replace('\n', "")
        data = IM.split(',')
        url = 'http://183.59.160.50:8082/EPG/jsp/images/universal/film/poster/' + data[3]
        path = u"E:/posters_5/"+"/".join(data[3].split('/')[:-1])+"/"
        try:
        	os.makedirs(path)
        except Exception as e:
        	# print(str(e))
        	pass
        local_filename = path+url.split('/')[-1]
        r = requests_get(url)
        print("r.status_code:",r.status_code)
        if r.status_code == 404 or r == False:
            with open("E:/404.txt", "a") as myfile:
                myfile.write(p)
            rd.sadd("imageTaskFailed",p)
            print("failed", p)
            continue
        f = open(local_filename, 'wb')
        for chunk in r.iter_content(chunk_size=512 * 1024):
            if chunk:
            	f.write(chunk)
        f.close()
        print("done", local_filename)
Exemplo n.º 3
0
def go_detail_list_task():
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_video_detail_task)
        if q is None:
            print(u"yk_video_detail_task sleeping 20 sec....")
            # time.sleep(task_wait)
            return True
        detail_url = json.loads(q)
        #if rd.sismember(config.yk_video_detail_failed,q)==True or rd.sismember(config.yk_video_detail_done,detail_url['url'])==True:
        if rd.sismember(config.yk_video_detail_done,
                        detail_url['url']) == True:
            print("pass", detail_url['url'])
            continue
        # r = go_detail_list_page(detail_url)
        r = requests_get(detail_url['url'], headers=youku_home_headers)
        d = parse_detail_list_page(r, detail_url['url'])
        data = d['data']
        if data is False or data == None:
            rd.sadd(config.yk_video_detail_failed, q)
            continue
        for x in d['stars']:
            rd.sadd(config.yk_star_task, x)  # 明星采集队列,redis set特性去重
        print('detail_url done:', detail_url['url'], data)
        done = rd.sadd(config.yk_video_detail_done,
                       detail_url['url'])  # finished
        #if done == 1:
        youku_videos.insert(data, check_keys=False)  # save tv data
        # 每50步更新一次session
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 4
0
def process():
    path = u"E:/avatar/"
    while True:
        p = rd.spop("stars")
        if not p:
        	return True
        task = json.loads(p)
        if task.get("avatar") and not task.get("img_url"):
        	im = requests_get(task['avatar'])
        elif not task.get("avatar") and task.get("img_url"):
        	im = requests_get(task['img_url'])
        else:
        	print("----",p)
        	continue
        #print("r.status_code:",r.status_code)
        #if r.status_code == 404 or r == False:
        if not im:
            rd.sadd("avatar_failed",p)
            print("failed", p)
            continue
        #im = Image.open(r.raw)
        file_name = "/".join([task.get("_id"),"%s.jpg"%(task.get("_id"))])
        try:
        	os.makedirs(re.search('(.*/)',path+file_name).group(1))
        except Exception as e:
        	#print(str(e))
        	pass
        im.convert('RGB').save(path+file_name)
        result = mongo_conn.stars.update_one({"_id":ObjectId(task['_id'])},{"$set":{"file_path":file_name}})
        print("done-----%s-----%s"%(result.modified_count,path+file_name))
        return
Exemplo n.º 5
0
def readtask():
    posters = mongo_conn.posters.find({"file_path": {
        "$exists": False
    }},
                                      no_cursor_timeout=True)
    for p in posters:
        p['_id'] = str(p['_id'])
        print(p['_id'])
        rd.sadd("posters", json.dumps(p))
Exemplo n.º 6
0
def merge_doubanvideo():
    douban = mongo_douban_tvs.find(no_cursor_timeout=True)
    for item in douban:
        print("put task", item.get("_id"))
        rd.sadd("task_merge_doubanvideo", str(item.get("_id")))

    for i in range(20):
        t = threading.Thread(target=task_merge_doubanvideo, )
        # t.setDaemon(True)
        t.start()
Exemplo n.º 7
0
def merge_youku_videos():
    stars = mongo_youku_videos.find()
    for item in stars:
        print("put task", item.get("_id"))
        rd.sadd("task_merge_youku_videos", str(item.get("_id")))

    for i in range(30):
        t = threading.Thread(target=task_merge_youku_videos, )
        # t.setDaemon(True)
        t.start()
Exemplo n.º 8
0
def merge_letvstar():
    stars = mongo_letv_stars.find()
    for item in stars:
        print("put task", item.get("_id"))
        rd.sadd("task_merge_letvstar", str(item.get("_id")))

    for i in range(20):
        t = threading.Thread(target=task_merge_letvstar, )
        # t.setDaemon(True)
        t.start()
Exemplo n.º 9
0
def get_category():
    """获取分类,做种子"""
    start = 1
    retry = 5
    print('get_category')
    while retry > 0:
        try:
            r = requests_get(url=category_url,
                             headers=youku_home_headers,
                             timeout=timeout,
                             session=session)
            page = etree.HTML(r)
            lis = page.xpath(
                u'//label[contains(text(),"分类:")]/following-sibling::ul/li')
            o = urlparse(category_url)
            host = o.scheme + '://' + o.netloc
            categories = []
            for x in xrange(1, len(lis)):
                categories.append({
                    "name": lis[x].find('a').text,
                    'url': host + lis[x].find('a').get('href')
                })
                # categories[lis[x].find('a').text] = host + lis[x].find('a').get('href')
            print("categories:", json.dumps(categories))
            # categories = {lis[x].find('a').text : host + lis[x].find('a').get('href') for x in xrange(1,len(lis)) if lis[x].find('a')!=None}  #
            if len(categories) == 0:
                update_session(proxy)
                continue
            for x in categories:
                if rd.sismember(config.yk_category_task_done,
                                x['url']) == False and rd.sismember(
                                    config.yk_category_task_failed,
                                    x['url']) == False:
                    task_sadd = rd.sadd(config.yk_category_task,
                                        json.dumps(x))  # 种子
                re_sadd = rd.sadd(config.yk_category_url, json.dumps(x))  # 种子
                if re_sadd != 0:  # 去重保存
                    youku_category.insert(x,
                                          check_keys=False)  # save categories
            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % 20 == 0:  # 每50步更新一次session
        update_session()
Exemplo n.º 10
0
 def failed_job(self):
     print("go failed_job")
     while True:
         '''监听task'''
         p = rd.spop(config.content_work_task_failed)
         if p == None:
             return True
         task = json.loads(p)
         if task.get("contentName") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.content_work_task_failed, p)
Exemplo n.º 11
0
def producer():
    db_session = scoped_session(DBSession)
    print(db_session)
    count = db_session.query(GDCmscontent).count()
    print(count)
    size = 500
    for x in xrange(1,count/size+2):
        contents = db_session.query(GDCmscontent).filter((GDCmscontent.series_flag.in_((100,110))) & (GDCmscontent.data_flag==None)&(GDCmscontent.id>=size*(x-1))&(GDCmscontent.id<=size*x)).all()
        for item in contents:
            rd.sadd(config.gd_task,pickle.dumps(item.__dict__))
            print(item.id)
    db_session.close()
    return True
Exemplo n.º 12
0
 def job(self):
     '''后台job'''
     while True:
         '''监听task'''
         p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         if p==None:
             print("sleep 6s...")
             time.sleep(6)
             continue
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.gd_task_bkbk,p)
             pass
Exemplo n.º 13
0
def task_video():
    """
    """
    retry = 5
    i = 0
    while True:
        id = rd.spop(config.douban_tv_task)
        # id = rd.spop(config.douban_tv_failed)
        if id is None:
            print(u"task_page sleeping....20sec")
            return True
        if rd.sismember(config.doubantv_ajax_task_done, id) == True:
            print(u"already done%s" % id)
            continue
        url = tv_url.format(id=id)
        r = requests_get(url=url, headers=douban_home_headers)
        if r == False or r == None:
            rd.sadd(config.douban_tv_failed, id)
            continue
        try:
            cb = check_block(r)
        except Exception as e:
            print("check_block:", str(e))
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_video(r)
        piw = piwik(page_title=page_title(r),
                    session_time=session_time,
                    origin_url=url,
                    urlref='')
        print("piw", piw)
        if data.get("title") == None:
            rd.sadd(config.douban_tv_failed, id)
            time.sleep(task_wait)
            # update_session()
            print("------spider ben block...")
            continue
        data['doubanid'] = id
        print(json.dumps(data))
        mongo_r = mongo_douban_tvs.insert(data, check_keys=False)  #
        photostask = json.dumps({"id": id, "mongoTVID": str(mongo_r)})
        if rd.sismember(config.douban_star_done,
                        photostask) == False and rd.sismember(
                            config.douban_photos_failed, photostask) == False:
            rd.sadd(config.douban_photos_task, photostask)
        print(photostask)
        # return True
        rd.sadd(config.douban_tv_done, id)
        # tv_after(id=id, url=url)
        print("done.. sleep %s seconds." % task_wait)
        delay()
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Exemplo n.º 14
0
 def job(self):
     '''后台job'''
     print("go job")
     while True:
         '''监听task'''
         p = rd.spop(config.content_work_task)
         if p == None:
             self.failed_job()
             print("sleep 6s...")
             time.sleep(6)
             continue
         task = json.loads(p)
         if task.get("contentName") is None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.content_work_task_failed, p)
Exemplo n.º 15
0
 def failed_job(self):
     '''后台job'''
     while True:
         '''监听task'''
         # p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         p = rd.spop(config.gd_task_failed)
         if p == None:
             return True
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         if not r:
             rd.sadd(config.gd_task_failed, p)
             pass
         else:
             rd.sadd(config.gd_task_bkbk, p)
             pass
Exemplo n.º 16
0
def get_detailurl_task():
    """
    get_detailurl_task yk_get_detailurl_task 解析到detail_list页面的url
    """
    retry = 5
    i = 0
    while True:
        q = rd.spop(config.yk_get_detailurl_task)
        if q is None:
            print(u"yk_get_detailurl_task sleeping 20 sec")
            # time.sleep(task_wait)
            return True
        to_detail_url = json.loads(q)
        headers = youku_home_headers
        headers['Referer'] = to_detail_url['Referer']
        # if rd.sismember(config.yk_get_detailurl_done,q)==True or rd.sismember(config.yk_get_detailurl_field,q)==True:
        if rd.sismember(config.yk_get_detailurl_done,q)==True:
            print("pass")
            continue
        r = requests_get(to_detail_url['url'], headers=headers)
        # headers = youku_home_headers
        # headers['Referer'] = to_detail_url['url']
        # try:
        #     session.get('http://cmstool.youku.com/cms/player/userinfo/user_info?specialTest=test&client=pc&callback=tuijsonp1',headers=headers)
        # except Exception as e:
        #     pass
        print("to_detail_url",to_detail_url['url'])
        detail_url = parse_tv_show(r, to_detail_url['url'])
        print("detail_url:",detail_url)
        if detail_url == False or detail_url==None:
            rd.sadd(config.yk_get_detailurl_field, q)
            continue
        # if rd.sismember(config.yk_video_detail_done,json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))==False:
        if rd.sismember(config.yk_video_detail_done,detail_url)==False:
            red = rd.sadd(config.yk_video_detail_task, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
            if red==1:
                print("yes")
        rd.sadd(config.yk_get_detailurl_done,q)
        # rd.sadd(config.yk_video_detail_task_, json.dumps({"url": detail_url, 'Referer': to_detail_url['url']}))
        # time.sleep(2)
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 17
0
def process():
    path = u"E:/posters/"
    while True:
        p = rd.spop(config.image_v)
        if not p:
            print("done! sleep 6s")
            time.sleep(6)
            continue
        task = json.loads(p)
        # im = requests_get(u'http://meeting.itvfocus.com/'+task['image_v'])
        im = requests_get(
            u'http://183.59.160.50:8082/EPG/jsp/images/universal/film/poster/'
            + task['image_v'])
        if not im:
            rd.sadd("image_v_failed", p)
            print("failed", p)
            continue
        #im = Image.open(r.raw)
        if im.width < 180:
            continue
        file_name = "/".join([
            task.get("content_id"),
            "%s_%sx%s.jpg" % (task.get("content_id"), im.width, im.height)
        ])
        try:
            os.makedirs(re.search('(.*/)', path + file_name).group(1))
        except Exception as e:
            #print(str(e))
            pass
        im.convert('RGB').save(path + file_name)
        ise = mongo_conn.posters.find({
            "file_path": file_name,
            "content_id": task['content_id']
        })
        if ise.count() != 0:
            continue
        task['file_path'] = file_name
        task['url'] = task['image_v']
        if task.get("_id"):
            del task['_id']
        _id = mongo_conn.posters.insert(task, check_keys=False)
        print(task['content_id'], _id, file_name)
Exemplo n.º 18
0
def task_types_fetch():
    retry = 5
    i = 0
    while True:
        type_url = rd.spop(config.yk_types_task)
        if type_url is None:
            print(u"yk_types_task sleeping 20sec....")
            return True
        if rd.sismember(config.yk_types_failed,
                        type_url) == True or rd.sismember(
                            config.yk_types_done, type_url) == True:
            continue
        r = requests_get(url=type_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:
            print(u'filed task:%s' % type_url)
            rd.sadd(config.yk_types_failed, type_url)
            continue
        pages = parse_category_show(r, type_url)
        print("task_types_fetch data:", pages)
        for page in xrange(1, int(pages['pages'])):
            page_url = re.sub('(\.html.*)',
                              '_s_1_d_1_p_{page}.html'.format(page=page),
                              type_url)
            print("task_types_fetch for :", page_url)
            if rd.sismember(config.yk_page_failed,
                            page_url) == False and rd.sismember(
                                config.yk_page_done, page_url) == False:
                rd.sadd(config.yk_page_task, page_url)
        rd.sadd(config.yk_types_done, type_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 19
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        # task = rd.spop(config.douban_star_task)
        task = rd.spop(config.douban_star_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            break
            continue
        # if rd.sismember(config.douban_star_failed, task) == True or rd.sismember(config.douban_star_done, task) == True:
        if rd.sismember(config.douban_star_done, task) == True:
            print(u"already done%s" % task)
            continue
        url = star_url.format(id=task)
        print(url)
        r = requests_get(url=url)
        if u'检测到有异常请求从你的 IP 发出' in r:
            print("------spider ben block... break......")
            delay(block_wait)
            continue
        data = parse_star(r)
        if data == False or data == None or data.get("name") == None:
            rd.sadd(config.douban_star_failed, task)
            update_session()
            time.sleep(20)
            print("------spider ben sleep 20 sec...")
            continue
        data['doubanid'] = task
        print(json.dumps(data))
        result = mongo_douban_stars.insert(data, check_keys=False)
        rd.sadd(config.douban_star_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Exemplo n.º 20
0
def producer_image_v():
    """get image_v from gd cms_content"""
    db_session = scoped_session(DBSession)
    print(db_session)
    # count = db_session.query(GDCmscontent).count()
    maxid = db_session.query(func.max(GDCmscontent.id)).all()
    print(maxid[0][0])
    size = 500
    cat = [
        u"电影", u"少儿", u"动漫", u'动画', u'动画片', u'剧集', u'连续剧'
        u'青春'
        u'综艺'
        u'纪录片', u'儿童', u'高清', u'电视剧'
    ]
    for x in xrange(1, maxid[0][0] / size + 2):
        contents = db_session.query(GDCmscontent).filter(
            (GDCmscontent.image_v != None) & (GDCmscontent.id >= size *
                                              (x - 1))
            & (GDCmscontent.id <= size * x)).all()
        print(size * x)
        for item in contents:
            data = {}
            data['code'] = item.__dict__["code"]
            data['image_v'] = item.__dict__["image_v"]
            c = mongo_conn.contents.find({
                'relationship': {
                    '$elemMatch': {
                        'mediaId': item.__dict__["code"],
                        "platform": "gd"
                    }
                }
            })
            if c.count() == 0:
                continue
            for x in c:
                data['content_id'] = str(x['_id'])
                print(data)
                rd.sadd(config.image_v, json.dumps(data))
    db_session.close()
    return True
Exemplo n.º 21
0
 def job(self):
     '''后台job'''
     while True:
         '''监听task'''
         # p = rd.spop(config.gd_task_bk.encode('latin1'))
         # p = rd.spop(config.gd_task_bkbk.encode('latin1'))
         p = rd.spop(config.gd_task)
         if p == None:
             self.failed_job()
             print("sleep 60s...")
             time.sleep(60)
             continue
         task = pickle.loads(p)
         if task.get("name") == None:
             continue
         r = self.process(task)
         print("process", r)
         if not r:
             rd.sadd(config.gd_task_failed, p)
             pass
         else:
             rd.sadd(config.gd_task_bkbk, p)
             pass
Exemplo n.º 22
0
def task_star():
    """
    """
    retry = 5
    i = 0
    while True:
        task = rd.spop(config.le_star_task)
        # task = u'{"7088": "石田卓也"}'
        if task is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        print(task)
        is_done = rd.sismember(config.le_star_done, task)
        if is_done == True:
            print("already done.")
            continue
        task_json = json.loads(task)
        url = so_url.format(wd=task_json[task_json.keys()[0]])
        r = requests_get(url=url, headers=leso_headers)
        if r is False or r == None:  # 失败
            print(u'filed task:%s' % url)
            rd.sadd(config.le_star_failed, task)
            continue
        data = parse_sostar(r, task_json)
        if data == False or data == None:
            rd.sadd(config.le_star_failed, task)
            continue
        mongo_id = mongo_letv_stars.insert(data, check_keys=False)  #
        if mongo_id:
            rd.sadd(config.le_star_done, task)
        else:
            print(mongo_id)
            rd.sadd(config.le_star_failed, task)
        print('done.')
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 23
0
def task_category():
    """
    解析每一个category下的分类,
    并获取该category 每个分类下的全部资源的url任务, 
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        category = rd.spop(config.yk_category_task)
        if category is None:
            print(u"task_category sleeping....20sec")
            # time.sleep(task_wait)
            return True
        category = json.loads(category)
        print(category)
        r = requests_get(url=category['url'], headers=youku_home_headers,session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % category['url'])
            rd.sadd(config.yk_category_task_failed, category['url'])
            continue
        data = parse_category_show(r, category['url'])
        print("category and types:", json.dumps(data))
        if len(data['types']) == 0:  # category下没有type,
            re_sadd = rd.sadd(config.yk_types_task,category['url'])  # types url
        else:
            for ty in data['types']:
                if rd.sismember(config.yk_types_done,data['types'][ty]) == False and rd.sismember(config.yk_types_failed,data['types'][ty]) == False:
                    rd.sadd(config.yk_types_task,data['types'][ty])  # types fetch task
                re_sadd = rd.sadd(config.yk_types_done,data['types'][ty])  # types url 数据库去重
                if re_sadd == 0:  # 去重保存
                    continue
                youku_video_types.insert(
                    {"name": ty, "url": data['types'][ty], "category": category['name']}, check_keys=False)  # save tv types
        rd.sadd(config.yk_category_task_done, category['url'])
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 24
0
def task_page_fetch():
    """
    解析每一个category下每个分类下的每一页list数据中的所有tv url,
    这里要做url任务去重
    """
    retry = 5
    i = 0
    while True:
        page_url = rd.spop(config.yk_page_task)
        # page_url = rd.spop(config.yk_page_failed) #retry
        if page_url is None:
            print(u"task_page_fetch sleeping 20sec....")
            # time.sleep(task_wait)
            return True
        print("page_url", page_url)
        if rd.sismember(config.yk_page_failed,
                        page_url) == True or rd.sismember(
                            config.yk_page_done, page_url) == True:
            continue
        r = requests_get(url=page_url,
                         headers=youku_home_headers,
                         session=session)
        if r is False or r == None:  # 获取详情失败
            print(u'filed task:%s' % page_url)
            rd.sadd(config.yk_page_failed, page_url)
            continue
        print("done task_page_fetch:", page_url)
        data = parse_page_fetch(r, page_url)
        for x in data['yk_get_detailurl_task']:
            rd.sadd(config.yk_get_detailurl_task,
                    json.dumps(x))  # 链接是直接到播放页面的V_show类型
        for x in data['yk_video_detail_task']:
            r_add = rd.sadd(config.yk_video_detail_task,
                            json.dumps(x))  # detail_list_task
        rd.sadd(config.yk_page_done, page_url)
        # 每50步更新一次session
        i += 1
        if i % max_step == 0:
            update_session()
Exemplo n.º 25
0
def task_photos():
    """
    """
    retry = 5
    i = 0
    photos_url = u'https://movie.douban.com/subject/{id}/photos?type=R'
    while True:
        #线程锁,必须加这里.
        #with threading.Lock():
        # task = rd.spop(config.douban_photos_task)
        task = rd.spop(config.douban_photos_failed)
        if task is None:
            print(u"task_page sleeping....20sec")
            return True
        # if rd.sismember(config.douban_photos_failed, task) == True or rd.sismember(config.douban_photos_done, task) == True:
        if rd.sismember(config.douban_photos_done, task) == True:
            print(u"already done%s" % task)
            continue
        T = json.loads(task)
        # T = {}
        # task = ""
        # T['id'] = "25827963"
        url = photos_url.format(id=T['id'])
        print(url)
        # data = []
        data = get_photos(url=url, id=T['id'])
        # for x in get_photos(url=url, id=T['id']):
        #     #if x == False or len(x) == 0 or x == None:
        #     if x == False or x == None:
        #         # rd.sadd(config.douban_photos_failed, task)
        #         rd.sadd(config.douban_photos_task, task)
        #         print("------spider ben sleep 20 sec...")
        #         update_session()
        #         break
        #     print(json.dumps(x))
        #     print(len(x))
        #     data += x
        print("++++++++++++++++%s+++++++++++++%s++++++++++++" %
              (task, len(data)))
        if len(data) == 0:
            #rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
            continue
        print(json.dumps(data))
        # return
        '''这是后面的骚操作.....'''
        mongo_douban_tvs.update({'_id': ObjectId(T['mongoTVID'])},
                                {'$unset': {
                                    'poster': 1
                                }},
                                multi=True)
        result = mongo_douban_tvs.update_one({'_id': ObjectId(T['mongoTVID'])},
                                             {'$set': {
                                                 'poster': data
                                             }})
        if result.modified_count == 0:
            rd.sadd(config.douban_photos_failed, task)
            #rd.sadd(config.douban_photos_task, task)
        rd.sadd(config.douban_photos_done, task)
        delay()
        print("done.%s. sleep 3 seconds." % result.modified_count)
        i += 1
        if i % max_step == 0:
            bid = random_str(10)
            session.cookies.set('bid', bid, domain='.douban.com', path='/')
Exemplo n.º 26
0
def parse_video(r):
    data = {}
    page = etree.HTML(r)
    year = re.search(u'<span class="year">\((\d{4})\)</span>', r)
    if year:
        data['year'] = year.group(1)
    title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r)
    if title:
        data['title'] = title.group(1)
    bianju = page.xpath(u'//span[contains(text(),"编剧")]')
    if len(bianju) > 0:
        bianju_a = bianju[0].getnext()
        if bianju_a:
            bianju_a = bianju_a.findall('a')
            data['screenwriter_list'] = []
            screenwriters = ''
            for x in bianju_a:
                screenwriters = screenwriters + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data['screenwriter_list'].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            screenwriters = screenwriters.strip(',')
            data['screenwriters'] = screenwriters

    directors_el = page.xpath(u'//span[contains(text(),"导演")]')
    if len(directors_el) > 0:
        directors_a = directors_el[0].getnext()
        if directors_a:
            directors_a = directors_a.findall('a')
            data['directors_list'] = []
            directors = ""
            for x in directors_a:
                directors = directors + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data["directors_list"].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            directors = directors.strip(',')
            data['directors'] = directors

    starring_el = page.xpath(u'//span[contains(text(),"主演")]')
    if len(starring_el) > 0:
        starring_a = starring_el[0].getnext()
        if starring_a:
            starring_a = starring_a.findall('a')
            data['starring_list'] = []
            starring = ""
            for x in starring_a:
                starring = starring + x.text + ","
                # doubanid = re.search(u'/celebrity/(\d*)/',x.get("href")).group(1) if re.search(u'/celebrity/(\d*)/',x.get("href")) else x.get("href")
                if re.search(u'/celebrity/(\d*)/', x.get("href")):
                    doubanid = re.search(u'/celebrity/(\d*)/',
                                         x.get("href")).group(1)
                    if rd.sismember(config.douban_star_done,
                                    doubanid) == False and rd.sismember(
                                        config.douban_star_failed,
                                        doubanid) == False:
                        rd.sadd(config.douban_star_task, doubanid)
                else:
                    doubanid = x.get("href")
                data["starring_list"].append({
                    "name": x.text,
                    "doubanid": doubanid
                })
            starring = starring.strip(',')
            data['starring'] = starring
    type_el = page.xpath(u'//span[@property="v:genre"]')  # 类型
    if len(type_el) > 0:
        mvtype = ""
        for x in type_el:
            mvtype = mvtype + x.text + ","
        mvtype = mvtype.strip(',')
        data['type'] = mvtype

    producer_country_el = page.xpath(u'//span[contains(text(),"制片国家/地区:")]')
    if len(producer_country_el) > 0:
        data['producer_country'] = page.xpath(
            u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0]

    language_el = page.xpath(u'//span[contains(text(),"语言:")]')
    if len(language_el) > 0:
        data['language'] = page.xpath(
            u'//span[contains(text(),"语言:")]/following::text()[1]')[0]

    all_episode = page.xpath(u'//span[contains(text(),"集数:")]')
    if len(all_episode) > 0:
        data['all_episode'] = page.xpath(
            u'//span[contains(text(),"集数:")]/following::text()[1]')[0]

    episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]')
    if len(episode_time) > 0:
        data['episode_time'] = page.xpath(
            u'//span[contains(text(),"单集片长:")]')[0].text

    season = page.xpath(
        u'//select[@id="season"]/option[@selected="selected"]')  #season季数
    if len(season) > 0:
        data['season'] = season[0].text

    release_date_el = page.xpath(
        u'//span[@property="v:initialReleaseDate"]')  #首播
    if len(release_date_el) > 0:
        release_date = ""
        for x in release_date_el:
            release_date = release_date + x.text + "|"
        release_date = release_date.strip('|')
        data['release_date'] = release_date
    duration_el = page.xpath(u'//span[@property="v:runtime"]')
    if len(duration_el) > 0:
        data['duration'] = duration_el[0].text  # 片长

    alias_al = page.xpath(u'//span[contains(text(),"又名:")]')
    if len(alias_al) > 0:
        data["alias"] = page.xpath(
            u'//span[contains(text(),"又名:")]/following::text()[1]')[0]

    IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]')
    if len(IMDb_el) > 0:
        data["IMDb"] = IMDb_el[0].getnext().get("href")

    rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r)
    if rating:
        data['rating'] = rating.group(1)

    rating_sum = page.xpath(u'//span[@property="v:votes"]')
    if len(rating_sum) > 0:
        data['rating_sum'] = rating_sum[0].text

    summary_all = page.xpath(u'//span[@class="all hidden"]')
    summary = page.xpath(u'//span[@property="v:summary"]')
    if len(summary_all) > 0:
        data['summary'] = ''.join(
            page.xpath(u'//span[@class="all hidden"]/text()'))
    elif len(summary) > 0:
        data['summary'] = ''.join(
            page.xpath(u'//span[@property="v:summary"]/text()'))

    img_url = page.xpath(u'//img[@title="点击看更多海报"]')
    nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]')
    if len(img_url) > 0:
        data["img_url"] = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src")
    elif len(nbgnbg) > 0:
        data["img_url"] = nbgnbg[0].get("href")
    tags = page.xpath(u'//div[@class="tags-body"]/a')
    data['tags'] = ''
    for x in tags:
        data['tags'] += "".join([x.text, ','])
    data['tags'] = data['tags'].strip(',')
    if len(data) == 0:
        print(r)
    return data
Exemplo n.º 27
0
def task_api():
    """
    """
    retry = 5
    i = 0
    while True:
        url = rd.spop(config.doubantv_ajax_task)
        origin_url = url
        if url is None:
            print(u"task_page sleeping....20sec")
            time.sleep(task_wait)
            continue
        # if rd.sismember(config.doubantv_ajax_task_done, url) == True or rd.sismember(config.doubantv_ajax_task_failed, url) == True:
        if rd.sismember(config.doubantv_ajax_task_done, url) == True:
            print(u"already done%s" % url)
            continue
        start = 0
        while True:
            url = re.sub(u'start=(\d*)', 'start=%s' % str(start * 20), url)
            print(url)
            r = requests_get(url, headers=douban_referer_tag_headers)
            if r is False or r == None:  # 失败
                print(u'filed task:%s' % url)
                rd.sadd(config.doubantv_ajax_task_failed, url)
                continue
            try:
                r_data = json.loads(r)
            except Exception as e:
                rd.sadd(config.doubantv_ajax_task_failed, url)
                print(r)
                print(str(e))
                update_session()
                time.sleep(task_wait)
                print("-----spider  ben   sleep 10 sec....")
                continue
            if len(r_data['data']) == 0:
                rd.sadd(config.doubantv_ajax_task_done, origin_url)
                print("done%s" % origin_url)
                break
            for x in r_data['data']:
                if rd.sismember(config.douban_tv_done,
                                x['id']) == False and rd.sismember(
                                    config.douban_tv_failed, x['id']) == False:
                    add_task = rd.sadd(config.douban_tv_task, x['id'])
                    if add_task == 1:
                        print(
                            "---------------join task.----%s--------------------"
                            % x['id'])
                    else:
                        print(
                            '***********task repeat-******%s********************'
                            % x['id'])
                    rd.sadd(config.douban_tvids, x['id'])
            rd.sadd(config.doubantv_ajax_task_done, origin_url)
            print("sleep 2 seconds")
            delay()
            i += 1
            start += 1
            if i % max_step == 0:
                bid = random_str(10)
                session.cookies.set('bid', bid, domain='.douban.com', path='/')
                try:
                    session.get(url=ad_url.format(bid=bid),
                                headers=douban_referer_tag_headers,
                                timeout=timeout)
                except Exception as e:
                    pass
Exemplo n.º 28
0
def spider_seed(tag_url=tag_url):
    """获取分类,做种子"""
    start = 1
    retry = 5
    while retry > 0:
        try:
            r = requests_get(url=tag_url,
                             headers=douban_home_headers,
                             timeout=timeout)
            # page = etree.HTML(r)
            appjs_url = re.search(
                u'<script type="text/javascript" src="((.*)app\.js)"></script>',
                r).group(1)
            print(appjs_url)
            appjs = requests_get(url=appjs_url, headers=douban_appjs_headers)
            jsdata = re.search(
                u'mixins\:\[f\.mixin\],data\:function\(\)\{return(.*)\},ready\:function\(\)\{window',
                appjs).group(1)
            print(jsdata)
            jsdata = re.sub(u'!', '', jsdata)
            jsdata = re.sub(
                u'browserHeight:document.documentElement.clientHeight', '',
                jsdata)
            jsdata = demjson.decode(jsdata)
            save_tags = rd.sadd(config.doubantv_tags,
                                json.dumps(jsdata['tag_categories']))
            if save_tags == 1:
                # mongo_douban_tags.insert(json.dumps(jsdata["tag_categories"]), check_keys=False)  #
                mongo_douban_tags.insert(
                    {"tag_categories": jsdata["tag_categories"]},
                    check_keys=False)  #
            ajax_list_url = u'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags={tags}&start=0&genres={genres}&countries={countries}'
            print(len(jsdata["tag_categories"][0]))
            print(len(jsdata["tag_categories"][1]))
            print(len(jsdata["tag_categories"][2]))
            print(len(jsdata["tag_categories"][3]))
            jsdata["tag_categories"][1][0] = ""
            jsdata["tag_categories"][2][0] = ""
            jsdata["tag_categories"][3][0] = ""
            jsdata["tag_categories"][0][0] = ""
            for x in xrange(0, len(jsdata["tag_categories"][1])):  # "全部类型"
                c1 = jsdata["tag_categories"][1][x]
                for xx in xrange(0,
                                 len(jsdata["tag_categories"][2])):  # "全部地区"
                    c2 = jsdata["tag_categories"][2][xx]
                    # "全部特色"  tag2
                    for xx in xrange(0, len(jsdata["tag_categories"][3])):
                        c3 = jsdata["tag_categories"][3][xx]
                        url = ajax_list_url.format(tags=c3,
                                                   genres=c1,
                                                   countries=c2)
                        if rd.sismember(config.doubantv_ajax_task_failed,
                                        url) == False and rd.sismember(
                                            config.doubantv_ajax_task_done,
                                            url) == False:
                            rd.sadd(config.doubantv_ajax_task, url)
                        # rd.sadd(config.doubantv_ajax_url,url)
                        print(url)
                        # "全部形式" tag1
                        for xx in xrange(0, len(jsdata["tag_categories"][0])):
                            c0 = jsdata["tag_categories"][0][xx]
                            c3c0 = c3 + ',' + c0
                            c3c0 = re.sub(u',$', "", c3c0)
                            c3c0 = re.sub(u'^,', "", c3c0)
                            url = ajax_list_url.format(tags=c3c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            url = ajax_list_url.format(tags=c0,
                                                       genres=c1,
                                                       countries=c2)
                            if rd.sismember(config.doubantv_ajax_task_failed,
                                            url) == False and rd.sismember(
                                                config.doubantv_ajax_task_done,
                                                url) == False:
                                rd.sadd(config.doubantv_ajax_task, url)
                            # rd.sadd(config.doubantv_ajax_url,url)
                            print(url)

            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % max_step == 0:  # 每50步更新一次session
        update_session()
Exemplo n.º 29
0
def readfile():
    with open(file_name) as f:
        for line in f:
            rd.sadd("imageTask", line)
            print(line)
Exemplo n.º 30
0
def spider_seed(category_url=category_url):
    """获取分类,做种子"""
    start = 1
    list_url = u'http://list.youku.com'
    retry = 5
    while retry > 0:
        try:
            r = requests_get(url=category_url,
                             headers=leshi_headers,
                             timeout=timeout)
            page = etree.HTML(r)
            category_el = page.xpath(
                u'//div[@class="list_box"]/div[@class="column"]/ul[@class="list_cnt"]/li'
            )
            # categories = [{"url":list_url + x.find("a").get("href"),"category":x.find("a").text.replace(" ","").replace("\n","")} for x in category_el if x.find("a") != None]
            categories = []
            for x in category_el:
                if x.find("a") != None:
                    categories.append({
                        "url":
                        list_url + x.find("a").get("href"),
                        "title":
                        x.find("a").text.replace(" ", "").replace("\n", "")
                    })
                else:
                    categories.append({
                        "url":
                        category_url,
                        "title":
                        x.text.replace(" ", "").replace("\n", "")
                    })
            print(json.dumps(categories))
            # return categories
            for x in categories:
                rd.sadd(config.le_page_task, x['url'])
                rd.sadd(config.le_page_urls, x['url'])
                urls = parse_all_url(x["url"])  # 获取该category下的urls
                if urls == False:
                    re.sadd(config.le_getpage_task, x["url"])  #获取该url下的urls失败
                    continue
                for xx in urls:  # 遍历每一个url,得到该页面的全部url
                    rd.sadd(config.le_page_task, xx['url'])
                    rd.sadd(config.le_page_urls, xx['url'])
                    print(xx['url'])
                    print(xx['title'])
                    r = requests_get(r=r, url=xx["url"])
                    rr_urls = parse_all_url(r)
                    if rr_urls == False:
                        re.sadd(config.le_getpage_task,
                                x["url"])  #获取该url下的urls失败
                        continue
                    for xxx in rr_urls:
                        if rd.sismember(config.le_page_failed,
                                        xxx['url']) == True:
                            continue
                        if rd.sismember(config.le_page_done,
                                        xxx['url']) == True:
                            continue
                        rd.sadd(config.le_page_task, xxx['url'])
                        rd.sadd(config.le_page_urls, xxx['url'])
            return True
        except requests.exceptions.ProxyError as e:
            print("ttt", str(e))
            update_session(proxy)
            # retry = 5
        # except requests.exceptions.InvalidProxyURL as e:
        except requests.exceptions.RequestException as e:
            print("xxx", str(e))
            update_session(proxy)
            # retry = 5
        retry -= 1
    start += 1
    if start % 20 == 0:  # 每50步更新一次session
        update_session()