def add_root_url(search_keyword1=[], search_keyword2=[], search_keyword3=[]):
    log.debug(
        '''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        ''' %
        (str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {
        'search_keyword1': search_keyword1,
        'search_keyword2': search_keyword2,
        'search_keyword3': search_keyword3
    }

    search_keyword = search_keyword1 + search_keyword2

    for i in search_keyword:
        if not i:
            continue
        for num in range(0, 760, 10):
            link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num)
            link = tools.quote(link, safe='#/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, link,
                                       remark=remark):
                base_parser.update_url('VA_urls', link, Constance.TODO)
Пример #2
0
def add_root_url(parser_params):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    for program in parser_params:  #[[91, '山东卫视', '调查', '新闻'], [...]]
        program_id = program[0]
        chan_name = program[1]
        program_name = program[2]
        program_type = program[3]
        image_url = program[4]
        is_have_official_blog = program[5]
        if is_have_official_blog == 2:
            search_keyword = tools.quote(chan_name + ' ' + program_name,
                                         safe='/:?=&%')
            url = 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D' + search_keyword
            base_parser.add_url('mms_urls',
                                SITE_ID,
                                url,
                                remark={
                                    'program_id': program_id,
                                    'chan_name': chan_name,
                                    'program_name': program_name
                                })
Пример #3
0
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for j in search_keywords:
        if not j.strip():
            continue

        for i in range(1, 109):
            url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \
                  '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \
                  'page=%d' % i
            url = tools.quote(url, safe='/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)
def add_root_url(parser_params={}):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    search_keyword1 = parser_params['search_keyword1']
    search_keyword2 = parser_params['search_keyword2']
    search_keyword3 = parser_params['search_keyword3']

    remark = parser_params

    search_keywords = []
    for str_key1 in search_keyword1:
        for str_key2 in search_keyword2:
            search_keywords.append((str_key1 + str_key2).strip())
    else:
        if not search_keyword1:
            search_keywords = search_keyword2
        if not search_keyword2:
            search_keywords = search_keyword1

    for i in search_keywords:
        # print(i)
        if not i.strip():
            continue
        for num in range(0, 760, 10):
            link = "https://www.baidu.com/s?wd=%s%s&pn=%d" % (i, ' 视频', num)
            # print(link)
            link = tools.quote(link, safe='#/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, link,
                                       remark=remark):
                base_parser.update_url('VA_urls', link, Constance.TODO)
Пример #5
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        quote_keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'http://www.soku.com/search_video/q_%s_orderby_2_limitdate_0?spm=a2h0k.8191407.0.0&site=14&' \
                  '_lg=10&page=%s' % (quote_keyword, page_index)
            log.debug('''
                处理: %s
                url : %s''' % (keyword, url))
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div', {'class': 'v-thumb'})
            video_list_url = tools.get_tag(html, 'div', {'class': 'v-meta'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'v-meta-data'})

            if not video_list_title:
                break

            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_info(str(video_info),
                                           'src="(.+?)"',
                                           fetch_one=True)
                image_url = 'http:' + image_url
                print(image_url)
                title = tools.get_info(str(video_info),
                                       'alt="(.+?)"',
                                       fetch_one=True)
                print(title)
                url = tools.get_info(str(video_list_url[info_index]),
                                     'href="(.+?)"',
                                     fetch_one=True)
                url = 'http:' + url
                print(url)
                release_time = tools.get_info(str(
                    video_list_time[info_index * 2 + 1]),
                                              'lass="r">(.+?)<',
                                              fetch_one=True)
                release_time = get_release_time(release_time)
                print(release_time)

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)

                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
Пример #6
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        print(keyword)
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 20):
            url = 'http://so.iqiyi.com/so/q_%s_ctg__t_0_page_%s_p_1_qc_0_rd__site__m_4_bitrate_' % (
                keyword, page_index)

            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'a',
                                             {'class': 'figure-180101'})
            video_list_time = tools.get_tag(html, 'div',
                                            {'class': 'result_info'})
            if not video_list_time:
                print('无视频列表  跳出')
                break

            for info_index, video_info in enumerate(video_list_time):
                try:
                    image_url = tools.get_info(str(
                        video_list_title[info_index]),
                                               'src="(.+?)"',
                                               fetch_one=True)
                    title = tools.get_info(str(video_list_title[info_index]),
                                           'title="(.+?)"',
                                           fetch_one=True)
                    url = tools.get_info(str(video_list_title[info_index]),
                                         'href="(.+?)"',
                                         fetch_one=True)
                    release_time = tools.get_tag(
                        video_info,
                        'em', {
                            'class': 'result_info_desc'
                        },
                        find_all=False).get_text()
                    is_continue = base_parser.save_video_info(
                        image_url=image_url,
                        url=url,
                        title=title,
                        release_time=release_time,
                        site_name=NAME)
                    if not is_continue:
                        next_keyword = True
                        break

                except Exception as e:
                    log.error(e)

            if next_keyword:
                break
Пример #7
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        log.debug('添加根url 关键词 ' + keyword)
        keyword = tools.quote(keyword)
        link = 'http://news.baidu.com/ns?word=%s&pn=0&cl=2&ct=0&tn=news&rn=50&ie=utf-8&bt=0&et=0' % (keyword)
        base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, link, remark = {'offset':0})
Пример #8
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        keyword = tools.quote(keyword)
        for page_index in range(1, 10):
            url = 'https://v.qq.com/x/search/?q=%s&filter=sort=1&&cur=%s' % (
                keyword, page_index)
            print(url)
            html, res = tools.get_html_by_requests(url)
            video_list_title = tools.get_tag(html, 'div',
                                             {'class': 'result_item'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                try:
                    image_url = tools.get_tag(video_info,
                                              'img',
                                              find_all=False)['src']
                    image_url = 'http:' + image_url
                    title = tools.get_tag(video_info, 'h2',
                                          find_all=False).get_text()
                    print(title)
                    url = tools.get_tag(video_info, 'h2',
                                        find_all=False).a['href']
                    release_time = tools.get_tag(video_info,
                                                 'span', {
                                                     'class': 'content'
                                                 },
                                                 find_all=False).get_text()
                    print(release_time)
                    release_time = get_release_time(release_time)
                    print(release_time)
                except Exception as e:
                    log.error(e)
                    continue

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
def add_root_url(parser_params):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(parser_params))

    result_list = parser_params['result_list']
    for result in result_list:
        monitor_type = result[1]
        keywords = str(result[0]).split(',')
        for search_keyword in keywords:
            if not search_keyword:
                continue
            search_keyword = tools.quote(search_keyword, safe='/:?=&%')
            url = 'http://m.weibo.cn/api/container/getIndex?type=user&containerid=100103type%3D3%26q%3D' + search_keyword
            base_parser.add_url('WWA_weibo_user_urls',
                                SITE_ID,
                                url,
                                remark=monitor_type)
Пример #10
0
def add_root_url(keywords):
    log.debug('''
        添加根url
        parser_params : %s
        ''' % str(keywords))
    for keyword in keywords:
        next_keyword = False
        for page_index in range(1, 10):
            keyword = tools.quote(keyword)
            url = 'https://so.tv.sohu.com/mts?wd=%s&c=0&v=0&length=0&limit=0&site=0&o=3&p=%s&st=&suged=&filter=0' % \
                  (keyword, page_index)
            log.debug('处理 url = %s' % url)
            html, res = tools.get_html_by_requests(url)
            video_list_time = tools.get_tag(html, 'a', {'class': 'tcount'})
            video_list_title = tools.get_tag(html, 'div', {'class': 'pic170'})
            if not video_list_title:
                break
            for info_index, video_info in enumerate(video_list_title):
                image_url = tools.get_tag(video_info, 'img',
                                          find_all=False)['src']
                image_url = 'http:' + image_url
                title = video_info.a['title']
                url = video_info.a['href']
                url = 'http:' + url
                release_time = video_list_time[info_index].get_text()
                print(release_time)
                release_time = get_release_time(release_time)
                print(release_time)

                is_continue = base_parser.save_video_info(
                    image_url=image_url,
                    url=url,
                    title=title,
                    release_time=release_time,
                    site_name=NAME)
                if not is_continue:
                    next_keyword = True
                    break

            if next_keyword:
                break
def add_root_url(search_keyword1 = [], search_keyword2 = [], search_keyword3 = []):
    log.debug('''
        添加根url
        search_keyword1 = %s
        search_keyword2 = %s
        search_keyword3 = %s
        '''%(str(search_keyword1), str(search_keyword2), str(search_keyword3)))

    remark = {'search_keyword1': search_keyword1, 'search_keyword2': search_keyword2,
              'search_keyword3': search_keyword3}

    search_keyword = search_keyword1 + search_keyword2

    for j in search_keyword:
        if not j:
            continue
        for i in range(1, 109):
            url = 'https://m.weibo.cn/container/getIndex?type=all&queryVal=%s&luicode=10000011' % j + \
                  '&lfid=106003type%3D1&' + 'title=%s&containerid=100103type' % j + '%3D1%26q%3D' + '%s&' % j + \
                  'page=%d' % i
            url = tools.quote(url, safe='/:?=&%')
            if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark):
                base_parser.update_url('VA_urls', url, Constance.TODO)