Exemplo n.º 1
0
def analysis_relation2_fromurl(url):
    res = []
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        return None
    soup = bs4.BeautifulSoup(data, 'html.parser')
    # get_name = soup.find_all('dd', attrs={'class': 'lemmaWgt-lemmaTitle-title'})
    # if len(get_name) == 0:
    #     print(path1)
    #     m = m + 1
    #     continue
    # main_name = get_name[0].h1.text
    # print(main_name)

    tags = soup.find_all('ul', attrs={'class': 'slider maqueeCanvas'})

    for tag in tags:
        soup2 = bs4.BeautifulSoup(str(tag), 'html.parser')
        tag2 = soup2.find_all('div', attrs={'class': 'name'})
        if len(tag2) == 0:
            continue
        dict = {}
        for i in tag2:
            if not i.em:
                continue
            full = i.text
            name = i.em.text
            len1 = len(full)
            len2 = len(name)
            relations = full[0:len1 - len2]
            dict_tmp = {relations: name}
            res.append(dict_tmp)
    # print(relations)
    return res
Exemplo n.º 2
0
def analysis_movieurl(url):
    relation = set()

    data = lurl.load(url)

    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    tags1 = soup1.find_all('a', attrs={'class': 'actor-name '})
    if (len(tags1) == 0):
        tags2 = soup1.find_all('p', attrs={'class': 'actorName'})
        if (len(tags2) == 0):
            return None
        for tag2 in tags2:
            relation_star = tag2.text.strip()
            # if relation_star == real_name:
            #     continue
            # else:
            relation.add(relation_star)
    else:
        for tag1 in tags1:
            relation_star = tag1.text.strip()
            # if relation_star == real_name:
            #     continue
            # else:
            relation.add(relation_star)

    return relation
Exemplo n.º 3
0
def analysis_showurl(url, show_path, file_name):
    """
    解析showurl并写html
    :param url: 要解析的url
    :param show_path: 写html的路径
    :param file_name: 写html的文件名
    :return: set
    """
    # p = Pool(1)
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        with open('log', 'a', encoding='utf-8') as f:
            f.write('show_url load 失败:' + str(file_name) + '>----->' + url +
                    '\n')
        return None
    # lurl.write_html(file_name, data, show_path)
    # p.apply_async(lurl.write_html,args=(file_name,data,show_path))
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    # print(soup1)
    tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'})
    # tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    if not tag1:
        return None
    relation = set()
    num = -1
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': True})
    # print(tags2)
    tags3 = soup2.find_all('dd', attrs={'class': True})
    # print(tags3)
    for tag2 in tags2:
        if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text.strip('\n') == '主持人':
                num = num + 1
                break
            else:
                num = num + 1
                # print(num)
    # print(num)
    if num != -1:
        zhuchiren = tags3[num]
        # print(zhuchiren)
    else:
        return None
    list = re.split(r'[,、]', zhuchiren.text.strip('\n'))
    # print(list)
    # print(type(list))
    for i in range(len(list)):
        list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i])
        # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(')
    # relation = set(list)
    relation = set(list)
    return relation
Exemplo n.º 4
0
def analysis_showurl(url):
    """
    解析url,获取该条movie-url的movie-relation关系
    :param url: 
    :return: 返回一个set集合
    """
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        with open('log', 'a') as f:
            f.write('show_url load 失败:' + url + '\n')
        return None
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    # print(soup1)
    tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'})
    # tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    if not tag1:
        return None
    relation = set()
    num = -1
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': True})
    # print(tags2)
    tags3 = soup2.find_all('dd', attrs={'class': True})
    # print(tags3)
    for tag2 in tags2:
        if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text.strip('\n') == '主持人':
                num = num + 1
                break
            else:
                num = num + 1
                # print(num)
    # print(num)
    if num != -1:
        zhuchiren = tags3[num]
        # print(zhuchiren)
    else:
        return None
    # list = zhuchiren.text.strip('\n').split('、')
    # print(list)
    list = re.split(r'[,、]', zhuchiren.text.strip('\n'))
    # print(list)
    # print(type(list))
    for i in range(len(list)):
        list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i])
        # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(')
    # relation = set(list)
    relation = set(list)
    return relation
Exemplo n.º 5
0
def analysis_movieurl(url, movie_path, file_name):
    """
    接卸movie——url并且写html文件
    :param url: movie的url
    :param movie_path: 写html的路径
    :param file_name: html的文件名
    :return: set
    """
    relation = set()
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        with open('log', 'a', encoding='utf-8') as f:
            f.write('movie_url load 失败:' + str(file_name) + '>----->' + url +
                    '\n')
        return None
    lurl.write_html(file_name, data, movie_path)
    # p.apply_async(lurl.write_html, args = (file_name,data,movie_path))
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    num = -1
    if not tag1:
        return None
    # else:
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'})
    tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'})

    for tag2 in tags2:
        # print(tag2.text.strip('\n'))
        if tag2.text != '主    演' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text == '主    演':
                # print('here')
                num = num + 1
                break
            else:
                num = num + 1
    # print(num)
    if num != -1:
        zhuyan = tags3[num]
    else:
        return None
    list = re.split(r'[,、]', zhuyan.text.strip('\n'))
    # print(list)
    # print(type(list))
    for i in range(len(list)):
        list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i])
        # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(')
    relation = set(list)
    return relation
Exemplo n.º 6
0
def get_movieurl_fromurl(star_url):
    url = []
    data = lurl.load(star_url)
    if data == None:
        print('timeout:' + star_url)
        return None
    soup = bs4.BeautifulSoup(data, 'html.parser')
    # get_name = soup.find_all('dd', attrs={'class': 'lemmaWgt-lemmaTitle-title'})
    # if len(get_name) == 0:
    #     # print(path1)
    #     continue
    # main_name = get_name[0].h1.text
    # print(main_name)
    tags = soup.find_all('div', attrs={'class': 'star-info-block works'})
    # tags = soup.find_all('ul', attrs={'class':'slider maqueeCanvas'})

    for tag in tags:
        soup2 = bs4.BeautifulSoup(str(tag), 'html.parser')
        tag2 = soup2.find_all('ul', attrs={'class': 'slider maqueeCanvas'})
        if len(tag2) == 0:
            continue
        for i in tag2:
            soup3 = bs4.BeautifulSoup(str(i), 'html.parser')
            tag3 = soup3.find_all('a', attrs={'href': True})
            for j in tag3:
                name = j.text.strip('\n')
                # print(name)
                item = j['href']
                # print(item)
                if item[0] != '/':
                    movie_url = item
                else:
                    movie_url = tmp + item
                # print(movie_url)
                url.append(movie_url)
                # 下载电影html
                # if name == '' or name == None:
                #     continue
                # else:
                #     data = test.load(movie_url)
                #     test.write_html(main_name.replace('/', ' ') + '-' + name.replace('/', ' '), data)
                #     n = n + 1
                #     print(n)
    # print(url)
    return url
Exemplo n.º 7
0
def analysis_movieurl_list(url):
    """
    因为要获取有序的列表,需要返回list(去重的list)
    :param url: 
    :return: 
    """
    # relation = []
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        with open('log', 'a') as f:
            f.write('movie_url load 失败:' + url + '\n')
        return None
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    num = -1
    if not tag1:
        return None
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'})
    tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'})
    for tag2 in tags2:
        # print(tag2.text.strip('\n'))
        if tag2.text != '主    演' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text == '主    演':
                num = num + 1
                break
            else:
                num = num + 1
    if num != -1:
        zhuyan = tags3[num]
    else:
        return None
    tmp_list = re.split(r'[,、]', zhuyan.text.strip('\n'))
    for i in range(len(tmp_list)):
        tmp_list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '',
                             tmp_list[i])
    relation = list(set(tmp_list))
    relation.sort(key=tmp_list.index)
    return relation
Exemplo n.º 8
0
def analysis_movieurl(url):
    """
    解析url,获取该条url的movie——relation关系
    :param url: 需要解析的movie——url地址
    :return: 返回一个set集合
    """
    relation = set()
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        with open('log', 'a') as f:
            f.write('movie_url load 失败:' + url + '\n')
        return None
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    num = -1
    if not tag1:
        return None
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'})
    tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'})
    for tag2 in tags2:
        # print(tag2.text.strip('\n'))
        if tag2.text != '主    演' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text == '主    演':
                num = num + 1
                break
            else:
                num = num + 1
    if num != -1:
        zhuyan = tags3[num]
    else:
        return None
    list = re.split(r'[,、]', zhuyan.text.strip('\n'))
    for i in range(len(list)):
        list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i])
    relation = set(list)
    return relation
Exemplo n.º 9
0
def get_showurl_fromurl(star_url):
    res = []
    data = lurl.load(star_url)
    if data == None:
        print('timeout:' + star_url)
        return None
    soup1 = bs4.BeautifulSoup(f, 'html.parser')
    tags1 = soup1.find_all('table', attrs={'class': 'cell-module'})
    if (len(tags1) == 0):
        return None
    tags1 = tags1[0]
    soup2 = bs4.BeautifulSoup(str(tags1), 'html.parser')
    tags2 = soup2.find_all('a', attrs={'href': True})
    if (len(tags2) == 0):
        return None
    for tag2 in tags2:
        # 每个循环一个url(一个show记录),解析并添加到relation_set
        show_name = tag2.text.strip('\n')
        url = tag2['href']
        if url[0] == '/':
            show_url = tmp + url
        elif url[1] == 'h':
            show_url = url
        else:
            continue
        res.append(show_url)
        # n = n + 1
        # print(n)
        # print(star_name)
        # data = lurl.load(show_url)
        # st.write_html(star_name.replace('/',' ')+'-'+show_name.replace('/',' '),data)
        # list = ansis.show_html_analysis(data)
        # if list != None:
        #     for name in list:
        #         realation_set.add(name)
        # else:
        #     continue
    # print(res)
    # print(res)
    return res
Exemplo n.º 10
0
def analysis_showurl2(url):
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)
        return None
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    # print(soup1)
    # tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'})
    # 在这里修改了
    tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    if not tag1:
        return None
    relation = set()
    num = -1
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': True})
    # print(tags2)
    tags3 = soup2.find_all('dd', attrs={'class': True})
    # print(tags3)
    for tag2 in tags2:
        if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text.strip('\n') == '主持人':
                num = num + 1
                break
            else:
                num = num + 1
        # print(num)
    # print(num)
    if num != -1:
        zhuchiren = tags3[num]
        # print(zhuchiren)
    else:
        return None
    list = zhuchiren.text.strip('\n').split('、')
    # print(list)
    relation = set(list)
    return relation
Exemplo n.º 11
0
def analysis_movieurl2(url):
    relation = set()
    data = lurl.load(url)
    if data == None:
        print('timeout:' + url)

        return None
    soup1 = bs4.BeautifulSoup(data, 'html.parser')
    tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'})
    num = -1
    if not tag1:
        return None
    # else:
    soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser')
    tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'})
    tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'})

    for tag2 in tags2:
        # print(tag2.text.strip('\n'))
        if tag2.text != '主    演' and num == len(tags2) - 2:
            num = -1
        else:
            if tag2.text == '主    演':
                # print('here')
                num = num + 1
                break
            else:
                num = num + 1
    # print(num)
    if num != -1:
        zhuyan = tags3[num]
    else:
        return None
    list = zhuyan.text.strip('\n').split(',')
    relation = set(list)
    return relation
def recomend(star_url, star_path, movie_path, show_path, path_res):
    with open(path_res, 'a', encoding='UTF-8') as f:
        for k, v in star_url.items():
            # p1 = Pool(3)
            star_name = k  #明星名字
            print('=======' + star_name + '=======')
            relation_list = []  # 解析的明星relation列表
            movie_url = []  # 解析明星的movieurl列表
            show_url = []  # 解析明星的showurl列表

            full = {}  # each line {star_name:full_relation}
            full_relation = []  # 一条总记录
            movie_dic = {}  # 解析的movie推荐列表
            show_dic = {}  # 借些的show推荐列表

            # print(v)
            data = lurl.load(v)
            # lurl.write_html(star_name,data,star_path)
            # p1.apply_async(lurl.write_html(star_name, data, star_path))
            if data == None:
                with open('log', 'a', encoding='utf-8') as f1:
                    f1.write('明星url_load失败:')
                    f1.write(k + ':' + v + '\n')
                continue
            #解析结果:relation,movieurl,showurl
            relation_list = analysis.get_relations(data)
            # relation_list = p1.apply_async(analysis.get_relations,  args=(data, )).get()
            movie_url = analysis.get_movieurl(data)
            # movie_url = p1.apply_async(analysis.get_movieurl, args=(data, )).get()
            show_url = analysis.get_showurl(data)
            # show_url = p1.apply_async(analysis.get_showurl, args=(data, )).get()
            # print(show_url)
            # p1.close()
            # p1.join()
            # relation 结果存储 {relation:[name...}
            # if len(relation_list)!=0:
            if relation_list:
                tmp_dict = {}
                tmp_list = []
                print('relation')
                for i in relation_list:
                    for j in i.keys():
                        tmp_list.append(i[j])
                tmp_dict['relation'] = tmp_list
                full_relation.append(tmp_dict)
                print('relation_over')

            # p2 = Pool(2)
            #load movieurl列表并解析
            # if movie_url != None:
            #     print('movie')
            #
            #     # 逐一借些movie的url
            #     movie_set = p2.apply_async(get_movieset,args=(movie_url, movie_path, star_name)).get()
            #     # 把该明星名字从列表中去除
            #     if movie_set != '' and star_name in movie_set:
            #         movie_set.remove(star_name)
            #     movie_list=list(movie_set)
            #     movie_dic['movie']=movie_list
            #     if len(movie_dic['movie']) != 0 and movie_dic['movie'] != None:
            #         full_relation.append(movie_dic)
            #         print('movie_over')
            if movie_url:
                print('movie')
                movie_relation_list = get_movie_relation_list(movie_url)
                # movie_relation_list = p2.apply_async(get_movie_relation_list, args=(movie_url,)).get()
                if movie_relation_list and star_name in movie_relation_list:
                    movie_relation_list.remove(star_name)
                movie_dic['movie'] = movie_relation_list
                if movie_dic['movie']:
                    full_relation.append(movie_dic)
                    print('movie_over')

            # load showurl列表并解析
            # if show_url!= None:
            if show_url:
                print('show')
                show_set = get_showset(show_url, show_path, star_name)
                # show_set = p2.apply_async(get_showset,args=(show_url, show_path, star_name)).get()
                # 20171228新加的还没尝试(过滤重复名字)
                if show_set != '' and star_name in show_set:
                    show_set.remove(star_name)
                show_list = list(show_set)
                # print(show_list)
                show_dic['show'] = show_list
                if len(show_dic['show']) != 0 and show_dic['show'] != None:
                    full_relation.append(show_dic)
                    print('show_over')
            # p2.close()
            # p2.join()
            if len(full_relation) != 0:
                full[star_name] = full_relation
                data = js.dumps(full, ensure_ascii=False)
                f.write(data + '\n')
            else:
                with open('Null_recommend_list', 'a', encoding='UTF-8') as f3:
                    f3.write(k + ':' + v + '\n')
Exemplo n.º 13
0
def recomend(star_url, path):
    with open(path, 'a') as f:
        for k, v in star_url.items():
            star_name = k  #明星名字
            print('=======' + star_name + '=======')
            relation_list = []  # 解析的明星relation列表
            movie_url = []  # 解析明星的movieurl列表
            show_url = []  # 解析明星的showurl列表

            full = {}  # each line {star_name:full_relation}
            full_relation = []  # 一条总记录
            movie_dic = {}  # 解析的movie推荐列表
            show_dic = {}  # 借些的show推荐列表

            # print(v)
            data = lurl.load(v)
            if data == None:
                with open('log', 'a') as f1:
                    f1.write('明星url_load失败:')
                    f1.write(k + ':' + v + '\n')
                continue
            #解析结果:relation,movieurl,showurl
            relation_list = analysis.get_relations(data)
            movie_url = analysis.get_movieurl(data)
            show_url = analysis.get_showurl(data)
            # print(show_url)

            # relation 结果存储 {relation:[name...}
            if len(relation_list) != 0:
                tmp_dict = {}
                tmp_list = []
                print('relation')
                for i in relation_list:
                    for j in i.keys():
                        tmp_list.append(i[j])
                tmp_dict['relation'] = tmp_list
                full_relation.append(tmp_dict)
                print('relation_over')

            #load movieurl列表并解析
            if movie_url != None:
                print('movie')
                movie_set = set()
                # 逐一借些movie的url
                for url in movie_url:
                    # print(url)
                    tmpset = analysis.analysis_movieurl(url)
                    # print(tmpset)
                    if tmpset != None and len(tmpset) != 0:
                        movie_set = movie_set | tmpset
                    else:
                        continue
                # 把该明星名字从列表中去除
                if movie_set != '' and star_name in movie_set:
                    movie_set.remove(star_name)
                movie_list = list(movie_set)
                movie_dic['movie'] = movie_list
                if len(movie_dic['movie']) != 0 and movie_dic['movie'] != None:
                    # print(len(movie_dic['movie']))
                    full_relation.append(movie_dic)
                    print('movie_over')

            # load showurl列表并解析
            if show_url != None:
                print('show')
                show_set = set()
                for url in show_url:
                    # print(url)
                    tmpset2 = analysis.analysis_showurl(url)
                    # print(tmpset2)
                    if tmpset2 != None and len(tmpset2) != 0:
                        show_set = show_set | tmpset2
                    else:
                        continue
                # 20171228新加的还没尝试(过滤重复名字)
                if show_set != '' and star_name in show_set:
                    show_set.remove(star_name)
                show_list = list(show_set)
                # print(show_list)
                show_dic['show'] = show_list
                if len(show_dic['show']) != 0 and show_dic['show'] != None:
                    full_relation.append(show_dic)
                    print('show_over')

            if len(full_relation) != 0:
                full[star_name] = full_relation
                data = js.dumps(full, ensure_ascii=False)
                f.write(data + '\n')
            else:
                with open('None_recommend_list', 'a') as f3:
                    f3.write(k + ':' + v + '\n')