def parse_url(offset):
    resp = requests.get(base_url, params={'page': offset})
    print("解析:" + resp.url)
    result = []
    if resp.status_code == 200:
        soup = t.get_bs(resp.content)
        tables = soup.select('table[width="100%%"]')
        for table in tables:
            a = table.find('a')
            detail_url = a['href']  # 歌曲详情页面
            img_url = a.img['src']  # 图片url
            music_name = a.img['alt']  # 歌曲名
            p = table.find('p')
            data_split = p.get_text().split("/")
            singer = data_split[0].strip()  # 歌手
            public_date = data_split[1].strip()
            category = ""  # 分类
            for data in data_split[2:]:
                category += data.strip() + "/"
            div = table.find('div', class_="star clearfix")
            score = div.select('span.rating_nums')[0].text  # 评分
            rate_count = rate_count_pattern.search(
                div.select('span.pl')[0].get_text()).group(0)  # 评分人数
            result.append([
                img_url, music_name, singer, public_date, category, score,
                rate_count, detail_url
            ])
    return result
예제 #2
0
def extract_text(url):
    report = ""
    resp = requests.get(news_url).content
    if resp is not None:
        soup = t.get_bs(resp)
        ps = soup.select('div#main_content p')
        for p in ps[:-1]:
            report += p.text
    return report
def extract_text(url):
    report = ""
    resp = requests.get(news_url).content
    if resp is not None:
        soup = t.get_bs(resp)
        ps = soup.select('div#main_content p')
        for p in ps[:-1]:
            report += p.text
    return report
예제 #4
0
def catch_pic_diagrams(url):
    resp = requests.get(url).content
    if resp is not None:
        soup = t.get_bs(resp)
        # 拿标题建文件夹
        title = soup.select("h1.article-title a")[0].text
        imgs = soup.select('article.article-content img')
        for img in imgs[:-1]:
            t.write_str_data(title + "~" + str(img['src']), file_save_path)
def get_page_count():
    try:
        resp = requests.get(list_url, headers=headers, timeout=5)
        if resp is not None:
            soup = tools.get_bs(resp.text)
            page_count = int(soup.select('li.page-item')[-2].text)
            print("解析获得文章页数:" + str(page_count))
            return page_count
    except Exception as e:
        print(str(e))
예제 #6
0
def get_page_count():
    try:
        resp = requests.get(list_url, headers=headers, timeout=5)
        if resp is not None:
            soup = tools.get_bs(resp.text)
            page_count = int(soup.select('li.page-item')[-2].text)
            print("解析获得文章页数:" + str(page_count))
            return page_count
    except Exception as e:
        print(str(e))
예제 #7
0
def catch_pic_diagrams(url):
    resp = requests.get(url).content
    if resp is not None:
        soup = t.get_bs(resp)
        # 拿标题建文件夹
        title = soup.select("h1.article-title a")[0].text
        imgs = soup.select('article.article-content img')
        for img in imgs[:-1]:
            t.write_str_data(title + "~" + str(img['src']),
                             file_save_path)
def get_article_url(url):
    try:
        resp = requests.get(url, headers=headers, timeout=5)
        if resp is not None:
            print("解析:" + resp.request.url)
            soup = tools.get_bs(resp.text)
            hrefs = soup.select('span.link_title a')
            for a in hrefs:
                tools.write_str_data(a['href'], articles_file)
            return None
    except Exception as e:
        print(str(e))
예제 #9
0
def get_article_url(url):
    try:
        resp = requests.get(url, headers=headers, timeout=5)
        if resp is not None:
            print("解析:" + resp.request.url)
            soup = tools.get_bs(resp.text)
            hrefs = soup.select('span.link_title a')
            for a in hrefs:
                tools.write_str_data(a['href'], articles_file)
            return None
    except Exception as e:
        print(str(e))
예제 #10
0
def catch_pic_diagrams_url(url):
    url_list = []
    print("获取套图:" + url)
    resp = requests.get(url)
    if not resp.status_code == 404:
        if resp is not None:
            soup = t.get_bs(resp.content)
            article = soup.select("article.excerpt a.thumbnail")
            for a in article:
                url_list.append(a['href'])
    else:
        return None
    return url_list
예제 #11
0
def catch_pic_diagrams_url(url):
    url_list = []
    print("获取套图:" + url)
    resp = requests.get(url)
    if not resp.status_code == 404:
        if resp is not None:
            soup = t.get_bs(resp.content)
            article = soup.select("article.excerpt a.thumbnail")
            for a in article:
                url_list.append(a['href'])
    else:
        return None
    return url_list
def parse_url(offset):
    resp = requests.get(base_url, params={'page': offset})
    print("解析:" + resp.url)
    result = []
    if resp.status_code == 200:
        soup = t.get_bs(resp.content)
        tables = soup.select('table[width="100%%"]')
        for table in tables:
            a = table.find('a')
            detail_url = a['href']  # 歌曲详情页面
            img_url = a.img['src']  # 图片url
            music_name = a.img['alt']  # 歌曲名
            p = table.find('p')
            data_split = p.get_text().split("/")
            singer = data_split[0].strip()  # 歌手
            public_date = data_split[1].strip()
            category = ""  # 分类
            for data in data_split[2:]:
                category += data.strip() + "/"
            div = table.find('div', class_="star clearfix")
            score = div.select('span.rating_nums')[0].text  # 评分
            rate_count = rate_count_pattern.search(div.select('span.pl')[0].get_text()).group(0)  # 评分人数
            result.append([img_url, music_name, singer, public_date, category, score, rate_count, detail_url])
    return result