Пример #1
0
def article(url):
    wbdata = download.get(url).text
    soup = BeautifulSoup(wbdata, 'lxml')
    if soup.find('div', id="Content") == None:#文章内容是否存在
        print('this article link is blank,continue to next one, and you need to delete this link from the database')
    else:
        if soup.find('div', id="div_currpage") == None:#通过页面标签判断是否只有一页
            print('this article has only one page')
            para_get(url)
        else:
            print('this article has more than one page')
            page = 1
            page_total = page_count(url)
            while page <= page_total:
                if page == 1:
                    para_get(url)
                    print("this is %d page" % page)
                    page += 1
                else:
                    print('this is %d page' % page)
                    list_url = list(url)
                    list_url.insert(-4, '_'+str(page))
                    list_url = ''.join(list_url)
                    para_get(list_url)
                    page += 1
Пример #2
0
def para_get(url):
    wbdata = download.get(url).text
    soup = BeautifulSoup(wbdata, 'lxml')
    paragraph = soup.find('div', id="Content").find_all('p')
    #corpus_path = r"F:/repository/Corpus.txt"
    if os.path.exists("F:/repository/corpus.txt"):
        os.chdir("F:/repository/")
        pass
    else:
        with open("Corpus.txt", 'w', encoding='utf-8') as file:
            file.close()
        os.chdir("F:/repository/")#chdir是改变工作目录的,不能切换到一个文件下去
    for p_tag in paragraph:
        duanluo = p_tag.get_text()
        duanluo = re.sub(r'\[.*?\]', ' ', str(duanluo))  # 去除[]本身必须使用\转意符号,去除[]及其所包含的内容并以空格替代
        duanluo = re.sub(r'[,?!."]', ' ', duanluo)  # 去除符号
        duanluo = re.sub(r'\(.*?\)', ' ', duanluo)  # 去除()及其所包含的内容
        duanluo = re.sub(r'\n', ' ', duanluo)#去除换行
        duanluo = duanluo.lower()
        duanluo = duanluo.split()
        duanluo = [danci for danci in duanluo]
        duanluo = ' '.join(duanluo)
        with open("F:\\repository\\Corpus.txt", mode='a+', encoding='utf-8') as file:
            file.write(duanluo+'\n')
        print(duanluo)
Пример #3
0
 def get_pic(self, page):
     html = self.get_page(page)
     soup = BeautifulSoup(html, 'lxml')
     pic_tag = soup.find('div', class_="main-image")
     title = pic_tag.img['alt']
     pic_url = pic_tag.img['src']
     pic_name = pic_url[-17:]
     pattern = re.compile(r'/', re.S)
     name = re.sub(pattern, r'-', pic_name)
     img = download.get(pic_url)
     #文件名不能取a/b/c.jpg这一种,否则python会认为他是某路径下的文件
     f = open(name, 'ab')
     f.write(img.content)
Пример #4
0
 def get_all_index(self):
     start_html = download.get(self.url)
     soup = BeautifulSoup(start_html.text, 'lxml')
     year_tag = soup.find('div', class_="main").find_all('div',
                                                         class_="year")
     li_tags = soup.find('div', class_="main").find_all(
         'li')  #找到所有叫li的tag并存在一个可循环对象li_tags中
     for li_tag in li_tags:  #对每一个li_tag操作
         a_tags = li_tag.find(class_="url").find_all("a")  #找出li_tag的所偶a tag
         for a_tag in a_tags:
             self.get_per_folder(a_tag['href'])
             print self.count_num
             self.count_num += 1
Пример #5
0
def page_count(url):
    wbdata = download.get(url).text
    soup = BeautifulSoup(wbdata, 'lxml')
    if soup.find('div', id="div_currpage") == None:
        total_page = int(1)
        print('this article has %d page' % total_page)
        return total_page
    else:
        page_tag = soup.select("div#div_currpage a.pageno")
        total_page = int(page_tag[-1].string)
        print('this article has %d page' % total_page)
        page_tag = soup.select("div#div_currpage a")
        total_page = int(page_tag[-2].string)
        return total_page
Пример #6
0
 def get_per_folder(self, start_page):
     html = download.get(start_page)
     soup = BeautifulSoup(html.text, 'lxml')
     all_index = int(
         soup.find('div', class_="pagenavi").find_all('a')[-2].get_text())
     original_folder_name = soup.find('div', class_="main-image").img['alt']
     folder_name = re.sub(
         '\?', r' ', original_folder_name
     )  #Windows files cannot contain the ':' character: (or any of \ / : * ? " < > | as they are reserved characters.)参考印象笔记的记录内容
     os.makedirs(os.path.join(r'F:\projects\DailyMm\\', folder_name))
     print folder_name
     for index in range(1, all_index + 1):
         page = start_page + r'/' + str(index)
         html = download.get(page)
         soup = BeautifulSoup(html.text, 'lxml')
         img_tag = soup.find('div', class_="main-image").img
         img_href = img_tag['src']
         original_pic_name = img_href[-17:]
         pic_name = re.sub(r'/', '-', original_pic_name)
         os.chdir(r'F:\projects\DailyMm\\' + folder_name)
         img = download.get(img_href)
         f = open(pic_name, 'ab')
         f.write(img.content)
         f.close()
Пример #7
0
def get_all_link(page):
    wbdata = download.get(page).text
    soup = BeautifulSoup(wbdata, 'lxml')
    atagset = soup.find_all('a')
    for atag in atagset:
        #link = atag['href']#it may be wrong
        link = str(atag.get('href'))#获取链接地址
        #print(link)
        link = link_filter(link)#对链接地址进行过滤并返回修改值
        print(link)
        if link:#若链接不为空
            if link in visited:#如果链接已访问过
                print('this link existed')
                continue#进行下一个链接爬取
            else:#链接未访问过
                link_type(link)
        else:
            continue
Пример #8
0
 def get_page(self, page):
     response = download.get(page)
     return response.text