Exemplo n.º 1
0
import spiderFunction
import requests
import re
from lib import FMysql
from bs4 import BeautifulSoup
from download import Download

download = Download()

mysql = FMysql.FMysql()

for page_num in range(1, 11):
    start_html = 'http://sse.tongji.edu.cn/Data/List/xwdt'
    start_html += '?page='
    start_html += str(page_num)
    Soup = download.get(start_html)
    all_a = Soup.find('div', class_='right-nr').find('ul').find_all('a')

    for a in all_a:

        article_href = a['href'][a['href'].rfind('/') + 1:]
        if not mysql.isUrlExist("software_engineering", article_href):
            page_url = "http://sse.tongji.edu.cn/" + a['href']
            page_soup = download.get(page_url)
            article = page_soup.find('div', class_='right-nr')
            article_title = article.find('div', class_='view-title').find('h1').get_text()
            article_time = article.find('div', class_='view-info').find('span').get_text()
            article_time = re.search(r"(\d{4}-\d{1,2}-\d{1,2})", article_time).group()
            article_content = article.find('div', class_='view-cnt').get_text()
            dict_article = {'title': article_title, 'content': article_content, 'time': article_time,
                            'id': article_href}
Exemplo n.º 2
0
import spiderFunction
import requests
import re
from lib import FMysql
from bs4 import BeautifulSoup
from download import Download

start_html = 'https://news.tongji.edu.cn/classid-5.html'
base_html = 'https://news.tongji.edu.cn'
download = Download()
mysql = FMysql.FMysql()
Soup = download.get(start_html)
max_page = Soup.find('div', class_='pager').find_all('a')[-1]['href']
start = str(max_page).rfind('-')
end = str(max_page).rfind('.')
max_page = str(max_page)[start + 1:end]

for page_num in range(1, int(max_page)):
    base_page_html = base_html + '/classid-5-'
    page_html = base_page_html + str(page_num) + '.html'
    Soup = download.get(page_html)
    all_a = Soup.find_all('div', class_='news_list')[2].find_all(
        'a', attrs={'title': True})
    for a in all_a:
        a = base_html + '/' + a['href']
        # print(a)
        left = a.rfind('-t')
        right = a.rfind('id')
        id = a[right + 3:left]
        if not mysql.isIdExist('tongji_university_news', id):
            Soup = download.get(a)
Exemplo n.º 3
0
 def get_html_content(url, data, headers, method):
     return Download.post(
         url, data,
         headers) if str(method).lower() == 'post' else Download.get(url)