title = doc(".title").text() cont = doc(".field-item") items = cont.children() text = "" for i in items: i = pq(i) if i.is_("table"): table = i.outerHtml() table = re.sub('<table\s.*?>', '<table>', table) table = re.sub('<p\s.*?>', '<p>', table) table = re.sub('<tr\s.*?>', '<tr>', table) table = re.sub('<td\s.*?>', '<td>', table) table = pq(table).addClass("table") text = text + str(table) else: text = text + i.text() + '\n\n' i_describe = text[:70] obj = New.objects.filter(title=title) if not obj: new = New(title=i_title, public=i_department, source=lec_urls, text=text, type='notices', pub_date=i_time, describe=i_describe, img_url="") new.save() times += 1 print("已更新" + str(times) + "条讲座通知。")
url = "http://jwc.shmtu.edu.cn/" r = requests.get(url+"/jiaowugonggao") soup = BeautifulSoup(r.content, 'lxml') div_tag = soup.find('div', class_='table-responsive') all_a = div_tag.find_all('a') for k in all_a: if k.get('href') is not None: title = k.string source = url + str(k.get('href')) r = requests.get(source) soup = BeautifulSoup(r.content, 'lxml') div = soup.find('div', class_='region region-content') text = div.text img_src = '' pub_date = soup.find('div', class_="view-content").text.split(':', 2)[2][:10] pub_date = pub_date.rstrip()#去除时间右边的空格 pub_date = datetime.strptime(pub_date, '%Y/%m/%d') try: describe = text.split(':', 1)[1][:40] except IndexError: describe = text[:50] describe = describe.rstrip() obj = New.objects.filter(title=title) if not obj: new = New(title=title, public='教务处', source=source, text=text, type='notices', pub_date=pub_date, describe=describe, img_url=img_src) new.save() times += 1 print("已更新" + str(times) + "条教务通知。")
for li in all_li: # 逐一分析 a = li.find('a') # 找出a标签 title = a.string # 获取链接及标题名 source = url + a['href'] # 具体动态内容网址 s = li.find('span', class_='date-display-single') # 找出指定class类型span标签 pub_date = s['content'][:10] # 公布时间为s的content属性前10位 pub_date = datetime.strptime(pub_date, "%Y-%m-%d") # 转格式 r = requests.get(source) # 获取动态内容网页 soup = BeautifulSoup(r.content, 'lxml') div_tag = soup.find('div', class_='region region-content') text = div_tag.text # text为div块内文本 describe = text[:70] # describe为text前70位 try: # 尝试寻找动态来源,没有的话保存空字符 img_src = div_tag.find('img')['src'] except TypeError: img_src = '' obj = New.objects.filter(title=title) # 过滤标题 if not obj: # 如果数据库中没有则保存 new = New(title=title, describe=describe, text=text, type='new', public='SMU', pub_date=pub_date, source=source, img_url=img_src) new.save() times += 1 # print(title,describe,source)#试验 print('更新数' + str(times) + '校园动态') # 显示更新条数