Пример #1
0
    title = doc(".title").text()
    cont = doc(".field-item")
    items = cont.children()
    text = ""
    for i in items:
        i = pq(i)
        if i.is_("table"):
            table = i.outerHtml()
            table = re.sub('<table\s.*?>', '<table>', table)
            table = re.sub('<p\s.*?>', '<p>', table)
            table = re.sub('<tr\s.*?>', '<tr>', table)
            table = re.sub('<td\s.*?>', '<td>', table)
            table = pq(table).addClass("table")
            text = text + str(table)
        else:
            text = text + i.text() + '\n\n'
    i_describe = text[:70]
    obj = New.objects.filter(title=title)
    if not obj:
        new = New(title=i_title,
                  public=i_department,
                  source=lec_urls,
                  text=text,
                  type='notices',
                  pub_date=i_time,
                  describe=i_describe,
                  img_url="")
        new.save()
        times += 1
print("已更新" + str(times) + "条讲座通知。")
Пример #2
0
url = "http://jwc.shmtu.edu.cn/"
r = requests.get(url+"/jiaowugonggao")
soup = BeautifulSoup(r.content, 'lxml')
div_tag = soup.find('div', class_='table-responsive')
all_a = div_tag.find_all('a')
for k in all_a:
    if k.get('href') is not None:
        title = k.string
        source = url + str(k.get('href'))
        r = requests.get(source)
        soup = BeautifulSoup(r.content, 'lxml')
        div = soup.find('div', class_='region region-content')
        text = div.text
        img_src = ''
        pub_date = soup.find('div', class_="view-content").text.split(':', 2)[2][:10]
        pub_date = pub_date.rstrip()#去除时间右边的空格
        pub_date = datetime.strptime(pub_date, '%Y/%m/%d')
        try:
            describe = text.split(':', 1)[1][:40]
        except IndexError:
            describe = text[:50]
        describe = describe.rstrip()
        obj = New.objects.filter(title=title)
        if not obj:
            new = New(title=title, public='教务处', source=source, text=text, type='notices', pub_date=pub_date,
                      describe=describe, img_url=img_src)
            new.save()
            times += 1

print("已更新" + str(times) + "条教务通知。")
Пример #3
0
for li in all_li:  # 逐一分析
    a = li.find('a')  # 找出a标签
    title = a.string  # 获取链接及标题名
    source = url + a['href']  # 具体动态内容网址
    s = li.find('span', class_='date-display-single')  # 找出指定class类型span标签
    pub_date = s['content'][:10]  # 公布时间为s的content属性前10位
    pub_date = datetime.strptime(pub_date, "%Y-%m-%d")  # 转格式
    r = requests.get(source)  # 获取动态内容网页
    soup = BeautifulSoup(r.content, 'lxml')
    div_tag = soup.find('div', class_='region region-content')
    text = div_tag.text  # text为div块内文本
    describe = text[:70]  # describe为text前70位
    try:  # 尝试寻找动态来源,没有的话保存空字符
        img_src = div_tag.find('img')['src']
    except TypeError:
        img_src = ''
    obj = New.objects.filter(title=title)  # 过滤标题
    if not obj:  # 如果数据库中没有则保存
        new = New(title=title,
                  describe=describe,
                  text=text,
                  type='new',
                  public='SMU',
                  pub_date=pub_date,
                  source=source,
                  img_url=img_src)
        new.save()
        times += 1
    # print(title,describe,source)#试验
print('更新数' + str(times) + '校园动态')  # 显示更新条数