Exemplo n.º 1
0
def crawlingArticle(id, user):
    try:
        if len(id)>5:
            r = requests.get(getObjectUrl(id), cookies=login.cookies)
            soup = bs(r.text, 'html.parser')
            content =  soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text
            print content
            if not db.is_exist(db.Data, id):
                db.session.add(db.Data(id=id,
                                       user=user,
                                       datetime=soup.find("td", class_="m-tcol-c date").text,
                                       title=soup.find("span", class_="b m-tcol-c").text,
                                       content=content,
                                       morpheme=str(tag.get_tags(content)),
                                       category=soup.find_all("td", class_="m-tcol-c")[1].text))
                db.session.commit()
            print soup.find("td", class_="m-tcol-c date").text      #date
            print soup.find("span", class_="b m-tcol-c").text       #title
            print soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text  #content
            print soup.find_all("td", class_="m-tcol-c")[1].text    #category

    except Exception as e:
               print e.message
Exemplo n.º 2
0
    with open(list_file, 'r') as f:
        arr = pickle.load(f)
        for id in arr:
            try:
                if len(id)>5:
                    print id
                    r = requests.get(getObjectUrl(id), cookies=login.cookies)
                    #print r.text
                    soup = bs(r.text, 'html.parser')
                    content =  soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text
                    if not db.is_exist(db.Data, id):
                        db.session.add(db.Data(id=id,
                                               datetime=soup.find("td", class_="m-tcol-c date").text,
                                               title=soup.find("span", class_="b m-tcol-c").text,
                                               content=content,
                                               morpheme=str(tag.get_tags(content)),
                                               category=soup.find_all("td", class_="m-tcol-c")[1].text))
                        db.session.commit()
                    print soup.find("td", class_="m-tcol-c date").text      #date
                    print soup.find("span", class_="b m-tcol-c").text       #title
                    print soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text  #content
                    print soup.find_all("td", class_="m-tcol-c")[1].text    #category

            except Exception as e:
                print e.message


#        with open('sample.html','w') as f:
#        f.write(unicode(r.text))

#    rst = []