def crawlingArticle(id, user): try: if len(id)>5: r = requests.get(getObjectUrl(id), cookies=login.cookies) soup = bs(r.text, 'html.parser') content = soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text print content if not db.is_exist(db.Data, id): db.session.add(db.Data(id=id, user=user, datetime=soup.find("td", class_="m-tcol-c date").text, title=soup.find("span", class_="b m-tcol-c").text, content=content, morpheme=str(tag.get_tags(content)), category=soup.find_all("td", class_="m-tcol-c")[1].text)) db.session.commit() print soup.find("td", class_="m-tcol-c date").text #date print soup.find("span", class_="b m-tcol-c").text #title print soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text #content print soup.find_all("td", class_="m-tcol-c")[1].text #category except Exception as e: print e.message
with open(list_file, 'r') as f: arr = pickle.load(f) for id in arr: try: if len(id)>5: print id r = requests.get(getObjectUrl(id), cookies=login.cookies) #print r.text soup = bs(r.text, 'html.parser') content = soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text if not db.is_exist(db.Data, id): db.session.add(db.Data(id=id, datetime=soup.find("td", class_="m-tcol-c date").text, title=soup.find("span", class_="b m-tcol-c").text, content=content, morpheme=str(tag.get_tags(content)), category=soup.find_all("td", class_="m-tcol-c")[1].text)) db.session.commit() print soup.find("td", class_="m-tcol-c date").text #date print soup.find("span", class_="b m-tcol-c").text #title print soup.find("div", class_="tbody m-tcol-c").find_all("p")[4].text #content print soup.find_all("td", class_="m-tcol-c")[1].text #category except Exception as e: print e.message # with open('sample.html','w') as f: # f.write(unicode(r.text)) # rst = []