from entity_api import entity_extract from untitled1 import Article, db, Keyword # article = Article.query.filter_by(id=92).first() # print article.category # article.category = 'Musics' # db.session.commit() #data = Article.query.all() #for article in data: # print article.category #entity_extract(1, "The East India Company in the 1700s conjures pictures of British colonisation. What originally started as trade and business eventually led to 200 years of British Raj for our country.\nAt its peak, the company accounted for about half the global trade specialising in commodities like cotton, indigo, tea and opium, and offering employment to a third of the British workforce.\nAfter the 1857 mutiny in India, all its powers were transferred to the British Crown, and eventually, by 1874, the company was dissolved.\nForwarding into over a century later, in 2005, Mumbai-born entrepreneur, Sanjiv Mehta, with a \u201csense of redemption\u201d, bought the company from its 30 odd owners, and turned it into a luxury food brand.\nThe company now specialises in selling gourmet coffees, chocolates, rare teas, and other luxury-food items through its e-commerce website.\nThe first store was launched in the Mayfair neighborhood in London. Today, the company has escalated commercially and now runs stores across the UK, the Middle East, Europe and Asia, in addition to a successful e-commerce website.\nOn inaugurating the company, Mehta received congratulatory e-mails from thousands of Indians.\n\u201cIt is a dream come true to build a business like this and to acquire a brand like this to own the company,\u201d he said.\u201d\n\nParticipate in this discussion") # key = Keyword.query.filter_by(id=2).first() # print key.key_name for i in range(560, 1290): article = Article.query.filter_by(id=i).first() if article is None: continue entity_extract(article.id, article.full_story, 1) print "done" print "all done"
simple_text = bsObj.find("div",attrs={"class":"ys_post_content text"}).get_text() # category category = "YourStory" # print title # print image # print description # print full_story # print simple_text # print category # print date if not db.session.query(Article).filter(Article.title == title).count(): article_a = Article(title=title, full_story=simple_text, image=image, category=category, description=description, pubdate=date, html=full_story) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, simple_text, 1) except psycopg2.ProgrammingError: # as ie: # print ie #print"Caught" pass #db.session.rollback() # break # continue except Exception as e: print(e)
def entity(Id, data, news): entity_extract(Id, data, news)
# print image # print description # print full_story # print simple_text # print category # print date if not db.session.query(Article).filter( Article.title == title).count(): article_a = Article(title=title, full_story=simple_text, image=image, category=category, description=description, pubdate=date, html=full_story) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, simple_text, 1) except psycopg2.ProgrammingError: # as ie: # print ie #print"Caught" pass #db.session.rollback() # break # continue except Exception as e: print(e)
# print title # print image # print description # print link # print date # print full_story # print category # print "\n\n" if not db.session.query(Article).filter(Article.title == title).count(): article_a = Article(title=title, full_story=full_story, image=image, category=category, description=description, pubdate=date) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, full_story, 1) except psycopg2.IntegrityError: # as ie: # print ie print"Caught" db.session.rollback() # break # continue except Exception as e: print e pass except Exception as e: print e pass
def upload(): toi_rss={'http://timesofindia.indiatimes.com/rssfeedstopstories.cms': 'Top stories', 'http://timesofindia.indiatimes.com/rssfeeds/1221656.cms': 'Most Recent', 'http://timesofindia.feedsportal.com/c/33039/f/533916/index.rss': 'India', 'http://timesofindia.feedsportal.com/c/33039/f/533917/index.rss': 'World', 'http://timesofindia.feedsportal.com/c/33039/f/533919/index.rss':'Business', 'http://timesofindia.feedsportal.com/c/33039/f/533920/index.rss':'Cricket', 'http://timesofindia.feedsportal.com/c/33039/f/533921/index.rss':'Sports', 'http://dynamic.feedsportal.com/c/33039/f/533968/index.rss':'Health', 'http://timesofindia.feedsportal.com/c/33039/f/533922/index.rss':'Science', 'http://timesofindia.feedsportal.com/c/33039/f/533925/index.rss':'Environment', 'http://timesofindia.feedsportal.com/c/33039/f/533923/index.rss':'Technology', 'http://timesofindia.feedsportal.com/c/33039/f/533924/index.rss':'Education', 'http://timesofindia.feedsportal.com/c/33039/f/533928/index.rss':'Entertainment', 'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms':'Lifestyle' } for key, value in toi_rss.iteritems(): # print key d = feedparser.parse(key) category = value for post in d.entries: try: title = post.title dated = post.published if "photo" in post.link: continue if "live" in post.link: continue if "videos" in post.link: continue if "listshow" in post.link: continue html = urlopen(post.link) bsObj = BeautifulSoup(html, "html.parser") images = bsObj.find("link", attrs={"rel":"image_src"}) if images is not None: images=images['href'] story_list=bsObj.find("div", attrs={"class":"content"}) if story_list is None: story_list=bsObj.find("div", attrs={"class":"Normal"}) #print("story was none") description=bsObj.find("meta", {'name':'description'})['content'] #print('title :'+title+"\n") # print(post.link) # print('category :'+category+"\n") # print('description :'+description+"\n") # print('full story :'+story_list.get_text()+"\n") # # print (""+images) # print ('pubdate:'+dated) # save below variables in db save_title=title #save_link=post.link save_category=category save_description=description save_full_story=story_list.get_text() save_image=images save_date=dated try: if not db.session.query(Article).filter(Article.title == save_title).count(): article_a = Article(title=save_title, full_story=save_full_story, image=save_image, category=save_category, description=save_description, pubdate=save_date) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, save_full_story, 1) except Exception as e:#psycopg2.IntegrityError: print"Caught" db.session.rollback() except Exception as e: print e
# print image # print description # print pubdate # print full_story category = "Firstpost" if not db.session.query(Article).filter( Article.title == title).count(): article_a = Article(title=title, full_story=full_story, image=image, category=category, description=description, pubdate=pubdate) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, full_story, 1) except psycopg2.IntegrityError: # as ie: # print ie print "Caught" db.session.rollback() # break # continue # print "\n\n" except Exception as e: print e pass
# category category="TechCrunch" # print title # print image # print date # print description # print html # print cleantext # print category if not db.session.query(Article).filter(Article.title == title).count(): article_a = Article(title=title, full_story=cleantext, image=image, category=category, description=description, pubdate=date, html=html) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, cleantext, 1) except psycopg2.IntegrityError: # as ie: # print ie print"Caught" db.session.rollback() # break # continue except Exception as e: print e pass
# print title # print image # print date # print description # print html # print cleantext # print category if not db.session.query(Article).filter( Article.title == title).count(): article_a = Article(title=title, full_story=cleantext, image=image, category=category, description=description, pubdate=date, html=html) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, cleantext, 1) except psycopg2.IntegrityError: # as ie: # print ie print "Caught" db.session.rollback() # break # continue except Exception as e: print e pass
from entity_api import entity_extract from untitled1 import Article, db, Keyword # article = Article.query.filter_by(id=92).first() # print article.category # article.category = 'Musics' # db.session.commit() #data = Article.query.all() #for article in data: # print article.category #entity_extract(1, "The East India Company in the 1700s conjures pictures of British colonisation. What originally started as trade and business eventually led to 200 years of British Raj for our country.\nAt its peak, the company accounted for about half the global trade specialising in commodities like cotton, indigo, tea and opium, and offering employment to a third of the British workforce.\nAfter the 1857 mutiny in India, all its powers were transferred to the British Crown, and eventually, by 1874, the company was dissolved.\nForwarding into over a century later, in 2005, Mumbai-born entrepreneur, Sanjiv Mehta, with a \u201csense of redemption\u201d, bought the company from its 30 odd owners, and turned it into a luxury food brand.\nThe company now specialises in selling gourmet coffees, chocolates, rare teas, and other luxury-food items through its e-commerce website.\nThe first store was launched in the Mayfair neighborhood in London. Today, the company has escalated commercially and now runs stores across the UK, the Middle East, Europe and Asia, in addition to a successful e-commerce website.\nOn inaugurating the company, Mehta received congratulatory e-mails from thousands of Indians.\n\u201cIt is a dream come true to build a business like this and to acquire a brand like this to own the company,\u201d he said.\u201d\n\nParticipate in this discussion") # key = Keyword.query.filter_by(id=2).first() # print key.key_name for i in range(560, 1290): article = Article.query.filter_by(id=i).first() if article is None: continue entity_extract(article.id, article.full_story, 1) print "done" print"all done"
def upload(): toi_rss = { 'http://timesofindia.indiatimes.com/rssfeedstopstories.cms': 'Top stories', 'http://timesofindia.indiatimes.com/rssfeeds/1221656.cms': 'Most Recent', 'http://timesofindia.feedsportal.com/c/33039/f/533916/index.rss': 'India', 'http://timesofindia.feedsportal.com/c/33039/f/533917/index.rss': 'World', 'http://timesofindia.feedsportal.com/c/33039/f/533919/index.rss': 'Business', 'http://timesofindia.feedsportal.com/c/33039/f/533920/index.rss': 'Cricket', 'http://timesofindia.feedsportal.com/c/33039/f/533921/index.rss': 'Sports', 'http://dynamic.feedsportal.com/c/33039/f/533968/index.rss': 'Health', 'http://timesofindia.feedsportal.com/c/33039/f/533922/index.rss': 'Science', 'http://timesofindia.feedsportal.com/c/33039/f/533925/index.rss': 'Environment', 'http://timesofindia.feedsportal.com/c/33039/f/533923/index.rss': 'Technology', 'http://timesofindia.feedsportal.com/c/33039/f/533924/index.rss': 'Education', 'http://timesofindia.feedsportal.com/c/33039/f/533928/index.rss': 'Entertainment', 'http://timesofindia.indiatimes.com/rssfeeds/2886704.cms': 'Lifestyle' } for key, value in toi_rss.iteritems(): # print key d = feedparser.parse(key) category = value for post in d.entries: try: title = post.title dated = post.published if "photo" in post.link: continue if "live" in post.link: continue if "videos" in post.link: continue if "listshow" in post.link: continue html = urlopen(post.link) bsObj = BeautifulSoup(html, "html.parser") images = bsObj.find("link", attrs={"rel": "image_src"}) if images is not None: images = images['href'] story_list = bsObj.find("div", attrs={"class": "content"}) if story_list is None: story_list = bsObj.find("div", attrs={"class": "Normal"}) #print("story was none") description = bsObj.find("meta", {'name': 'description'})['content'] #print('title :'+title+"\n") # print(post.link) # print('category :'+category+"\n") # print('description :'+description+"\n") # print('full story :'+story_list.get_text()+"\n") # # print (""+images) # print ('pubdate:'+dated) # save below variables in db save_title = title #save_link=post.link save_category = category save_description = description save_full_story = story_list.get_text() save_image = images save_date = dated try: if not db.session.query(Article).filter( Article.title == save_title).count(): article_a = Article(title=save_title, full_story=save_full_story, image=save_image, category=save_category, description=save_description, pubdate=save_date) db.session.add(article_a) db.session.commit() print article_a.id entity_extract(article_a.id, save_full_story, 1) except Exception as e: #psycopg2.IntegrityError: print "Caught" db.session.rollback() except Exception as e: print e