def __init__ (self,conn, HTML_page_Obj=None, article_file_path=None): if HTML_page_Obj==None and article_file_path!=None: make_from_file(article_file_path) elif HTML_page_Obj!=None and article_file_path==None: self.article_url=HTML_page_Obj.url website_base_url=extraction_text_manip.extract_website(self.article_url) print "\n Making article found on %s"%website_base_url self.article_headline="" self.article_alt_headline_list=[] self.article_text="" article_soup=HTML_page_Obj.html_prettify() table = simpleMySQL.verified_select(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url) if table is not None: for i in range(0,len(table)): try_code=table[i][1] # print try_code try: exec(try_code) except Exception: print "Something went wrong while executing that code. Trying next code..." else: self.article_headline=article_headline.strip() self.article_alt_headline_list=article_alt_headline_list self.article_text=article_text.strip() return print "None of the extraction codes for %s have worked on this article. Please re-check them."%website_base_url
def __init__ (self, conn, HTML_page_Obj=None, article_file_path=None): if HTML_page_Obj==None and article_file_path!=None: make_from_file(article_file_path) elif HTML_page_Obj!=None and article_file_path==None: self.article_url=HTML_page_Obj.url website_base_url=extraction_text_manip.extract_website(self.article_url) print "\n Making article found on %s"%website_base_url self.article_headline="" self.article_alt_headline_list=[] self.article_text="" self.article_date=datetime.date(1970, 1, 1) ## Start of UNIX time self.article_time=datetime.time(0,0) article_soup=HTML_page_Obj.html_prettify() table = sqliteDefaults.verified_select_sqlite(conn=conn, select_query="select * from website_regex where base_url='%s' order by date_of_addition desc;" %website_base_url) ##<-CHANGED if table is not None: for i in range(0,len(table)): try_code=table[i][1] # print try_code try: exec(try_code) except Exception: print "Something went wrong while executing that code. Trying next code..." else: self.article_headline=article_headline.strip() self.article_alt_headline_list=article_alt_headline_list self.article_text=article_text.strip() self.article_date = article_date self.article_time = article_time return print "None of the extraction codes for %s have worked on this article. \ Please re-check them."%website_base_url
import re from bs4 import BeautifulSoup import extraction_text_manip '''The purpose of this python file is to help you build the code needed to extract articles from websites''' url = "http://www.livemint.com/Companies/fghWAFAu1k7JYKnUU31g4I/Nestle-asks-Bombay-HC-for-time-to-reply-to-Maharashtra-FDA-a.html" website = extraction_text_manip.extract_website(url) html = extraction_text_manip.get_html(url) #We must set the following: article_headline = "" article_alt_headline_list = [] article_text = "" article_soup = BeautifulSoup(html) with open("G:/article.html", 'w') as art_file: art_file.write(article_soup.prettify().encode('ascii', 'ignore')) #start of website-specific code #input: article_soup website_base_url = "livemint.com" headline_list = article_soup.find("h1", {"class": "sty_head_38"}) article_headline = "" for i in headline_list: article_headline += extraction_text_manip.properly_encode(str(i)) article_headline = extraction_text_manip.properly_format(article_headline) article_alt_headline_list = [] alt_headline_list = article_soup.find("div", {"class": "sty_sml_summary_18"})
import re from bs4 import BeautifulSoup import extraction_text_manip '''The purpose of this python file is to help you build the code needed to extract articles from websites''' url="http://www.livemint.com/Companies/fghWAFAu1k7JYKnUU31g4I/Nestle-asks-Bombay-HC-for-time-to-reply-to-Maharashtra-FDA-a.html" website=extraction_text_manip.extract_website(url) html=extraction_text_manip.get_html(url) #We must set the following: article_headline="" article_alt_headline_list=[] article_text="" article_soup=BeautifulSoup(html) with open("G:/article.html", 'w') as art_file: art_file.write(article_soup.prettify().encode('ascii','ignore')) #start of website-specific code #input: article_soup website_base_url="livemint.com" headline_list=article_soup.find("h1", {"class":"sty_head_38"}) article_headline=""