class Classifier: def __init__(self): self.bag_of_words = [ 'modi', 'pm', 'visit', 'narendra', 'prime minister' ] self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() def classify_title(self): sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC " sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s" news_url_id = self.get_last_news_url_id() counter = 0 while True: self.cursor.execute(sql, (news_url_id, )) rs_tuple = self.cursor.fetchone() if not rs_tuple: break (news_url_id, title) = rs_tuple print(news_url_id) for key_word in self.bag_of_words: if re.search(key_word, title.strip(), re.I): self.cursor.execute(sql_update, (news_url_id, )) self.db_con.commit() counter += 1 if counter % 1000 == 0: print('Sleeping for 5 seconds') time.sleep(5) self.update_last_news_url_id(news_url_id) def update_last_news_url_id(self, news_url_id): sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1" self.cursor.execute(sql, (news_url_id, )) self.db_con.commit() def get_last_news_url_id(self): sql = "SELECT news_url_id FROM scraper_info" self.cursor.execute(sql) rs_tuple = self.cursor.fetchone() (news_url_id, ) = rs_tuple return news_url_id
class Classifier: def __init__(self): self.bag_of_words = [ 'modi', 'pm', 'visit', 'narendra', 'prime minister' ] self.bag_of_words_set = set(self.bag_of_words) self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() def classify_title(self): sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC " sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s" news_url_id = self.get_last_news_url_id() counter = 0 while True: self.cursor.execute(sql, (news_url_id, )) rs_tuple = self.cursor.fetchone() if not rs_tuple: break (news_url_id, title) = rs_tuple print(news_url_id) stemmer = PorterStemmer() result_set = None title = title.lower() title_set = set([stemmer.stem(item) for item in title.split()]) #print (title) result_set = (title_set & self.bag_of_words_set) print("result_set", result_set) '''if result_set: self.cursor.execute(sql_update, (news_url_id, )) self.db_con.commit()''' #for item in title_set: # if item.lower() in COUNTRY: # # print(item) # if result_set: # self.cursor.execute(sql_update, (news_url_id, )) # self.db_con.commit() # else: # if self.get_place(item) and result_set: # self.cursor.execute(sql_update, (news_url_id, )) # self.db_con.commit() counter += 1 if counter % 1000 == 0: print('Sleeping for 5 seconds') time.sleep(5) #self.update_last_news_url_id(news_url_id) def update_last_news_url_id(self, news_url_id): sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1" self.cursor.execute(sql, (news_url_id, )) self.db_con.commit() def get_last_news_url_id(self): sql = "SELECT news_url_id FROM scraper_info" self.cursor.execute(sql) rs_tuple = self.cursor.fetchone() (news_url_id, ) = rs_tuple return news_url_id def get_place(self, text): chunked = ne_chunk(pos_tag(word_tokenize(text))) for i in chunked: if type(i) == Tree: if i.label() == "GPE": for token, pos in i.leaves(): #print("found country %s " %token) return True return False
class Xtractor: def __init__(self): self.year = 2015 self.base_url = "http://timesofindia.indiatimes.com/%s" self.url_to_scrape = None self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() self.request_headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } def save_last_month_date_url(self, month_date_url_id): sql_scraper_info = "UPDATE scraper_info SET last_month_date_url_id = %s WHERE id=1" self.cursor.execute(sql_scraper_info, (month_date_url_id, )) self.db_con.commit() def get_url_to_scrape(self): month_date_url_id = 0 sql_last_month_date_url = "SELECT last_month_date_url_id FROM scraper_info WHERE id=1" sql_get_month_date_url = """SELECT month_date_url ,id FROM month_date_url %s """ self.cursor = self.db_con.cursor() self.cursor.execute(sql_last_month_date_url) rs_tuple = self.cursor.fetchone() (month_date_url_id, ) = rs_tuple if rs_tuple: (month_date_url_id, ) = rs_tuple condition = 'WHERE id > %s ORDER BY id asc' % (month_date_url_id) self.cursor.execute(sql_get_month_date_url % condition) rs_tuple = self.cursor.fetchone() if not rs_tuple: raise NoMoreUrlFoundException( "No URL left to scrape...you are done scraping all the urls present in DB" ) (month_date_url, month_date_url_id) = rs_tuple else: self.cursor.execute(sql_get_month_date_url % ('ORDER BY id asc')) rs_tuple = self.cursor.fetchone() (month_date_url, month_date_url_id) = rs_tuple return (month_date_url_id, month_date_url) def start(self): while True: try: time.sleep(5) (month_date_url_id, month_date_url) = self.get_url_to_scrape() print(month_date_url) url_to_open = self.base_url % month_date_url print(url_to_open) html = requests.get(url_to_open, headers=self.request_headers).text self.parse_titles(html, month_date_url_id) except NoMoreUrlFoundException: print("------------------DONE------------------------") sys.exit(0) return def save_tile_and_href(self, month_date_url_id, title, href): sql = "INSERT INTO news_url (month_date_url_id, title, url) VALUES (%s, %s, %s) ON CONFLICT (title) DO NOTHING" self.cursor.execute(sql, (month_date_url_id, title, href)) self.db_con.commit() return def parse_titles(self, html, month_date_url_id): soup = BeautifulSoup(html, 'lxml') div = soup.find( 'div', { 'style': 'font-family:arial ;font-size:12;font-weight:bold; color: #006699' }) table = div.find('table') atag_list = table.find_all('a') with open("/home/aish/Desktop/toi/toi.html", 'w') as f: f.write(str(table)) for a in atag_list: title = a.text.strip() href = a['href'] self.save_tile_and_href(month_date_url_id, title, href) self.save_last_month_date_url(month_date_url_id) return def initiate(self): sql_insert_month_date_url = "INSERT INTO month_date_url (url_month, url_date, year, month_date_url) VALUES (%s, %s, %s, %s)" toi_starttime = 42005 month_counter = 1 date_counter = 1 # Add all url of all month date to database while True: # Get number of days in a month depending on the year num_of_days_in_month = calendar.monthrange(self.year, month_counter)[1] while True: toi_url_month_date = "/%s/1/1/archivelist/year-%s,month-%s,starttime-%s.cms" % ( self.year, self.year, month_counter, toi_starttime) self.cursor.execute(sql_insert_month_date_url, (month_counter, date_counter, self.year, toi_url_month_date)) self.db_con.commit() toi_starttime += 1 date_counter += 1 print(toi_url_month_date) if date_counter > num_of_days_in_month: # Reset the date counter to 1 before break for each month date_counter = 1 break month_counter += 1 # Break when December is reached if month_counter > 12: break
''' #Stage 4 # keyword specific seach if re.search("arrives|attend|embarks|reaches", content,re.DOTALL): filter_count = filter_count + 1 cursor.execute(sql, (filter_count, id)) print (">>>>>>>>>>>>>>>>>",id) ''' #dbcon.commit() requests sql_news_title = "SELECT n.title, n.id, n.url FROM news_url n INNER JOIN classified_article c ON c.news_url_id = n.id WHERE c.counter > 3 " sql_insert = "INSERT INTO visit_info(place, visit_date) values(%s, %s) ON CONFLICT (place, visit_date) DO NOTHING" cursor.execute(sql_news_title) rs_tuple_list = cursor.fetchall() for rs_tuple in rs_tuple_list: (title, id, url) = rs_tuple lst_ner = get_continuous_chunks(title) country_list = [] for item in lst_ner: if re.search(regex, item.strip().lower()): country_list.append(item) html = requests.get(url).text soup = BeautifulSoup(html, 'lxml') if country_list: for country in country_list: date = soup.find('span', {'class': "time_cptn"}).text date_reg = re.search(r'(.*\|)(.*2015)(.*)', date) date_str = date_reg.group(2) cursor.execute(sql_insert, (country, date_str)) dbcon.commit()