def display_vist_details(): db_con = DBConnection().create_connection() cursor = db_con.cursor() sql = "select place, visit_date from visit_info" cursor.execute(sql) rs_tuple_list = cursor.fetchall() value_dict = {} for rs_tuple in rs_tuple_list: (place, date_of_visit) = rs_tuple value_dict[place] = date_of_visit return render_template('index.html', vist_details=value_dict)
def hello_world(): db_con = DBConnection().create_connection() cursor = db_con.cursor() sql = "select place, visit_date from visit_info" cursor.execute(sql) rs_tuple_list = cursor.fetchall() #return jsonify(rs_tuple_list) value_dict = {} for rs_tuple in rs_tuple_list: (place, date_of_visit) = rs_tuple value_dict[place] = date_of_visit #return 'Hello World' return jsonify(value_dict)
class Classifier: def __init__(self): self.bag_of_words = [ 'modi', 'pm', 'visit', 'narendra', 'prime minister' ] self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() def classify_title(self): sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC " sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s" news_url_id = self.get_last_news_url_id() counter = 0 while True: self.cursor.execute(sql, (news_url_id, )) rs_tuple = self.cursor.fetchone() if not rs_tuple: break (news_url_id, title) = rs_tuple print(news_url_id) for key_word in self.bag_of_words: if re.search(key_word, title.strip(), re.I): self.cursor.execute(sql_update, (news_url_id, )) self.db_con.commit() counter += 1 if counter % 1000 == 0: print('Sleeping for 5 seconds') time.sleep(5) self.update_last_news_url_id(news_url_id) def update_last_news_url_id(self, news_url_id): sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1" self.cursor.execute(sql, (news_url_id, )) self.db_con.commit() def get_last_news_url_id(self): sql = "SELECT news_url_id FROM scraper_info" self.cursor.execute(sql) rs_tuple = self.cursor.fetchone() (news_url_id, ) = rs_tuple return news_url_id
class Xtractor: def __init__(self): self.year = 2015 self.base_url = "http://timesofindia.indiatimes.com/%s" self.url_to_scrape = None self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() self.request_headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" } def save_last_month_date_url(self, month_date_url_id): sql_scraper_info = "UPDATE scraper_info SET last_month_date_url_id = %s WHERE id=1" self.cursor.execute(sql_scraper_info, (month_date_url_id, )) self.db_con.commit() def get_url_to_scrape(self): month_date_url_id = 0 sql_last_month_date_url = "SELECT last_month_date_url_id FROM scraper_info WHERE id=1" sql_get_month_date_url = """SELECT month_date_url ,id FROM month_date_url %s """ self.cursor = self.db_con.cursor() self.cursor.execute(sql_last_month_date_url) rs_tuple = self.cursor.fetchone() (month_date_url_id, ) = rs_tuple if rs_tuple: (month_date_url_id, ) = rs_tuple condition = 'WHERE id > %s ORDER BY id asc' % (month_date_url_id) self.cursor.execute(sql_get_month_date_url % condition) rs_tuple = self.cursor.fetchone() if not rs_tuple: raise NoMoreUrlFoundException( "No URL left to scrape...you are done scraping all the urls present in DB" ) (month_date_url, month_date_url_id) = rs_tuple else: self.cursor.execute(sql_get_month_date_url % ('ORDER BY id asc')) rs_tuple = self.cursor.fetchone() (month_date_url, month_date_url_id) = rs_tuple return (month_date_url_id, month_date_url) def start(self): while True: try: time.sleep(5) (month_date_url_id, month_date_url) = self.get_url_to_scrape() print(month_date_url) url_to_open = self.base_url % month_date_url print(url_to_open) html = requests.get(url_to_open, headers=self.request_headers).text self.parse_titles(html, month_date_url_id) except NoMoreUrlFoundException: print("------------------DONE------------------------") sys.exit(0) return def save_tile_and_href(self, month_date_url_id, title, href): sql = "INSERT INTO news_url (month_date_url_id, title, url) VALUES (%s, %s, %s) ON CONFLICT (title) DO NOTHING" self.cursor.execute(sql, (month_date_url_id, title, href)) self.db_con.commit() return def parse_titles(self, html, month_date_url_id): soup = BeautifulSoup(html, 'lxml') div = soup.find( 'div', { 'style': 'font-family:arial ;font-size:12;font-weight:bold; color: #006699' }) table = div.find('table') atag_list = table.find_all('a') with open("/home/aish/Desktop/toi/toi.html", 'w') as f: f.write(str(table)) for a in atag_list: title = a.text.strip() href = a['href'] self.save_tile_and_href(month_date_url_id, title, href) self.save_last_month_date_url(month_date_url_id) return def initiate(self): sql_insert_month_date_url = "INSERT INTO month_date_url (url_month, url_date, year, month_date_url) VALUES (%s, %s, %s, %s)" toi_starttime = 42005 month_counter = 1 date_counter = 1 # Add all url of all month date to database while True: # Get number of days in a month depending on the year num_of_days_in_month = calendar.monthrange(self.year, month_counter)[1] while True: toi_url_month_date = "/%s/1/1/archivelist/year-%s,month-%s,starttime-%s.cms" % ( self.year, self.year, month_counter, toi_starttime) self.cursor.execute(sql_insert_month_date_url, (month_counter, date_counter, self.year, toi_url_month_date)) self.db_con.commit() toi_starttime += 1 date_counter += 1 print(toi_url_month_date) if date_counter > num_of_days_in_month: # Reset the date counter to 1 before break for each month date_counter = 1 break month_counter += 1 # Break when December is reached if month_counter > 12: break
class Classifier: def __init__(self): self.bag_of_words = [ 'modi', 'pm', 'visit', 'narendra', 'prime minister' ] self.bag_of_words_set = set(self.bag_of_words) self.db_con = DBConnection().create_connection() self.cursor = self.db_con.cursor() def classify_title(self): sql = "SELECT id, title FROM news_url WHERE id > %s ORDER BY id ASC " sql_update = "UPDATE news_url SET is_valid = True WHERE id = %s" news_url_id = self.get_last_news_url_id() counter = 0 while True: self.cursor.execute(sql, (news_url_id, )) rs_tuple = self.cursor.fetchone() if not rs_tuple: break (news_url_id, title) = rs_tuple print(news_url_id) stemmer = PorterStemmer() result_set = None title = title.lower() title_set = set([stemmer.stem(item) for item in title.split()]) #print (title) result_set = (title_set & self.bag_of_words_set) print("result_set", result_set) '''if result_set: self.cursor.execute(sql_update, (news_url_id, )) self.db_con.commit()''' #for item in title_set: # if item.lower() in COUNTRY: # # print(item) # if result_set: # self.cursor.execute(sql_update, (news_url_id, )) # self.db_con.commit() # else: # if self.get_place(item) and result_set: # self.cursor.execute(sql_update, (news_url_id, )) # self.db_con.commit() counter += 1 if counter % 1000 == 0: print('Sleeping for 5 seconds') time.sleep(5) #self.update_last_news_url_id(news_url_id) def update_last_news_url_id(self, news_url_id): sql = "UPDATE scraper_info SET news_url_id = %s WHERE id = 1" self.cursor.execute(sql, (news_url_id, )) self.db_con.commit() def get_last_news_url_id(self): sql = "SELECT news_url_id FROM scraper_info" self.cursor.execute(sql) rs_tuple = self.cursor.fetchone() (news_url_id, ) = rs_tuple return news_url_id def get_place(self, text): chunked = ne_chunk(pos_tag(word_tokenize(text))) for i in chunked: if type(i) == Tree: if i.label() == "GPE": for token, pos in i.leaves(): #print("found country %s " %token) return True return False
] regex_list = [''] country_capital_list = [] with open('/home/aish/Desktop/capital.txt', 'r') as f: lines = f.readlines() for capital_name in lines: country_capital_list.append(capital_name) country_capital_list.extend(COUNTRY) regex = "|".join(str(item.lower()) for item in country_capital_list) sql = "UPDATE classified_article SET counter = %s WHERE id = %s" dbcon = DBConnection().create_connection() cursor = dbcon.cursor() cursor.execute( "SELECT id,article,counter FROM classified_article WHERE id > 0 AND counter >3 ORDER BY id asc" ) result_set = cursor.fetchall() frequency_dict = {} for result in result_set: (id, content, filter_count) = result #print (id) content = content.lower() content = re.sub("<.*?\>", " ", content) content = re.sub('[,.]', '', content) ''' #Stage 1 if re.search('visit|visiting|travel|trip', content):