def parse_all(fnames, reparse=False): """ 格式化为ts、tag、url、title、root_domain、domain、url_path :param reparse:是否重新全部解析 :return: """ sqldb = SQLite('data/secwiki.db') # 判断是否重新全部解析 if reparse: fnames = [] gen_file = glob.iglob(r'data/html/secwiki_*.html') sql = 'delete from `secwiki`' for gfile in gen_file: fnames.append(gfile) sqldb.execute(sql) if fnames is None: print('No new secwiki') return sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);' for fname in fnames: # 判断目标文件本地是否存在 m = re.search(r'secwiki_(\d+)\.html', fname) rname = m.group(1) rname = path('data/txt', 'secwiki_' + rname + '.txt') if not os.path.exists(path("data/txt")): os.mkdir(path("data/txt")) if os.path.exists(rname) and os.path.getsize(rname) > 0: continue # 待统一写入目标文件 rf = codecs.open(rname, mode='wb') # 读本地源文件并解析 with codecs.open(fname, 'rb') as f: all_content = {} #print(fname) for content in parse_single(f): if content: # 解析完写入目标文件 k = content[0] + content[2] all_content[k] = content line = "\t".join(content) rf.write(line.encode() + b'\r\n') # 批量存入sqlite3 if all_content: sqldb.executemany(sql, all_content.values()) rf.close()
class ToiScraper(): TABLE_NAME = 'articles' TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'), (u'url', u'text')] # Manually observed minimum date on TOI INIT_DATE = (2020, 1, 1) MIN_ENTRIES = 600 MAX_SLEEP = 3600 def __init__(self): # ==== Required vars ===== # self.stdin_path = '/dev/null' self.stdout_path = '/dev/null' self.stderr_path = '/dev/null' # self.pidfile_path = '/var/run/toidaemon/toidaemon.pid' self.pidfile_path = PID_FILE_PATH self.pidfile_timeout = 5 # ========================= # self.db_name = DB_PATH self.db = SQLite(self.db_name) self.table = self.db.get(ToiScraper.TABLE_NAME) print("Initializing...") if not self.table: print("No table found with name {0}. Creating it.".format( ToiScraper.TABLE_NAME)) self.table = self.db.create(ToiScraper.TABLE_NAME, ToiScraper.TABLE_SCHEMA) else: if not self.table.get_info() == ToiScraper.TABLE_SCHEMA: error_str = "Table {0} exists but with incorrect schema".format( ToiScraper.TABLE_NAME) print(error_str) raise Exception(error_str) self.iter_date = self._get_init_date_full() # Get the last date in the database with at least 600 entries in it (enough to tell that it's full) def _get_init_date_full(self): print( "Retrieving last retrieved date from database with at least {0} in it" .format(ToiScraper.MIN_ENTRIES)) first_date = self.db.execute(""" SELECT a.ds, a.count FROM ( SELECT ds, count(1) AS count FROM {0} GROUP BY ds ORDER BY DATE(ds) DESC ) a WHERE a.count > {1} LIMIT 1; """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES), get=True) if len(first_date) == 0: print( "No last date with given minimum entries found in DB, starting from beginning." ) return ToiScraper.INIT_DATE print("Last date with entries {0} found. {1} entries total.".format( first_date[0][0], first_date[0][1])) return self.get_next_day(*tuple(map(int, first_date[0][0].split('-')))) # Get the last date in the database with entries in it def _get_init_date(self): print("Retrieving last retrieved date from database") first_date = self.db.execute( 'SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format( ToiScraper.TABLE_NAME), get=True) if len(first_date) == 0: print("No last date found in DB, starting from beginning.") return ToiScraper.INIT_DATE print("Last date {0} found.".format(first_date[0]['ds'])) return self.get_next_day( *tuple(map(int, first_date[0]['ds'].split('-')))) def get_last_valid_date(self): return datetime.utcnow() + timedelta(hours=5, minutes=30) # Check if the date is strictly before today in IST def is_valid_date(self, year, month, day): try: datetime(year, month, day) except ValueError: return False cur_time = datetime(year, month, day) india_time = self.get_last_valid_date() return cur_time + timedelta( days=1) < india_time and cur_time >= datetime( *ToiScraper.INIT_DATE) def compute_url_for_day(self, year, month, day): if not self.is_valid_date(year, month, day): return None # Day count used in TOI URL (1st October, 2015 == 42278) day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2 return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format( year=year, month=month, day=day, daycount=day_count) def get_next_day(self, year, month, day): next_day = datetime(year, month, day) + timedelta(days=1) return (next_day.year, next_day.month, next_day.day) def _retrieve_url_contents(self, url, datetuple): print("Request sent to url {0}".format(url)) req = requests.get(url) print("Response retrieved, parsing") soup = BeautifulSoup(req.text, 'lxml') # Signature of the element we're interested in. We rely on the TOI webpage # not to change divs = soup.find_all( 'div', style= 'font-family:arial ;font-size:12;font-weight:bold; color: #006699') if not len(divs) == 1: error_str = "Found {0} divs matching signature. Aborting.".format( len(divs)) self.error(error_str) raise Exception(error_str) articles = divs[0].find_all('a') print("Found {0} hyperlinks in the archive.".format(len(articles))) articles = [a for a in articles if len(a.text) > 0] res = [] titles = set({}) for art in articles: corr_url = self.validate_url(art['href']) if corr_url: if art.text in titles: continue titles.add(art.text) res.append([ datetime(*datetuple).strftime('%Y-%m-%d'), art.text, corr_url, ]) print("Finished parsing, {0} rows remain".format(len(res))) return res # TOI specific article URL validation and correction def validate_url(self, url): URL_CORRECT = 'http://timesofindia.indiatimes.com/' URL_STANDARD = 'http://' URL_INSIDE = '.indiatimes.com/' if not url.startswith(URL_STANDARD) or not URL_INSIDE in url: if not url.endswith('.cms') or 'http' in url or ' ' in url: return None else: return URL_CORRECT + url return url def dedup_insert(self, data, ds): date_str = '-'.join(map(str, ds)) print("Asking to insert {0} articles in {1}".format( len(data), date_str)) rows = self.table.where({'ds': date_str}) print("Already {0} rows exist in {1}".format(len(rows), date_str)) titles = set({}) res = [] for a in rows: if not a['title'] in titles: titles.add(a['title']) res.append((a['ds'], a['title'], a['url'])) for r in data: if not r[1] in titles: titles.add(r[1]) res.append(r) print("{0} rows left after deduplicating".format(len(res))) if len(rows) > 0: print("Deleting {0} rows from {1}".format(len(rows), date_str)) self.table.del_where({'ds': date_str}) if len(res) > 0: print("Inserting {0} rows from {1}".format(len(res), date_str)) self.table.insert(res) def get_articles_for_day(self, year, month, day): print("Getting articles for the day") url = self.compute_url_for_day(year, month, day) if not url: return 0 data = self._retrieve_url_contents(url, (year, month, day)) self.dedup_insert(data, (year, month, day)) return len(data) def run(self): while True: while not self.is_valid_date(*self.iter_date): next_date = datetime(*self.iter_date) + timedelta(days=1) sec_to_next_date = (next_date - self.get_last_valid_date()).seconds print("Reached the end, {0} seconds until {1}".format( sec_to_next_date, datetime(*self.iter_date).strftime('%Y-%m-%d'))) if sec_to_next_date <= ToiScraper.MAX_SLEEP: time.sleep(sec_to_next_date) else: print( 'Seconds till next day {0} greater than {1}, so only sleeping for {1}' .format(sec_to_next_date, ToiScraper.MAX_SLEEP)) time.sleep(ToiScraper.MAX_SLEEP) print('Woken up, getting init date again') self.iter_date = self._get_init_date_full() print('New date set to {0}'.format(self.iter_date)) print("Retrieving articles for date {0}".format(self.iter_date)) num_rows = self.get_articles_for_day(*self.iter_date) print("Retrieved {0} rows from TOI".format(num_rows)) if num_rows == 0: print("Sleeping for 10 seconds, no rows retrieved") time.sleep(10) else: self.iter_date = self.get_next_day(*self.iter_date) print("Iterated to next day - {0}".format( datetime(*self.iter_date)))