def parse_all(fnames, reparse=False):
    """
    格式化为ts、tag、url、title、root_domain、domain、url_path
    :param reparse:是否重新全部解析
    :return:
    """
    sqldb = SQLite('data/secwiki.db')

    # 判断是否重新全部解析
    if reparse:
        fnames = []
        gen_file = glob.iglob(r'data/html/secwiki_*.html')
        sql = 'delete from `secwiki`'
        for gfile in gen_file:
            fnames.append(gfile)
        sqldb.execute(sql)

    if fnames is None:
        print('No new secwiki')
        return

    sql = 'insert into `secwiki` (`ts`,`tag`,`url`,`title`,`root_domain`,`domain`,`url_path`) values(?,?,?,?,?,?,?);'

    for fname in fnames:
        # 判断目标文件本地是否存在
        m = re.search(r'secwiki_(\d+)\.html', fname)
        rname = m.group(1)
        rname = path('data/txt', 'secwiki_' + rname + '.txt')
        if not os.path.exists(path("data/txt")):
            os.mkdir(path("data/txt"))
        if os.path.exists(rname) and os.path.getsize(rname) > 0:
            continue

        # 待统一写入目标文件
        rf = codecs.open(rname, mode='wb')

        # 读本地源文件并解析
        with codecs.open(fname, 'rb') as f:
            all_content = {}
            #print(fname)
            for content in parse_single(f):
                if content:
                    # 解析完写入目标文件
                    k = content[0] + content[2]
                    all_content[k] = content
                    line = "\t".join(content)
                    rf.write(line.encode() + b'\r\n')

            # 批量存入sqlite3
            if all_content:
                sqldb.executemany(sql, all_content.values())

        rf.close()
class ToiScraper():
    TABLE_NAME = 'articles'
    TABLE_SCHEMA = [(u'ds', u'text(10)'), (u'title', u'text'),
                    (u'url', u'text')]
    # Manually observed minimum date on TOI
    INIT_DATE = (2020, 1, 1)
    MIN_ENTRIES = 600
    MAX_SLEEP = 3600

    def __init__(self):
        # ====  Required vars ===== #
        self.stdin_path = '/dev/null'
        self.stdout_path = '/dev/null'
        self.stderr_path = '/dev/null'
        # self.pidfile_path =  '/var/run/toidaemon/toidaemon.pid'
        self.pidfile_path = PID_FILE_PATH
        self.pidfile_timeout = 5
        # ========================= #

        self.db_name = DB_PATH
        self.db = SQLite(self.db_name)
        self.table = self.db.get(ToiScraper.TABLE_NAME)
        print("Initializing...")
        if not self.table:
            print("No table found with name {0}. Creating it.".format(
                ToiScraper.TABLE_NAME))
            self.table = self.db.create(ToiScraper.TABLE_NAME,
                                        ToiScraper.TABLE_SCHEMA)
        else:
            if not self.table.get_info() == ToiScraper.TABLE_SCHEMA:
                error_str = "Table {0} exists but with incorrect schema".format(
                    ToiScraper.TABLE_NAME)
                print(error_str)
                raise Exception(error_str)
        self.iter_date = self._get_init_date_full()

    # Get the last date in the database with at least 600 entries in it (enough to tell that it's full)
    def _get_init_date_full(self):
        print(
            "Retrieving last retrieved date from database with at least {0} in it"
            .format(ToiScraper.MIN_ENTRIES))
        first_date = self.db.execute("""
        SELECT
          a.ds,
          a.count
        FROM (
          SELECT
            ds,
            count(1) AS count
          FROM {0}
          GROUP BY ds
          ORDER BY DATE(ds) DESC
        ) a
        WHERE a.count > {1}
        LIMIT 1;
      """.format(ToiScraper.TABLE_NAME, ToiScraper.MIN_ENTRIES),
                                     get=True)
        if len(first_date) == 0:
            print(
                "No last date with given minimum entries found in DB, starting from beginning."
            )
            return ToiScraper.INIT_DATE
        print("Last date with entries {0} found. {1} entries total.".format(
            first_date[0][0], first_date[0][1]))
        return self.get_next_day(*tuple(map(int, first_date[0][0].split('-'))))

    # Get the last date in the database with entries in it
    def _get_init_date(self):
        print("Retrieving last retrieved date from database")
        first_date = self.db.execute(
            'SELECT ds FROM {0} ORDER BY DATE(ds) DESC LIMIT 1'.format(
                ToiScraper.TABLE_NAME),
            get=True)
        if len(first_date) == 0:
            print("No last date found in DB, starting from beginning.")
            return ToiScraper.INIT_DATE
        print("Last date {0} found.".format(first_date[0]['ds']))
        return self.get_next_day(
            *tuple(map(int, first_date[0]['ds'].split('-'))))

    def get_last_valid_date(self):
        return datetime.utcnow() + timedelta(hours=5, minutes=30)

    # Check if the date is strictly before today in IST
    def is_valid_date(self, year, month, day):
        try:
            datetime(year, month, day)
        except ValueError:
            return False
        cur_time = datetime(year, month, day)
        india_time = self.get_last_valid_date()
        return cur_time + timedelta(
            days=1) < india_time and cur_time >= datetime(
                *ToiScraper.INIT_DATE)

    def compute_url_for_day(self, year, month, day):
        if not self.is_valid_date(year, month, day):
            return None
        # Day count used in TOI URL (1st October, 2015 == 42278)
        day_count = (date(year, month, day) - date(1900, 1, 1)).days + 2
        return "http://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{daycount}.cms".format(
            year=year, month=month, day=day, daycount=day_count)

    def get_next_day(self, year, month, day):
        next_day = datetime(year, month, day) + timedelta(days=1)
        return (next_day.year, next_day.month, next_day.day)

    def _retrieve_url_contents(self, url, datetuple):
        print("Request sent to url {0}".format(url))
        req = requests.get(url)
        print("Response retrieved, parsing")
        soup = BeautifulSoup(req.text, 'lxml')
        # Signature of the element we're interested in. We rely on the TOI webpage
        # not to change
        divs = soup.find_all(
            'div',
            style=
            'font-family:arial ;font-size:12;font-weight:bold; color: #006699')
        if not len(divs) == 1:
            error_str = "Found {0} divs matching signature. Aborting.".format(
                len(divs))
            self.error(error_str)
            raise Exception(error_str)
        articles = divs[0].find_all('a')
        print("Found {0} hyperlinks in the archive.".format(len(articles)))
        articles = [a for a in articles if len(a.text) > 0]
        res = []
        titles = set({})
        for art in articles:
            corr_url = self.validate_url(art['href'])
            if corr_url:
                if art.text in titles:
                    continue
                titles.add(art.text)
                res.append([
                    datetime(*datetuple).strftime('%Y-%m-%d'),
                    art.text,
                    corr_url,
                ])
        print("Finished parsing, {0} rows remain".format(len(res)))
        return res

    # TOI specific article URL validation and correction
    def validate_url(self, url):
        URL_CORRECT = 'http://timesofindia.indiatimes.com/'
        URL_STANDARD = 'http://'
        URL_INSIDE = '.indiatimes.com/'
        if not url.startswith(URL_STANDARD) or not URL_INSIDE in url:
            if not url.endswith('.cms') or 'http' in url or ' ' in url:
                return None
            else:
                return URL_CORRECT + url
        return url

    def dedup_insert(self, data, ds):
        date_str = '-'.join(map(str, ds))
        print("Asking to insert {0} articles in {1}".format(
            len(data), date_str))
        rows = self.table.where({'ds': date_str})
        print("Already {0} rows exist in {1}".format(len(rows), date_str))
        titles = set({})
        res = []
        for a in rows:
            if not a['title'] in titles:
                titles.add(a['title'])
                res.append((a['ds'], a['title'], a['url']))
        for r in data:
            if not r[1] in titles:
                titles.add(r[1])
                res.append(r)
        print("{0} rows left after deduplicating".format(len(res)))
        if len(rows) > 0:
            print("Deleting {0} rows from {1}".format(len(rows), date_str))
            self.table.del_where({'ds': date_str})
        if len(res) > 0:
            print("Inserting {0} rows from {1}".format(len(res), date_str))
            self.table.insert(res)

    def get_articles_for_day(self, year, month, day):
        print("Getting articles for the day")
        url = self.compute_url_for_day(year, month, day)
        if not url:
            return 0
        data = self._retrieve_url_contents(url, (year, month, day))
        self.dedup_insert(data, (year, month, day))
        return len(data)

    def run(self):
        while True:
            while not self.is_valid_date(*self.iter_date):
                next_date = datetime(*self.iter_date) + timedelta(days=1)
                sec_to_next_date = (next_date -
                                    self.get_last_valid_date()).seconds
                print("Reached the end, {0} seconds until {1}".format(
                    sec_to_next_date,
                    datetime(*self.iter_date).strftime('%Y-%m-%d')))
                if sec_to_next_date <= ToiScraper.MAX_SLEEP:
                    time.sleep(sec_to_next_date)
                else:
                    print(
                        'Seconds till next day {0} greater than {1}, so only sleeping for {1}'
                        .format(sec_to_next_date, ToiScraper.MAX_SLEEP))
                    time.sleep(ToiScraper.MAX_SLEEP)
                print('Woken up, getting init date again')
                self.iter_date = self._get_init_date_full()
                print('New date set to {0}'.format(self.iter_date))
            print("Retrieving articles for date {0}".format(self.iter_date))
            num_rows = self.get_articles_for_day(*self.iter_date)
            print("Retrieved {0} rows from TOI".format(num_rows))
            if num_rows == 0:
                print("Sleeping for 10 seconds, no rows retrieved")
                time.sleep(10)
            else:
                self.iter_date = self.get_next_day(*self.iter_date)
                print("Iterated to next day - {0}".format(
                    datetime(*self.iter_date)))