def analyse(self, base_url, html): url_list = html_analyse.analyse_url(html) for url in url_list: record_url = None if re.search('^https?://[0-9a-zA-Z\\.&%/]+$', url) != None: record_url = url elif re.search('^/[0-9a-zA-Z\\.&%/]+$', url) != None: record_url = base_url + url elif re.search('^(\\./)*[0-9a-zA-Z\\.&%/]+$', url) != None: count = len(re.findall('(\\./)', url)) parent_url = base_url for i in range(0, count): parent_url = os.path.dirname(parent_url) record_url = parent_url + url[count * 2 :] if record_url != None: record_obj = record.Record(define.UNDEFINE, record_url, record.STATUS_NOTYET, '', '') _logic.insert_record(self.table, record_obj)
def initialize(): if not os.path.exists(define.STORAGE_PATH): os.mkdir(define.STORAGE_PATH) if not os.path.exists(define.HTML_PATH): os.mkdir(define.HTML_PATH) if not os.path.exists(define.CACHE_PATH): os.mkdir(define.CACHE_PATH) for i in range(0, 5): _logic.create_record_table('%s_%s' % (record.DB_TABLE, i)) bloom_fliter.init_bitarray() index = 0 for url in urls.init_url_list: if not bloom_fliter.url_exist(url): record_obj = record.Record(define.UNDEFINE, url, record.STATUS_NOTYET, '', '') table = '%s_%s' % (record.DB_TABLE, index % THREAD_COUNT) _logic.insert_record(table, record_obj) index = index + 1 bloom_fliter.save_bitarray()