Пример #1
0
 def analyse(self, base_url, html):
     url_list = html_analyse.analyse_url(html)
     for url in url_list:
         record_url = None
         if re.search('^https?://[0-9a-zA-Z\\.&%/]+$', url) != None:
             record_url = url
         elif re.search('^/[0-9a-zA-Z\\.&%/]+$', url) != None:
             record_url = base_url + url
         elif re.search('^(\\./)*[0-9a-zA-Z\\.&%/]+$', url) != None:
             count = len(re.findall('(\\./)', url))
             parent_url = base_url
             for i in range(0, count):
                 parent_url = os.path.dirname(parent_url)
             record_url = parent_url + url[count * 2 :]
         if record_url != None:
             record_obj = record.Record(define.UNDEFINE, record_url, record.STATUS_NOTYET, '', '')
             _logic.insert_record(self.table, record_obj)
Пример #2
0
def initialize():
    if not os.path.exists(define.STORAGE_PATH):
        os.mkdir(define.STORAGE_PATH)
    if not os.path.exists(define.HTML_PATH):
        os.mkdir(define.HTML_PATH)
    if not os.path.exists(define.CACHE_PATH):
        os.mkdir(define.CACHE_PATH)
    for i in range(0, 5):
        _logic.create_record_table('%s_%s' % (record.DB_TABLE, i))
    bloom_fliter.init_bitarray()
    index = 0
    for url in urls.init_url_list:
        if not bloom_fliter.url_exist(url):
            record_obj = record.Record(define.UNDEFINE, url, record.STATUS_NOTYET, '', '')
            table = '%s_%s' % (record.DB_TABLE, index % THREAD_COUNT)
            _logic.insert_record(table, record_obj)
        index = index + 1
    bloom_fliter.save_bitarray()