Exemplo n.º 1
0
 def insert_novel_to_db(self, novel_name, novel_url, novel_author,
                        novel_desc, latest_chapter_name, latest_chapter_url,
                        img):
     '''
     插入小说到数据库
     :param novel_name: 被抓小说名称
     :param novel_url:   被抓取小说的url
     :param novel_author: 作者
     :param novel_desc:  小说简要介绍
     :param latest_chapter_name: 最新章节名称
     :param latest_chapter_url:  最新章节url
     :return:
     '''
     url_obj = URLFilter(novel_url)
     host_id = self.get_website_id(novel_url)
     check_novel_url = url_obj.get_novel_url()
     dt = get_current_time()
     from CONSTANTS import DATABASE_OBJECT
     # if not self.check_novel_exist(check_novel_url):
     DATABASE_OBJECT.insert('novel',
                            novel_name=novel_name,
                            novel_url=check_novel_url,
                            img=img,
                            author=novel_author,
                            novel_desc=novel_desc,
                            website_id=host_id,
                            latest_chapter_name=latest_chapter_name,
                            latest_chapter_url=latest_chapter_url,
                            novel_type=None,
                            create_time=dt)
Exemplo n.º 2
0
 def check_chapter_exist(self, chapter_url):
     url_obj = URLFilter(chapter_url)
     check_chapter_url = url_obj.get_novel_url() + url_obj.get_url_end()
     results = DATABASE_OBJECT.query(
         '''select * from chapter where chapter_url = '%s';''' %
         check_chapter_url)
     if len(results) == 0:
         return False
     return True
Exemplo n.º 3
0
 def get_chapter_id(self, url):
     url_obj = URLFilter(url)
     check_chapter_url = url_obj.get_novel_url() + url_obj.get_url_end()
     results = DATABASE_OBJECT.query(
         '''select * from chapter where chapter_url = '%s';''' %
         check_chapter_url)
     if len(results) == 0:
         return
     for result in results:
         return result.chapter_id
Exemplo n.º 4
0
 def check_novel_exist(self, novel_url):
     url_obj = URLFilter(novel_url)
     check_novel_url = url_obj.get_novel_url()
     from CONSTANTS import DATABASE_OBJECT
     results = DATABASE_OBJECT.query(
         '''select * from novel where novel_url = '%s';''' %
         check_novel_url)
     if len(results) == 0:
         return False
     return True
Exemplo n.º 5
0
 def get_novel_id(self, url):
     url_obj = URLFilter(url)
     check_novel_url = url_obj.get_novel_url()
     from CONSTANTS import *
     results = DATABASE_OBJECT.query(
         '''select * from novel where novel_url = '%s';''' %
         check_novel_url)
     if len(results) == 0:
         return
     for result in results:
         return result.novel_id
Exemplo n.º 6
0
 def check_website_exist(self, url):
     '''
     检查网址是否在website中存在
     :param url:
     :return:
     '''
     url_obj = URLFilter(url)
     check_host_url = url_obj.get_host_url()
     results = DATABASE_OBJECT.query(
         '''select * from website where website_url = '%s';''' %
         check_host_url)
     if len(results) == 0:
         return False
     return True
Exemplo n.º 7
0
 def insert_website_to_db(self, url):
     '''
     将网站插入到数据库
     :param url:
     :return:
     '''
     url_obj = URLFilter(url)
     temp_host_url = url_obj.get_host_url()
     host_name = url_obj.get_host_name()
     dt = get_current_time()
     if not self.check_website_exist(url):
         DATABASE_OBJECT.insert('website',
                                website_name=host_name,
                                website_url=temp_host_url,
                                website_type='source',
                                create_time=dt)
Exemplo n.º 8
0
 def get_website_id(self, url):
     '''
     获取网站的id
     :param url:
     :return:
     '''
     url_obj = URLFilter(url)
     check_host_url = url_obj.get_host_url()
     from CONSTANTS import *
     results = DATABASE_OBJECT.query(
         '''select * from website where website_url = '%s';''' %
         check_host_url)
     if len(results) == 0:
         return
     for result in results:
         return result.website_id
    def process_log(self, verbose= False):

        urls = set()
        user_classes = set()
        req_types = set()
        ips = set()
        labels = set()

        with open(self.path) as fr:

            # filter lines with urls that should not be included
            lines = fr.readlines()
            lines2keep = [l for l in lines if not URLFilter.is_filtered_url(l)]
            print('only kept:', len(lines2keep), 'out of', len(lines), 'due to url filtering')

            self.events = []
            mapper = URLMapper()
            filter_events = 0

            for l in lines2keep:
                parts = l.strip().split()
                ip = parts[1]
                time = parts[4].strip('[') + " " + parts[5].strip(']')
                time = datetime.strptime(time, "%d/%b/%Y:%H:%M:%S %z")
                req_type = parts[6].strip("\"")
                url = parts[7]
                label = mapper.get_url_label(url, req_type, l)  # map urls to labels, used as event name

                if not label:  ## only keep events with labels
                    filter_events += 1
                    continue
                user_class = UserClassMapper.extract_user_class(l)
                # for p in parts:
                if verbose:
                    urls.add(url)
                    user_classes.add(user_class)
                    req_types.add(req_type)
                    ips.add(ip)
                    labels.add(label)
                self.events.append(BEAREvent(ip, time, req_type, url, label, user_class, l))
            print('extracted', len(self.events), 'events, filtered', filter_events, 'due to a missing labels')

            # break events according to time window of 60 minutes (according to the paper)
            self._break_events_to_traces()
            if verbose:
                self._print_set(urls, "urls")
                self._print_set(user_classes, "user_classes")
                self._print_set(req_types, "req_types")
                self._print_set(ips, "ips")
                self._print_set(labels, "labels")
        return self.traces
Exemplo n.º 10
0
 def insert_chapter_to_db(self, novel_id, chapter_name, chapter_url,
                          content, previous_chapter_url, next_chapter_url):
     url_obj = URLFilter(chapter_url)
     # novel_url = url_obj.get_novel_url()
     # novel_id = self.get_novel_id(novel_url)
     # check_chapter_url = novel_url + url_obj.get_url_end()
     dt = get_current_time()
     print dt
     if not (previous_chapter_url is None or next_chapter_url is None):
         DATABASE_OBJECT.insert('chapter',
                                chapter_name=chapter_name,
                                chapter_url=chapter_url,
                                content=content,
                                previous_chapter_url=previous_chapter_url,
                                next_chapter_url=next_chapter_url,
                                novel_id=novel_id,
                                create_time=dt)
Exemplo n.º 11
0
#!/usr/bin/env python
#coding=utf8

import URLFilter

if __name__ == '__main__':
    urlbf = URLFilter.URLBloomFilter()
    urlbf.initdb()
    urlbf.initfilter()
    urlbf.initsql(m_sql='insert into test (url) values(%s)')
    if urlbf.add('www.sina.com.cn'):
        print 'add success'
    else:
        print 'add failed'
    if urlbf.add('www.sina.com.cn'):
        print 'add success'
    else:
        print 'add failed'

    urlbf.close()