示例#1
0
 def next(self):
     db.query('SELECT * from `queue` WHERE `finished`=0 ORDER by `id` ASC LIMIT 1')
     res = db.store_result()
     if res.num_rows() == 1:
         self.query = res.fetch_row(how=1)[0]
         return self.query
     else:
         return False
示例#2
0
 def save(self):
     if self.visit_id and self.errors:
         db.query('DELETE from `page_errors` WHERE `visit_id`='+db.escape_string(str(self.visit_id)))
         for er in self.errors:
             db.query('INSERT into `page_errors` (`visit_id`,`error_id`) VALUES ("'+db.escape_string(str(self.visit_id))+'","'+db.escape_string(str(er))+'")')
         return True
     else:
         return False
示例#3
0
 def get(self, id='', page_id='', status=''):
     if id:
         db.query('SELECT `id`,`page_id`,`status` FROM `visits` WHERE `id`='+db.escape_string(str(id)))
         res = db.store_result()
         if res.num_rows() == 1:
             self.id, self.page_id, self.status = res.fetch_row()[0]
     elif page_id and status:
         self.page_id = page_id
         self.status = status
示例#4
0
 def save(self):
     if self.host_id and self.path:
         if self.id:
             db.query('UPDATE `pages` SET `host_id`="'+db.escape_string(str(self.host_id))+'", `path`="'+db.escape_string(self.path)+'" WHERE `id`='+db.escape_string(str(self.id)))
             return True
         else:
             db.query('INSERT into `pages` (`host_id`,`path`) VALUES ("'+db.escape_string(str(self.host_id))+'","'+db.escape_string(self.path)+'")')
             self.id = db.insert_id()
             return True
     else:
         return False
示例#5
0
 def save(self):
     if self.host:
         if self.id:
             db.query('UPDATE `hosts` SET `host`="'+db.escape_string(self.host)+'" WHERE `id`='+db.escape_string(str(self.id)))
             return True
         else:
             db.query('INSERT into `hosts` (`host`) VALUES ("'+db.escape_string(self.host)+'")')
             self.id = db.insert_id()
             return True
     else:
         return False
示例#6
0
 def save(self):
     if self.page_id and self.status:
         if self.id:
             db.query('UPDATE `visits` SET `page_id`="'+db.escape_string(str(self.page_id))+'", `time`=UNIX_TIMESTAMP(), `status`="'+db.escape_string(str(self.status))+'" WHERE `id`='+db.escape_string(str(self.id)))
             return True
         else:
             db.query('INSERT into `visits` (`page_id`,`time`,`status`) VALUES ("'+db.escape_string(str(self.page_id))+'",UNIX_TIMESTAMP(),"'+db.escape_string(str(self.status))+'")')
             self.id = db.insert_id()
             return True
     else:
         return False
示例#7
0
 def get(self, visit_id='', errors=''):
     if visit_id and not errors:
         self.visit_id = visit_id
         db.query('SELECT `id`,`visit_id`,`error_id` FROM `page_errors` WHERE `visit_id`='+db.escape_string(str(visit_id)))
         res = db.store_result()
         if res.num_rows():
             while True:
                 row = res.fetch_row()[0]
                 if not row:
                     break
                 self.errors = self.errors + [res.fetch_row()[0][2]]
     elif visit_id and errors:
         self.visit_id = visit_id
         self.errors = errors
示例#8
0
 def get(self, id='', host=''):
     if id or host:
         if id:
             db.query('SELECT `id`,`host` FROM `hosts` WHERE `id`='+db.escape_string(str(id)))
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host = res.fetch_row()
             elif host:
                 self.host = host
         elif host:
             host = re.sub('^www.', '', host)
             db.query('SELECT `id`,`host` FROM `hosts` WHERE `host`="'+db.escape_string(host)+'"')
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host = res.fetch_row()[0]
             else:
                 self.host = host
示例#9
0
 def get(self, id='', host_id='', path=''):
     if id or path:
         if id:
             db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `id`='+db.escape_string(str(id)))
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host_id, self.path = res.fetch_row()[0]
             elif host_id and path:
                 self.host_id = host_id
                 self.path = path
         elif host_id and path:
             db.query('SELECT `id`,`host_id`,`path` FROM `pages` WHERE `host_id`="'+db.escape_string(str(host_id))+'" AND `path`="'+db.escape_string(path)+'"')
             res = db.store_result()
             if res.num_rows() == 1:
                 self.id, self.host_id, self.path = res.fetch_row()[0]
             else:
                 self.host_id = host_id
                 self.path = path
示例#10
0
 def save(self):
     if self.visit_id and self.page_id:
         if self.id:
             db.query(
                 'UPDATE `cache` SET `visit_id`="'
                 + db.escape_string(str(self.visit_id))
                 + '", `page_id`="'
                 + db.escape_string(str(self.page_id))
                 + '", `encoding`="'
                 + db.escape_string(self.encoding)
                 + '", `doctype`="'
                 + db.escape_string(self.doctype)
                 + '", `validity`="'
                 + db.escape_string(str(self.validity))
                 + '", `headers`="'
                 + db.escape_string(str(self.headers))
                 + '", `content`="'
                 + db.escape_string(self.content)
                 + '" WHERE `id`='
                 + db.escape_string(str(self.id))
             )
             return True
         else:
             db.query(
                 'INSERT into `cache` (`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content`,`headers`) VALUES ("'
                 + db.escape_string(str(self.visit_id))
                 + '","'
                 + db.escape_string(str(self.page_id))
                 + '","'
                 + db.escape_string(self.encoding)
                 + '","'
                 + db.escape_string(self.doctype)
                 + '","'
                 + db.escape_string(str(self.validity))
                 + '","'
                 + db.escape_string(self.content)
                 + '","'
                 + db.escape_string(str(self.headers))
                 + '")'
             )
             self.id = db.insert_id()
             return True
     else:
         return False
示例#11
0
 def get(self, id="", visit_id="", page_id="", browser=""):
     if id:
         db.query(
             "SELECT `id`,`visit_id`,`page_id`,`encoding`,`doctype`,`validity`,`content` FROM `cache` WHERE `id`="
             + db.escape_string(str(id))
         )
         res = db.store_result()
         if res.num_rows() == 1:
             self.id, self.visit_id, self.page_id, self.encoding, self.doctype, self.validity, self.content = res.fetch_row()[
                 0
             ]
     elif visit_id and page_id and browser:
         self.visit_id = visit_id
         self.page_id = page_id
         self.headers = browser.headers
         if browser.validate:
             self.encoding = browser.validator.charset
             self.doctype = browser.validator.doctype
             if browser.validator.validity:
                 self.validity = int(browser.validator.validity)
         self.content = browser.page
示例#12
0
 def parse(self, browser):
     #data = re.findall('<a((([^=>]+)(=(([\'"]([^\'">]*)[\'"])|([^\'"\s\n>]*))*))*)>', page, re.S)
     data = re.findall('href=([\'"][^\'"]+[\'"]|[^\n\s>]*)', browser.page, re.S)
     if data:
         for path in data:
             i = data.index(path)
             if data.count(data[i]) > 1:
                 data[i] = ''
             elif data[i].strip() != '':
                 data[i] = re.sub(r'^[\'"](.*)[\'"]$',r'\1',data[i],re.S).strip()
                 mail = re.search('mailto:(?P<email>[a-zA-Z0-9@_\-\.]*)',data[i].strip(),re.S)
                 if mail:
                     print 'mailto founded' # debugging
                     if mail.group('email'):
                         db.query('INSERT into `emails` (`time`,`url`,`email`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(browser.get_url())+'","'+db.escape_string(mail.group('email'))+'")')
                         print 'email inserted' # debugging
                 else:
                     if not re.search('^http://',data[i]):
                         if len(data[i]) > 0 and data[i][0] != '/':
                             data[i] = '/'+data[i]
                         data[i] = 'http://'+browser.host+data[i]
                     if data.count(data[i]) == 1:
                         db.query('SELECT `id` from `queue` WHERE `url`="'+db.escape_string(data[i])+'" AND `created`>UNIX_TIMESTAMP()-1209600')
                         if not db.store_result().num_rows():
                             db.query('INSERT into `queue` (`created`,`url`) VALUES (UNIX_TIMESTAMP(),"'+db.escape_string(data[i])+'")')
                     elif data.count(data[i]) > 1:
                         data.remove(data[i])
示例#13
0
 def delete(self):
     if self.id:
         db.query('DELETE from `pages` WHERE `id`='+db.escape_string(str(self.id)))
         return True
     else:
         return False
示例#14
0
 def delete(self):
     if self.id:
         db.query("DELETE from `cache` WHERE `id`=" + db.escape_string(str(self.id)))
         return True
     else:
         return False
示例#15
0
 def aborted(self):
     db.query('UPDATE `queue` SET `finished`=-1 WHERE `id`='+db.escape_string(str(self.query['id'])))
示例#16
0
 def finished(self):
     db.query('UPDATE `queue` SET `finished`=UNIX_TIMESTAMP() WHERE `id`='+db.escape_string(str(self.query['id'])))