def main(): host = 'localhost' database = 'resys' user = '******' password = '' db = Connection(host,database,user,password) for row in db.query('select * from fatwallet'): if row.url.find('?start=0')!=-1 or re.match('^http://www.fatwallet.com/forums/hot-deals/[0-9]+/$',row.url)!=None: output(row) save(row,db)
class FatwalletSpider(Spider): def __init__(self, beginurl, reg, priorcontent, pages, downloadFolder, host, database, user=None, password=None): super(FatwalletSpider, self).__init__(beginurl, reg, priorcontent, pages, downloadFolder) self.db=Connection(host,database,user,password) def saveHTML(self, url, html, path): md5=self.md5(html) raw_html=html text_html=''.join([c for c in html if c in string.printable]) self.insertHTML2DB('fatwallet', url, raw_html, text_html, md5) super(FatwalletSpider, self).saveHTML(url,html) def insertHTML2DB(self, TABLE, URL, RAW, TEXT, MD5): self.db.insert(TABLE, url=URL,raw_html=RAW,text_html=TEXT, md5=MD5)
def __init__(self, beginurl, reg, priorcontent, pages, downloadFolder, host, database, user=None, password=None): super(FatwalletSpider, self).__init__(beginurl, reg, priorcontent, pages, downloadFolder) self.db=Connection(host,database,user,password)