class Crawler: '''Constructs List of pairs of Url and Pattern from the expressions on WATCH_TAB ''' def __init__(self): self.db = Noti_db('watch.db') self.crawled = [] url_pat_cur = self.db.get_url_pattern() url_pattern_pair = url_pat_cur.fetchone() while url_pattern_pair: if url_pattern_pair[0][-1] != '*': self.crawled.append(url_pattern_pair) else: base_url = url_pattern_pair[0][:-1] #strip the asterisk url_pattern = re.compile( base_url + '.*', re.I ) #convert to standard regex self.crawled.append(( base_url, url_pattern_pair[1])) #the base url should be added too self.crawled.extend( map( lambda x: (x ,url_pattern_pair[1] ),crawl_url( base_url, url_pattern))) url_pattern_pair = url_pat_cur.fetchone() #remove duplicates here ----- temp = self.crawled self.crawled = [] for x in temp: if x not in self.crawled: self.crawled.append(x) #----------------------------- def get_crawled(self): return self.crawled
def submit(): global urlEntryWidget global patternEntryWidget global disappEntryWidget db = Noti_db('watch.db') if urlEntryWidget.get().strip() == "": return if not patternEntryWidget.get().strip() == "": db.add_url_pattern(urlEntryWidget.get().strip(), patternEntryWidget.get().strip() ) if not disappEntryWidget.get().strip() == '': db.add_url_disapp(urlEntryWidget.get().strip(), disappEntryWidget.get().strip() ) #code to clear the inputs here #urlEntryWidget.delete(0, Tkinter.END) #left to allow entering multiple words for the same URL patternEntryWidget.delete(0, Tkinter.END) disappEntryWidget.delete(0, Tkinter.END)
def __init__(self): self.db = Noti_db('watch.db') self.crawled = [] url_pat_cur = self.db.get_url_pattern() url_pattern_pair = url_pat_cur.fetchone() while url_pattern_pair: if url_pattern_pair[0][-1] != '*': self.crawled.append(url_pattern_pair) else: base_url = url_pattern_pair[0][:-1] #strip the asterisk url_pattern = re.compile( base_url + '.*', re.I ) #convert to standard regex self.crawled.append(( base_url, url_pattern_pair[1])) #the base url should be added too self.crawled.extend( map( lambda x: (x ,url_pattern_pair[1] ),crawl_url( base_url, url_pattern))) url_pattern_pair = url_pat_cur.fetchone() #remove duplicates here ----- temp = self.crawled self.crawled = [] for x in temp: if x not in self.crawled: self.crawled.append(x)