Пример #1
0
class Crawler:
    '''Constructs List of pairs of Url and Pattern 
        from the expressions on WATCH_TAB 
    '''
    def __init__(self):
        self.db = Noti_db('watch.db')
        self.crawled = []
        url_pat_cur = self.db.get_url_pattern()
        url_pattern_pair = url_pat_cur.fetchone()
        while url_pattern_pair:
            if url_pattern_pair[0][-1] != '*':
                self.crawled.append(url_pattern_pair)
            else:
                base_url = url_pattern_pair[0][:-1] #strip the asterisk
                url_pattern = re.compile( base_url + '.*', re.I ) #convert to standard regex
                self.crawled.append(( base_url, url_pattern_pair[1])) #the base url should be added too
                self.crawled.extend( map( lambda x: (x ,url_pattern_pair[1] ),crawl_url( base_url, url_pattern)))
            url_pattern_pair = url_pat_cur.fetchone()
        
        #remove duplicates here -----
        temp = self.crawled
        self.crawled = []
        for x in temp:
            if x not in self.crawled:
                self.crawled.append(x)
        #-----------------------------

    def get_crawled(self):
        return self.crawled
Пример #2
0
def submit():
    global urlEntryWidget
    global patternEntryWidget
    global disappEntryWidget

    db = Noti_db('watch.db')
    
    if urlEntryWidget.get().strip() == "":
        return
    if not patternEntryWidget.get().strip() == "":   
        db.add_url_pattern(urlEntryWidget.get().strip(), patternEntryWidget.get().strip() )
    if not disappEntryWidget.get().strip() == '':
        db.add_url_disapp(urlEntryWidget.get().strip(), disappEntryWidget.get().strip() )

    #code to clear the inputs here
    #urlEntryWidget.delete(0, Tkinter.END) #left to allow entering multiple words for the same URL
    patternEntryWidget.delete(0, Tkinter.END)
    disappEntryWidget.delete(0, Tkinter.END)
Пример #3
0
 def __init__(self):
     self.db = Noti_db('watch.db')
     self.crawled = []
     url_pat_cur = self.db.get_url_pattern()
     url_pattern_pair = url_pat_cur.fetchone()
     while url_pattern_pair:
         if url_pattern_pair[0][-1] != '*':
             self.crawled.append(url_pattern_pair)
         else:
             base_url = url_pattern_pair[0][:-1] #strip the asterisk
             url_pattern = re.compile( base_url + '.*', re.I ) #convert to standard regex
             self.crawled.append(( base_url, url_pattern_pair[1])) #the base url should be added too
             self.crawled.extend( map( lambda x: (x ,url_pattern_pair[1] ),crawl_url( base_url, url_pattern)))
         url_pattern_pair = url_pat_cur.fetchone()
     
     #remove duplicates here -----
     temp = self.crawled
     self.crawled = []
     for x in temp:
         if x not in self.crawled:
             self.crawled.append(x)