def getlinks(urlid,url,siteurl,baseurl): iurls = ignoreurls() ignorelinks = iurls.getallbyscrapeurlid(urlid) _pagelinks = _get_page_links(url) pagelinks = [] for pagelink in _pagelinks: link = decodelink(pagelink,siteurl,baseurl) pagelinks.append((link.lower(),pagelink.get_text().strip())) # diff the lists, dropping links from the ignore list links = [x for x in pagelinks if x not in ignorelinks] #print "len(ignorelinks) = {0}".format(len(ignorelinks)) #pprint(ignorelinks) #print "\n\n" #print "len(pagelinks) = {0}".format(len(pagelinks)) #pprint(pagelinks) #print "\n\n" #print "len(links) = {0}".format(len(links)) #pprint(links) #print "\n\n" return links
def get_ignore_list(): iurls = ignoreurls() urls = iurls.getall() retval = [] for url in urls: ignoreurlid,ignoreurl,ignoredt,scrapeurlid = url retval.append(ignoreurl) return urls
def addignore(link,scrapeurlid): dt = datetime.datetime.now().date().isoformat() iurls = ignoreurls() iurls.add(link,dt,scrapeurlid)