示例#1
0
def getlinks(urlid,url,siteurl,baseurl):
    iurls = ignoreurls()
    ignorelinks = iurls.getallbyscrapeurlid(urlid)

    _pagelinks = _get_page_links(url)

    pagelinks = []
    for pagelink in _pagelinks:
        link = decodelink(pagelink,siteurl,baseurl)
        pagelinks.append((link.lower(),pagelink.get_text().strip()))

    # diff the lists, dropping links from the ignore list
    links = [x for x in pagelinks if x not in ignorelinks]

    #print "len(ignorelinks) = {0}".format(len(ignorelinks))
    #pprint(ignorelinks)
    #print "\n\n"
    #print "len(pagelinks) = {0}".format(len(pagelinks))
    #pprint(pagelinks)
    #print "\n\n"
    #print "len(links) = {0}".format(len(links))
    #pprint(links)
    #print "\n\n"

    return links
示例#2
0
def get_ignore_list():
    iurls = ignoreurls()
    urls = iurls.getall()
    retval = []
    for url in urls:
        ignoreurlid,ignoreurl,ignoredt,scrapeurlid = url
        retval.append(ignoreurl)
    return urls
示例#3
0
def addignore(link,scrapeurlid):
    dt = datetime.datetime.now().date().isoformat()
    iurls = ignoreurls()
    iurls.add(link,dt,scrapeurlid)