Пример #1
0
def getRandomURLlycos():
    """Pick images by feeding random words into Lycos.

    Based on webcollage by jwz."""

    lycos_search_url = "http://lycospro.lycos.com/srchpro/?lpv=1&t=any&query="

    words = randomWord.randomWords(3)
    start = random.randrange(9) * 10 + 1
    search_url = lycos_search_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) + '&start=%d' % (start)

    subpages = pickFromSearchEngine(search_url)
    candidates = []
    urlre = re.compile(r'^http://click.hotbot.com/director.asp\?id=[1-9]\d*&target=([^&]+)')
    # Lycos plays exact the same redirection game as hotbot.
    # Note that "id=0" is used for internal advertising links,
    # and 1+ are used for  search results.
    # Lycos doesn't give the strange behaviour hotbot does.
    for x in subpages:
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(m.group(1))
        candidates.append(url)
        log.debug("candidate %s" % (url))

    return candidates
Пример #2
0
def getRandomURLhotbot():
    """Pick images by feeding random words into Hotbot.

    Based on jwz's code.
    """

    # TODO:
    # They seemd to have some anti-robot code at hotbot.
    # At least they return always 'no results' to this scripts although
    # the same query URL leads to hundreds of hits in my browser

    hotbot_search_url = "http://hotbot.lycos.com/?SM=SC&DV=0&LG=any&FVI=1&DC=100&DE=0&SQ=1&TR=13&AM1=MC&MT="
 
    words = randomWord.randomWords(3)
    search_url = hotbot_search_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words])

    subpages = pickFromSearchEngine(search_url)
    candidates = []
    urlre = re.compile(r'^/director.asp\?target=([^&]+)')
    # Hotbot plays redirection games too
    for x in subpages:
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(m.group(1))
        candidates.append(url)
        log.debug("candidate %s" % (url))

    return candidates
Пример #3
0
def getRandomURLaltavista():
    """Pick images by feeding random words into Alta Vista Text Search.
    
    Ported over from jwz's webcollage."""

    altavista_url_1 = "http://www.altavista.com/cgi-bin/query?pg=q&text=yes&kl=XX&stype=stext&q="
    altavista_url_2 = "http://www.altavista.com/sites/search/web?pg=q&kl=XX&search=Search&q="
    altavista_url = altavista_url_2

    words = randomWord.randomWords(3)
    page = random.randrange(10) + 1
    search_url = altavista_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words])

    if page > 1:
        search_url += "&pgno=%d&stq%d" % (page, (page-1) * 10)

    subpages = pickFromSearchEngine(search_url)
    candidates = []
    # jwz: Those altavista fuckers are playing really nasty
    # redirection games these days: the filter your clicks through
    # their site, but use onMouseOver to make it look like they're
    # not!  Well, it makes it easier for us to identify search
    # results...
    urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+.*\&uid=[a-zA-Z0-9]+\&r=(.*)')
    for x in subpages:
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(m.group(1))
        candidates.append(url)
        log.debug("candidate %s" % (url))

    return candidates
Пример #4
0
def getRandomImagealtavista():
    """Pick images by feeding random words into Alta Vista Image Search.

    From webcollage by jwz."""
    altavista_images_url = ''.join(["http://www.altavista.com/cgi-bin/query",
                                     "?ipht=1",       # photos
                                     "&igrph=1",      # graphics
                                     "&iclr=1",       # color
                                     "&ibw=1",        # b&w
                                     "&micat=1",      # no partner sites
                                     "&imgset=1",     # no partner sites
                                     "&stype=simage", # do image search
                                     "&mmW=1",        # unknown, but required
                                     "&q="])

    # TODO: hack arround altavista adult filter
    words = randomWord.randomWords(15)
    page = random.randrange(10) + 1
    search_url = altavista_images_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words])

    if page > 1:
        search_url += "&pgno=%d&stq=%d" % (page, (page-1) * 10)

    subpages = pickFromSearchEngine(search_url)
    # we use a dictionary here to get rid of dupes
    candidates = {}
    # altavista is encoding their URLs now.
    urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+\&ref=[a-zA-Z0-9]+.*\&r=(.*)')
    framere = re.compile(r'.+&url=([^&]+)&.+&src=(http[^&]+)&stq=\d+')
    imgre = re.compile(r'.*&src=(http.*)&stq=.*')
    for x in subpages:
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(m.group(1))
        #  skip non-HTTP or relative URLs
        if url.find('http://') != 0:
            log.debug("no http:// url with candidate %s" % (url))
            continue
        m = framere.match(url)
        if m == None:
            log.debug("no imagematch with candidate %s (%s)" % (url, x))
            continue
        url = urlUnquote(m.group(1))
        image = urlUnquote(m.group(2))
        # skip altavista builtins
        if url.find('altavista.com') != -1 \
           or url.find('doubleclick.net') != -1 \
           or url.find('clicktomarket.com') != -1 \
           or url.find('viewimages.com') != -1 \
           or url.find('gettyimages.com') != -1:
            log.debug("despammed candidate %s" % (url))
            continue
        
        candidates[(image, url)] = 1
        log.debug("candidate %s - %s" % (image, url))

    return candidates.keys()
Пример #5
0
def getRandomURLaltavistanews():
    """Pick images by feeding random words into news.altavista.com."""

    altavista_news_url = "http://news.altavista.com/search?nc=&q="

    # TODO: to find actual news we need more common words
    words = randomWord.randomWords(2)
    search_url = altavista_news_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words])

    subpages = pickFromSearchEngine(search_url)
    candidates = []
    urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+.*\&r=(.*)')
    for x in subpages:
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(m.group(1))
        candidates.append(url)
        log.debug("candidate %s" % (url))

    return candidates
Пример #6
0
def getRandomURLyahoonews():
    """Pick images by feeding random words into news.yahoo.com.

    Based on webcolage by jwz"""

    yahoo_news_url = "http://search.news.yahoo.com/search/news_photos?&z=&n=100&o=o&2=&3=&p="

    # TODO: to find actual news we need more common words
    words = randomWord.randomWords(10)
    search_url = yahoo_news_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words])

    subpages = pickFromSearchEngine(search_url)
    candidates = []
    urlre = re.compile(r'^http://dailynews.yahoo.com/')
    for x in subpages:
        # only accept URLs on Yahoo's news site
        m = urlre.match(x)
        if m == None:
            continue
        url = urlUnquote(x)
        candidates.append(url)
        log.debug("candidate %s" % (url))

    return candidates