def getRandomURLlycos(): """Pick images by feeding random words into Lycos. Based on webcollage by jwz.""" lycos_search_url = "http://lycospro.lycos.com/srchpro/?lpv=1&t=any&query=" words = randomWord.randomWords(3) start = random.randrange(9) * 10 + 1 search_url = lycos_search_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) + '&start=%d' % (start) subpages = pickFromSearchEngine(search_url) candidates = [] urlre = re.compile(r'^http://click.hotbot.com/director.asp\?id=[1-9]\d*&target=([^&]+)') # Lycos plays exact the same redirection game as hotbot. # Note that "id=0" is used for internal advertising links, # and 1+ are used for search results. # Lycos doesn't give the strange behaviour hotbot does. for x in subpages: m = urlre.match(x) if m == None: continue url = urlUnquote(m.group(1)) candidates.append(url) log.debug("candidate %s" % (url)) return candidates
def getRandomURLhotbot(): """Pick images by feeding random words into Hotbot. Based on jwz's code. """ # TODO: # They seemd to have some anti-robot code at hotbot. # At least they return always 'no results' to this scripts although # the same query URL leads to hundreds of hits in my browser hotbot_search_url = "http://hotbot.lycos.com/?SM=SC&DV=0&LG=any&FVI=1&DC=100&DE=0&SQ=1&TR=13&AM1=MC&MT=" words = randomWord.randomWords(3) search_url = hotbot_search_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) subpages = pickFromSearchEngine(search_url) candidates = [] urlre = re.compile(r'^/director.asp\?target=([^&]+)') # Hotbot plays redirection games too for x in subpages: m = urlre.match(x) if m == None: continue url = urlUnquote(m.group(1)) candidates.append(url) log.debug("candidate %s" % (url)) return candidates
def getRandomURLaltavista(): """Pick images by feeding random words into Alta Vista Text Search. Ported over from jwz's webcollage.""" altavista_url_1 = "http://www.altavista.com/cgi-bin/query?pg=q&text=yes&kl=XX&stype=stext&q=" altavista_url_2 = "http://www.altavista.com/sites/search/web?pg=q&kl=XX&search=Search&q=" altavista_url = altavista_url_2 words = randomWord.randomWords(3) page = random.randrange(10) + 1 search_url = altavista_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) if page > 1: search_url += "&pgno=%d&stq%d" % (page, (page-1) * 10) subpages = pickFromSearchEngine(search_url) candidates = [] # jwz: Those altavista fuckers are playing really nasty # redirection games these days: the filter your clicks through # their site, but use onMouseOver to make it look like they're # not! Well, it makes it easier for us to identify search # results... urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+.*\&uid=[a-zA-Z0-9]+\&r=(.*)') for x in subpages: m = urlre.match(x) if m == None: continue url = urlUnquote(m.group(1)) candidates.append(url) log.debug("candidate %s" % (url)) return candidates
def getRandomImagealtavista(): """Pick images by feeding random words into Alta Vista Image Search. From webcollage by jwz.""" altavista_images_url = ''.join(["http://www.altavista.com/cgi-bin/query", "?ipht=1", # photos "&igrph=1", # graphics "&iclr=1", # color "&ibw=1", # b&w "&micat=1", # no partner sites "&imgset=1", # no partner sites "&stype=simage", # do image search "&mmW=1", # unknown, but required "&q="]) # TODO: hack arround altavista adult filter words = randomWord.randomWords(15) page = random.randrange(10) + 1 search_url = altavista_images_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) if page > 1: search_url += "&pgno=%d&stq=%d" % (page, (page-1) * 10) subpages = pickFromSearchEngine(search_url) # we use a dictionary here to get rid of dupes candidates = {} # altavista is encoding their URLs now. urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+\&ref=[a-zA-Z0-9]+.*\&r=(.*)') framere = re.compile(r'.+&url=([^&]+)&.+&src=(http[^&]+)&stq=\d+') imgre = re.compile(r'.*&src=(http.*)&stq=.*') for x in subpages: m = urlre.match(x) if m == None: continue url = urlUnquote(m.group(1)) # skip non-HTTP or relative URLs if url.find('http://') != 0: log.debug("no http:// url with candidate %s" % (url)) continue m = framere.match(url) if m == None: log.debug("no imagematch with candidate %s (%s)" % (url, x)) continue url = urlUnquote(m.group(1)) image = urlUnquote(m.group(2)) # skip altavista builtins if url.find('altavista.com') != -1 \ or url.find('doubleclick.net') != -1 \ or url.find('clicktomarket.com') != -1 \ or url.find('viewimages.com') != -1 \ or url.find('gettyimages.com') != -1: log.debug("despammed candidate %s" % (url)) continue candidates[(image, url)] = 1 log.debug("candidate %s - %s" % (image, url)) return candidates.keys()
def getRandomURLaltavistanews(): """Pick images by feeding random words into news.altavista.com.""" altavista_news_url = "http://news.altavista.com/search?nc=&q=" # TODO: to find actual news we need more common words words = randomWord.randomWords(2) search_url = altavista_news_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) subpages = pickFromSearchEngine(search_url) candidates = [] urlre = re.compile(r'^/r\?ck_sm=[a-zA-Z0-9]+.*\&r=(.*)') for x in subpages: m = urlre.match(x) if m == None: continue url = urlUnquote(m.group(1)) candidates.append(url) log.debug("candidate %s" % (url)) return candidates
def getRandomURLyahoonews(): """Pick images by feeding random words into news.yahoo.com. Based on webcolage by jwz""" yahoo_news_url = "http://search.news.yahoo.com/search/news_photos?&z=&n=100&o=o&2=&3=&p=" # TODO: to find actual news we need more common words words = randomWord.randomWords(10) search_url = yahoo_news_url + '%20'.join(['"%s"' % (x.replace(' ', '%20')) for x in words]) subpages = pickFromSearchEngine(search_url) candidates = [] urlre = re.compile(r'^http://dailynews.yahoo.com/') for x in subpages: # only accept URLs on Yahoo's news site m = urlre.match(x) if m == None: continue url = urlUnquote(x) candidates.append(url) log.debug("candidate %s" % (url)) return candidates