def mejeej(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "photo-wrap"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.a['href'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def stilettocouture(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "PhotoWrapper"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.a.img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def jjperfectlegs(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("section", {"class": "post"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') soup.decompose() return (res_url)
def closetheels(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"id": "container"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') soup.decompose() return (res_url)
def addicttosex(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "photo-wrapper-inner"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') soup.decompose() return (res_url)
def therubik(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "post-content"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.img['src'] if debug_app: logr('img link ' + res_url) except: if debug_app: logr('error') res_url = 'error' soup.decompose() return (res_url)
def sexyonheels(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soup = BeautifulSoup(req.text, "lxml") tags = soup.find_all('div') soup.decompose() pattern = re.compile('src="?\'?([^"\'>]*)') try: res_url = re.findall(pattern, str(tags[5]))[0] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def nicelegsandperspectives(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("ul", {"id": "posts"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) tags = soup.find_all("section", class_="top") soup.decompose() try: res_url = tags[0].img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def haawheels(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "autopagerize_page_element"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) tags = soup.find_all("img") soup.decompose() try: res_url = tags[0]['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def heelhunter(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("article") soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) tags = soup.find_all("a") soup.decompose() try: res_url = tags[0].img['src'] if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') return (res_url)
def naughtylegs(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"id": "posts"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) try: res_url = soup.img['src'] ####### bypass images that are avatars http://apollo:3080/crawler/crawler.2/peeptoeheels.tumblr.com/avatar_f570d9426951_16.png if debug_app: logr('img link ' + res_url) except: res_url = 'error' if debug_app: logr('error') soup.decompose() return (res_url)
def get_url_archives(url_master): global site_errors url_array = [] parsed = urlparse(url_master) url_parsed = parsed.scheme + '://' + parsed.netloc soupfilter = SoupStrainer('nav', {'class': 'months'}) req = requests.get(url_master, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soup = BeautifulSoup(req.text, 'lxml', parse_only=soupfilter) url_array = [ url_parsed + url_path['href'] for url_path in soup.find_all('a') ] if len(url_array) == 0: site_errors.append(url_master) logr('Site %s not working properly' % url_master) return (url_array)
def get_url_archives_old(url_master): global site_errors br = mechanize.Browser() br.set_handle_robots(False) url_array = [] try: br.open(url_master) months = ['January','February','March','April','May','June','July','August','September','October','November','December', \ 'Janvier', 'Fevrier','mars', 'Avril','Mai','Juin','Juillet','Aout','Septembre','Octobre','Novembre','Decembre' \ 'Ocak', 'Subat', 'Mart', 'Nisan', 'Mayis', 'Harizan', 'Temmuz', 'Agustos', 'Eylul', 'Ekim', 'Kasim', 'Aralik'] for item in br.links(): print item continue if item.text in months: parsed = urlparse(item.base_url) url_parsed = parsed.scheme + '://' + parsed.netloc url = url_parsed + item.url url_array.append(url) except: site_errors.append(url_master) logr('Site %s not working properly' % url_master) return (url_array)
def heelsland(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) # typical tumblr post - easy to grab pictures soupfilter = SoupStrainer("div", {"class": "post load"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) posts = [ post['href'] for post in soup.find_all('a') if re.search('/image/', post['href']) ] if len(posts) > 0: for post_url in posts: req = requests.get(post_url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soup = BeautifulSoup(req.text, "lxml") img_tags = soup.find_all("div", {"id": "content-wrapper"}) img_array = [img_url.img['data-src'] for img_url in img_tags] if debug_app: for item in img_array: logr('img link ' + item) return (img_array) soup = BeautifulSoup(req.text, "lxml") # check if it's a photoset post if len(soup.find_all("div", {"class": "html_photoset"})): # this returns a javascript snippet which passes the img as parameters. this part parses the js code and extracts the urls. img_tags = soup.find_all('script') jscript_tag = str( img_tags[7]).split('\n')[-1].split('\t')[-1].strip('</script>') json_data = json.loads(jscript_tag) if debug_app: for item in json_data: logr('img link ' + item) return (json_data['image']['@list']) # img_array = [img_url for img_url in json_data['image']['@list'] if re.match('^http://.+',img_url)] # return(img_array) # # if other algorithms failed it defaults to download the first instance image img_tag = soup.find("div", {"class": "photo"}) if debug_app: logr('img link ' + img_tag.img['src']) return (img_tag.img['src'])
def evil(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soup = BeautifulSoup(req.text, "lxml") tags = soup.find_all('meta') soup.decompose() if len(tags) == 63: res_url = tags[39]['content'] if re.match(img_pattern, res_url): if debug_app: logr('img link ' + res_url) else: res_url = 'error' if debug_app: logr('error') else: res_url = 'error' if debug_app: logr('error') return (res_url)
def heelsfromhell(url): req = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"id": "content"}) soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter) tags = soup.find_all('a') nxt_soup = BeautifulSoup(str(tags[0])) try: res_url = nxt_soup.img['src'] if debug_app: logr('img link ' + res_url) except: tags = soup.find_all('p') nxt_soup = BeautifulSoup(str(tags[0])) try: res_url = nxt_soup.img['src'] if debug_app: logr('img link ' + res_url) except: if debug_app: logr('error') res_url = 'error' soup.decompose() return (res_url)
parser = argparse.ArgumentParser(prog='tumblrbot.py', description='Tumblr Image Downloader') parser.add_argument('-x', '--proxy', nargs='?', const='dynamic', default='none') parser.add_argument('-d', '--debug_app', action="store_true", help=("prints debug messages")) parser.add_argument('-u', '--url', nargs='?', const='dynamic', default='none') parser.add_argument('-f', '--file', nargs='?', const='dynamic', default='/root/xscripts/tumblrbot/urls') parser.add_argument("-r", "--random", action="store_true", help=("randomizes playlist")) args = parser.parse_args() if os.path.isfile(args.file) or re.match(r'^http://.+\.', args.url): bot(args.file, args.proxy, args.random, args.url, args.debug_app) else: logr('not a valid download_list file or url')
def bot(url_list, opt_proxy, random_mode, target_url, debug): script_start = timer() global complete_path global tumblrblog # global image_link global dbconn_tumblr global dbconn_smp global debug_app global my_proxies global my_headers global my_timeout global site_errors global image_counter global tot_images global page_errors global tot_errors global imgskiped global parse_only debug_app = debug if opt_proxy == 'none': my_proxies = {} else: my_proxies = {"http": opt_proxy} start_path = '/mnt/vol1/crawler/crawler.4/' credentials_path = os.path.join(work_folder, 'api_settings', 'config') user_agent_string = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0' my_headers = { 'User-Agent': user_agent_string, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Referer': "http://www.tumblr.com" } my_timeout = 10 image_counter = 0 tot_images = 0 tot_errors = 0 imgskiped = 0 max_pages_per_blog = 99 max_images_per_page = 999999 sleep_time = 5 site_errors = [] # Database Connection Setup db_server = 'prometheus' (db_user, db_pass) = load_config(credentials_path)['databases'][db_server] dbconn_tumblr = MySQLdb.connect(host=db_server, user=db_user, passwd=db_pass, db='tumblr') dbconn_smp = MySQLdb.connect(host=db_server, user=db_user, passwd=db_pass, db='samplesdb') dbconn_tumblr.autocommit(True) dbconn_smp.autocommit(True) ########################### img_pattern = re.compile('.+\.(jpg|png|gif)$') today = datetime.datetime.now() if target_url != 'none': sites_list = [target_url] else: sites_list = open(url_list).read() sites_list = [ line for line in sites_list.split('\n') if re.match('^http://.+', line) ] for tumblrblog in sites_list: page_counter = 1 image_counter = 1 complete_path = start_path + str(urlparse(tumblrblog).hostname) ensure_dir(complete_path) lista_paginas = get_url_archives(tumblrblog) if len(lista_paginas) == 0: continue for pagina in lista_paginas: #url archive pages by month parsed_obj = urlparse(pagina) url_path = parsed_obj.path url_path_split = url_path.split('/') url_year = url_path_split[2] url_month = url_path_split[3] if (str(today.month) == url_month) and (str(today.year) == url_year): pass else: rows_pages = check_db_page(pagina) if len(rows_pages) > 0: logr('page match') continue logr('Looking at Page %s ...' % pagina) page_errors = 0 logr('Getting list of urls') list_image_links = find_image_urls(pagina) for this_img_link in list_image_links: if isinstance(this_img_link, list): for this_instance in this_img_link: download_images(this_instance, tumblrblog) page_counter += len(this_img_link) else: download_images(this_img_link, tumblrblog) page_counter += 1 if image_counter > max_images_per_page: break if page_errors == 0: rec_page(pagina) time.sleep(sleep_time) if page_counter > max_pages_per_blog: page_counter -= 1 logr('Reached max number of %s blog pages for %s' % (page_counter, tumblrblog)) break dbconn_tumblr.close() dbconn_smp.close() elapsed_time = timer() - script_start # logr("I've done my job!") bot_summary('tumblrbot', tot_images, tot_errors, imgskiped, elapsed_time, url_list)
def download_images(url, tumblrblog): global image_counter global tot_images global page_errors global tot_errors global imgskiped filename = os.path.basename(url) sys.stdout.write('Checking ' + url) rows_image_urls = check_db_imageurl(url) if len(rows_image_urls) > 0: logr('url match') return try: raw_content = requests.get(url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) # raw_content = urllib.urlopen(url).read() except: sys.stdout.write(' error while downloading \n') # page_errors += 1 tot_errors += 1 return # sys.stdout.write('ok') try: raw_conversion = StringIO(raw_content.content) resource_image = Image.open(raw_conversion) imghash = hashmem(resource_image) rows_images = check_imgdup(imghash) except: sys.stdout.write(' hash error, image skipped\n') page_errors += 1 return if len(rows_images) > 0: imgskiped += 1 logr('image hash match') # sys.stdout.write( 'image hash match\n' ) return filename = os.path.basename(url) save_as = os.path.join(complete_path, filename) try: image_handle = open(save_as, "w") image_handle.write(raw_content.content) image_handle.close() sys.stdout.write(' saved') except: sys.stdout.write('\nerror while saving %s ' + filename + '\n') page_errors += 1 tot_errors += 1 return dbfd_uri = save_as.replace('/mnt/vol1/', '') dbfd_pstar = dbfd_uri.split('/')[2] img_height, img_width = get_img_size(save_as) is_portrait = is_img_portrait(img_height, img_width) sample_timestamp = str(os.path.getmtime(save_as)).split('.')[-0] rec_img_indb(dbconn_smp, dbfd_uri, dbfd_pstar, img_height, img_width, is_portrait, sample_timestamp) rec_link(url, urlparse(tumblrblog).netloc, imghash) sys.stdout.write(' inserted into db\n') image_counter += 1 tot_images += 1 return
def find_image_urls(page_url): array_urls = [] htmlsource = requests.get(page_url, headers=my_headers, timeout=my_timeout, proxies=my_proxies) soupfilter = SoupStrainer("div", {"class": "l-content"}) soup = BeautifulSoup(htmlsource.text, "lxml", parse_only=soupfilter) tags = soup.find_all("a") all_links_for_page = [link['href'] for link in tags] for every_link in all_links_for_page: if debug_app: logr('post link ' + every_link) url_parsed = urlparse(every_link) if url_parsed.netloc == 'sweet57334.tumblr.com': img_link = get_sweet(every_link) elif url_parsed.netloc == 'highheelsandshizzle.tumblr.com': img_link = hizzle(every_link) elif url_parsed.netloc == 'heelsfromhell.tumblr.com': img_link = heelsfromhell(every_link) elif url_parsed.netloc == 'therubik.tumblr.com': img_link = therubik(every_link) elif url_parsed.netloc == 'shoelvr67.tumblr.com': img_link = shoelvr67(every_link) elif url_parsed.netloc == 'hot-on-heels.com': img_link = hotonheels(every_link) elif url_parsed.netloc == 'legsandheels.tumblr.com': img_link = lgshls(every_link) elif url_parsed.netloc == 'jjperfectlegs.tumblr.com' or url_parsed.netloc == 'classysexypixs.tumblr.com': img_link = jjperfectlegs(every_link) elif url_parsed.netloc == 'naughtylegs.tumblr.com' or url_parsed.netloc == 'peeptoeheels.tumblr.com': img_link = naughtylegs(every_link) elif url_parsed.netloc == 'closetheels.tumblr.com': img_link = closetheels(every_link) elif url_parsed.netloc == 'high-heels-forever.tumblr.com': img_link = highheelsforever(every_link) elif url_parsed.netloc == 'tuneman86.tumblr.com' or url_parsed.netloc == 'sluttybimbogirl.tumblr.com' \ or url_parsed.netloc == 'artandsexy.tumblr.com' : img_link = tuneman86(every_link) elif url_parsed.netloc == 'bestcelebritylegs.tumblr.com': img_link = bestcelebritylegs(every_link) elif url_parsed.netloc == 'e-v-i-l-f-u-c-k-e-r.tumblr.com': img_link = evil(every_link) elif url_parsed.netloc == 'sexy-on-heels.tumblr.com': img_link = sexyonheels(every_link) elif url_parsed.netloc == 'addicttosex.tumblr.com': img_link = addicttosex(every_link) elif url_parsed.netloc == 'icelegsandperspectives.tumblr.com': img_link = nicelegsandperspectives(every_link) elif url_parsed.netloc == 'haawheels.tumblr.com': img_link = haawheels(every_link) elif url_parsed.netloc == 'heelhunter.tumblr.com': img_link = heelhunter(every_link) elif url_parsed.netloc == 'mejeej.tumblr.com': img_link = mejeej(every_link) elif url_parsed.netloc == 'stilettocouture.tumblr.com': img_link = stilettocouture(every_link) elif url_parsed.netloc == 'goodmission.tumblr.com' or url_parsed.netloc == 'tejano78.tumblr.com': img_link = goodmission(every_link) elif url_parsed.netloc == 'www.heels-land.com': img_link = heelsland(every_link) else: logr(every_link) img_link = 'error' # first check if the crawler return an array of link urls if isinstance(img_link, list): print 'array : %s' % str(img_link) if img_link[0] != 'error': array_urls.append(img_link) else: if not re.match('^avatar_.+', img_link.split('/')[-1]) and img_link != 'error': array_urls.append(img_link) return array_urls