def get_video_links_from_model_page(av_str): from_path = "http://upornia.com/models/" + av_str + "/" to_path = "model/" + av_str dl.download_url(from_path, to_path) f = codecs.open(str(to_path), "r", "utf-8") if (f == None): return #print("processing file: " + str(av_str)) content = f.read() soup = BeautifulSoup(content); video_list_section = soup.find('div', {"id": "list_videos2_common_videos_list"}) if video_list_section == None: #print("video_list_section is empty") f.close() return items = video_list_section.find_all('article', {"class": "item"}) if items == None: #print("items are empty") f.close() return for item in items: url = item.find('a') if url == None: #print("url is empty") continue; link = url['href'] video_id = item['data-video-id'] # print(link) dl_link_path = "video_url/" + video_id dl.download_url(link, dl_link_path) f.close()
def dl_videos(link): content = climber.get_content(link) if content == None: return soup = BeautifulSoup(content) boxes = soup.findAll('div', {'class': 'boxim'}) if boxes == None: return links = [] product_ids = [] for sec in boxes: a_link_sec = sec.find('a') if a_link_sec == None: continue a_link = a_link_sec['href'] a_title = a_link_sec['title'] product_id = a_title.split(' ')[3] print("[{}] {}".format(product_id, a_link)) links.append(a_link) product_ids.append(product_id) for idx, each_link in enumerate(links): content1 = climber.get_content(each_link) if content1 == None: continue soup1 = BeautifulSoup(content1) ep_num = get_ep_num(soup1) vid_link = get_final_video_link(each_link) if vid_link == '' or vid_link == None: continue # download this vid_link to product_id_1.mp4 first_file_path = "{}_1.mp4".format(product_ids[idx]) print("Download {} ...".format(first_file_path)) if os.path.exists(first_file_path): print("{} exists, skip".format(first_file_path)) else: dl.download_url(vid_link, first_file_path) #print(vid_link) for i in range(2, ep_num + 1): new_link = "{}?ep={}".format(each_link, i) vid_link1 = get_final_video_link(new_link) if vid_link1 == '' or vid_link1 == None: continue #print(vid_link1) to_path = "{}_{}.mp4".format(product_ids[idx], i) print("Download {} ...".format(to_path)) if os.path.exists(to_path): print("{} exists, skip".format(to_path)) else: dl.download_url(vid_link1, to_path)
def getPhoto(soup): photo_section = soup.find('div', {"class": "photo"}) if photo_section != None: tn_section = photo_section.find('p', {"class": "tn"}) if tn_section != None: # May have no photo if tn_section.find('a') == None: return None url = tn_section.a.img['src'].split('?width')[0] if 'noimage.gif' in url: return None download_path = "../video_img/" + str(av_ID) + ".jpg" dl.download_url(url, download_path)
def scan_jpg(path): if not os.path.exists(path): print('Path not exists: {}'.format(path)) return for entry in scandir.scandir(path): if entry.is_dir(follow_symlinks=False): yield from scan_jpg(entry.path) # see below for Python 2.x else: #print(entry.path) if ".jpg" not in entry.path: file_without_ext = os.path.splitext(entry.path)[0] jpg_file = "{}.jpg".format(file_without_ext) if not os.path.exists(jpg_file): bn = ntpath.basename(file_without_ext) #print("checking {}".format(bn)) match = re.match(r'^([A-Z0-9]+-[A-Z]*\d+)\D*', bn) if match == None: print("{} cannot pass the reg exp, skip".format(bn)) continue product_id = "" for gg in match.groups(): product_id = gg #print("Check {}".format(product_id)) #print("{} doesn't exist, downloading...".format(product_id)) hyper_link_path = 'https://www.javbus.com/{}'.format(product_id) content = "" try: content = u''.join(urllib.request.urlopen(hyper_link_path).read().decode('utf-8')) #except urllib.error.HTTPError as e: except Exception: pass soup = BeautifulSoup(content) link = soup.find('a', {'class': 'bigImage'}) if link == None: print('Cannot find jpg link from "{}", abort...'.format(product_id)) debug = soup.find('div', {'class': 'bol-md-9'}) if debug == None: continue print(debug) continue link_path = link['href'] to_path = '/Volumes/wd2/new_cover/{}.jpg'.format(product_id) if (os.path.exists(to_path)): continue print("Download '{}' -> '{}".format(link_path, to_path)) dl.download_url(link_path, to_path)
def download_output_wrapper(URL_num, URL, destpath, videos, i, rate_limit): """ Wrapper around download to provide print output for info on what's downloaded and what's done """ print "Downloading URL #" + URL_num + ": " + URL + "\n" sys.stdout.flush() data = download_url(URL, destpath, videos, i, rate_limit) print "Finished for URL #" + URL_num + ": " + URL + "\n" sys.stdout.flush() return data
if result == None: continue # download image frame = soup.find('div', {"class": "frame"}) if frame != None: photo = frame.find('div', {"class": "photo"}) if photo != None: imgTag = photo.find('img') if imgTag != None: imgsrc = imgTag['src'] noImage = re.search('noimage', imgsrc) if noImage == None: print(imgsrc) img_path = "../image/" + str(file) dl.download_url(imgsrc, img_path) else: print('no image...') # download itemBox #prefix = "http://xcity.jp" # items = soup.find_all('div', {"class": "x-itemBox-package"}) # for item in items: # url = item.find('a') # if url == None: # continue; # url = prefix + url['href'] # print(url) # download_path = "../video/" + url.split('?id=')[1] # dl.download_url(url, download_path) # update result to DB
"--filename", dest="filename", required=False, type=str, help="sets custom name for output file (name without extension)" ) parser.add_argument( "-o", "--output", dest="output", required=False, type=str, help="specify output directory (defaults to /home/Music)" ) if __name__ == "__main__": parser = ArgumentParser( description="Script for downloading music from youtube videos.\n" ) add_arguments(parser) arguments = parser.parse_args() if url := arguments.url: interval = arguments.interval fname = arguments.filename output = arguments.output download_url(url, interval, fname, output)
from bs4 import BeautifulSoup import download as dl import os.path # print the title for fileId in range(1, 62490+1): if os.path.isfile("video/" + str(fileId)): continue; url = "http://xcity.jp/release/detail/?id=%d" % fileId print("download: " + url) dl.download_url(url, "video/" + str(fileId))
def get_page(page_num): from_path = "http://www.javbus.com/ja/actresses/%d" % page_num to_path = "actress_page/%d" % page_num dl.download_url(from_path, to_path)
# encoding=utf8 #aurhor :LiMengming #date:2017-10-19 #domain type #source = https://ransomwaretracker.abuse.ch/downloads/RW_URLBL.txt #source_id = 23 import download stamp = 'BalckList' source ='23' url = 'https://ransomwaretracker.abuse.ch/downloads/RW_URLBL.txt' download.download_url(source, stamp, url)
# encoding=utf8 #source: https://openphish.com/feed.txt #source ID : 20 #date:2017-9-22 import download stamp = 'Phish' source = '20' url = 'https://openphish.com/feed.txt' download.download_url(source, stamp, url)
def generate_contours_for_url(url, bucket, bucket_prefix): # Check if s3 files already exist s3_path_metric = get_s3_path(url, bucket_prefix, metric=True) s3_path_imperial = get_s3_path(url, bucket_prefix, metric=False) if s3_key_exists(bucket, key=s3_path_metric) and s3_key_exists( bucket, key=s3_path_imperial): print('s3 path exists; skipping') return None with TemporaryDirectory() as tmpdir: # Download url to local path print(url) local_path = download_url(url, tmpdir) # Unzip DEM with open(local_path, 'rb') as f: with ZipFile(f) as zf: img_file = [x for x in zf.namelist() if x.endswith('.img')][0] unzipped_path = zf.extract(img_file, path=tmpdir) # Generate metric contours # outputs hardcoded to data/contours_10m print('generating metric contours') cmd = ['bash', 'make_contours_10m.sh', unzipped_path] run(cmd, check=True) if bucket is not None: gj_path = Path('data/contour_10m') / (Path(unzipped_path).stem + '.geojson') assert gj_path.exists(), 'file does not exist' print('generating metric mbtiles') mbtiles_path = run_tippecanoe(gj_path, metric=True) # Write mbtiles to S3 s3.Bucket(bucket).upload_file( str(mbtiles_path), f'{bucket_prefix}/10m/{mbtiles_path.name}') # Delete geojson and mbtiles Path(gj_path).unlink(missing_ok=True) Path(mbtiles_path).unlink(missing_ok=True) # Generate imperial contours # outputs hardcoded to data/contours_40ft print('generating imperial contours') cmd = ['bash', 'make_contours_40ft.sh', unzipped_path] run(cmd, check=True) if bucket is not None: gj_path = Path('data/contour_40ft') / (Path(unzipped_path).stem + '.geojson') assert gj_path.exists(), 'file does not exist' print('generating imperial mbtiles') mbtiles_path = run_tippecanoe(gj_path, metric=False) # Write mbtiles to S3 s3.Bucket(bucket).upload_file( str(mbtiles_path), f'{bucket_prefix}/40ft/{mbtiles_path.name}') # Delete geojson and mbtiles Path(gj_path).unlink(missing_ok=True) Path(mbtiles_path).unlink(missing_ok=True)
if f == None: conitnue print("processing file: " + str(file)) content = f.read() soup = BeautifulSoup(content); waterfall = soup.find('div', {"id": "waterfall"}) if waterfall == None: f.close() continue items = waterfall.find_all('a', {"class": "avatar-box"}) if items == None: f.close() continue for item in items: img_tag = item.find('img') if img_tag == None: f.close() continue name = img_tag['title'] img_url = img_tag['src'] print(name) url = item['href'] dl.download_url(url, "../actress/" + name) if "nowprinting.gif" in img_url: print('No image, skip') else: dl.download_url(img_url, "../av_icon/" + name + ".jpg") os.rename(file, "../proceed_page/" + str(file)) f.close()
cur.executescript(''' DROP TABLE IF EXISTS Products; CREATE TABLE Products ( id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, filename TEXT UNIQUE, path TEXT, md5sum TEXT UNIQUE ); ''') current_path = Path(__file__).parent.absolute() for i in range(len(href)): print("Dowloading file") zip_file = download_url(s, href[i]['href'], str(current_path) + '/') path = str(current_path) + zip_file print(path) print("Complete downloading file: ", zip_file) prod_link = href[i]['href'] prod_id = prod_link[:-6] prod_checksum = prod_id + "Checksum/Value/$value" print("Getting file checksum") checksum = s.get(prod_checksum) dowloaded_file_checksum = md5(path) if checksum.text == dowloaded_file_checksum: print("Checksums match") cur.execute( '''INSERT OR IGNORE INTO Products (filename, path, md5sum) VALUES ( ? , ?, ? )''',
def get_music(req): try: filename = download_url(BASE_URL + '/get', os.path.expanduser(MUSIC_STORAGE), get_params=req.filename) return GetMusicResponse(success=True, filename=filename) except Exception as e: return GetMusicResponse(success=False, message=str(e))