def get_srtm_tile(srtm_tile, out_dir): """ Downloads and extract an srtm tile from the internet. Args: srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying the desired strm tile out_dir: directory where to store and extract the srtm tiles """ # check if the tile is already there if not os.path.exists(out_dir): os.makedirs(out_dir) if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)): return # download the zip file srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile) zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile) common.download(zip_path, srtm_tile_url) # extract the tif file if zipfile.is_zipfile(zip_path): z = zipfile.ZipFile(zip_path, 'r') z.extract('%s.tif' % srtm_tile, out_dir) else: print "%s not available" % srtm_tile # remove the zip file os.remove(zip_path)
def down(dl_list, minmax): for image in dl_list: print(image) filename = image.split("/") filename = filename[-1] common.download(image, filename, minmax) print("Download complete!")
def main(): sys.stdout.write(u'正在努力请求节目单...') sys.stdout.flush() data = common.open_url(list_url) if not data: return menu_list = json.loads(data)['list'] sys.stdout.write('\r') list_format = u'[{title}] by {author} | {player} {min:02}:{sec:02}' print u'{0:*^60}'.format(u'悦读FM.倾听文字的声音') print u'总共%d期.最新10期:'%len(menu_list) for i in range(0,10): print i,list_format.format(**menu_list[i]) print u"\n输入序号下载,以','分开.'q'退出" while 1: usr_input = raw_input('Select(0-%d):'%(len(menu_list)-1)) if usr_input == 'q': print 'bye!' break try: li = map(int, usr_input.split(',')) except: print 'Input Error!' for i in li: if 0 <= i < len(menu_list): common.download(menu_list[i]['mp3'], _TARGET,\ menu_list[i]['title'], 'mp3', Referer='http://yuedu.fm/') article2Html(i, menu_list[i]['title'])
def download(self,url,filepath): #获取名称 name = self.getName(url) html = common.getHtml(url) m3u8 = self.getM3u8(html) common.download(urllib.unquote(m3u8),filepath,name + '.m3u8') url = self.URL_PIRFIX + self.getSinavideoUrl(filepath+name+'.m3u8') common.download(url,filepath,name+'.mp4')
def get_dict(): """ Get the word, verb and label dictionary of Wikipedia corpus. """ word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) return word_dict, verb_dict, label_dict
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = download(link)
def download_pic(url): print url html = common.open_url(url) find_re = re.compile(r'<li id.+?<img src="(.+?)"', re.DOTALL) img_url = find_re.findall(html) print 'Start download %d pics'%len(img_url) for url in img_url: if url: filename,ext = os.path.splitext(os.path.split(url)[-1]) if not ext: ext = '.jpg' common.download(url, TARGET, filename, ext[1:], Referer=url)
def download_show(li): for num in li: if num > 296: url = xml_url_1%num else: url = xml_url_2%num xml_data = common.open_url(url) if xml_data: songlist = extract(xml_data) target_dir = TARGET%num for title, location in songlist: ext = location.split('.')[-1] common.download(location, target_dir, title, ext, Referer=referer%num)
def main(): common.setup() usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name()) if len(sys.argv) != 3: print usage sys.exit(1) src = sys.argv[1] if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'): print usage print ('gs_dst should be of the form /path/to/object. gs://{0} will be ' 'prefixed for you.').format(cfg.gs_bucket) sys.exit(1) dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2]) common.download(src, dst)
def get_dateil(): content = parse_content(url) links = [article['link'] for article in content] for link in links: article_tree = lxml.html.fromstring(download(link)) article_content = article_tree.cssselect('div#article_content > p')[0] print article_content
def main(): D = download(is_cookie=True) if not login(D, userid, passwd, dynamic_passwd): # login fail, return return hr_url = 'https://hr.guosen.com.cn/sso/SsoHrssServlet' html = D.get(hr_url) login_data = {} login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html) login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html) login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html) login_data['ReturnURL'] = 'https://hr.guosen.com.cn/sso/SsoHrssServlet' html = D.post('https://sso.guosen.com.cn/login.aspx', login_data) login_data = {} login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html) login_data['Result'] = '0' login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html) login_data['UserAccount'] = userid login_data['ErrorDescription'] = '' login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html) login_data['IASUserAccount'] = userid html = D.post(hr_url, login_data) end_url = 'https://hr.guosen.com.cn/hrss/ta/Clockin.jsp?_funcode=E0020902' D.get(end_url) post_url = 'https://hr.guosen.com.cn/hrss/dorado/smartweb2.RPC.d?__rpc=true' search_data = {} search_data['__type'] = 'loadData' search_data['__viewInstanceId'] = 'nc.bs.hrss.ta.Clockin~nc.bs.hrss.ta.ClockinViewModel' search_data['__xml'] = get_post_xml(month, year) html = D.post(post_url, search_data) if 'result succeed="true"' in html: print 'Hello world!' else: print html
def main(): if len(sys.argv) < 3 or (sys.argv[1] != '-t' and len(sys.argv) > 3): help_info() return if sys.argv[1] == '-a': url = _albumUrl % sys.argv[2] elif sys.argv[1] == '-c': url = _collectUrl % sys.argv[2] elif sys.argv[1] == '-t': url = _trackUrl % ','.join(sys.argv[2:]) else : help_info() return content = common.open_url(url) if not content: return res = extract(content) for title,uri,lrc in res: common.download(uri,TARGET,title,'mp3') if lrc:
def test100(): """ CIFAR-100 test set cretor. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. :return: Test reader creator. :rtype: callable """ return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True): ''' Create flowers validation set reader. It returns a reader, each sample in the reader is image pixels in [0, 1] and label in [1, 102] translated from original color image by steps: 1. resize to 256*256 2. random crop to 224*224 3. flatten :param mapper: a function to map sample. :type mapper: callable :param buffered_size: the size of buffer used to process images :type buffered_size: int :return: test data reader :rtype: callable ''' return reader_creator( download(DATA_URL, 'flowers', DATA_MD5), download(LABEL_URL, 'flowers', LABEL_MD5), download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper, buffered_size, use_xmap)
def parse_content(url): html = download(url) tree = lxml.html.fromstring(html) td = tree.cssselect('ul.detail_list > li') for lis in td: item = {} item['title'] = lis.cssselect('h4 > a')[0].text_content() item['time'] = lis.cssselect('div.detail_b > span')[0].text_content() item['views'] = lis.cssselect('div.detail_b > em')[0].text_content() item['abstract'] = lis.cssselect('p.detail_p')[0].text_content() item['link'] = lis.cssselect('h4 > a')[0].attrib['href'] yield item
def train10(): """ CIFAR-10 training set creator. It returns a reader creator, each sample in the reader is image pixels in [0, 1] and label in [0, 9]. :return: Training reader creator :rtype: callable """ return reader_creator( download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] # the queue of URL's to download while crawl_queue: url = crawl_queue.pop() html = download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): # add this link to the crawl queue crawl_queue.append(link)
def iteration(): max_errors = 5 # maximum number of consecutive download errors allowed num_errors = 0 # current number of consecutive download errors for page in itertools.count(1): url = 'http://example.webscraping.com/view/-{}'.format(page) html = download(url) if html is None: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum amount of errors in a row so exit break # so assume have reached the last country ID and can stop downloading else: # success - can scrape the result # ... num_errors = 0
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] seen = set(crawl_queue) # keep track which URL's have seen before while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): # check if link matches expected regex if re.match(link_regex, link): # form absolute link link = urlparse.urljoin(seed_url, link) # check if have already seen this link if link not in seen: seen.add(link) crawl_queue.append(link)
def test(): """ Conll05 test set creator. Because the training dataset is not free, the test dataset is used for training. It returns a reader creator, each sample in the reader is nine features, including sentence sequence, predicate, predicate context, predicate context flag and tagged sequence. :return: Training reader creator :rtype: callable """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict)
def __initialize_meta_info__(): fn = download(URL, "movielens", MD5) global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') with zipfile.ZipFile(file=fn) as package: for info in package.infolist(): assert isinstance(info, zipfile.ZipInfo) MOVIE_INFO = dict() title_word_set = set() categories_set = set() with package.open('ml-1m/movies.dat') as movie_file: for i, line in enumerate(movie_file): movie_id, title, categories = line.strip().split('::') categories = categories.split('|') for c in categories: categories_set.add(c) title = pattern.match(title).group(1) MOVIE_INFO[int(movie_id)] = MovieInfo( index=movie_id, categories=categories, title=title) for w in title.split(): title_word_set.add(w.lower()) global MOVIE_TITLE_DICT MOVIE_TITLE_DICT = dict() for i, w in enumerate(title_word_set): MOVIE_TITLE_DICT[w] = i global CATEGORIES_DICT CATEGORIES_DICT = dict() for i, c in enumerate(categories_set): CATEGORIES_DICT[c] = i global USER_INFO USER_INFO = dict() with package.open('ml-1m/users.dat') as user_file: for line in user_file: uid, gender, age, job, _ = line.strip().split("::") USER_INFO[int(uid)] = UserInfo( index=uid, gender=gender, age=age, job_id=job) return fn
def get_image(self, image_object, try_web=True): image_path = self.get_image_path(image_object.get_small_basename()) temp_path = self.get_temp_path(image_object.get_small_basename()) if os.path.exists(image_path) and os.path.isfile(image_path): try: pixbuf = gtk.gdk.pixbuf_new_from_file(image_path) except gobject.GError: try: os.unlink(image_path) except: pass else: del pixbuf return image_path if try_web: small_image_url = image_object.small_url if small_image_url: ret = common.download(small_image_url, temp_path) if ret and self.cleanup_small(temp_path, image_path): return image_path return None
cwd = os.getcwd() start_time = time.time() if args.begin_phase <= 1: print('\n======= Phase I, downloading data =======') for name, url_tuple in baseline_data.items(): if verbose: print('Downloading ' +str(name)) sys.stdout.flush() path = os.path.join('datasets/', name) # if url_tuple[RES_LOCATION].startswith('http') or \ # url_tuple[RES_LOCATION].startswith('ftp'): loc = url_tuple[RES_LOCATION] if any([loc.startswith(x) for x in ['file', 'ftp', 'http']]): download(url_tuple[RES_LOCATION], path) print(loc) print('Phase 1 ran in %.3f minutes' % ((time.time() - start_time) / 60)) if args.end_phase == 1: print('\nTerminating process after phase 1 as specified by user.') print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60)) sys.exit() else: print('\nSkipping phase 1.') sys.stdout.flush() if args.begin_phase <= 2: print('\n======= Phase II, parsing data =======') # For now, download and store the data in the parsed.py module. This module
if not os.path.exists(resource_dir): os.mkdir(resource_dir) # change to resource directory os.chdir(resource_dir) # make dataset directory if not os.path.exists(path_constants.dataset_dir): os.mkdir(path_constants.dataset_dir) # create empty dictionary to hold all ns values and equivalence gp_dict = {} # parse reference dataset (entrez gene) for path, url in gp_reference.file_to_url.items(): download(url, path) parser = gp_reference.parser_class(gp_reference.file_to_url) print("Running " + str(parser)) (gene_dict, history_dict) = parser.parse() gp_dict.update(gene_dict) # parse dependent datasets for d in gp_datasets: for path, url in d.file_to_url.items(): download(url, path) parser = d.parser_class(gene_dict, history_dict, d.file_to_url) print("Running " + str(parser)) gp_dict.update(parser.parse()) print("Completed gene protein resource generation.") print("Number of namespace entries: %d" %(len(gp_dict)))
def fetch(): download(DATA_URL, 'flowers', DATA_MD5) download(LABEL_URL, 'flowers', LABEL_MD5) download(SETID_URL, 'flowers', SETID_MD5)
def fetch(): download(CIFAR10_URL, 'cifar', CIFAR10_MD5) download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
change_log['MESHCL'] = {} change_log['MESHD'] = {} change_log['SCHEM'] = {} change_log['SDIS'] = {} change_log['DO'] = {} change_log['DOID'] = {} # download the data needed for resolving lost values print('\nDownloading data needed for resolving changed/lost terms...') if not os.path.exists('changelog_datasets/'): os.mkdir('changelog_datasets/') for name, data_tuple in changelog_data.items(): if verbose: print('Downloading ' +str(data_tuple[RES_LOCATION])) path = os.path.join('changelog_datasets/', name) if 'ftp' in data_tuple[RES_LOCATION] or 'http' in data_tuple[RES_LOCATION]: download(data_tuple[RES_LOCATION], path) print('Resolving changed/lost terms...') sp_accession_ids = [] for label, data_tuple in changelog_data.items(): url = label parser = data_tuple[PARSER_TYPE]('changelog_datasets/'+url) if str(parser) == 'EntrezGeneHistory_Parser': log = change_log.get('EGID') if verbose: print('\nGathering Entrez update info...') for row in parser.parse(): discontinued_id = row.get('Discontinued_GeneID') gid = row.get('GeneID') replacement_id = gid if gid != '-' else 'withdrawn'
def get_embedding(): """ Get the trained word vector based on Wikipedia corpus. """ return download(EMB_URL, 'conll05st', EMB_MD5)
def fetch(): return download(URL, "MQ2007", MD5)
def fetch(): download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) download(EMB_URL, 'conll05st', EMB_MD5) download(DATA_URL, 'conll05st', DATA_MD5)