示例#1
0
文件: srtm.py 项目: jguinet/s2p
def get_srtm_tile(srtm_tile, out_dir):
    """
    Downloads and extract an srtm tile from the internet.

    Args:
        srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying
            the desired strm tile
        out_dir: directory where to store and extract the srtm tiles
    """
    # check if the tile is already there
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)):
        return

    # download the zip file
    srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile)
    zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile)
    common.download(zip_path, srtm_tile_url)

    # extract the tif file
    if zipfile.is_zipfile(zip_path):
        z = zipfile.ZipFile(zip_path, 'r')
        z.extract('%s.tif' % srtm_tile, out_dir)
    else:
        print "%s not available" % srtm_tile

    # remove the zip file
    os.remove(zip_path)
示例#2
0
文件: fusk.py 项目: frimerke/lot
def down(dl_list, minmax):
    for image in dl_list:
        print(image)
        filename = image.split("/")
        filename = filename[-1]
        common.download(image, filename, minmax)
    print("Download complete!")
示例#3
0
文件: yuedu.py 项目: spyth/xiami
def main():
    sys.stdout.write(u'正在努力请求节目单...')
    sys.stdout.flush()
    data = common.open_url(list_url)
    if not data:
        return
    menu_list = json.loads(data)['list']
    sys.stdout.write('\r')

    list_format = u'[{title}] by {author}  |  {player} {min:02}:{sec:02}'
    print u'{0:*^60}'.format(u'悦读FM.倾听文字的声音')
    print u'总共%d期.最新10期:'%len(menu_list)

    for i in range(0,10):
        print i,list_format.format(**menu_list[i])
    print u"\n输入序号下载,以','分开.'q'退出"

    while 1:
        usr_input = raw_input('Select(0-%d):'%(len(menu_list)-1))
        if usr_input == 'q':
            print 'bye!'
            break
        try:
            li = map(int, usr_input.split(','))
        except:
            print 'Input Error!'
        for i in li:
            if 0 <= i < len(menu_list):
                common.download(menu_list[i]['mp3'], _TARGET,\
                    menu_list[i]['title'], 'mp3', Referer='http://yuedu.fm/')
                article2Html(i, menu_list[i]['title'])
示例#4
0
	def download(self,url,filepath):
		#获取名称
		name = self.getName(url)
		html = common.getHtml(url)
		m3u8 = self.getM3u8(html)
		common.download(urllib.unquote(m3u8),filepath,name + '.m3u8')
		url = self.URL_PIRFIX + self.getSinavideoUrl(filepath+name+'.m3u8')
		common.download(url,filepath,name+'.mp4')
示例#5
0
def get_dict():
    """
    Get the word, verb and label dictionary of Wikipedia corpus.
    """
    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
    return word_dict, verb_dict, label_dict
示例#6
0
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
示例#7
0
文件: jandan.py 项目: spyth/xiami
def download_pic(url):
    print url
    html = common.open_url(url)
    find_re = re.compile(r'<li id.+?<img src="(.+?)"', re.DOTALL)
    img_url = find_re.findall(html)
    print 'Start download %d pics'%len(img_url) 
    for url in img_url:
        if url:
            filename,ext = os.path.splitext(os.path.split(url)[-1])
            if not ext:
                ext = '.jpg'
            common.download(url, TARGET, filename, ext[1:], Referer=url)
示例#8
0
文件: luoo.py 项目: spyth/xiami
def download_show(li):
    for num in li:
        if num > 296: 
            url = xml_url_1%num
        else:
            url = xml_url_2%num 
        xml_data = common.open_url(url)
        if xml_data:
            songlist = extract(xml_data)
            target_dir = TARGET%num
            for title, location in songlist:
                ext = location.split('.')[-1]
                common.download(location, target_dir, title, ext, Referer=referer%num)
def main():
  common.setup()
  usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name())
  if len(sys.argv) != 3:
    print usage
    sys.exit(1)

  src = sys.argv[1]
  if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'):
    print usage
    print ('gs_dst should be of the form /path/to/object. gs://{0} will be '
           'prefixed for you.').format(cfg.gs_bucket)
    sys.exit(1)
  dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2])

  common.download(src, dst)
示例#10
0
def get_dateil():
    content = parse_content(url)
    links = [article['link'] for article in content]
    for link in links:
        article_tree = lxml.html.fromstring(download(link))
        article_content = article_tree.cssselect('div#article_content > p')[0]
        print article_content
示例#11
0
def main():
    D = download(is_cookie=True)
    if not login(D, userid, passwd, dynamic_passwd):
        # login fail, return
        return
    hr_url = 'https://hr.guosen.com.cn/sso/SsoHrssServlet'
    html = D.get(hr_url)
    login_data = {}
    login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html)
    login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html)
    login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html)
    login_data['ReturnURL'] = 'https://hr.guosen.com.cn/sso/SsoHrssServlet'
    html = D.post('https://sso.guosen.com.cn/login.aspx', login_data)            
    login_data = {}
    login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html)
    login_data['Result'] = '0' 
    login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html)
    login_data['UserAccount'] = userid
    login_data['ErrorDescription'] = ''
    login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html)
    login_data['IASUserAccount'] = userid
    html = D.post(hr_url, login_data)         

    end_url = 'https://hr.guosen.com.cn/hrss/ta/Clockin.jsp?_funcode=E0020902'
    D.get(end_url)
    post_url = 'https://hr.guosen.com.cn/hrss/dorado/smartweb2.RPC.d?__rpc=true'
    search_data = {}
    search_data['__type'] = 'loadData'
    search_data['__viewInstanceId'] = 'nc.bs.hrss.ta.Clockin~nc.bs.hrss.ta.ClockinViewModel'
    search_data['__xml'] = get_post_xml(month, year) 
    html = D.post(post_url, search_data)
    if 'result succeed="true"' in html:
        print 'Hello world!'
    else:
        print html
示例#12
0
文件: xiami.py 项目: spyth/xiami
def main():
    if len(sys.argv) < 3 or (sys.argv[1] != '-t' and len(sys.argv) > 3):
        help_info()
        return
    if sys.argv[1] == '-a':
        url = _albumUrl % sys.argv[2]
    elif sys.argv[1] == '-c':
        url = _collectUrl % sys.argv[2]
    elif sys.argv[1] == '-t':
        url = _trackUrl % ','.join(sys.argv[2:])
    else :
        help_info()
        return
    content = common.open_url(url)
    if not content:
        return
    res = extract(content)
    for title,uri,lrc in res:
        common.download(uri,TARGET,title,'mp3')
        if lrc:
示例#13
0
def test100():
    """
    CIFAR-100 test set cretor.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
示例#14
0
def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers validation set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
        buffered_size, use_xmap)
示例#15
0
def parse_content(url):
    html = download(url)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('ul.detail_list > li')
    for lis in td:
        item = {}
        item['title'] = lis.cssselect('h4 > a')[0].text_content()
        item['time'] = lis.cssselect('div.detail_b > span')[0].text_content()
        item['views'] = lis.cssselect('div.detail_b > em')[0].text_content()
        item['abstract'] = lis.cssselect('p.detail_p')[0].text_content()
        item['link'] = lis.cssselect('h4 > a')[0].attrib['href']
        yield item
示例#16
0
def train10():
    """
    CIFAR-10 training set creator.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
示例#17
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url] # the queue of URL's to download
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to the crawl queue
                crawl_queue.append(link)
示例#18
0
def iteration():
    max_errors = 5 # maximum number of consecutive download errors allowed
    num_errors = 0 # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0
示例#19
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    seen = set(crawl_queue) # keep track which URL's have seen before
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urlparse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
示例#20
0
def test():
    """
    Conll05 test set creator.

    Because the training dataset is not free, the test dataset is used for
    training. It returns a reader creator, each sample in the reader is nine
    features, including sentence sequence, predicate, predicate context,
    predicate context flag and tagged sequence.

    :return: Training reader creator
    :rtype: callable
    """
    word_dict, verb_dict, label_dict = get_dict()
    reader = corpus_reader(
        download(DATA_URL, 'conll05st', DATA_MD5),
        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
    return reader_creator(reader, word_dict, verb_dict, label_dict)
示例#21
0
def __initialize_meta_info__():
    fn = download(URL, "movielens", MD5)
    global MOVIE_INFO
    if MOVIE_INFO is None:
        pattern = re.compile(r'^(.*)\((\d+)\)$')
        with zipfile.ZipFile(file=fn) as package:
            for info in package.infolist():
                assert isinstance(info, zipfile.ZipInfo)
                MOVIE_INFO = dict()
                title_word_set = set()
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
                            categories_set.add(c)
                        title = pattern.match(title).group(1)
                        MOVIE_INFO[int(movie_id)] = MovieInfo(
                            index=movie_id, categories=categories, title=title)
                        for w in title.split():
                            title_word_set.add(w.lower())

                global MOVIE_TITLE_DICT
                MOVIE_TITLE_DICT = dict()
                for i, w in enumerate(title_word_set):
                    MOVIE_TITLE_DICT[w] = i

                global CATEGORIES_DICT
                CATEGORIES_DICT = dict()
                for i, c in enumerate(categories_set):
                    CATEGORIES_DICT[c] = i

                global USER_INFO
                USER_INFO = dict()
                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
                        uid, gender, age, job, _ = line.strip().split("::")
                        USER_INFO[int(uid)] = UserInfo(
                            index=uid, gender=gender, age=age, job_id=job)
    return fn
 def get_image(self, image_object, try_web=True):
     image_path = self.get_image_path(image_object.get_small_basename())
     temp_path = self.get_temp_path(image_object.get_small_basename())
     
     if os.path.exists(image_path) and os.path.isfile(image_path):
         try:
             pixbuf = gtk.gdk.pixbuf_new_from_file(image_path)
         except gobject.GError:
             try:
                 os.unlink(image_path)
             except: pass
         else:    
             del pixbuf
             return image_path
         
     if try_web:
         small_image_url = image_object.small_url
         if small_image_url:
             ret = common.download(small_image_url, temp_path)
             if ret and self.cleanup_small(temp_path, image_path):
                 return image_path
             
     return None        
cwd = os.getcwd()

start_time = time.time()

if args.begin_phase <= 1:
	print('\n======= Phase I, downloading data =======')
	for name, url_tuple in baseline_data.items():
		if verbose:
			print('Downloading ' +str(name))
			sys.stdout.flush()
		path = os.path.join('datasets/', name)
	#	if url_tuple[RES_LOCATION].startswith('http') or \
	#			url_tuple[RES_LOCATION].startswith('ftp'):
		loc = url_tuple[RES_LOCATION]
		if any([loc.startswith(x) for x in ['file', 'ftp', 'http']]):
			download(url_tuple[RES_LOCATION], path)
			print(loc)
	print('Phase 1 ran in %.3f minutes' % ((time.time() - start_time) / 60))
	
	if args.end_phase == 1:
		print('\nTerminating process after phase 1 as specified by user.')
		print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
		sys.exit()
else:
	print('\nSkipping phase 1.')

sys.stdout.flush()

if args.begin_phase <= 2:
	print('\n======= Phase II, parsing data =======')
	# For now, download and store the data in the parsed.py module. This module
if not os.path.exists(resource_dir):
    os.mkdir(resource_dir)

# change to resource directory
os.chdir(resource_dir)

# make dataset directory
if not os.path.exists(path_constants.dataset_dir):
    os.mkdir(path_constants.dataset_dir)

# create empty dictionary to hold all ns values and equivalence
gp_dict = {}

# parse reference dataset (entrez gene)
for path, url in gp_reference.file_to_url.items():
    download(url, path)
parser = gp_reference.parser_class(gp_reference.file_to_url)
print("Running " + str(parser))
(gene_dict, history_dict) = parser.parse()
gp_dict.update(gene_dict)

# parse dependent datasets
for d in gp_datasets:
    for path, url in d.file_to_url.items():
        download(url, path)
    parser = d.parser_class(gene_dict, history_dict, d.file_to_url)
    print("Running " + str(parser))
    gp_dict.update(parser.parse())

print("Completed gene protein resource generation.")
print("Number of namespace entries: %d" %(len(gp_dict)))
示例#25
0
def fetch():
    download(DATA_URL, 'flowers', DATA_MD5)
    download(LABEL_URL, 'flowers', LABEL_MD5)
    download(SETID_URL, 'flowers', SETID_MD5)
示例#26
0
def fetch():
    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
示例#27
0
change_log['MESHCL'] = {}
change_log['MESHD'] = {}
change_log['SCHEM'] = {}
change_log['SDIS'] = {}
change_log['DO'] = {}
change_log['DOID'] = {}
# download the data needed for resolving lost values
print('\nDownloading data needed for resolving changed/lost terms...')
if not os.path.exists('changelog_datasets/'):
	os.mkdir('changelog_datasets/')
for name, data_tuple in changelog_data.items():
	if verbose:
		print('Downloading ' +str(data_tuple[RES_LOCATION]))
	path = os.path.join('changelog_datasets/', name)
	if 'ftp' in data_tuple[RES_LOCATION] or 'http' in data_tuple[RES_LOCATION]:
		download(data_tuple[RES_LOCATION], path)

print('Resolving changed/lost terms...')
sp_accession_ids = []
for label, data_tuple in changelog_data.items():
	url = label
	parser = data_tuple[PARSER_TYPE]('changelog_datasets/'+url)

	if str(parser) == 'EntrezGeneHistory_Parser':
		log = change_log.get('EGID')
		if verbose:
			print('\nGathering Entrez update info...')
		for row in parser.parse():
			discontinued_id	 = row.get('Discontinued_GeneID')
			gid = row.get('GeneID')
			replacement_id = gid if gid != '-' else 'withdrawn'
示例#28
0
def get_embedding():
    """
    Get the trained word vector based on Wikipedia corpus.
    """
    return download(EMB_URL, 'conll05st', EMB_MD5)
示例#29
0
def fetch():
    return download(URL, "MQ2007", MD5)
示例#30
0
def fetch():
    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
    download(EMB_URL, 'conll05st', EMB_MD5)
    download(DATA_URL, 'conll05st', DATA_MD5)