Пример #1
0
    def download_clicked(self, _=None):
        limit_input = self.limit_entry.get()
        date_input = self.date_entry.get()
        type_input = self.type_dropdown.get()
        content_input = self.content_dropdown.get()
        limit_type = self.limit_text_entry.get()
        folder = self.download_folder_entry.get()
        try:
            if folder:
                folder = str(folder)
                if folder != clean_filename(folder):
                    raise ValueError(texts.get('RANK_INVALID_FOLDER_ERROR'))
            else:
                folder = None

            rank_params = self.check_input(limit=limit_input,
                                           date=date_input,
                                           rank_type=type_input,
                                           content=content_input,
                                           limit_type=limit_type)
            rank_params['folder'] = folder
            download(target=self.pikax_handler.rank, kwargs=rank_params)

        except ValueError as e:
            import sys
            sys.stdout.write(
                texts.get('RANK_ERROR_MESSAGE').format(error_message=str(e)))
Пример #2
0
def download():
    if downloadtuble.index(downloadcb.get()) == 1:
        section = common.downloadmode1()
    elif downloadtuble.index(downloadcb.get()) == 2:
        section = common.downloadmode2(downloadmode2en.get())
    elif downloadtuble.index(downloadcb.get()) == 3:
        section = common.downloadmode3(downloadmode3en1.get(),
                                       downloadmode3en2.get())
    else:
        return
    if section == -1:
        tkinter.messagebox.showerror('错误', '输入的不是数字')
        return
    elif section == 999:
        tkinter.messagebox.showerror('错误', '数字范围有误')
        return
    if not os.path.isdir(downloadpathstr.get()):
        tkinter.messagebox.showerror('错误', '路径有误')
        return
    downloadlogstr.set('开始下载')
    for i in range(section[0], section[1] + 1):
        downloadlogstr.set('正在下载第' + str(i) + '张CG图片')
        if i == 386 or i == 387:
            downloadlogstr.set('正在下载被和谐的图片,下载速度较慢')
        window.update()
        common.download(i, downloadpathstr.get())
        time.sleep(0.1)
    downloadlogstr.set('下载完成')
    tkinter.messagebox.showinfo('提示', '下载完成')
Пример #3
0
def get_srtm_tile(srtm_tile, out_dir):
    """
    Downloads and extract an srtm tile from the internet.

    Args:
        srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying
            the desired strm tile
        out_dir: directory where to store and extract the srtm tiles
    """
    # check if the tile is already there
    mkdir_p(out_dir)
    if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)):
        return

    # download the zip file
    srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile)
    zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile)
    common.download(zip_path, srtm_tile_url)

    # extract the tif file
    if zipfile.is_zipfile(zip_path):
        z = zipfile.ZipFile(zip_path, 'r')
        z.extract('%s.tif' % srtm_tile, out_dir)
    else:
        print("%s not available" % srtm_tile)

    # remove the zip file
    os.remove(zip_path)
Пример #4
0
def crawl_sitemap():
    """Download the sitemap, extract links by using a regex, download all links.
    """
    sitemap = download(URL)
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    for link in links:
        html = download(link)
Пример #5
0
def main():
    sys.stdout.write(u'正在努力请求节目单...')
    sys.stdout.flush()
    data = common.open_url(list_url)
    if not data:
        return
    menu_list = json.loads(data)['list']
    sys.stdout.write('\r')

    list_format = u'[{title}] by {author}  |  {player} {min:02}:{sec:02}'
    print u'{0:*^60}'.format(u'悦读FM.倾听文字的声音')
    print u'总共%d期.最新10期:'%len(menu_list)

    for i in range(0,10):
        print i,list_format.format(**menu_list[i])
    print u"\n输入序号下载,以','分开.'q'退出"

    while 1:
        usr_input = raw_input('Select(0-%d):'%(len(menu_list)-1))
        if usr_input == 'q':
            print 'bye!'
            break
        try:
            li = map(int, usr_input.split(','))
        except:
            print 'Input Error!'
        for i in li:
            if 0 <= i < len(menu_list):
                common.download(menu_list[i]['mp3'], _TARGET,\
                    menu_list[i]['title'], 'mp3', Referer='http://yuedu.fm/')
                article2Html(i, menu_list[i]['title'])
Пример #6
0
def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
    '''
    Create flowers test set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :param cycle: whether to cycle through the dataset
    :type cycle: bool
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(download(DATA_URL, 'flowers', DATA_MD5),
                          download(LABEL_URL, 'flowers', LABEL_MD5),
                          download(SETID_URL, 'flowers', SETID_MD5),
                          TEST_FLAG,
                          mapper,
                          buffered_size,
                          use_xmap,
                          cycle=cycle)
Пример #7
0
 def download_clicked(self, _=None):
     try:
         params = self._get_params()
         download(target=self.pikax_handler.download_by_artist_id,
                  kwargs=params)
     except ValueError as e:
         sys.stdout.write(str(e))
Пример #8
0
def down(dl_list, minmax):
    for image in dl_list:
        filename = image.split("/")
        filename = filename[-1].split("?")
        stdout.write("\rdownloading {}".format(filename))
        stdout.flush()
        common.download(image, filename[0], minmax)
Пример #9
0
def main():
    parser = argparse.ArgumentParser(description='download bilibili danmaku')
    parser.add_argument(
        '--id', metavar='ID',
        help='use a BV number, av number or bangumi media id to specify target video')
    parser.add_argument(
        '-t', '--type', metavar='TYPE',
        help='TYPE=BV/av/md'
    )
    parser.add_argument(
        '-o', '--output', metavar='OUTPUT_PATH', default='.',
        help='files will be saved in OUTPUT_PATH, or a new subdirectory in OUTPUT_PATH, depending on --mkdir argument. OUTPUT_PATH is default to working path'
    )
    parser.add_argument(
        '--mkdir', action='store_true', default=False,
        help='make new subdirectory in OUTPUT_PATH'
    )
    parser.add_argument(
        '--use-name', action='store_true', default=False,
        help='use video title as the name of subdirectory, video parts and episodes'
    )
    parser.add_argument(
        '--save-info', action='store_true', default=False,
        help='create a json file containing video information'
    )

    args = parser.parse_args()
    if not check_arguments(args):
        sys.exit()
    
    print('Collecting information')
    info = asyncio.run(getinfo.get(args.type, args.id))
    print('Title:', info.title)
    print('Video number:', info.n)
    if args.use_name:
        subdir = common.escape_filename(info.title)
    else:
        subdir = args.type + args.id
    if args.mkdir:
        target_path = os.path.join(args.output, subdir)
        if not os.path.isdir(target_path):
            os.mkdir(target_path)
    else:
        target_path = args.output

    if args.use_name:
        xml_list = [os.path.join(target_path, common.escape_filename(f'{i+1}.{title}.xml')) for i, title in enumerate(info.title_list)]
    else:
        xml_list = [os.path.join(target_path, str(_ + 1) + '.xml') for _ in range(info.n)]
    url_list = [common.comments_url(cid) for cid in info.cid_list]
    print('Downloading')
    common.download(url_list, xml_list)
    if args.save_info:
        if args.use_name:
            info_json_filename = common.escape_filename(info.title + '.json')
        else:
            info_json_filename = args.type + args.id + '.json'
        with open(os.path.join(target_path, info_json_filename), 'w', encoding='utf-8') as info_f:
            json.dump(info.save_info, info_f, ensure_ascii=False, indent=4)
    print('Complete.')
Пример #10
0
def get_srtm_tile(srtm_tile, out_dir):
    """
    Downloads and extract an srtm tile from the internet.

    Args:
        srtm_tile: string following the pattern 'srtm_%02d_%02d', identifying
            the desired strm tile
        out_dir: directory where to store and extract the srtm tiles
    """
    # check if the tile is already there
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    if os.path.exists(os.path.join(out_dir, '%s.tif' % srtm_tile)):
        return

    # download the zip file
    srtm_tile_url = '%s/%s.zip' % (cfg['srtm_url'], srtm_tile)
    zip_path = os.path.join(out_dir, '%s.zip' % srtm_tile)
    common.download(zip_path, srtm_tile_url)

    # extract the tif file
    if zipfile.is_zipfile(zip_path):
        z = zipfile.ZipFile(zip_path, 'r')
        z.extract('%s.tif' % srtm_tile, out_dir)
    else:
        print "%s not available" % srtm_tile

    # remove the zip file
    os.remove(zip_path)
Пример #11
0
    def search_and_download_clicked(self, _=None):
        try:
            keyword = str(self.keyword_entry.get())
            if not keyword:
                raise ValueError(texts.get('SEARCH_EMPTY_KEYWORD_ERROR'))

            folder_input = str(self.download_folder_entry.get())
            if folder_input != clean_filename(folder_input):
                raise ValueError(texts.get('SEARCH_INVALID_FOLDER_ERROR'))
            folder = folder_input or None

            try:
                limit_input = int(
                    self.limit_entry.get()) if self.limit_entry.get() else None
            except ValueError:
                raise ValueError(texts.get('SEARCH_LIMIT_ERROR'))
            match_input = str(self.match_dropdown.get())
            sort_input = str(self.sort_dropdown.get())
            popularity_input = str(self.popularity_dropdown.get())
            limit_type_input = str(self.limit_text_entry.get())
            params = self.check_inputs(limit_input=limit_input,
                                       match_input=match_input,
                                       sort_input=sort_input,
                                       popularity_input=popularity_input,
                                       limit_type_input=limit_type_input)
        except (TypeError, ValueError) as e:
            sys.stdout.write(
                texts.get('SEARCH_ERROR_MESSAGE').format(error_message=str(e)))
            return

        params['keyword'] = keyword
        params['folder'] = folder
        download(target=self.pikax_handler.search, kwargs=params)
Пример #12
0
def down(dl_list, minmax):
    for image in dl_list:
        print(image)
        filename = image.split("/")
        filename = filename[-1]
        common.download(image, filename, minmax)
    print("Download complete!")
Пример #13
0
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
Пример #14
0
	def download(self,url,filepath):
		#获取名称
		name = self.getName(url)
		html = common.getHtml(url)
		m3u8 = self.getM3u8(html)
		common.download(urllib.unquote(m3u8),filepath,name + '.m3u8')
		url = self.URL_PIRFIX + self.getSinavideoUrl(filepath+name+'.m3u8')
		common.download(url,filepath,name+'.mp4')
Пример #15
0
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
Пример #16
0
def get_dict():
    """
    Get the word, verb and label dictionary of Wikipedia corpus.
    """
    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
    return word_dict, verb_dict, label_dict
Пример #17
0
def get_dict():
    """
    Get the word, verb and label dictionary of Wikipedia corpus.
    """
    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
    return word_dict, verb_dict, label_dict
Пример #18
0
    def download_clicked(self):
        self.canvas.itemconfigure(self.output_id, text='')
        user_input = self.id_or_url_input.get(0.0, tk.END)
        search_ids = re.findall(r'(?<!\d)\d{8}(?!\d)', user_input, re.S)

        if search_ids:
            params = {'illust_ids': search_ids}
            download(target=self.pikax_handler.download_by_illust_ids, kwargs=params)
        else:
            sys.stdout.write(texts.get('ILLUSTRATION_NO_ID_FOUND'))
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall("<loc>(.*?)</loc>", sitemap)
    # download each link
    for link in links:
        html = download(link)
        # scrape html here
        # ...
        print(html)
Пример #20
0
def download_pic(url):
    print url
    html = common.open_url(url)
    find_re = re.compile(r'<li id.+?<img src="(.+?)"', re.DOTALL)
    img_url = find_re.findall(html)
    print 'Start download %d pics'%len(img_url) 
    for url in img_url:
        if url:
            filename,ext = os.path.splitext(os.path.split(url)[-1])
            if not ext:
                ext = '.jpg'
            common.download(url, TARGET, filename, ext[1:], Referer=url)
Пример #21
0
def download_show(li):
    for num in li:
        if num > 296: 
            url = xml_url_1%num
        else:
            url = xml_url_2%num 
        xml_data = common.open_url(url)
        if xml_data:
            songlist = extract(xml_data)
            target_dir = TARGET%num
            for title, location in songlist:
                ext = location.split('.')[-1]
                common.download(location, target_dir, title, ext, Referer=referer%num)
Пример #22
0
def main():
    D = download(is_cookie=True)
    if not login(D, userid, passwd, dynamic_passwd):
        # login fail, return
        return
    hr_url = 'https://hr.guosen.com.cn/sso/SsoHrssServlet'
    html = D.get(hr_url)
    login_data = {}
    login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html)
    login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html)
    login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html)
    login_data['ReturnURL'] = 'https://hr.guosen.com.cn/sso/SsoHrssServlet'
    html = D.post('https://sso.guosen.com.cn/login.aspx', login_data)            
    login_data = {}
    login_data['IASID'] = common_re(r'"IASID"[^>]+value="([^"]+)"', html)
    login_data['Result'] = '0' 
    login_data['TimeStamp'] = common_re(r'"TimeStamp"[^>]+value="([^"]+)"', html)
    login_data['UserAccount'] = userid
    login_data['ErrorDescription'] = ''
    login_data['Authenticator'] = common_re(r'"Authenticator"[^>]+value="([^"]+)"', html)
    login_data['IASUserAccount'] = userid
    html = D.post(hr_url, login_data)         

    end_url = 'https://hr.guosen.com.cn/hrss/ta/Clockin.jsp?_funcode=E0020902'
    D.get(end_url)
    post_url = 'https://hr.guosen.com.cn/hrss/dorado/smartweb2.RPC.d?__rpc=true'
    search_data = {}
    search_data['__type'] = 'loadData'
    search_data['__viewInstanceId'] = 'nc.bs.hrss.ta.Clockin~nc.bs.hrss.ta.ClockinViewModel'
    search_data['__xml'] = get_post_xml(month, year) 
    html = D.post(post_url, search_data)
    if 'result succeed="true"' in html:
        print 'Hello world!'
    else:
        print html
    def download(url, headers, proxy, num_retries, data=None): 
        print('Downloading:%s'%url)
        request = request.Request(url, data, headers)
        opener = request.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(request.ProxyHandler(proxy_params))

        try:
            response = opener.open(request)
            html = response.read()
            code = response.code

        except request.URLError as e:
            print('Download error:%s'%e.reason)
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    # retry 5XX HTTP errors
                    return download(url, headers, proxy, num_retries-1, data)
                else:
                    code =None

            return html
Пример #24
0
def test():
    word_dict, verb_dict, label_dict = get_dict()
    reader = corpus_reader(
        download(DATA_URL, 'conll05st', DATA_MD5),
        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
    return reader_creator(reader, word_dict, verb_dict, label_dict)
Пример #25
0
def get_dateil():
    content = parse_content(url)
    links = [article['link'] for article in content]
    for link in links:
        article_tree = lxml.html.fromstring(download(link))
        article_content = article_tree.cssselect('div#article_content > p')[0]
        print article_content
Пример #26
0
def extract_article_content(url):
    html = download(url)
    try:
        if html is not None:
            soup = BeautifulSoup(html, 'lxml')
            content_main = soup.select_one('#content-main')
            selectors_to_remove = [
                '.article-function-social-media', '.article-icon.spiegelplus',
                '.article-function-box', 'script', 'style',
                '#js-article-column > p', '#js-article-top-wide-asset',
                '.asset-box', '.article-copyright',
                '.article-function-box-wide', '.top-anchor', '.module-box',
                '.spiegel-asset-box', '#spRecommendations', '#js-video-slider',
                '.column-both-bottom', '#footer'
            ]
            for selector in selectors_to_remove:
                for node in content_main.select(selector):
                    node.decompose()
            for comment in soup.findAll(
                    text=lambda text: isinstance(text, Comment)):
                comment.extract()
            content = re.sub(
                '(\r\n|\n|\t|\s+)', ' ',
                reduce(lambda agg, cur: agg + ' ' + cur,
                       content_main.findAll(text=True)))
            return content
    except Exception:
        print('extraction of {} failed ({})'.format(url, format_exc()))
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1):
	crawl_queue = deque([seed_url])
	seen = {seed_url: 0}
	num_urls = 0
	rp = get_robots(seed_url)
	throttle = Throttle(delay)
	headers = headers or {}
	if user_agent:
		headers['User-agent'] = user_agent

	while crawl_queue:
		url = crawl_queue.pop()
		if rp.can_fetch(user_agent, url):
			throttle.wait(url)
			html = download(url, headers, proxy=proxy, num_retries=num_retries).decode('utf-8')
			links = []

			depth = seen[url]
			if depth != max_depth:
				if link_regex:
					links.extend(link for link in get_links(html) if re.match(link_regex, link))

				for link in links:
					link = normalize(seed_url, link)
					if link not in seen:
						seen[link] = depth + 1
						if same_domain(seed_url, link):
							crawl_queue.append(link)

			num_urls += 1
			if num_urls == max_urls:
				break
		else:
			print('Blocked by robots.txt:', url)
Пример #28
0
def scrape():
    url = 'http://example.webscraping.com/places/default/view/United-Kindom-239'
    html = download(url)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
    area = td.text_content()
    return area
def main():
  common.setup()
  usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name())
  if len(sys.argv) != 3:
    print usage
    sys.exit(1)

  src = sys.argv[1]
  if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'):
    print usage
    print ('gs_dst should be of the form /path/to/object. gs://{0} will be '
           'prefixed for you.').format(cfg.gs_bucket)
    sys.exit(1)
  dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2])

  common.download(src, dst)
def main():
    common.setup()
    usage = 'USAGE: {0} hdfs_src gs_dst'.format(common.script_name())
    if len(sys.argv) != 3:
        print usage
        sys.exit(1)

    src = sys.argv[1]
    if sys.argv[2].startswith('gs://') or not sys.argv[2].startswith('/'):
        print usage
        print(
            'gs_dst should be of the form /path/to/object. gs://{0} will be '
            'prefixed for you.').format(cfg.gs_bucket)
        sys.exit(1)
    dst = 'gs://{0}{1}'.format(cfg.gs_bucket, sys.argv[2])

    common.download(src, dst)
Пример #31
0
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url).decode('utf-8')
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)
Пример #32
0
def downloadQtPackge():
    qt_version_dotless = qt_version.replace('.', '')
    base_url = 'https://download.qt.io/online/qtsdkrepository/{}/{}/qt5_{}' \
        .format(os_url, target_platform ,qt_version_dotless)
    updates_file = 'Updates-{}-{}.xml'.format(qt_version, os_name)
    c.download(base_url + '/Updates.xml', updates_file)

    updates = ET.parse(updates_file)
    updates_root = updates.getroot()
    all_modules = {}
    for i in updates_root.iter('PackageUpdate'):
        name = i.find('Name').text
        if 'debug' in name or not kit_arch in name:
            continue

        archives = i.find('DownloadableArchives')
        if archives.text is None:
            continue

        archives_parts = archives.text.split(',')
        version = i.find('Version').text
        for archive in archives_parts:
            archive = archive.strip()
            parts = archive.split('-')
            module_name = parts[0]
            all_modules[module_name] = {
                'package': name,
                'file': version + archive
            }

    if len(sys.argv) > 1:  # handle subcommand
        if sys.argv[1] == 'list':
            c.print('Available modules:')
            for k in iter(sorted(all_modules.keys())):
                c.print(k, '---', all_modules[k]['file'])
        exit(0)

    for module in qt_modules_list:
        if module not in all_modules:
            c.print('>> Required module {} not available'.format(module))
            continue
        file_name = all_modules[module]['file']
        package = all_modules[module]['package']
        c.download(base_url + '/' + package + '/' + file_name, file_name)
        c.extract(file_name, '.')
Пример #33
0
 def downOrExplain(self,aurl):
     fileName = aurl.replace('http://', '').replace('/', '_').replace('?', '_') + ".html"
     # print fileName
     # fileName="m1.html"
     if not os.path.exists(fileName):
         print "down file"
         html = download(aurl)
         self.saveHtml(fileName, html)
     self.html = self.readFile(fileName)
Пример #34
0
def test():
    global UCI_TEST_DATA
    load_data(download(URL, 'uci_housing', MD5))

    def reader():
        for d in UCI_TEST_DATA:
            yield d[:-1], d[-1:]

    return reader
Пример #35
0
def iteration():
    for page in itertools.count(1):
        # url = 'http://example.webscraping.com/view/-%d' % page
        url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = download(url)
        if html is None:
            break
        else:
            pass
Пример #36
0
def download_mingw_tool():
    qt_version_dotless = qt_version.replace('.', '')
    base_url = 'https://download.qt.io/online/qtsdkrepository/windows_x86/desktop/tools_mingw/'
    updates_file = 'Updates-{}-{}-{}.xml'.format(qt_version, os_name, 'qttool')
    c.download(base_url + '/Updates.xml', updates_file)

    updates = ET.parse(updates_file)
    updates_root = updates.getroot()
    all_modules = {}
    for i in updates_root.iter('PackageUpdate'):
        name = i.find('Name').text

        if 'debug' in name or not kit_arch in name:
            continue

        archives = i.find('DownloadableArchives')
        if archives.text is None:
            continue
        c.print(' archives: {}'.format(archives))
        archives_parts = archives.text.split(',')
        version = i.find('Version').text
        c.print(' version: {}'.format(version))
        for archive in archives_parts:
            archive = archive.strip()
            parts = archive.split('-')
            module_name = parts[0]
            all_modules[module_name] = {
                'package': name,
                'file': version + archive
            }
    if len(sys.argv) > 1:  # handle subcommand
        if sys.argv[1] == 'list':
            c.print('Available modules:')
            for k in iter(sorted(all_modules.keys())):
                c.print(k, '---', all_modules[k]['file'])
        exit(0)

    file_name = all_modules[module_name]['file']
    package = all_modules[module_name]['package']
    c.print('download url: {}'.format(base_url + '/' + package + '/' +
                                      file_name))
    c.download(base_url + '/' + package + '/' + file_name, file_name)
    c.extract(file_name, '.')
Пример #37
0
def link_crawler(seed_url, link_regex):
    """Crawl from the giver seed URL following links matched by link_regex"""
    crawl_queue = [seed_url]  # the queue of URL's to download
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to the crawl queue
                crawl_queue.append(link)
Пример #38
0
def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers training set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: train data reader
    :rtype: callable
    '''
    return reader_creator(download(DATA_URL, 'flowers', DATA_MD5),
                          download(LABEL_URL, 'flowers', LABEL_MD5),
                          download(SETID_URL, 'flowers', SETID_MD5),
                          TRAIN_FLAG, mapper, buffered_size, use_xmap)
Пример #39
0
def test100():
    """
    CIFAR-100 test set cretor.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

    :return: Test reader creator.
    :rtype: callable
    """
    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
Пример #40
0
def main():
    if len(sys.argv) < 3 or (sys.argv[1] != '-t' and len(sys.argv) > 3):
        help_info()
        return
    if sys.argv[1] == '-a':
        url = _albumUrl % sys.argv[2]
    elif sys.argv[1] == '-c':
        url = _collectUrl % sys.argv[2]
    elif sys.argv[1] == '-t':
        url = _trackUrl % ','.join(sys.argv[2:])
    else :
        help_info()
        return
    content = common.open_url(url)
    if not content:
        return
    res = extract(content)
    for title,uri,lrc in res:
        common.download(uri,TARGET,title,'mp3')
        if lrc:
Пример #41
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url] # the queue of URL's to download
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to the crawl queue
                crawl_queue.append(link)
Пример #42
0
def parse_content(url):
    html = download(url)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('ul.detail_list > li')
    for lis in td:
        item = {}
        item['title'] = lis.cssselect('h4 > a')[0].text_content()
        item['time'] = lis.cssselect('div.detail_b > span')[0].text_content()
        item['views'] = lis.cssselect('div.detail_b > em')[0].text_content()
        item['abstract'] = lis.cssselect('p.detail_p')[0].text_content()
        item['link'] = lis.cssselect('h4 > a')[0].attrib['href']
        yield item
Пример #43
0
def train10():
    """
    CIFAR-10 training set creator.

    It returns a reader creator, each sample in the reader is image pixels in
    [0, 1] and label in [0, 9].

    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(
        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
Пример #44
0
def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
    '''
    Create flowers validation set reader.
    It returns a reader, each sample in the reader is
    image pixels in [0, 1] and label in [1, 102]
    translated from original color image by steps:
    1. resize to 256*256
    2. random crop to 224*224
    3. flatten
    :param mapper:  a function to map sample.
    :type mapper: callable
    :param buffered_size: the size of buffer used to process images
    :type buffered_size: int
    :return: test data reader
    :rtype: callable
    '''
    return reader_creator(
        download(DATA_URL, 'flowers', DATA_MD5),
        download(LABEL_URL, 'flowers', LABEL_MD5),
        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
        buffered_size, use_xmap)
Пример #45
0
def iteration():
    max_errors = 5 # maximum number of consecutive download errors allowed
    num_errors = 0 # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0
Пример #46
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    seen = set(crawl_queue) # keep track which URL's have seen before
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urlparse.urljoin(seed_url, link)
                # check if have already seen this link
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
Пример #47
0
def test():
    """
    Conll05 test set creator.

    Because the training dataset is not free, the test dataset is used for
    training. It returns a reader creator, each sample in the reader is nine
    features, including sentence sequence, predicate, predicate context,
    predicate context flag and tagged sequence.

    :return: Training reader creator
    :rtype: callable
    """
    word_dict, verb_dict, label_dict = get_dict()
    reader = corpus_reader(
        download(DATA_URL, 'conll05st', DATA_MD5),
        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
    return reader_creator(reader, word_dict, verb_dict, label_dict)
Пример #48
0
def __initialize_meta_info__():
    fn = download(URL, "movielens", MD5)
    global MOVIE_INFO
    if MOVIE_INFO is None:
        pattern = re.compile(r'^(.*)\((\d+)\)$')
        with zipfile.ZipFile(file=fn) as package:
            for info in package.infolist():
                assert isinstance(info, zipfile.ZipInfo)
                MOVIE_INFO = dict()
                title_word_set = set()
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
                            categories_set.add(c)
                        title = pattern.match(title).group(1)
                        MOVIE_INFO[int(movie_id)] = MovieInfo(
                            index=movie_id, categories=categories, title=title)
                        for w in title.split():
                            title_word_set.add(w.lower())

                global MOVIE_TITLE_DICT
                MOVIE_TITLE_DICT = dict()
                for i, w in enumerate(title_word_set):
                    MOVIE_TITLE_DICT[w] = i

                global CATEGORIES_DICT
                CATEGORIES_DICT = dict()
                for i, c in enumerate(categories_set):
                    CATEGORIES_DICT[c] = i

                global USER_INFO
                USER_INFO = dict()
                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
                        uid, gender, age, job, _ = line.strip().split("::")
                        USER_INFO[int(uid)] = UserInfo(
                            index=uid, gender=gender, age=age, job_id=job)
    return fn
 def get_image(self, image_object, try_web=True):
     image_path = self.get_image_path(image_object.get_small_basename())
     temp_path = self.get_temp_path(image_object.get_small_basename())
     
     if os.path.exists(image_path) and os.path.isfile(image_path):
         try:
             pixbuf = gtk.gdk.pixbuf_new_from_file(image_path)
         except gobject.GError:
             try:
                 os.unlink(image_path)
             except: pass
         else:    
             del pixbuf
             return image_path
         
     if try_web:
         small_image_url = image_object.small_url
         if small_image_url:
             ret = common.download(small_image_url, temp_path)
             if ret and self.cleanup_small(temp_path, image_path):
                 return image_path
             
     return None        
Пример #50
0
cwd = os.getcwd()

start_time = time.time()

if args.begin_phase <= 1:
	print('\n======= Phase I, downloading data =======')
	for name, url_tuple in baseline_data.items():
		if verbose:
			print('Downloading ' +str(name))
			sys.stdout.flush()
		path = os.path.join('datasets/', name)
	#	if url_tuple[RES_LOCATION].startswith('http') or \
	#			url_tuple[RES_LOCATION].startswith('ftp'):
		loc = url_tuple[RES_LOCATION]
		if any([loc.startswith(x) for x in ['file', 'ftp', 'http']]):
			download(url_tuple[RES_LOCATION], path)
			print(loc)
	print('Phase 1 ran in %.3f minutes' % ((time.time() - start_time) / 60))
	
	if args.end_phase == 1:
		print('\nTerminating process after phase 1 as specified by user.')
		print('Total runtime: %.3f minutes' % ((time.time() - start_time) / 60))
		sys.exit()
else:
	print('\nSkipping phase 1.')

sys.stdout.flush()

if args.begin_phase <= 2:
	print('\n======= Phase II, parsing data =======')
	# For now, download and store the data in the parsed.py module. This module
Пример #51
0
if not os.path.exists(resource_dir):
    os.mkdir(resource_dir)

# change to resource directory
os.chdir(resource_dir)

# make dataset directory
if not os.path.exists(path_constants.dataset_dir):
    os.mkdir(path_constants.dataset_dir)

# create empty dictionary to hold all ns values and equivalence
gp_dict = {}

# parse reference dataset (entrez gene)
for path, url in gp_reference.file_to_url.items():
    download(url, path)
parser = gp_reference.parser_class(gp_reference.file_to_url)
print("Running " + str(parser))
(gene_dict, history_dict) = parser.parse()
gp_dict.update(gene_dict)

# parse dependent datasets
for d in gp_datasets:
    for path, url in d.file_to_url.items():
        download(url, path)
    parser = d.parser_class(gene_dict, history_dict, d.file_to_url)
    print("Running " + str(parser))
    gp_dict.update(parser.parse())

print("Completed gene protein resource generation.")
print("Number of namespace entries: %d" %(len(gp_dict)))
Пример #52
0
def fetch():
    download(DATA_URL, 'flowers', DATA_MD5)
    download(LABEL_URL, 'flowers', LABEL_MD5)
    download(SETID_URL, 'flowers', SETID_MD5)
Пример #53
0
def fetch():
    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
    download(EMB_URL, 'conll05st', EMB_MD5)
    download(DATA_URL, 'conll05st', DATA_MD5)
Пример #54
0
def fetch():
    return download(URL, "MQ2007", MD5)
Пример #55
0
def get_embedding():
    """
    Get the trained word vector based on Wikipedia corpus.
    """
    return download(EMB_URL, 'conll05st', EMB_MD5)
Пример #56
0
change_log['MESHCL'] = {}
change_log['MESHD'] = {}
change_log['SCHEM'] = {}
change_log['SDIS'] = {}
change_log['DO'] = {}
change_log['DOID'] = {}
# download the data needed for resolving lost values
print('\nDownloading data needed for resolving changed/lost terms...')
if not os.path.exists('changelog_datasets/'):
	os.mkdir('changelog_datasets/')
for name, data_tuple in changelog_data.items():
	if verbose:
		print('Downloading ' +str(data_tuple[RES_LOCATION]))
	path = os.path.join('changelog_datasets/', name)
	if 'ftp' in data_tuple[RES_LOCATION] or 'http' in data_tuple[RES_LOCATION]:
		download(data_tuple[RES_LOCATION], path)

print('Resolving changed/lost terms...')
sp_accession_ids = []
for label, data_tuple in changelog_data.items():
	url = label
	parser = data_tuple[PARSER_TYPE]('changelog_datasets/'+url)

	if str(parser) == 'EntrezGeneHistory_Parser':
		log = change_log.get('EGID')
		if verbose:
			print('\nGathering Entrez update info...')
		for row in parser.parse():
			discontinued_id	 = row.get('Discontinued_GeneID')
			gid = row.get('GeneID')
			replacement_id = gid if gid != '-' else 'withdrawn'
Пример #57
0
def fetch():
    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)