def get_country_urls(locs_url, countries_xpath='//div[@id="content"]/div/div/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = locs_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(countries_xpath) country_urls = [] country_names = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] country_urls.append(url) name = eles[i].text country_names.append(name) return ((country_urls, country_names))
def get_EXIF(EXIF_url): info_container, records_container = taxonomy_init() info_container['url'] = EXIF_url #### ####info_container = nvsoli.walkon(info_container,records_container=records_container) ####info_container = nvsoli.auto_redireced(info_container,records_container) #### #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath('//table[@class="exif"]/tr') EXIF = {} for i in range(0, eles.__len__()): key = eles[i].xpath('td')[0].text.rstrip(':') EXIF[key] = eles[i].xpath('td')[1].text return (EXIF)
def fishbase_init(base_url='http://www.fishbase.us/'): info_container = nvsoli.new_info_container() info_container['base_url'] = base_url info_container['url'] = base_url info_container['method'] = 'GET' req_head_str = '''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36\r\nAccept-Encoding: gzip,deflate,sdch\r\nAccept-Language: en;q=1.0, zh-CN;q=0.8''' info_container['req_head'] = nvhead.build_headers_dict_from_str( req_head_str, '\r\n') info_container['req_head']['Connection'] = 'close' #### init records_container records_container = nvsoli.new_records_container() info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) return ((info_container, records_container))
def search_via_country(c_code, info_container, records_container): req_body = gen_CI_post_body(Country=c_code) ciurl = info_container['base_url'] + 'country/CountrySearchList.php' info_container['req_head']['Referer'] = info_container['base_url'] info_container['req_head']['Upgrade-Insecure-Requests'] = 1 info_container['req_head'][ 'Content-Type'] = 'application/x-www-form-urlencoded' info_container['url'] = ciurl info_container['method'] = 'POST' info_container['req_body'] = req_body info_container = nvsoli.walkon(info_container, records_container=records_container) info_container['method'] = 'GET' info_container = nvsoli.auto_redireced(info_container, records_container) html_text = info_container['resp_body_bytes'].decode('utf-8') root = etree.HTML(html_text) return (root)
def get_nav_urls(loc_url, nav_xpath='//p[@class="nav"]/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = loc_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): ##### #print('--------get_nav_urls--------') #print(sleep_cnt) #print(loc_url) #print('--------get_nav_urls--------') ##### sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(nav_xpath) if (eles.__len__() == 0): nav_urls = [] else: max_page = eles[-2].text max_page = int(max_page) nav_urls = [loc_url] tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/')) for i in range(2, max_page + 1): url = nudipix_base_url + tem + '/' + str(i) nav_urls.append(url) return (nav_urls)
def get_locsp_urls(nav_url, locsp_xpah='//div[@class="thumbnail"]/div/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = nav_url #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): #print('>>>>>>>>>>>>>>>>>>>>') #print(info_container['url']) #print('<<<<<<<<<<<<<<<<<<<<') sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(locsp_xpah) locsp_urls = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if ('location' in url): locsp_urls.append(url) return (locsp_urls)
def get_location_urls( country_url, locations_xpath='//ul[@class="country_dive_site_list"]/li/a[@href]'): info_container, records_container = taxonomy_init() info_container['url'] = country_url #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(locations_xpath) location_urls = [] location_names = [] for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if ('location' in url): location_urls.append(url) name = eles[i].text location_names.append(name) return ((location_urls, location_names))
def get_country_infos(c_code, country_island_dict, info_container, records_container, **kwargs): if ('display' in kwargs): display = int(kwargs['display']) else: display = 0 if ('new_database' in kwargs): newdb = kwargs['new_database'] else: newdb = 0 #### #os.system('date') #### root = search_via_country(c_code, info_container, records_container) tables = get_all_tables(root, info_container['base_url']) qurl = get_query_url(info_container, kwargs) info_container['url'] = qurl info_container['method'] = 'GET' info_container['req_body'] = None info_container = nvsoli.walkon(info_container, records_container=records_container) html_text = info_container['resp_body_bytes'].decode('utf-8') root = etree.HTML(html_text) eles = root.xpath('//thead/tr/th') #### #### fish = {} for i in range(0, eles.__len__()): fish[eles[i].text] = None url_dict = nvurl.url_to_dict(info_container['url']) qd = nvurl.urldecode(url_dict['query']) all_country_eles = root.xpath("//tr[@class='t_value1']") country = country_island_dict[str(c_code)] fn = '../INFOS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(fn)): pass else: os.makedirs(fn) picfn = '../PICS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(picfn)): pass else: os.makedirs(picfn) thumbfn = '../THUMBNAILS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[ 'cpresence'] + '/' + qd['vhabitat'] + '/' if (os.path.exists(thumbfn)): pass else: os.makedirs(thumbfn) #### #### #### if (bool(newdb)): fishes = {} else: fishes_dir = fn + "fishes.dict" print(fishes_dir) if (os.path.exists(fishes_dir)): fd = open(fishes_dir, 'r+') fishes_text = fd.read() fishes = json.loads(fishes_text) fd.close() else: fishes = {} #### print('--------------------') from xdict.jprint import paint_str print( paint_str( "===============fishes loads completed======================", single_color='yellow')) print(fishes.keys()) print('----------------') #os.system('date') #### for i in range(0, all_country_eles.__len__()): fish_ele = all_country_eles[i] nfish = get_fish_info(fishes, fish_ele, fish, info_container, records_container) ###### print( paint_str( "===============nfish load completed======================", single_color='green')) ###### if (nfish): #### print("====handle new nfish========") #### nfish['eles-seq'] = i nfish['images-dir'] = picfn nfish['info-dir'] = fn fishes[nfish['Species']['name']] = nfish nfdir = fn + nfish['Species']['name'] + '/' if (os.path.exists(nfdir)): pass else: os.makedirs(nfdir) nffn = nfdir + 'fish.dict' infofn = nfdir + 'fish.info' nvft.write_to_file(fn=nffn, content=json.dumps(nfish), op='w+') info = get_printed_str(nfish, with_color=0, display=display) nvft.write_to_file(fn=infofn, content=info, op='w+') else: #### print("===bypass existed fish====") #### pass #---------------------------------------# #### print( paint_str( "===============all nfish es load completed======================", single_color='yellow')) print(fishes.keys()) print(fishes.keys().__len__()) #### dfn = fn + 'fishes.dict' if (os.path.exists(dfn)): pass else: nvft.write_to_file(fn=dfn, content=json.dumps(fishes), op='w+') ldfn = fn + 'fishes.lines' if (os.path.exists(ldfn)): pass else: nvft.write_to_file(fn=ldfn, content='', op='w+') for key in fishes: nfish = fishes[key] nvft.write_to_file(fn=ldfn, content=get_printed_str(nfish, with_color=0, display=display), op='a+') nvft.write_to_file(fn=ldfn, content='\n', op='a+') #---------------------------------------# #### print("-----get all_photos ready----") #### apafn = fn + 'pics.array' if (os.path.exists(apafn)): fd = open(apafn, 'r+') apa_text = fd.read() all_photos = json.loads(apa_text) fd.close() else: all_photos = [] for name in fishes: #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos']) #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos']) for photo in fishes[name]['All-Photos']: all_photos.append(photo) #### print("all_photos gotted") print(all_photos.__len__()) #### types = [] for each in all_photos: type = each['type'] if (type in types): pass else: if (type == None): pass else: types.append(type) for type in types: typefn = picfn + type if (os.path.exists(typefn)): pass else: os.makedirs(typefn) typefn = thumbfn + type if (os.path.exists(typefn)): pass else: os.makedirs(typefn) for each in all_photos: if (each['type'] == None): each['img-dir'] = None each['thumb-dir'] = None else: img_dir = picfn + each['type'] + '/' + each['img-name'] each['img-dir'] = img_dir thumb_dir = thumbfn + each['type'] + '/' + each['img-name'] each['thumb-dir'] = thumb_dir apafn = fn + 'pics.array' if (os.path.exists(apafn)): pass else: nvft.write_to_file(fn=apafn, content=json.dumps(all_photos), op='w+') lapafn = fn + 'pics.lines' if (os.path.exists(lapafn)): pass else: nvft.write_to_file(fn=lapafn, content='', op='w+') for each in all_photos: nvft.write_to_file(fn=lapafn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=lapafn, content='\n', op='a+') ############################ print("pics.lines and pics.array ready") ############################ imagename_dir_dict = {} dir_imagename_dict = {} for each in all_photos: if (each['type'] != None): imagename = each['img-name'] dir = each['img-dir'] else: imagename = None dir = None imagename_dir_dict[imagename] = dir dir_imagename_dict[dir] = imagename iddfn = fn + 'image_dir.dict' didfn = fn + 'dir_image.dict' if (os.path.exists(iddfn)): pass else: nvft.write_to_file(fn=iddfn, content=json.dumps(imagename_dir_dict), op='w+') liddfn = fn + 'image_dir.lines' if (os.path.exists(liddfn)): pass else: nvft.write_to_file(fn=liddfn, content='', op='w+') for each in imagename_dir_dict: nvft.write_to_file(fn=liddfn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=liddfn, content='\n', op='a+') if (os.path.exists(didfn)): pass else: nvft.write_to_file(fn=didfn, content=json.dumps(dir_imagename_dict), op='w+') ldidfn = fn + 'dir_image.lines' if (os.path.exists(ldidfn)): pass else: nvft.write_to_file(fn=ldidfn, content=get_printed_str(dir_imagename_dict, with_color=0, display=display), op='w+') ############### print("==dir_image.dict and dir_image.lines gotted==") ############## thumb_dir_dict = {} dir_thumb_dict = {} for each in all_photos: if (each['type'] != None): imagename = each['img-name'] dir = each['thumb-dir'] else: imagename = None dir = None thumb_dir_dict[imagename] = dir dir_thumb_dict[dir] = imagename iddfn = fn + 'thumb_dir.dict' didfn = fn + 'dir_thumb.dict' if (os.path.exists(iddfn)): pass else: nvft.write_to_file(fn=iddfn, content=json.dumps(thumb_dir_dict), op='w+') liddfn = fn + 'thumb_dir.lines' if (os.path.exists(liddfn)): pass else: nvft.write_to_file(fn=liddfn, content='', op='w+') for each in thumb_dir_dict: nvft.write_to_file(fn=liddfn, content=get_printed_str(each, with_color=0, display=display), op='a+') nvft.write_to_file(fn=liddfn, content='\n', op='a+') if (os.path.exists(didfn)): pass else: nvft.write_to_file(fn=didfn, content=json.dumps(dir_thumb_dict), op='w+') ldidfn = fn + 'dir_thumb.lines' if (os.path.exists(ldidfn)): pass else: nvft.write_to_file(fn=ldidfn, content=get_printed_str(dir_thumb_dict, with_color=0, display=display), op='w+') ############### print("===dir_thumb.lines and thumb_dir.dict gotted===") ############### print("begin download images") ############### for each in all_photos: if (each['type'] != None): imagename = each['img-name'] img_dir = each['img-dir'] img_url = each['img-url'] thumb_dir = each['thumb-dir'] thumb_url = each['thumbnail-url'] if (os.path.exists(img_dir)): #### print(paint_str("pass_by_pic", single_color="red")) #### pass else: info_container['url'] = img_url info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) nvft.write_to_file(fn=img_dir, content=info_container['resp_body_bytes'], op='wb+') #### print("downloaded one pic") #### if (os.path.exists(thumb_dir)): #### print(paint_str("pass_by_thumb", single_color="red")) #### pass else: info_container['url'] = thumb_url info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) nvft.write_to_file(fn=thumb_dir, content=info_container['resp_body_bytes'], op='wb+') #### print("downloaded one thumb") #### else: print("---external pics not downloaded in this version,pass--") pass return ((info_container, records_container))
def get_img_info(img_url, thumbnail_url, country_abbrev, location, base_url=nudipix_base_url): info_container, records_container = taxonomy_init() info_container['url'] = img_url #### #sys.stdout.flush() #print('---------------') #print(img_url) #sys.stdout.flush() #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### img_root = get_etree_root(info_container) tbodys = img_root.xpath('//table') sp = img_root.xpath('//div/div/h2/a')[0].attrib['href'].rstrip('/') sp_name = os.path.basename(sp) info_raw = tbodys[0].getchildren() info = {} for i in range(0, info_raw.__len__()): key = info_raw[i].xpath('td')[0].text.rstrip(':') if (key == 'Camera'): info[key] = info_raw[i].xpath('td')[1].text EXIF_url = nudipix_base_url + info_raw[i].xpath( 'td/span/a')[0].attrib['href'] info['EXIF'] = get_EXIF(EXIF_url) elif (key == 'Taken on'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Viewed'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Posted'): info[key] = info_raw[i].xpath('td')[1].text elif (key == 'Updated'): info[key] = info_raw[i].xpath('td')[1].text else: info[key] = info_raw[i].xpath('td/a')[0].text kpcofgs = get_KPCOFGS(tbodys, rsltin='dict') info['kpcofgs'] = kpcofgs img_real_url = nudipix_base_url + img_root.xpath( '//div/img')[0].attrib['src'] try: img_verifier = img_root.xpath('//div/img')[1].attrib['title'] except: img_verifier = '' else: pass sha1 = hashlib.sha1(img_real_url.encode('utf-8')).hexdigest() img_suffix = os.path.basename(img_real_url).split('.')[-1] img_name = sp_name + '_' + sha1 + '.' + img_suffix thumbnail_suffix = os.path.basename(thumbnail_url).split('.')[-1] thumbnail_name = sp_name + '_' + sha1 + '.thumbnail.' + thumbnail_suffix info_name = sp_name + '_' + sha1 + '.dict' info['img_url'] = img_real_url info['verifier'] = img_verifier info['img_name'] = images_dir + img_name info['index'] = sha1 info['thumbnail_url'] = thumbnail_url info['thumbnail_name'] = thumbs_dir + thumbnail_name info['info_name'] = infos_dir + info_name info['country'] = country_abbrev info['location'] = location #### #print(img_real_url) try: info['seq'] = int(os.path.basename(img_real_url).split('.')[0]) except: info['seq'] = -1 else: pass #print('-------------') return (info)
def get_img_urls(locsp_url, img_xpath='//div[@class="thumbnail"]/div/a[@href]'): #### #sys.stdout.flush() #print(locsp_url) #sys.stdout.flush() #### info_container, records_container = taxonomy_init() info_container['url'] = locsp_url #### #info_container = nvsoli.walkon(info_container,records_container=records_container) #info_container = nvsoli.auto_redireced(info_container,records_container) #### #### sleep_cnt = 0 while (1): #print('-------------------') #print(locsp_url) #print('-------------------') sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon(info_container, records_container=records_container) info_container = nvsoli.auto_redireced(info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(img_xpath) img_urls = [] thumbnail_urls = [] #### #### for i in range(0, eles.__len__()): url = nudipix_base_url + eles[i].attrib['href'] if (('photo' in url) & (not ('photographer' in url))): img_urls.append(url) ele = eles[i].xpath('img')[0] thumbnail_urls.append(nudipix_base_url + ele.attrib['src']) nav_xpath = '//p[@class="nav"]/a[@href]' eles = root.xpath(nav_xpath) if (eles.__len__() == 0): pass else: max_page = eles[-2].text max_page = int(max_page) tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/')) for i in range(2, max_page + 1): nav_url = nudipix_base_url + tem + '/' + str(i) info_container, records_container = taxonomy_init() info_container['url'] = nav_url #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced( info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### root = get_etree_root(info_container) eles = root.xpath(img_xpath) for j in range(0, eles.__len__()): url = nudipix_base_url + eles[j].attrib['href'] if (('photo' in url) & (not ('photographer' in url))): img_urls.append(url) ele = eles[j].xpath('img')[0] thumbnail_urls.append(nudipix_base_url + ele.attrib['src']) return ((img_urls, thumbnail_urls))
op='w+') info_container, records_container = taxonomy_init() info_container['url'] = info['img_url'] #### sleep_cnt = 0 while (1): sleep_cnt = sleep_cnt + 1 if (sleep_cnt > 30): sleep_cnt = 30 else: pass try: info_container = nvsoli.walkon( info_container, records_container=records_container) info_container = nvsoli.auto_redireced( info_container, records_container) except: time.sleep(10 * sleep_cnt) else: break #### #sys.stdout.flush() #print(info['img_name']) #print(info['seq']) #print(info['index']) #print(info['img_url']) #print(info_container['resp_body_bytes'][:50]) #sys.stdout.flush() #### nvft.write_to_file( fn=info['img_name'],