Пример #1
0
def login(base=BASE,passwd=PASSWD):
    ic,rc = nvsoli.keepalive_init(base)
    ic['url'] = base
    ic = nvsoli.walkon(ic,records_container=rc)
    nvsoli.shutdown(ic)
    ic,rc = nvsoli.keepalive_init(base)
    ckstr = get_authorization_ckstr(passwd)
    ic['url'] = base
    ic['req_head']['Referer'] = base
    ic['req_head']['Cookie'] = ckstr
    ic = nvsoli.walkon(ic,records_container=rc)
    subtag = get_subtag(ic)
    ckstr = drone.append(ckstr,subtag)
    nvsoli.shutdown(ic)
    return(ckstr)
Пример #2
0
def get_EXIF(EXIF_url):
    info_container, records_container = taxonomy_init()
    info_container['url'] = EXIF_url
    ####
    ####info_container = nvsoli.walkon(info_container,records_container=records_container)
    ####info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    ####
    sleep_cnt = 0
    while (1):
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath('//table[@class="exif"]/tr')
    EXIF = {}
    for i in range(0, eles.__len__()):
        key = eles[i].xpath('td')[0].text.rstrip(':')
        EXIF[key] = eles[i].xpath('td')[1].text
    return (EXIF)
Пример #3
0
def get_country_urls(locs_url,
                     countries_xpath='//div[@id="content"]/div/div/a[@href]'):
    info_container, records_container = taxonomy_init()
    info_container['url'] = locs_url
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    sleep_cnt = 0
    while (1):
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath(countries_xpath)
    country_urls = []
    country_names = []
    for i in range(0, eles.__len__()):
        url = nudipix_base_url + eles[i].attrib['href']
        country_urls.append(url)
        name = eles[i].text
        country_names.append(name)
    return ((country_urls, country_names))
Пример #4
0
def get_page(ckstr,url,base=BASE):
    ic,rc = nvsoli.keepalive_init(base)
    ic['url'] = url
    ic['req_head']['Cookie'] = ckstr
    ic['req_head']['Referer'] = base
    ic = nvsoli.walkon(ic,records_container=rc)
    nvsoli.shutdown(ic)
    return(ic)
Пример #5
0
def req(url,ckstr,base=BASE,**kwargs):
    ic,rc = nvsoli.keepalive_init(base)
    ic['url'] = url
    ic['req_head']['Cookie'] = ckstr
    if("referer" in kwargs):
        ic['req_head']['Referer'] = kwargs['referer']
    else:
        ic['req_head']['Referer'] = base
    ic = nvsoli.walkon(ic,records_container=rc)
    nvsoli.shutdown(ic)
    return(ic)
Пример #6
0
def fishbase_init(base_url='http://www.fishbase.us/'):
    info_container = nvsoli.new_info_container()
    info_container['base_url'] = base_url
    info_container['url'] = base_url
    info_container['method'] = 'GET'
    req_head_str = '''Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\nUser-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36\r\nAccept-Encoding: gzip,deflate,sdch\r\nAccept-Language: en;q=1.0, zh-CN;q=0.8'''
    info_container['req_head'] = nvhead.build_headers_dict_from_str(
        req_head_str, '\r\n')
    info_container['req_head']['Connection'] = 'close'
    #### init records_container
    records_container = nvsoli.new_records_container()
    info_container = nvsoli.walkon(info_container,
                                   records_container=records_container)
    info_container = nvsoli.auto_redireced(info_container, records_container)
    return ((info_container, records_container))
Пример #7
0
def search_via_country(c_code, info_container, records_container):
    req_body = gen_CI_post_body(Country=c_code)
    ciurl = info_container['base_url'] + 'country/CountrySearchList.php'
    info_container['req_head']['Referer'] = info_container['base_url']
    info_container['req_head']['Upgrade-Insecure-Requests'] = 1
    info_container['req_head'][
        'Content-Type'] = 'application/x-www-form-urlencoded'
    info_container['url'] = ciurl
    info_container['method'] = 'POST'
    info_container['req_body'] = req_body
    info_container = nvsoli.walkon(info_container,
                                   records_container=records_container)
    info_container['method'] = 'GET'
    info_container = nvsoli.auto_redireced(info_container, records_container)
    html_text = info_container['resp_body_bytes'].decode('utf-8')
    root = etree.HTML(html_text)
    return (root)
Пример #8
0
def get_nav_urls(loc_url, nav_xpath='//p[@class="nav"]/a[@href]'):
    info_container, records_container = taxonomy_init()
    info_container['url'] = loc_url
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    sleep_cnt = 0
    while (1):
        #####
        #print('--------get_nav_urls--------')
        #print(sleep_cnt)
        #print(loc_url)
        #print('--------get_nav_urls--------')
        #####
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath(nav_xpath)
    if (eles.__len__() == 0):
        nav_urls = []
    else:
        max_page = eles[-2].text
        max_page = int(max_page)
        nav_urls = [loc_url]
        tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/'))
        for i in range(2, max_page + 1):
            url = nudipix_base_url + tem + '/' + str(i)
            nav_urls.append(url)
    return (nav_urls)
Пример #9
0
def get_locsp_urls(nav_url,
                   locsp_xpah='//div[@class="thumbnail"]/div/a[@href]'):
    info_container, records_container = taxonomy_init()
    info_container['url'] = nav_url
    ####
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    sleep_cnt = 0
    while (1):
        #print('>>>>>>>>>>>>>>>>>>>>')
        #print(info_container['url'])
        #print('<<<<<<<<<<<<<<<<<<<<')
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath(locsp_xpah)
    locsp_urls = []
    for i in range(0, eles.__len__()):
        url = nudipix_base_url + eles[i].attrib['href']
        if ('location' in url):
            locsp_urls.append(url)
    return (locsp_urls)
Пример #10
0
def get_location_urls(
        country_url,
        locations_xpath='//ul[@class="country_dive_site_list"]/li/a[@href]'):
    info_container, records_container = taxonomy_init()
    info_container['url'] = country_url
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    sleep_cnt = 0
    while (1):
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath(locations_xpath)
    location_urls = []
    location_names = []
    for i in range(0, eles.__len__()):
        url = nudipix_base_url + eles[i].attrib['href']
        if ('location' in url):
            location_urls.append(url)
            name = eles[i].text
            location_names.append(name)
    return ((location_urls, location_names))
Пример #11
0
def get_country_infos(c_code, country_island_dict, info_container,
                      records_container, **kwargs):
    if ('display' in kwargs):
        display = int(kwargs['display'])
    else:
        display = 0
    if ('new_database' in kwargs):
        newdb = kwargs['new_database']
    else:
        newdb = 0
    ####
    #os.system('date')
    ####
    root = search_via_country(c_code, info_container, records_container)
    tables = get_all_tables(root, info_container['base_url'])
    qurl = get_query_url(info_container, kwargs)
    info_container['url'] = qurl
    info_container['method'] = 'GET'
    info_container['req_body'] = None
    info_container = nvsoli.walkon(info_container,
                                   records_container=records_container)
    html_text = info_container['resp_body_bytes'].decode('utf-8')
    root = etree.HTML(html_text)
    eles = root.xpath('//thead/tr/th')
    ####
    ####
    fish = {}
    for i in range(0, eles.__len__()):
        fish[eles[i].text] = None
    url_dict = nvurl.url_to_dict(info_container['url'])
    qd = nvurl.urldecode(url_dict['query'])
    all_country_eles = root.xpath("//tr[@class='t_value1']")
    country = country_island_dict[str(c_code)]
    fn = '../INFOS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[
        'cpresence'] + '/' + qd['vhabitat'] + '/'
    if (os.path.exists(fn)):
        pass
    else:
        os.makedirs(fn)
    picfn = '../PICS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[
        'cpresence'] + '/' + qd['vhabitat'] + '/'
    if (os.path.exists(picfn)):
        pass
    else:
        os.makedirs(picfn)
    thumbfn = '../THUMBNAILS/' + 'COUNTRYANDISLAND/' + country + '/' + qd[
        'cpresence'] + '/' + qd['vhabitat'] + '/'
    if (os.path.exists(thumbfn)):
        pass
    else:
        os.makedirs(thumbfn)
    ####

    ####
    ####
    if (bool(newdb)):
        fishes = {}
    else:
        fishes_dir = fn + "fishes.dict"
        print(fishes_dir)
        if (os.path.exists(fishes_dir)):
            fd = open(fishes_dir, 'r+')
            fishes_text = fd.read()
            fishes = json.loads(fishes_text)
            fd.close()
        else:
            fishes = {}
    ####
    print('--------------------')
    from xdict.jprint import paint_str
    print(
        paint_str(
            "===============fishes loads completed======================",
            single_color='yellow'))
    print(fishes.keys())
    print('----------------')
    #os.system('date')
    ####
    for i in range(0, all_country_eles.__len__()):
        fish_ele = all_country_eles[i]
        nfish = get_fish_info(fishes, fish_ele, fish, info_container,
                              records_container)
        ######
        print(
            paint_str(
                "===============nfish load completed======================",
                single_color='green'))
        ######
        if (nfish):
            ####
            print("====handle new nfish========")
            ####
            nfish['eles-seq'] = i
            nfish['images-dir'] = picfn
            nfish['info-dir'] = fn
            fishes[nfish['Species']['name']] = nfish
            nfdir = fn + nfish['Species']['name'] + '/'
            if (os.path.exists(nfdir)):
                pass
            else:
                os.makedirs(nfdir)
            nffn = nfdir + 'fish.dict'
            infofn = nfdir + 'fish.info'
            nvft.write_to_file(fn=nffn, content=json.dumps(nfish), op='w+')
            info = get_printed_str(nfish, with_color=0, display=display)
            nvft.write_to_file(fn=infofn, content=info, op='w+')

        else:
            ####
            print("===bypass existed fish====")
            ####
            pass
    #---------------------------------------#
    ####
    print(
        paint_str(
            "===============all  nfish es load completed======================",
            single_color='yellow'))
    print(fishes.keys())
    print(fishes.keys().__len__())
    ####
    dfn = fn + 'fishes.dict'
    if (os.path.exists(dfn)):
        pass
    else:
        nvft.write_to_file(fn=dfn, content=json.dumps(fishes), op='w+')
    ldfn = fn + 'fishes.lines'
    if (os.path.exists(ldfn)):
        pass
    else:
        nvft.write_to_file(fn=ldfn, content='', op='w+')
        for key in fishes:
            nfish = fishes[key]
            nvft.write_to_file(fn=ldfn,
                               content=get_printed_str(nfish,
                                                       with_color=0,
                                                       display=display),
                               op='a+')
            nvft.write_to_file(fn=ldfn, content='\n', op='a+')
    #---------------------------------------#
    ####
    print("-----get all_photos ready----")
    ####
    apafn = fn + 'pics.array'
    if (os.path.exists(apafn)):
        fd = open(apafn, 'r+')
        apa_text = fd.read()
        all_photos = json.loads(apa_text)
        fd.close()
    else:
        all_photos = []
        for name in fishes:
            #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos'])
            #all_photos = all_photos + copy.deepcopy(fishes[name]['All-Photos'])
            for photo in fishes[name]['All-Photos']:
                all_photos.append(photo)
    ####
    print("all_photos gotted")
    print(all_photos.__len__())
    ####
    types = []
    for each in all_photos:
        type = each['type']
        if (type in types):
            pass
        else:
            if (type == None):
                pass
            else:
                types.append(type)
    for type in types:
        typefn = picfn + type
        if (os.path.exists(typefn)):
            pass
        else:
            os.makedirs(typefn)
        typefn = thumbfn + type
        if (os.path.exists(typefn)):
            pass
        else:
            os.makedirs(typefn)
    for each in all_photos:
        if (each['type'] == None):
            each['img-dir'] = None
            each['thumb-dir'] = None
        else:
            img_dir = picfn + each['type'] + '/' + each['img-name']
            each['img-dir'] = img_dir
            thumb_dir = thumbfn + each['type'] + '/' + each['img-name']
            each['thumb-dir'] = thumb_dir
    apafn = fn + 'pics.array'
    if (os.path.exists(apafn)):
        pass
    else:
        nvft.write_to_file(fn=apafn, content=json.dumps(all_photos), op='w+')
    lapafn = fn + 'pics.lines'
    if (os.path.exists(lapafn)):
        pass
    else:
        nvft.write_to_file(fn=lapafn, content='', op='w+')
        for each in all_photos:
            nvft.write_to_file(fn=lapafn,
                               content=get_printed_str(each,
                                                       with_color=0,
                                                       display=display),
                               op='a+')
            nvft.write_to_file(fn=lapafn, content='\n', op='a+')
    ############################
    print("pics.lines and pics.array ready")
    ############################
    imagename_dir_dict = {}
    dir_imagename_dict = {}
    for each in all_photos:
        if (each['type'] != None):
            imagename = each['img-name']
            dir = each['img-dir']
        else:
            imagename = None
            dir = None
        imagename_dir_dict[imagename] = dir
        dir_imagename_dict[dir] = imagename
    iddfn = fn + 'image_dir.dict'
    didfn = fn + 'dir_image.dict'
    if (os.path.exists(iddfn)):
        pass
    else:
        nvft.write_to_file(fn=iddfn,
                           content=json.dumps(imagename_dir_dict),
                           op='w+')
    liddfn = fn + 'image_dir.lines'
    if (os.path.exists(liddfn)):
        pass
    else:
        nvft.write_to_file(fn=liddfn, content='', op='w+')
        for each in imagename_dir_dict:
            nvft.write_to_file(fn=liddfn,
                               content=get_printed_str(each,
                                                       with_color=0,
                                                       display=display),
                               op='a+')
            nvft.write_to_file(fn=liddfn, content='\n', op='a+')
    if (os.path.exists(didfn)):
        pass
    else:
        nvft.write_to_file(fn=didfn,
                           content=json.dumps(dir_imagename_dict),
                           op='w+')
    ldidfn = fn + 'dir_image.lines'
    if (os.path.exists(ldidfn)):
        pass
    else:
        nvft.write_to_file(fn=ldidfn,
                           content=get_printed_str(dir_imagename_dict,
                                                   with_color=0,
                                                   display=display),
                           op='w+')
    ###############
    print("==dir_image.dict and dir_image.lines gotted==")
    ##############
    thumb_dir_dict = {}
    dir_thumb_dict = {}
    for each in all_photos:
        if (each['type'] != None):
            imagename = each['img-name']
            dir = each['thumb-dir']
        else:
            imagename = None
            dir = None
        thumb_dir_dict[imagename] = dir
        dir_thumb_dict[dir] = imagename
    iddfn = fn + 'thumb_dir.dict'
    didfn = fn + 'dir_thumb.dict'
    if (os.path.exists(iddfn)):
        pass
    else:
        nvft.write_to_file(fn=iddfn,
                           content=json.dumps(thumb_dir_dict),
                           op='w+')
    liddfn = fn + 'thumb_dir.lines'
    if (os.path.exists(liddfn)):
        pass
    else:
        nvft.write_to_file(fn=liddfn, content='', op='w+')
        for each in thumb_dir_dict:
            nvft.write_to_file(fn=liddfn,
                               content=get_printed_str(each,
                                                       with_color=0,
                                                       display=display),
                               op='a+')
            nvft.write_to_file(fn=liddfn, content='\n', op='a+')
    if (os.path.exists(didfn)):
        pass
    else:
        nvft.write_to_file(fn=didfn,
                           content=json.dumps(dir_thumb_dict),
                           op='w+')
    ldidfn = fn + 'dir_thumb.lines'
    if (os.path.exists(ldidfn)):
        pass
    else:
        nvft.write_to_file(fn=ldidfn,
                           content=get_printed_str(dir_thumb_dict,
                                                   with_color=0,
                                                   display=display),
                           op='w+')
    ###############
    print("===dir_thumb.lines and thumb_dir.dict gotted===")
    ###############
    print("begin download images")
    ###############
    for each in all_photos:
        if (each['type'] != None):
            imagename = each['img-name']
            img_dir = each['img-dir']
            img_url = each['img-url']
            thumb_dir = each['thumb-dir']
            thumb_url = each['thumbnail-url']

            if (os.path.exists(img_dir)):
                ####
                print(paint_str("pass_by_pic", single_color="red"))
                ####
                pass
            else:
                info_container['url'] = img_url
                info_container = nvsoli.walkon(
                    info_container, records_container=records_container)
                info_container = nvsoli.auto_redireced(info_container,
                                                       records_container)
                nvft.write_to_file(fn=img_dir,
                                   content=info_container['resp_body_bytes'],
                                   op='wb+')
                ####
                print("downloaded one pic")
                ####
            if (os.path.exists(thumb_dir)):
                ####
                print(paint_str("pass_by_thumb", single_color="red"))
                ####
                pass
            else:
                info_container['url'] = thumb_url
                info_container = nvsoli.walkon(
                    info_container, records_container=records_container)
                info_container = nvsoli.auto_redireced(info_container,
                                                       records_container)
                nvft.write_to_file(fn=thumb_dir,
                                   content=info_container['resp_body_bytes'],
                                   op='wb+')
                ####
                print("downloaded one thumb")
                ####
        else:
            print("---external pics not downloaded in this version,pass--")
            pass
    return ((info_container, records_container))
Пример #12
0
def get_fish_info(fishes, ele_fish, fish, info_container, records_container):
    new_fish = copy.deepcopy(fish)
    eles = ele_fish.getchildren()
    new_fish['Family'] = eles[0].text
    new_fish['Species'] = {}
    new_fish['Species']['name'] = eles[1].xpath("i/a")[0].text
    nfs_name = new_fish['Species']['name'].replace(' ', '_')
    ####
    print(new_fish['Species']['name'])
    ####
    if (new_fish['Species']['name'] in fishes):
        ####
        print('----return None----')
        ####
        return (None)
    else:
        ####
        print('------continue new fish------')
        ####
        pass
    new_fish['Species']['id'] = int(
        nvurl.urldecode(eles[1].xpath("i/a")[0].get('href'))['id'])
    new_fish['Species']['url'] = info_container[
        'base_url'] + 'country/' + eles[1].xpath("i/a")[0].get('href')
    new_fish['Author'] = eles[2].text
    new_fish['Info'] = eles[3].text
    regex = re.compile('[\r\n\t]+')
    new_fish['Info'] = regex.sub('', new_fish['Info'])
    new_fish['Occurrence'] = eles[4].text.strip('\xa0').strip(' ').strip(
        '\xa0')
    names = eles[5].text.split(',')
    new_fish['Common names'] = []
    for i in range(0, names.__len__()):
        new_fish['Common names'].append(
            names[i].strip('').strip('\xa0').strip(''))
    new_fish['Abundance'] = eles[6].text.strip('\xa0').strip(' ').strip('\xa0')
    new_fish['Max length'] = eles[7].text.strip('\xa0').strip(' ').strip(
        '\xa0')
    new_fish['Maturity'] = eles[8].text.strip('\xa0').strip(' ').strip('\xa0')
    new_fish['Remark'] = eles[9].text.strip('\xa0').strip(' ').strip('\xa0')
    new_fish['Photo'] = {}
    if (eles.__len__() >= 10):
        temp = eles[10].xpath('a')
        if (temp.__len__() > 0):
            new_fish['Photo'][
                'url'] = info_container['base_url'] + temp[0].get('href')
        else:
            new_fish['Photo']['url'] = None
        #####
        new_fish['Presenting-Photo'] = {}
        try:
            new_fish['Presenting-Photo']['ID'] = int(
                nvurl.urldecode(eles[10].xpath('a')[0].get('href'))
                ['/photos/ThumbnailsSummary.php?ID'])
            new_fish['Presenting-Photo']['thumbnail-url'] = info_container[
                'base_url'].rstrip('/') + eles[10].xpath('a/img')[0].get('src')
        except:
            new_fish['Presenting-Photo']['ID'] = None
            new_fish['Presenting-Photo']['thumbnail-url'] = None
        else:
            pass
    else:
        new_fish['Photo']['url'] = None
        new_fish['Presenting-Photo'] = {}
        new_fish['Presenting-Photo']['ID'] = None
        new_fish['Presenting-Photo']['thumbnail-url'] = None
    new_fish['All-Photos'] = []
    info_container['url'] = new_fish['Photo']['url']
    if (info_container['url'] == None):
        pass
    else:
        info_container = nvsoli.walkon(info_container,
                                       records_container=records_container)
        html_text = info_container['resp_body_bytes'].decode('utf-8', 'ignore')
        root = etree.HTML(html_text)
        #eles = root.xpath("//tr[@class='t_value1']")
        #eles = root.xpath("//tr[@align]")
        eles = root.xpath("//td[(@align) and (@width)]")
        for i in range(0, eles.__len__()):
            ele = eles[i]
            #photo_eles = ele.xpath('td')
            photo_eles = [ele]
            for j in range(0, photo_eles.__len__()):
                photo = {}
                photo_ele = photo_eles[j]
                tooltip = photo_ele.xpath("a[@class='tooltip']")
                if (tooltip.__len__() > 0):
                    rel_sum_url = tooltip[0].get('href')
                    if (rel_sum_url.strip(' \t\r\n') == '#'):
                        photo['summary-url'] = '#'
                    else:
                        if ('http' in rel_sum_url):
                            photo['summary-url'] = rel_sum_url.strip(
                                '.').strip('/').strip('.')
                        else:
                            photo['summary-url'] = info_container[
                                'base_url'] + 'photos/' + rel_sum_url.strip(
                                    '.').strip('/').strip('.')
                else:
                    rel_sum_url = None
                    photo['summary-url'] = None
                #regex = re.compile('what=(.*)')
                #m = regex.search(photo['summary-url'])
                if (photo['summary-url'] == None):
                    photo['type'] = None
                    photo['external-url'] = None
                    photo['colaborator-url'] = None
                    photo['thumbnail-url'] = None
                    photo['img-url'] = None
                    photo['photographer'] = None
                elif (photo['summary-url'] == '#'):
                    photo['external-url'] = None
                    photo['colaborator-url'] = None
                    photo['thumbnail-url'] = info_container[
                        'base_url'] + 'photos/' + tooltip[0].xpath(
                            'img')[0].get('src')
                    photo['img-url'] = tooltip[0].xpath('span/img')[0].get(
                        'src')
                    photo['type'] = 'uploads'
                    text = itertext(tooltip[0].xpath('span')[0])
                    regex = re.compile('<.*>')
                    photo['photographer'] = regex.sub('',
                                                      text).strip('\r\n\t ')
                    regex = re.compile('.*/(.*)')
                    m = regex.search(photo['img-url'])
                    img_name = nfs_name + '__' + m.group(1)
                    regex = re.compile('.*\.(.*)')
                    try:
                        img_type = regex.search(m.group(1)).group(1)
                    except:
                        img_type = None
                        img_name = None
                        photo['img-url'] = None
                        photo['thumbnail-url'] = None
                    else:
                        pass
                    photo['img-type'] = img_type
                    photo['img-name'] = img_name
                else:
                    if ('http' in rel_sum_url):
                        photo['external-url'] = photo['summary-url']
                        photo['type'] = None
                    else:
                        photo['external-url'] = None
                        if ('/Diseases/' in photo['summary-url']):
                            photo['type'] = 'Diseases'
                        else:
                            photo['type'] = nvurl.urldecode(
                                photo['summary-url'])['what']
                        if ('http' in photo_ele.xpath("a[not(@class)]")[0].get(
                                'href')):
                            photo['colaborator-url'] = photo_ele.xpath(
                                "a[not(@class)]")[0].get('href')
                        else:
                            photo['colaborator-url'] = info_container[
                                'base_url'].strip('/') + photo_ele.xpath(
                                    "a[not(@class)]")[0].get('href')
                        if ('http' in photo_ele.xpath('a/img')[0].get('src')):
                            photo['thumbnail-url'] = photo_ele.xpath(
                                'a/img')[0].get('src').strip('.')
                        else:
                            photo['thumbnail-url'] = info_container[
                                'base_url'].strip('/') + photo_ele.xpath(
                                    'a/img')[0].get('src').strip('.')
                        if ('http' in photo_ele.xpath('a/span/img')[0].get(
                                'src')):
                            photo['img-url'] = photo_ele.xpath(
                                'a/span/img')[0].get('src').strip('.')
                        else:
                            photo['img-url'] = info_container[
                                'base_url'].strip('/') + photo_ele.xpath(
                                    'a/span/img')[0].get('src').strip('.')
                        regex = re.compile('.*/(.*)')
                        m = regex.search(photo['img-url'])
                        img_name = nfs_name + '__' + m.group(1)
                        regex = re.compile('.*\.(.*)')
                        try:
                            img_type = regex.search(m.group(1)).group(1)
                        except:
                            img_type = None
                            img_name = None
                            photo['img-url'] = None
                            photo['thumbnail-url'] = None
                        else:
                            pass
                        photo['img-type'] = img_type
                        photo['img-name'] = img_name
                        photo['photographer'] = itertext(
                            photo_ele.xpath('a/span')[0])
                        regex = re.compile('[\r\n\t]+')
                        photo['photographer'] = regex.sub(
                            '', photo['photographer'])
                new_fish['All-Photos'].append(photo)
    return (new_fish)
Пример #13
0
def get_img_info(img_url,
                 thumbnail_url,
                 country_abbrev,
                 location,
                 base_url=nudipix_base_url):
    info_container, records_container = taxonomy_init()
    info_container['url'] = img_url
    ####
    #sys.stdout.flush()
    #print('---------------')
    #print(img_url)
    #sys.stdout.flush()
    ####
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    sleep_cnt = 0
    while (1):
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    img_root = get_etree_root(info_container)
    tbodys = img_root.xpath('//table')
    sp = img_root.xpath('//div/div/h2/a')[0].attrib['href'].rstrip('/')
    sp_name = os.path.basename(sp)
    info_raw = tbodys[0].getchildren()
    info = {}
    for i in range(0, info_raw.__len__()):
        key = info_raw[i].xpath('td')[0].text.rstrip(':')
        if (key == 'Camera'):
            info[key] = info_raw[i].xpath('td')[1].text
            EXIF_url = nudipix_base_url + info_raw[i].xpath(
                'td/span/a')[0].attrib['href']
            info['EXIF'] = get_EXIF(EXIF_url)
        elif (key == 'Taken on'):
            info[key] = info_raw[i].xpath('td')[1].text
        elif (key == 'Viewed'):
            info[key] = info_raw[i].xpath('td')[1].text
        elif (key == 'Posted'):
            info[key] = info_raw[i].xpath('td')[1].text
        elif (key == 'Updated'):
            info[key] = info_raw[i].xpath('td')[1].text
        else:
            info[key] = info_raw[i].xpath('td/a')[0].text
    kpcofgs = get_KPCOFGS(tbodys, rsltin='dict')
    info['kpcofgs'] = kpcofgs
    img_real_url = nudipix_base_url + img_root.xpath(
        '//div/img')[0].attrib['src']
    try:
        img_verifier = img_root.xpath('//div/img')[1].attrib['title']
    except:
        img_verifier = ''
    else:
        pass
    sha1 = hashlib.sha1(img_real_url.encode('utf-8')).hexdigest()
    img_suffix = os.path.basename(img_real_url).split('.')[-1]
    img_name = sp_name + '_' + sha1 + '.' + img_suffix
    thumbnail_suffix = os.path.basename(thumbnail_url).split('.')[-1]
    thumbnail_name = sp_name + '_' + sha1 + '.thumbnail.' + thumbnail_suffix
    info_name = sp_name + '_' + sha1 + '.dict'
    info['img_url'] = img_real_url
    info['verifier'] = img_verifier
    info['img_name'] = images_dir + img_name
    info['index'] = sha1
    info['thumbnail_url'] = thumbnail_url
    info['thumbnail_name'] = thumbs_dir + thumbnail_name
    info['info_name'] = infos_dir + info_name
    info['country'] = country_abbrev
    info['location'] = location
    ####
    #print(img_real_url)
    try:
        info['seq'] = int(os.path.basename(img_real_url).split('.')[0])
    except:
        info['seq'] = -1
    else:
        pass
    #print('-------------')
    return (info)
Пример #14
0
def get_img_urls(locsp_url,
                 img_xpath='//div[@class="thumbnail"]/div/a[@href]'):
    ####
    #sys.stdout.flush()
    #print(locsp_url)
    #sys.stdout.flush()
    ####
    info_container, records_container = taxonomy_init()
    info_container['url'] = locsp_url
    ####
    #info_container = nvsoli.walkon(info_container,records_container=records_container)
    #info_container = nvsoli.auto_redireced(info_container,records_container)
    ####
    ####
    sleep_cnt = 0
    while (1):
        #print('-------------------')
        #print(locsp_url)
        #print('-------------------')
        sleep_cnt = sleep_cnt + 1
        if (sleep_cnt > 30):
            sleep_cnt = 30
        else:
            pass
        try:
            info_container = nvsoli.walkon(info_container,
                                           records_container=records_container)
            info_container = nvsoli.auto_redireced(info_container,
                                                   records_container)
        except:
            time.sleep(10 * sleep_cnt)
        else:
            break
    ####
    root = get_etree_root(info_container)
    eles = root.xpath(img_xpath)
    img_urls = []
    thumbnail_urls = []
    ####
    ####
    for i in range(0, eles.__len__()):
        url = nudipix_base_url + eles[i].attrib['href']
        if (('photo' in url) & (not ('photographer' in url))):
            img_urls.append(url)
            ele = eles[i].xpath('img')[0]
            thumbnail_urls.append(nudipix_base_url + ele.attrib['src'])
    nav_xpath = '//p[@class="nav"]/a[@href]'
    eles = root.xpath(nav_xpath)
    if (eles.__len__() == 0):
        pass
    else:
        max_page = eles[-2].text
        max_page = int(max_page)
        tem = os.path.dirname(eles[-2].attrib['href'].rstrip('/'))
        for i in range(2, max_page + 1):
            nav_url = nudipix_base_url + tem + '/' + str(i)
            info_container, records_container = taxonomy_init()
            info_container['url'] = nav_url
            ####
            sleep_cnt = 0
            while (1):
                sleep_cnt = sleep_cnt + 1
                if (sleep_cnt > 30):
                    sleep_cnt = 30
                else:
                    pass
                try:
                    info_container = nvsoli.walkon(
                        info_container, records_container=records_container)
                    info_container = nvsoli.auto_redireced(
                        info_container, records_container)
                except:
                    time.sleep(10 * sleep_cnt)
                else:
                    break
            ####
            root = get_etree_root(info_container)
            eles = root.xpath(img_xpath)
            for j in range(0, eles.__len__()):
                url = nudipix_base_url + eles[j].attrib['href']
                if (('photo' in url) & (not ('photographer' in url))):
                    img_urls.append(url)
                    ele = eles[j].xpath('img')[0]
                    thumbnail_urls.append(nudipix_base_url + ele.attrib['src'])
    return ((img_urls, thumbnail_urls))
Пример #15
0
 nvft.write_to_file(fn=info['info_name'],
                    content=json.dumps(info),
                    op='w+')
 info_container, records_container = taxonomy_init()
 info_container['url'] = info['img_url']
 ####
 sleep_cnt = 0
 while (1):
     sleep_cnt = sleep_cnt + 1
     if (sleep_cnt > 30):
         sleep_cnt = 30
     else:
         pass
     try:
         info_container = nvsoli.walkon(
             info_container,
             records_container=records_container)
         info_container = nvsoli.auto_redireced(
             info_container, records_container)
     except:
         time.sleep(10 * sleep_cnt)
     else:
         break
 ####
 #sys.stdout.flush()
 #print(info['img_name'])
 #print(info['seq'])
 #print(info['index'])
 #print(info['img_url'])
 #print(info_container['resp_body_bytes'][:50])
 #sys.stdout.flush()
Пример #16
0
def get_species(root):
    eles_sps = root.xpath(
        '//tr/td/span/a  | //tr/td/em/strong/a | //tr/td/a | //tr/td/strong/a | //tr/td/em/a | //tr/td/a | //tr/td/strong/em/a'
    )
    new_eles_sps = []
    for i in range(0, eles_sps.__len__()):
        if (('#' in eles_sps[i].attrib['href']) |
            ('strombidae' in eles_sps[i].attrib['href']) |
            ('images' in eles_sps[i].attrib['href'])):
            new_eles_sps.append(eles_sps[i])
        else:
            pass
    del new_eles_sps[-1]
    #####################
    ele_cnames = []
    for i in range(0, new_eles_sps.__len__()):
        td_parent = new_eles_sps[i].getparent()
        while (td_parent.tag != 'td'):
            td_parent = td_parent.getparent()
        td_next = td_parent.getnext()
        ele_cnames.append(td_next)
    #####################
    urls = []
    for i in range(0, new_eles_sps.__len__()):
        urls.append(ryan_base_url + new_eles_sps[i].attrib['href'])
    #####################
    dir_names = []
    for i in range(0, new_eles_sps.__len__()):
        dir_names.append(new_eles_sps[i].attrib['href'].replace('.htm',
                                                                '').replace(
                                                                    '#', ' '))
    ####################
    new_urls_set = set({})
    for i in range(0, urls.__len__()):
        url = urls[i]
        url = url.split('#')[0]
        new_urls_set.add(url)
    ####################
    image_urls = []
    for url in new_urls_set:
        info_container['url'] = url
        info_container = nvsoli.walkon(info_container,
                                       records_container=records_container)
        root = get_etree_root(info_container)
        eles = root.xpath('//tr/td/div/img')
        for j in range(0, eles.__len__()):
            image_urls.append(
                (ryan_base_url + eles[j].attrib['src']).replace(' ', '%20'))
    #####################
    mirror_indexes = {}
    image_names = []
    info_names = []
    infos = []
    for i in range(0, image_urls.__len__()):
        suffix = image_urls[i].split('.')[-1]
        arr = os.path.basename(image_urls[i]).split('%20')
        name = arr[0] + ' ' + arr[1].rstrip(',').rstrip('.').rstrip(' ') + '_'
        name = name + hashlib.sha1(image_urls[i].encode('utf-8')).hexdigest()
        name = name + '.' + suffix
        image_names.append(name)
        info_names.append(name + '.' + 'info')
        info = {}
        info['origin'] = image_urls[i]
        info['path'] = ''
        info['details'] = {}
        infos.append(info)
        mirror_indexes[name] = image_urls[i]
        mirror_indexes[image_urls[i]] = name
        info_container['url'] = image_urls[i]
        info_container = nvsoli.walkon(info_container,
                                       records_container=records_container)
        nvft.write_to_file(fn=photosdir + '/' + image_names[i],
                           op='wb',
                           content=info_container['resp_body_bytes'])
    nvft.write_to_file(fn=photosdir + '/' + 'indexes.dict',
                       op='w',
                       content=json.dumps(mirror_indexes))