def get_vaname(query,verbose=False,debug=False): html=get_google_content(query,debug=debug) html=html.lower() html=htmltool.decode_entity(html) html=htmltool.remove_tags(html,repl="||") html=htmltool.clean_tags(html,repl="||") if debug: printu(html) html=separate(html,repl="||") html=replace(html,"") html=cjk.half2full(html) strings_list=[content.strip() for content in html.split("||") if len(content.strip()) ] strings_count=Counter(strings_list) for string,count in strings_count.items(): if count <= 1: pass #del strings_count[string] l=[(string,count*len(string)*(10 if cjk.contain_cjk(string) else 1)) for string,count in strings_count.items()] l.sort(key=lambda t:t[1],reverse=True) #sort by weight if verbose: #dump (word,weight) list for string,weight in sorted(l,key=lambda t:t[1]): printu("%-6d:%s"%(weight,string)) ## max maxweight_string=(l[0][0]) #print max weight string maxweight_substring=[re.sub("[\w \.]+$","",string) for string,weight in l[0:5] if maxweight_string in string] return max(maxweight_substring,key=len)
def get_vapic(keyword,path=os.path.abspath(os.path.curdir.decode()),num=3,height=700,width=500,verbose=False,debug=False): html=get_google_content_pic_search(keyword,debug=debug) #html=html.lower() html=htmltool.decode_entity(html) html=htmltool.remove_tags(html,repl="||") #html=htmltool.clean_tags(html,repl="||") if debug: printu(html) imgurls=re.findall("imgurl=([^&]*?.jpg)", html, flags=re.I) for url in imgurls: if num == 0: break if verbose: printu("try url: %s"%url) try: ## get image from internet content=None content=get_web_content_with_cache(url) if not content: continue ## check image size image=get_image_from_buff(content) if image.size[0] < 700 or image.size[1] < 500: continue ## skip small image ## save image to path filepath=os.path.join(path,os.path.basename(url)) with open(filepath,"wb+") as f: f.write(content) num=num-1 printu("(%4d,%4d)[%-32s] <= [%s]"%(image.size[0],image.size[1],os.path.relpath(filepath),url)) except Exception as e: if verbose: printu("Error:%s: %s: %s"%(type(e),str(e),url))