def sm1234_search(key, pn): # print("sm1234_search start...") kv = {'q': key, 'p': pn} # print(kv) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.get("http://sm.sm1234.net/", params=kv, headers=headers) # url = 'http://sm.sm1234.net/?q=python3&p=2' # print(r.url) soup = BeautifulSoup(r.text, 'lxml') # li = [] for item in soup.find_all('div', attrs={"class": "g"}): # print(item) # if item.has_attr('id') and item['id'] == str(now): result = {} result['title'] = item.h2.get_text() result['url'] = "http://sm.sm1234.net" + item.h2.a['href'] #http://sm.sm1234.net # print(result['url']) result['text'] = (item.find("div", attrs={"class": "std"})).get_text() # li.append(result) yield result
def gg_search(ip, key, pn): # print("gg_search start...") kv = {'q': key, 'start': pn} headers = {'User-Agent': random_user_agent(), 'X-Forwarded-For': ip} # print(proxies) # proxies=proxies r = requests.get("https://www.google.com/search", params=kv, headers=headers) #print(r.url) soup = BeautifulSoup(r.text, 'lxml') # li = [] for item in soup.find_all('div', attrs={"class": "g"}): #print("bing search div g") a = item.find('a') result = {} result['title'] = a.text.strip() result['url'] = a["href"] stext = item.find("span", attrs={"class": "st"}) if stext: result['text'] = stext.text # li.append(result) yield result
def bd_search(key, pn): print("bd_search start...") # key = quote(key) kv = {'wd': key, 'pn': pn} # print(kv) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.get("http://www.baidu.com/s", params=kv, headers=headers) # print(r.url) soup = BeautifulSoup(r.text, 'lxml') # select_html = soup.find("div", attrs={'id':'content_left'}) # li = [] now = int(pn) # t1 = time.time() for item in soup.find_all('div', attrs={"class": "c-container"}): #print("search c-container") now += 1 if item.has_attr('id') and item['id'] == str(now): #print(type(now)) result = {} # result['title'] = item.h3.get_text() #print(item.h3) #print(item.h3.name) name is h3 #print(item.h3.a.contents) #result['title'] = str(item.h3.a.contents).strip("[]") ss = '' for div in item.find_all('div'): if div.has_attr('class') and ( div['class'][0].find('abstract') != -1 or div['class'][0] == 'c-row'): ss += div.get_text() #ss += div.contents result['text'] = ss # class="c-showurl" style="text-decoration:none;" # a = item.find('a') # print(a.get('href')) # print(a.get_text()) if item.h3.a: #result['url'] = item.h3.a.get('href') #print(item.h3.a.string) result['url'] = item.h3.a['href'] # requests get for baidu redirect url to get result url. # a = requests.get(url = item.h3.a['href'], headers=headers) # result['url'] = a.url # print("one get: " + str(t6 - t5) + " seconds") result['title'] = item.h3.get_text() # Optimization baidu search # result['title'] = item.h3.get_text() + " " + result['url'] # li.append(result) yield result else: print("item.h3.a is None ***") print(item.h3)
def ddk_search(key, pn): # kv = {'wd':key, 'pn':pn} pn = int(pn) * 3 # dc = pn - 1 # kv = {'q':key, 's':pn, 'dc':pn, 'api':'/d.js'} if pn > 10: kv = { 'q': key, 's': pn, 'dc': pn, 'v': 'l', 'o': 'json', 'api': '/d.js' } else: kv = {'q': key, 's': pn, 'dc': pn} # print(kv) # print(random_user_agent()) headers = {'User-Agent': random_user_agent()} # headers = { # 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 QIHU 360SE' # } r = requests.post("https://www.duckduckgo.com/html", params=kv, headers=headers) # print(r.url) # print(r.status_code) # html = content.decode("utf8", "ignore") # print(r.text) soup = BeautifulSoup(r.text, 'lxml') # li = [] # <div class="result results_links results_links_deep web-result "> # On server, need use follow string. # <div class="links_main links_deep result__body"> for item in soup.find_all( 'div', attrs={"class": "links_main links_deep result__body"}): result = {} # result['title'] = item.h2.get_text() # print(result['title']) result['url'] = item.h2.a['href'] result['title'] = item.h2.get_text() + " " + result['url'] gettext = item.find("a", attrs={"class": "result__snippet"}) if gettext: result['text'] = gettext.get_text() else: result['text'] = "" # "<br>" # result['text'] = (item.find("a", attrs={"class":"result__snippet"})).get_text() # li.append(result) yield result
def html_to_md(url, param): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (random_user_agent()) driver = webdriver.PhantomJS(desired_capabilities=dcap) driver.get(url) driver.implicitly_wait(5) html = driver.page_source # driver.close() driver.quit() # headers = {'User-Agent':random_user_agent()} # session = HTMLSession() # r = session.get(url, headers=headers) # print(r.encoding) # print(r.apparent_encoding) # r.encoding = 'utf-8' # r.encoding = r.apparent_encoding # r.html.render() # html = r.html # html = r.html # val = html.render() # r = session.get('http://www.readmorejoy.com/') # r.html.render() # md = html2text.html2text(r.text) # # return md # print(param) if param: li = param.split(" ") soup = BeautifulSoup(html, 'lxml') if len(li) > 1: name = li[0] lili = li[1].split("=") if len(lili) > 1: attrs1 = lili[0] attrs2 = lili[1].strip("\"") select_html = soup.find(name, attrs={attrs1, attrs2}) md = html2text.html2text(str(select_html)) return md else: name = param select_html = soup.find(name) md = html2text.html2text(str(select_html)) return md md = html2text.html2text(html) return md
def youdao_fanyi(type, q, dst): print("youdao_fanyi") # q = "你好" # q = "hello" headers = {'User-Agent': random_user_agent()} # type = detect(q[:30]) #"en | zh_cn" # print("detect(q) = " + type) url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom=https://www.baidu.com/link' # if dst == "cn": # print("youdao cn") # # if type in ['zh'] 'zh-CHS' # data = {'from': type, 'to': 'zh-cn', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # elif dst == "en": # print("youdao en") # data = {'from': type, 'to': 'en', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # elif dst == "fra": # print("youdao fra") # data = {'from': 'zh-tw', 'to': 'zh-CHS', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', # 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', # 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # else: # print("youdao else") data = {'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607', 'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web', 'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q} # print(q) r = requests.get(url, params=data, headers=headers) ta = r.json() result = "" for ii in ta['translateResult']: for i in ii: result += i['tgt'] result += "\n" return result
def gg_fanyi(type, q, dst): print("gg_fanyi") # with PyV8.JSContext() as ctxt: # ctxt.eval(""" # function TL(a) { # var k = ""; # var b = 406644; # var b1 = 3293161072; # # var jd = "."; # var $b = "+-a^+6"; # var Zb = "+-3^+b+-f"; # # for (var e = [], f = 0, g = 0; g < a.length; g++) { # var m = a.charCodeAt(g); # 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), # e[f++] = m >> 18 | 240, # e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, # e[f++] = m >> 6 & 63 | 128), # e[f++] = m & 63 | 128) # } # a = b; # for (f = 0; f < e.length; f++) a += e[f], # a = RL(a, $b); # a = RL(a, Zb); # a ^= b1 || 0; # 0 > a && (a = (a & 2147483647) + 2147483648); # a %= 1E6; # return a.toString() + jd + (a ^ b) # }; # # function RL(a, b) { # var t = "a"; # var Yb = "+"; # for (var c = 0; c < b.length - 2; c += 3) { # var d = b.charAt(c + 2), # d = d >= t ? d.charCodeAt(0) - 87 : Number(d), # d = b.charAt(c + 1) == Yb ? a >>> d: a << d; # a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d # } # return a # } # """) # print("at here for debug") # TL = ctxt.locals.TL # tk = TL(q) # need call node, start node is slow # ctx = execjs.compile(""" # function TL(a) { # var k = ""; # var b = 406644; # var b1 = 3293161072; # # var jd = "."; # var $b = "+-a^+6"; # var Zb = "+-3^+b+-f"; # # for (var e = [], f = 0, g = 0; g < a.length; g++) { # var m = a.charCodeAt(g); # 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), # e[f++] = m >> 18 | 240, # e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, # e[f++] = m >> 6 & 63 | 128), # e[f++] = m & 63 | 128) # } # a = b; # for (f = 0; f < e.length; f++) a += e[f], # a = RL(a, $b); # a = RL(a, Zb); # a ^= b1 || 0; # 0 > a && (a = (a & 2147483647) + 2147483648); # a %= 1E6; # return a.toString() + jd + (a ^ b) # }; # # function RL(a, b) { # var t = "a"; # var Yb = "+"; # for (var c = 0; c < b.length - 2; c += 3) { # var d = b.charAt(c + 2), # d = d >= t ? d.charCodeAt(0) - 87 : Number(d), # d = b.charAt(c + 1) == Yb ? a >>> d: a << d; # a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d # } # return a # } # """) # t1 = time.time() # tk = ctx.call("TL", q) # t2 = time.time() # print('took tk ' + str(t2-t1) + ' second') session = requests.Session() token_acquirer = TokenAcquirer(session=session, host="translate.google.cn") tk = token_acquirer.do(q) # tk = TTL(q) # print(tk) # print(q) headers = {'User-Agent': random_user_agent()} # print(q) type = detect(q[:30]) # "en | zh_cn" print("detect(q) = " + type) # if type == 'ko': # type = "zh-cn" if type == "en": # print("gg dedect type en") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) # url = "http://translate.google.cn/translate_a/single?client=t" \ # "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ # "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ # "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif type == 'zh-cn': # print("google dedect type zh-cn") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif type == 'zh-tw': # print("google dedect type zh-cn") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: # print("google dedect type else") if dst == "cn": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "en": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) elif dst == "fra": url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) else: url = "http://translate.google.cn/translate_a/single?client=t" \ "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \ "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \ "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q) # t1 = time.time() # r = requests.get(url, headers=headers) r = session.get(url, headers=headers) # t2 = time.time() # print('took get ' + str(t2-t1) + ' second') # print(r.encoding) # r.encoding = 'UTF-8' # print(r.text) data = r.json() # print(data) # data = "122" result = '' for dt in data[0]: if dt[0]: result += dt[0] return result