예제 #1
0
def sm1234_search(key, pn):
    # print("sm1234_search start...")
    kv = {'q': key, 'p': pn}
    # print(kv)
    headers = {'User-Agent': random_user_agent()}
    # headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80  Safari/537.36 QIHU 360SE'
    # }
    r = requests.get("http://sm.sm1234.net/", params=kv, headers=headers)
    # url = 'http://sm.sm1234.net/?q=python3&p=2'
    # print(r.url)
    soup = BeautifulSoup(r.text, 'lxml')

    # li = []
    for item in soup.find_all('div', attrs={"class": "g"}):
        # print(item)
        # if item.has_attr('id') and item['id'] == str(now):
        result = {}
        result['title'] = item.h2.get_text()
        result['url'] = "http://sm.sm1234.net" + item.h2.a['href']
        #http://sm.sm1234.net
        # print(result['url'])
        result['text'] = (item.find("div", attrs={"class": "std"})).get_text()

        # li.append(result)
        yield result
예제 #2
0
def gg_search(ip, key, pn):
    # print("gg_search start...")
    kv = {'q': key, 'start': pn}

    headers = {'User-Agent': random_user_agent(), 'X-Forwarded-For': ip}
    # print(proxies)
    # proxies=proxies
    r = requests.get("https://www.google.com/search",
                     params=kv,
                     headers=headers)
    #print(r.url)
    soup = BeautifulSoup(r.text, 'lxml')

    # li = []
    for item in soup.find_all('div', attrs={"class": "g"}):
        #print("bing search div g")
        a = item.find('a')
        result = {}
        result['title'] = a.text.strip()
        result['url'] = a["href"]
        stext = item.find("span", attrs={"class": "st"})
        if stext:
            result['text'] = stext.text

        # li.append(result)
        yield result
예제 #3
0
def bd_search(key, pn):
    print("bd_search start...")
    # key = quote(key)
    kv = {'wd': key, 'pn': pn}
    # print(kv)
    headers = {'User-Agent': random_user_agent()}
    # headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80  Safari/537.36 QIHU 360SE'
    # }
    r = requests.get("http://www.baidu.com/s", params=kv, headers=headers)
    # print(r.url)
    soup = BeautifulSoup(r.text, 'lxml')
    # select_html = soup.find("div", attrs={'id':'content_left'})
    # li = []
    now = int(pn)
    # t1 = time.time()
    for item in soup.find_all('div', attrs={"class": "c-container"}):
        #print("search c-container")
        now += 1
        if item.has_attr('id') and item['id'] == str(now):
            #print(type(now))
            result = {}
            # result['title'] = item.h3.get_text()
            #print(item.h3)
            #print(item.h3.name) name is h3
            #print(item.h3.a.contents)
            #result['title'] = str(item.h3.a.contents).strip("[]")
            ss = ''
            for div in item.find_all('div'):
                if div.has_attr('class') and (
                        div['class'][0].find('abstract') != -1
                        or div['class'][0] == 'c-row'):
                    ss += div.get_text()
            #ss += div.contents
            result['text'] = ss
            # class="c-showurl" style="text-decoration:none;"
            # a = item.find('a')
            # print(a.get('href'))
            # print(a.get_text())

            if item.h3.a:
                #result['url'] = item.h3.a.get('href')
                #print(item.h3.a.string)
                result['url'] = item.h3.a['href']
                # requests get for baidu redirect url to get result url.
                # a = requests.get(url = item.h3.a['href'], headers=headers)
                # result['url'] = a.url
                # print("one get: " + str(t6 - t5) + " seconds")

                result['title'] = item.h3.get_text()
                # Optimization baidu search
                # result['title'] = item.h3.get_text() + " " + result['url']
                # li.append(result)
                yield result
            else:
                print("item.h3.a is None ***")
                print(item.h3)
예제 #4
0
def ddk_search(key, pn):
    # kv = {'wd':key, 'pn':pn}
    pn = int(pn) * 3
    # dc = pn - 1

    # kv = {'q':key, 's':pn, 'dc':pn, 'api':'/d.js'}
    if pn > 10:
        kv = {
            'q': key,
            's': pn,
            'dc': pn,
            'v': 'l',
            'o': 'json',
            'api': '/d.js'
        }
    else:
        kv = {'q': key, 's': pn, 'dc': pn}
    # print(kv)
    # print(random_user_agent())
    headers = {'User-Agent': random_user_agent()}
    # headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80  Safari/537.36 QIHU 360SE'
    # }
    r = requests.post("https://www.duckduckgo.com/html",
                      params=kv,
                      headers=headers)
    # print(r.url)
    # print(r.status_code)
    # html = content.decode("utf8", "ignore")
    # print(r.text)
    soup = BeautifulSoup(r.text, 'lxml')

    # li = []
    # <div class="result results_links results_links_deep web-result ">
    # On server, need use follow string.
    # <div class="links_main links_deep result__body">
    for item in soup.find_all(
            'div', attrs={"class": "links_main links_deep result__body"}):
        result = {}
        # result['title'] = item.h2.get_text()
        # print(result['title'])
        result['url'] = item.h2.a['href']
        result['title'] = item.h2.get_text() + " " + result['url']
        gettext = item.find("a", attrs={"class": "result__snippet"})
        if gettext:
            result['text'] = gettext.get_text()
        else:
            result['text'] = ""  # "<br>"
        # result['text'] = (item.find("a", attrs={"class":"result__snippet"})).get_text()

        # li.append(result)
        yield result
예제 #5
0
def html_to_md(url, param):
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (random_user_agent())
    driver = webdriver.PhantomJS(desired_capabilities=dcap)

    driver.get(url)
    driver.implicitly_wait(5)
    html = driver.page_source
    # driver.close()
    driver.quit()

    # headers = {'User-Agent':random_user_agent()}
    # session = HTMLSession()

    # r = session.get(url, headers=headers)
    # print(r.encoding)
    # print(r.apparent_encoding)
    # r.encoding = 'utf-8'
    # r.encoding = r.apparent_encoding
    # r.html.render()
    # html = r.html
    # html = r.html
    # val = html.render()
    # r = session.get('http://www.readmorejoy.com/')
    # r.html.render()
    # md = html2text.html2text(r.text)
    #
    # return md

    # print(param)
    if param:
        li = param.split(" ")
        soup = BeautifulSoup(html, 'lxml')
        if len(li) > 1:
            name = li[0]
            lili = li[1].split("=")
            if len(lili) > 1:
                attrs1 = lili[0]
                attrs2 = lili[1].strip("\"")
                select_html = soup.find(name, attrs={attrs1, attrs2})
                md = html2text.html2text(str(select_html))
                return md
        else:
            name = param
            select_html = soup.find(name)
            md = html2text.html2text(str(select_html))
            return md

    md = html2text.html2text(html)
    return md
예제 #6
0
def youdao_fanyi(type, q, dst):
    print("youdao_fanyi")
    # q = "你好"
    # q = "hello"
    headers = {'User-Agent': random_user_agent()}
    # type = detect(q[:30]) #"en | zh_cn"
    # print("detect(q) = " + type)
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom=https://www.baidu.com/link'
    # if dst == "cn":
    #     print("youdao cn")
    #     # if type in ['zh'] 'zh-CHS'
    #     data = {'from': type, 'to': 'zh-cn', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607',
    #             'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web',
    #             'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q}
    # elif dst == "en":
    #     print("youdao en")
    #     data = {'from': type, 'to': 'en', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607',
    #             'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web',
    #             'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q}
    # elif dst == "fra":
    #     print("youdao fra")
    #     data = {'from': 'zh-tw', 'to': 'zh-CHS', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607',
    #             'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web',
    #             'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q}
    # else:
    #     print("youdao else")
    data = {'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': '1500092479607',
            'sign': 'c98235a85b213d482b8e65f6b1065e26', 'doctype': 'json', 'version': '2.1', 'keyfrom': 'fanyi.web',
            'action': 'FY_BY_CL1CKBUTTON', 'typoResult': 'true', 'i': q}

    # print(q)
    r = requests.get(url, params=data, headers=headers)
    ta = r.json()
    result = ""
    for ii in ta['translateResult']:
        for i in ii:
            result += i['tgt']
        result += "\n"

    return result
예제 #7
0
def gg_fanyi(type, q, dst):
    print("gg_fanyi")

    # with PyV8.JSContext() as ctxt:
    #     ctxt.eval("""
    #         function TL(a) {
    #             var k = "";
    #             var b = 406644;
    #             var b1 = 3293161072;
    #
    #             var jd = ".";
    #             var $b = "+-a^+6";
    #             var Zb = "+-3^+b+-f";
    #
    #             for (var e = [], f = 0, g = 0; g < a.length; g++) {
    #                 var m = a.charCodeAt(g);
    #                 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
    #                 e[f++] = m >> 18 | 240,
    #                 e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
    #                 e[f++] = m >> 6 & 63 | 128),
    #                 e[f++] = m & 63 | 128)
    #             }
    #             a = b;
    #             for (f = 0; f < e.length; f++) a += e[f],
    #             a = RL(a, $b);
    #             a = RL(a, Zb);
    #             a ^= b1 || 0;
    #             0 > a && (a = (a & 2147483647) + 2147483648);
    #             a %= 1E6;
    #             return a.toString() + jd + (a ^ b)
    #         };
    #
    #         function RL(a, b) {
    #             var t = "a";
    #             var Yb = "+";
    #             for (var c = 0; c < b.length - 2; c += 3) {
    #                 var d = b.charAt(c + 2),
    #                 d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
    #                 d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
    #                 a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
    #             }
    #             return a
    #         }
    #     """)
    #     print("at here for debug")
    #     TL = ctxt.locals.TL
    #     tk = TL(q)

# need call node, start node is slow
#     ctx = execjs.compile("""
#     function TL(a) {
#     var k = "";
#     var b = 406644;
#     var b1 = 3293161072;
#
#     var jd = ".";
#     var $b = "+-a^+6";
#     var Zb = "+-3^+b+-f";
#
#     for (var e = [], f = 0, g = 0; g < a.length; g++) {
#         var m = a.charCodeAt(g);
#         128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
#         e[f++] = m >> 18 | 240,
#         e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
#         e[f++] = m >> 6 & 63 | 128),
#         e[f++] = m & 63 | 128)
#     }
#     a = b;
#     for (f = 0; f < e.length; f++) a += e[f],
#     a = RL(a, $b);
#     a = RL(a, Zb);
#     a ^= b1 || 0;
#     0 > a && (a = (a & 2147483647) + 2147483648);
#     a %= 1E6;
#     return a.toString() + jd + (a ^ b)
# };
#
# function RL(a, b) {
#     var t = "a";
#     var Yb = "+";
#     for (var c = 0; c < b.length - 2; c += 3) {
#         var d = b.charAt(c + 2),
#         d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
#         d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
#         a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
#     }
#     return a
# }
# """)
    # t1 = time.time()
    # tk = ctx.call("TL", q)
    # t2 = time.time()
    # print('took tk ' + str(t2-t1) + ' second')

    session = requests.Session()
    token_acquirer = TokenAcquirer(session=session, host="translate.google.cn")
    tk = token_acquirer.do(q)
    # tk = TTL(q)
    # print(tk)
    # print(q)
    headers = {'User-Agent': random_user_agent()}
    # print(q)
    type = detect(q[:30])  # "en | zh_cn"
    print("detect(q) = " + type)
    # if type == 'ko':
    #     type = "zh-cn"

    if type == "en":
        # print("gg dedect type en")
        if dst == "cn":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "en":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "fra":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        else:
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        # url = "http://translate.google.cn/translate_a/single?client=t" \
        #       "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
        #       "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
        #       "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
    elif type == 'zh-cn':
        # print("google dedect type zh-cn")
        if dst == "cn":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-cn&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "en":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "fra":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-cn&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        else:
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-cn&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
    elif type == 'zh-tw':
        # print("google dedect type zh-cn")
        if dst == "cn":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-tw&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "en":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "fra":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-tw&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        else:
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=zh-tw&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
    else:
        # print("google dedect type else")
        if dst == "cn":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "en":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        elif dst == "fra":
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=fr&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)
        else:
            url = "http://translate.google.cn/translate_a/single?client=t" \
                  "&sl=en&tl=zh-cn&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" \
                  "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" \
                  "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk, q)

    # t1 = time.time()
    # r = requests.get(url, headers=headers)
    r = session.get(url, headers=headers)
    # t2 = time.time()
    # print('took get ' + str(t2-t1) + ' second')
    # print(r.encoding)
    # r.encoding = 'UTF-8'
    # print(r.text)

    data = r.json()
    # print(data)
    # data = "122"

    result = ''
    for dt in data[0]:
        if dt[0]:
            result += dt[0]

    return result