Exemplo n.º 1
0
class TestMagicGoogle(unittest.TestCase):
    """
    Test MagicGoogle class
    """
    def setUp(self):
        PROXIES = [{'http': '127.0.0.1:1087', 'https': '127.0.0.1:1087'}]
        self.mg = MagicGoogle(PROXIES)

    def tearDown(self):
        self.mg = None

    def test_search_url(self):
        sleep = random.randint(2, 15)
        result = list(self.mg.search_url(query='python', num=1, pause=sleep))
        self.assertEqual(result[0], 'https://www.python.org/',
                         'test search_url fail')
Exemplo n.º 2
0
import os
import sys
import time
import random
# import pprint
import codecs

sys.path.append(os.path.dirname(os.path.dirname(__file__)))
# https://github.com/howie6879/magic_google
# pip install magic_google
from magic_google import MagicGoogle

mg = MagicGoogle()

x = 0
file = codecs.open("result-" + str(x) + ".txt", "w", "utf-8")
for url in mg.search_url(query='Github', num=100, start=x * 100):
    file.write(url)
    print(url)
    # file.write("\n")
    file.write("\n----------------\n")
file.close()
Exemplo n.º 3
0
#
# time.sleep(random.randint(1, 5))

# Get {'title','url','text'}
for i in mg.search(query='python', num=1, language='en'):
    pprint.pprint(i)

time.sleep(random.randint(1, 5))

# Output
# {'text': 'The official home of the Python Programming Language.',
# 'title': 'Welcome to Python .org',
# 'url': 'https://www.python.org/'}

# Get first page
for url in mg.search_url(query='python'):
    pprint.pprint(url)

time.sleep(random.randint(1, 5))

# Output
# 'https://www.python.org/'
# 'https://www.python.org/downloads/'
# 'https://www.python.org/about/gettingstarted/'
# 'https://docs.python.org/2/tutorial/'
# 'https://docs.python.org/'
# 'https://en.wikipedia.org/wiki/Python_(programming_language)'
# 'https://www.codecademy.com/courses/introduction-to-python-6WeG3/0?curriculum_id=4f89dab3d788890003000096'
# 'https://www.codecademy.com/learn/python'
# 'https://developers.google.com/edu/python/'
# 'https://learnpythonthehardway.org/book/'
Exemplo n.º 4
0
#
# time.sleep(random.randint(1, 5))

# Get {'title','url','text'}
for i in mg.search(query=str(search_key), num=1, language='en'):
    pprint.pprint(i)

time.sleep(random.randint(1, 5))

# Output
# {'text': 'The official home of the Python Programming Language.',
# 'title': 'Welcome to Python .org',
# 'url': 'https://www.python.org/'}

# Get first page
for url in mg.search_url(query=str(search_key)):
    pprint.pprint(url)

time.sleep(random.randint(1, 5))

# Output
# 'https://www.python.org/'
# 'https://www.python.org/downloads/'
# 'https://www.python.org/about/gettingstarted/'
# 'https://docs.python.org/2/tutorial/'
# 'https://docs.python.org/'
# 'https://en.wikipedia.org/wiki/Python_(programming_language)'
# 'https://www.codecademy.com/courses/introduction-to-python-6WeG3/0?curriculum_id=4f89dab3d788890003000096'
# 'https://www.codecademy.com/learn/python'
# 'https://developers.google.com/edu/python/'
# 'https://learnpythonthehardway.org/book/'
Exemplo n.º 5
0
def search(keyword, num=num, ty='google'):

    logging.info('搜索关键词:' + keyword)
    #   lang,p=langid.classify(keyword)
    #   if lang=='zh':
    #     logging.info('中文关键词')
    #   else:
    #     logging.info('非中文')
    #     return []

    if ty == 'google':

        mg = MagicGoogle(PROXIES)
        urls = mg.search_url(query=keyword, num=num, start=0, pause=5)
    elif ty == 'gcs':
        results = search_google.api.results(BUILDARGS, CSEARGS)
        urls = results.get_values('items', 'link')
        # logging.info(links)
    else:
        mg = MagicBaidu()
        urls = mg.search_url(query=keyword, start=0, pause=5)
    #  Crawling the whole page
#   result = mg.search_page(query=keyword)

    cx = tkit.CxExtractor()
    # Crawling url
    keywords = []
    #   for url in mg.search_url(query=keyword):
    n = 0
    file_name = PATH + 'corpu' + str(time.time()) + ".txt"

    for url in urls:  #google
        #for url in mg.search_url(query=keyword ,start=0, pause=10): #百度
        logging.info(url)

        try:

            #         items= cx.url_text_no_br(url=url)
            items = url_text(url=url)
        except:
            continue

#       logging.info("*"*50)
#       plogging.info.plogging.info(items)
#       plogging.info.plogging.info("*"*50)

#       items= tkit.Text().text_processing(items)
        items = text_pre(items)
        #       plogging.info.plogging.info('句子数目为: ',len(items['sentence']))
        logging.info('句子数目为: ' + str(items['sentences_num']))
        #       plogging.info.plogging.info(items)

        if items['sentences_num'] > 5:
            n = n + 1
            if n % 5 == 0:

                # keywords = keywords+ items['keywords']
                file_name = PATH + 'corpu' + str(time.time()) + ".txt"

            logging.info('写入文件:  ' + file_name)
            my_open = open(file_name, 'a')

            my_open.write(str(items['text']) + '\n\n')
            my_open.close()

    t = random.randint(30, 100)

    logging.info('搜索结束休息中 ' + str(t) + 's')
    logging.info("Start : %s" % time.ctime())
    time.sleep(t)
    logging.info("End : %s" % time.ctime())
    return