Python Crawler.Crawler示例，crawl.Crawler.Crawler Python示例

示例#1

0

显示文件

    def run(self):
        start_mark = False
        crawler = Crawler()
        while self.flag:
            try:
                url = 'http://hw.venndata.cn/proxy?num={num}'.format(
                    num=self.num)

                response, _ = crawler.crawl(url=url)
                html = response.text
                if html:
                    data = json.loads(html)['data']
                    proxy_list = data.split('|')
                    if len(proxy_list) > 500:
                        old_len = len(PROXY_LIST)
                        PROXY_LIST.extend(proxy_list)
                        PROXY_LIST[0:old_len] = []
                    if not start_mark:
                        log.critical("代理启动成功！获取代理%s" % len(proxy_list))
                        start_mark = True
            except Exception as e:
                log.error('生成 Proxy Failed' + str(e))
            time.sleep(self.interval)

        log.info('代理关闭')
        return

示例#2

0

显示文件

    def fill_empty_restaurant(self, item):
        # It there is a a dict, then add
        if item:
            print("Attempting to add empty restaurants to DB")
            p = ThreadPool(20)

            try:
                # for key, value in self.empty_restaurant.iteritems():
                for key, value in item.iteritems():
                    p.apply_async(self.upload_restaurant_item, [value])
            except AttributeError as ae:
                # remove key if the restaurant already exists
                print(ae.args[0])
                item.pop(ae.args[0], None)
                print(item)
            except Exception as e:
                # Sometimes this will throw, but it will execute anyways
                print(e)

            p.close()
            p.join()

            # self.upload_restaurant_list(self.empty_restaurant)

            print("Attempting to add food through YelpApi")
            # p.apply_async(self.upload_food_list(Crawler(self.uninserted_restaurant).query(10)))
            t = threading.Thread(
                name="Crawler",
                target=self.upload_food_list(
                    Crawler(self.uninserted_restaurant).query(10)))
            t.start()
            t.join()

示例#3

0

显示文件

文件： test_extrac.py 项目： saeedghx68/crawler

 async def valid_url():
     url = 'http://yoyowallet.com'
     urls = [
         'http://yoyowallet.com',
         'http://yoyowallet.com/',
         'http://yoyowallet.com/about.html',
         'http://yoyowallet.com/assets.html',
         'http://yoyowallet.com/banks/index.html',
         'http://yoyowallet.com/basket-data.html',
         'http://yoyowallet.com/careers.html',
         'http://yoyowallet.com/case-studies/caffe-nero-case-study.html',
         'http://yoyowallet.com/caterers/index.html',
         'http://yoyowallet.com/cookies.html',
         'http://yoyowallet.com/epos.html',
         'http://yoyowallet.com/get-in-touch.html',
         'http://yoyowallet.com/retailers/index.html']
     all_urls = {
         'http://yoyowallet.com/about.html',
         'http://yoyowallet.com/case-studies/caffe-nero-case-study.html',
         'http://yoyowallet.com/careers.html',
         'http://yoyowallet.com/banks/index.html',
         'http://yoyowallet.com',
         'http://yoyowallet.com/assets.html',
         'http://yoyowallet.com/caterers/index.html',
         'http://yoyowallet.com/cookies.html',
         'http://yoyowallet.com/',
         'http://yoyowallet.com/basket-data.html',
         'http://yoyowallet.com/epos.html',
         'http://yoyowallet.com/retailers/index.html',
         'http://yoyowallet.com/get-in-touch.html'}
     crawler = Crawler(url)
     _url, data, _urls, _all_urls = await crawler.extract(url)
     self.assertEqual(_url, url)
     self.assertListEqual(_urls, urls)
     self.assertSetEqual(_all_urls, all_urls)

示例#4

0

显示文件

    def run(self):
        print("开始")
        self.int()
        list_csv = self.get_parameter()
        wether_url_list = []
        for city in list_csv:
            cityid = self.city_search(city)
            wether_url = self.wether_data(cityid)
            wether_url_list.append(wether_url)
        for li in wether_url_list:
            self.url_queue.put(li)
            # print("获取队列中的对象", self.url_queue.get())

        crawlers = [
            Crawler(self.url_queue, self.page_parse, self.save_data,
                    self.generate_page) for _ in range(0, self.coro_num)
        ]
        loop = asyncio.get_event_loop()
        to_do = [
            crawlers[coro_id].asyn_crawl(coro_id)
            for coro_id in range(0, self.coro_num)
        ]
        wait_coro = asyncio.wait(to_do)
        loop.run_until_complete(wait_coro)
        loop.run_until_complete(asyncio.sleep(5.25))
        loop.close()

示例#5

0

显示文件

文件： app.py 项目： yuta0306/traffic-rocket

def crawl(base: str):
    clr = Crawler(base)
    pages = clr.crawl(depth=0)
    pages = {
        'urls': pages,
    }
    results = json.dumps(pages)

    return jsonify(results)

示例#6

0

显示文件

文件： getsocial.py 项目： fernandoruizanton/social

def main(argv):
    '''Demonstrate it working by printing out results'''
    a = Crawler()
    a.init_browser()
    html = a.getContent(argv[0])
    d = PageParse(html, argv[0])
    data = d.socialmedia()
    a.exit()
    print(data)

示例#7

0

显示文件

文件： runner.py 项目： pije76/dragline

def main(spider_class, settings_module):
    crawler = Crawler(spider_class, settings_module)
    try:
        joinall([spawn(crawler.process_url) for i in xrange(5)])
    except KeyboardInterrupt:
        crawler.clear(False)
    except:
        logger.exception("Unable to complete")
    else:
        crawler.clear(True)
        logger.info("Crawling completed")

示例#8

0

显示文件

 def media_changed(self, media):
     """
     Media mountpoint changed or added.
     """
     for id, client, monitors in self.clients:
         client.rpc('device.changed', media.id, media.prop)
     if not media.crawler:
         if not media.get('block.device'):
             log.info('start crawler for /')
             media.crawler = Crawler(self._db, use_inotify=True)
     self._db.signals['changed'].emit([media._beacon_id])

示例#9

0

显示文件

文件： main.py 项目： weihaigang/Proxypool

 def get(self, cycle, *a):
     """
     定时获取代理
     """
     getter = Crawler()
     while True:
         info('[调度系统] 开始抓取代理')
         getter.run()
         if config["crawl"]["checkmax"]:
             if len(DB().getall()) >= config["crawl"]["maxvalue"]:
                 info("[调度系统] 代理池已达上限，停止抓取代理")
                 while len(DB().getall()) >= config["crawl"]["maxvalue"]:
                     pass
         time.sleep(int(cycle))

示例#10

0

显示文件

    def run(self):
        print("开始")
        self.int()

        data = self.parameter_json(self.payload)
        all_type_list = list()
        for li in data:
            id = li['id']
            name = li['name']
            print("1级分类", id, name)

            self.payload['id'] = id
            data_2 = self.parameter_json(self.payload)
            for li_2 in data_2:
                id_2 = li_2['id']
                name_2 = name + '&' + li_2['name']
                print("2级分类", id_2, name_2)

                self.payload['id'] = id_2
                data_3 = self.parameter_json(self.payload)
                if len(data_3) == 0:
                    all_type_list.append([id_2, name_2])
                else:
                    for li_3 in data_3:
                        id_3 = li_3['id']
                        name_3 = name_2 + '&' + li_3['name']
                        print("3级分类", id_3, name_3)
                        all_type_list.append([id_3, name_3])
        print("月度数据总数", len(all_type_list))

        for li in all_type_list:
            self.url_queue.put(li)
            # print("获取队列中的对象", self.url_queue.get())

        crawlers = [
            Crawler(self.url_queue, self.page_parse, self.save_data,
                    self.generate_page) for _ in range(0, self.coro_num)
        ]
        loop = asyncio.get_event_loop()
        to_do = [
            crawlers[coro_id].asyn_crawl(coro_id)
            for coro_id in range(0, self.coro_num)
        ]
        wait_coro = asyncio.wait(to_do)
        loop.run_until_complete(wait_coro)
        loop.run_until_complete(asyncio.sleep(3.25))
        loop.close()

示例#11

0

显示文件

文件： test_fetch_urls.py 项目： saeedghx68/crawler

 def test_fetch_urls(self):
     html = """
     <!DOCTYPE html>
     <html>
     <title>Test</title>
     <body>
     <a href='link1'/>
     <a href='link2'/>
     </body>
     </html>
     """
     crawler = Crawler('http://test.com')
     fetch_urls, all_urls = crawler.fetch_urls(html)
     self.assertListEqual(
         fetch_urls, ['http://test.com/link1', 'http://test.com/link2'])
     self.assertSetEqual(all_urls,
                         {'http://test.com/link2', 'http://test.com/link1'})

示例#12

0

显示文件

文件： houzhi_weatherdata_spider.py 项目： ubwshook/template_data

    def run(self):
        print("开始")
        self.int()
        all_type_list = self.area_code()
        print("城市数", len(all_type_list))
        """{'130400': '河北省/邯郸市', '152200': '内蒙古自治区/兴安盟',...}"""
        for li in all_type_list:
            self.url_queue.put([li, all_type_list[li]])
            # print("获取队列中的对象", self.url_queue.get())

        crawlers = [Crawler(self.url_queue, self.page_parse, self.save_data, self.generate_page) for _ in range(0, self.coro_num)]
        loop = asyncio.get_event_loop()
        to_do = [crawlers[coro_id].asyn_crawl(coro_id) for coro_id in range(0, self.coro_num)]
        wait_coro = asyncio.wait(to_do)
        loop.run_until_complete(wait_coro)
        loop.run_until_complete(asyncio.sleep(5.25))
        loop.close()

示例#13

0

显示文件

文件： test_parse_html_content.py 项目： saeedghx68/crawler

 def test_parse_html_content(self):
     html = """
     <!DOCTYPE html>
     <html>
         <title>Test</title>
         <link rel="stylesheet" href="/assets/v2/css/app.css">
         <link rel="stylesheet" href="/assets/v2/css/app2.css">
         <link rel="apple-touch-icon" sizes="152x152" href="/assets/v2/apple-icon-152x152.png">
         <link rel="icon" type="image/png" sizes="192x192" href="/assets/v2/android-icon-192x192.png">
         <link href="/assets/v2/favicon.ico" rel="shortcut icon">
         <link href="/assets/v2/apple-touch-icon.png" rel="apple-touch-icon">
     <body>
         <a href='test1'/>
         <a href='test2'/>
         <div class="col-lg-6">
            <img src="/assets/v2/images/screen-yoyo-apps-2x.png" class="hero-img"/>
         </div>
         <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2.js"></script>
     </body>
     </html>
     """
     data = {
         'css_links': {
             'http://test.com/assets/v2/css/app2.css',
             'http://test.com/assets/v2/css/app.css'
         },
         'js_links': {'http://js.hsforms.net/forms/v2.js'},
         'img_links':
         {'http://test.com/assets/v2/images/screen-yoyo-apps-2x.png'},
         'icon_links': {
             'http://test.com/assets/v2/apple-touch-icon.png',
             'http://test.com/assets/v2/android-icon-192x192.png',
             'http://test.com/assets/v2/favicon.ico',
             'http://test.com/assets/v2/apple-icon-152x152.png'
         }
     }
     crawler = Crawler('http://test.com')
     result = crawler.parse_html_content(html)
     self.assertSetEqual(result['css_links'], data['css_links'])
     self.assertSetEqual(result['js_links'], data['js_links'])
     self.assertSetEqual(result['img_links'], data['img_links'])
     self.assertSetEqual(result['icon_links'], data['icon_links'])

示例#14

0

显示文件

    def call_API(self):
        #  return self.client.search('SF', self.data)
        #  return SearchResponse (
        #  self.client._make_request(SEARCH_PATH, self.data)
        #  )
        response = SearchResponse(
            self.client._make_request(SEARCH_PATH, self.data))

        list_to_be_returned = []
        #  for bus in response.businesses:
        #  list_to_be_returned += Crawler.limit("http://www.yelp.com/biz_photos/" + bus.id + "?tab=food&start=0", self.food_per_business)
        dict_of_urls = {}
        for bus in response.businesses:
            #  pprint(bus.categories[0].name)
            #  pprint(vars(bus.location.coordinate))
            url = "http://www.yelp.com/biz_photos/" + bus.id + "?tab=food&start=0"
            #  list_of_urls.append({url: [bus.location, bus.name]})
            #  list_of_urls.append({url:
            category_list = []
            for category in bus.categories:
                category_list.append(category.name)

            dict_of_urls[url] = dict(
                address=bus.location.address,
                name=bus.name,
                city=bus.location.city,
                state=bus.location.state_code,
                postal_code=bus.location.postal_code,
                display_address=bus.location.display_address,
                latitude=bus.location.coordinate.latitude,
                longitude=bus.location.coordinate.longitude,
                category=category_list)
            #  print dict_of_urls

            #  pprint(list_of_urls)
            #  print (list_of_urls)
        #  Crawler.limit(list_of_urls, 1)
        return Crawler(dict_of_urls).limit(self.food_per_business)

示例#15

0

显示文件

文件： yelpApi.py 项目： jimmyyentran/PlatePicks-AWS

    def call_API(self):
        response = SearchResponse(
            self.client._make_request(SEARCH_PATH, self.data))

        dict_of_urls = {}
        for bus in response.businesses:
            #  url = "http://www.yelp.com/biz_photos/"+bus.id+"?tab=food&start=0"
            category_list = []
            if bus.categories:
                for category in bus.categories:
                    category_list.append(category.name)

            dict_of_urls[bus.id] = dict(
                address=bus.location.address,
                city=bus.location.city,
                state=bus.location.state_code,
                postal_code=bus.location.postal_code,
                display_address=bus.location.display_address,
                restaurant_name=bus.name,
                restaurantId=bus.id,
                latitude=bus.location.coordinate.latitude,
                longitude=bus.location.coordinate.longitude,
                category=category_list)

        print(vars(response))
        if response.total == 0:
            raise RuntimeError("Yelp returns no businesses")

        if 'query_method' in self.data and self.data['query_method'] == 1:
            print("DB")
            food_list = DB(dict_of_urls).query(self.food_per_business)
        else:
            print("Yelp")
            food_list = Crawler(dict_of_urls).query(self.food_per_business)

        random.shuffle(food_list)
        return food_list

示例#16

0

显示文件

# -*- coding: utf-8 -*-
"""main module"""
__author__ = 'starstar'
import traceback

from crawl import Crawler
from house import House
from connect2db import DbConnector
from config import HEADERS
import preprocess
from connect2db import action_type

if __name__ == "__main__":
    """main"""
    mysql_connector = DbConnector()
    shcrawler = Crawler(HEADERS, 'sh')
    rg_urls = shcrawler.composeurl(1, 20)
    for region, urls in rg_urls.iteritems():
        for url in urls:
            res = shcrawler.parse(shcrawler.crawl(url))
            for i in res:
                i.region = region
                preprocess.main(i)
                try:
                    action, update_fields = mysql_connector.search(i)
                    if action == action_type.insert:
                        mysql_connector.insert(i)
                    elif action == action_type.update:
                        mysql_connector.update(i, update_fields)
                    elif action == action_type.none:
                        print "already inserted and up-to-date"

示例#17

0

显示文件

文件： test_get_body.py 项目： saeedghx68/crawler

 async def invalid_url():
     url = 'http://yoyowalletxxxx.com'
     crawler = Crawler('')
     result = await crawler.get_body(url)
     self.assertEqual(result, '')

示例#18

0

显示文件

class Server(object):
    """
    Server for the virtual filesystem to handle write access to the db and
    scanning / monitoring of queries.
    """
    def __init__(self, dbdir, scheduler=None):
        log.info('start beacon')
        try:
            self.ipc = kaa.rpc.Server('beacon')
        except IOError, e:
            kaa.beacon.thumbnail.thumbnail.disconnect()
            log.error('beacon: %s' % e)
            time.sleep(0.1)
            sys.exit(0)

        self.ipc.signals['client-connected'].connect(self.client_connect)
        self.ipc.register(self)

        self._dbdir = dbdir
        self._db = Database(dbdir)
        self._next_client = 0

        self._db.register_inverted_index('keywords', min=2, max=30)

        self._db.register_object_type_attrs("dir",
                                            image_from_parser=(bool,
                                                               ATTR_SIMPLE),
                                            last_crawl=(int, ATTR_SIMPLE),
                                            title=(unicode, ATTR_SIMPLE),
                                            series=(unicode, ATTR_SIMPLE),
                                            season=(int, ATTR_SIMPLE),
                                            artist=(unicode, ATTR_SIMPLE),
                                            album=(unicode, ATTR_SIMPLE),
                                            length=(float, ATTR_SIMPLE))

        # files

        self.register_file_type_attrs(
            "video",
            title=(unicode,
                   ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX,
                   'keywords'),
            poster=(str, kaa.beacon.ATTR_SIMPLE),
            width=(int, ATTR_SIMPLE),
            height=(int, ATTR_SIMPLE),
            length=(float, ATTR_SIMPLE),
            scheme=(str, ATTR_SIMPLE),
            description=(unicode, ATTR_SIMPLE),
            series=(unicode, ATTR_SEARCHABLE),
            season=(int, ATTR_SEARCHABLE),
            episode=(int, ATTR_SEARCHABLE),
            hash=(str, ATTR_SIMPLE),
            stereo=(str, ATTR_SIMPLE),
            timestamp=(int, ATTR_SEARCHABLE))

        self.register_file_type_attrs(
            "audio",
            title=(unicode,
                   ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX,
                   'keywords'),
            artist=(unicode, ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_IGNORE_CASE
                    | ATTR_INVERTED_INDEX, 'keywords'),
            album=(unicode,
                   ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX,
                   'keywords'),
            genre=(unicode, ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_IGNORE_CASE),
            samplerate=(int, ATTR_SIMPLE),
            length=(float, ATTR_SIMPLE),
            bitrate=(int, ATTR_SIMPLE),
            trackno=(int, ATTR_SIMPLE),
            userdate=(unicode, ATTR_SIMPLE),
            description=(unicode, ATTR_SIMPLE),
            hash=(str, ATTR_SIMPLE),
            timestamp=(int, ATTR_SEARCHABLE))

        self.register_file_type_attrs(
            "image",
            width=(int, ATTR_SEARCHABLE),
            height=(int, ATTR_SEARCHABLE),
            comment=(unicode,
                     ATTR_SEARCHABLE | ATTR_IGNORE_CASE | ATTR_INVERTED_INDEX,
                     'keywords'),
            rotation=(int, ATTR_SIMPLE),
            author=(unicode, ATTR_SIMPLE),
            hash=(str, ATTR_SIMPLE),
            timestamp=(int, ATTR_SEARCHABLE))

        # tracks for rom discs or iso files

        self.register_track_type_attrs("dvd",
                                       length=(float, ATTR_SIMPLE),
                                       audio=(list, ATTR_SIMPLE),
                                       chapters=(int, ATTR_SIMPLE),
                                       subtitles=(list, ATTR_SIMPLE))

        self.register_track_type_attrs("vcd", audio=(list, ATTR_SIMPLE))

        self.register_track_type_attrs(
            "cdda",
            title=(unicode, ATTR_SEARCHABLE | ATTR_INVERTED_INDEX, 'keywords'),
            artist=(unicode,
                    ATTR_SEARCHABLE | ATTR_INDEXED | ATTR_INVERTED_INDEX,
                    'keywords'))

        # list of current clients
        self.clients = []

        # Config file is synced in Thumbnailer.  See its constructor for
        # rationale.
        config.load(os.path.join(dbdir, "config"))
        config.watch()
        if scheduler:
            config.scheduler.policy = scheduler
        else:
            config.autosave = True

        # commit and wait for the results (there are no results,
        # this code is only used to force waiting until the db is
        # set up.
        self._db.commit()

        # give database to controller / hardware monitor
        rootfs = {
            'beacon.id': 'root-' + get_machine_uuid(),
            'block.device': '',
            'volume.mount_point': '/'
        }

        self.item_controller = Controller(self, self._db, rootfs)
        self._db.commit()

        # load plugins
        plugins.load(self, self._db)

        for dir in config.monitors:
            self.monitor_directory(os.path.expandvars(os.path.expanduser(dir)))

        # scanner
        self.scanner = Crawler(self._db, monitor=False)

示例#19

0

显示文件

文件： getter_.py 项目： cshk/proxy_pool

 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()

示例#20

0

显示文件

single_row = RowParser(rows[0])
single_row.extract_fields()
print(single_row.extracted_content)

print("--------Wyciąganie danych ze wszystkich rezultatów--------------")

results = []
for i in rows:
    single_row = RowParser(i)

    single_row.extract_fields()
    results.append(single_row.extracted_content)

print(results)
print(f"length of results: {len(results)}")

print("test nowej klasy")
parser_ = Parser(a.page_content)
parser_.extract_fields()
print(parser_.results)

print(parser_._log)

print("test master obiektu ")

crawler = Crawler(search_params)
crawler.get_all()

print(crawler.results)
print(crawler.log)

示例#21

0

显示文件

文件： main.py 项目： pgenev/HackBG-Programing-101-v3

def main():
    crawler = Crawler("http://start.bg/")
    database = Database()
    crawler.start()

示例#22

0

显示文件

            f.write(self.output_string)
            f.close()
        else:
            print self.output_string

    def print_tree(self, tree, level):
        self.output('<li><a href="' + tree.url + '">' + tree.url + '</a></li>',
                    level)
        if tree.statics:
            self.output('<b>Static resources:</b>', level)
            self.output('<ul>', level)
            for s in tree.statics:
                self.output(
                    '<li>' + s[0] + ': <a href="' + s[1] + '">' + s[1] +
                    '</a></li>', level)
            self.output('</ul>', level)
        if tree.children:
            self.output('<b>Children:</b>', level)
            self.output('<ul>', level)
            for c in tree.children:
                self.print_tree(c, level + 1)
            self.output('</ul>', level)


starttime = time.time()
crawler = Crawler(args.domain)
c = crawler.crawl_domain()
endtime = time.time()
p = Parser(c, args.file)
p.render_html(endtime - starttime)

示例#23

0

显示文件

 def __init__(self):
     self.db = RedisClient()
     self.crawl = Crawler()

示例#24

0

显示文件

文件： test_insertToDB.py 项目： jimmyyentran/PlatePicks-AWS

from dynamodb import DB
import json
from crawl import Crawler
import threading

#Test Parameters
with open('params.json', 'r') as file:
    json_sns = file.read()

# DB().fill_empty_restaurant(json.loads(json_sns))
t = threading.Thread(name="Crawler",
                     target=DB().upload_food_list(
                         Crawler(json.loads(json_sns)).query(10)))
t.start()
t.join()
print("NEXT")

示例#25

0

显示文件

文件： test_parse_html_content.py 项目： saeedghx68/crawler

 def test_parse_empty_html_content(self):
     html = ""
     data = {}
     crawler = Crawler('http://test.com')
     result = crawler.parse_html_content(html)
     self.assertDictEqual(result, data)

示例#26

0

显示文件

db = MySqlOperator(server='127.0.0.1',
                   user_name='root',
                   password='',
                   dbname='taobao_sf')
rows = db.execute(
    'SELECT distinct(itemId) FROM taobao_sf.sf_list_itemid').fetchall()
for row in rows:
    item_id = row[0]
    print(item_id)

    url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory'
    headers = {
        'user-agent':
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36'
    }
    crawler = Crawler()
    res, session = crawler.crawl(url=url, headers=headers)
    cookies = res.cookies.get_dict()
    m_h5_tk = cookies['_m_h5_tk']
    app_key = '12574478'
    data = '{"itemId":"%s"}' % item_id
    sign, t = get_sign(m_h5_tk, app_key, data)
    params = {
        'jsv': '2.4.2',
        'appKey': app_key,
        't': t,
        'sign': sign,
        'api': 'mtop.taobao.GovauctionMTopDetailService.queryHttpsItemDetail',
        'v': '2.0',
        'ecode': '0',
        'type': 'jsonp',

示例#27

0

显示文件

文件： test_get_body.py 项目： saeedghx68/crawler

 async def valid_url():
     url = 'http://yoyowallet.com'
     crawler = Crawler('')
     result = await crawler.get_body(url)
     self.assertTrue(result)

示例#28

0

显示文件

 def __init__(self):
     self.crawler = Crawler()
     self.redis = RedisClient()
     self.proxy_list = []

示例#29

0

显示文件

文件： crawler.py 项目： saeedghx68/crawler

    parser.add_argument(
        '-u',
        '--url',
        required=True,
        type=str,
        help='For example => http://yoyowallet.com/')
    parser.add_argument(
        '-o',
        '--out',
        required=True,
        type=str,
        help='You have to enter a valid file address')
    parser.add_argument(
        '-t',
        '--type',
        required=True,
        type=FileType.from_string,
        choices=list(FileType),
        help='You have to choose one of theme => csv or xml')
    args = parser.parse_args()
    crawler = Crawler(str(args.url))
    task = asyncio.Task(crawler.crawl())
    loop = asyncio.get_event_loop()
    print(f'\n{30*"*"} crawler is working {30*"*"}\n\n')
    loop.run_until_complete(task)
    loop.close()
    result = task.result()
    print(f'\n\n{30*"*"} crawling was done {30*"*"}\n\n')
    Export().print(str(args.type), result, str(args.out))
    print(f'{30*"*"} output save on {args.out} {30*"*"}\n')

示例#30

0

显示文件

文件： analysis.py 项目： egehatirnaz/seo-helper

 def __init__(self):
     self.crawler = Crawler()
     self.mysql_obj = dbMysql.DbMysql(env.DB_HOST, env.DB_PORT,
                                      env.DB_USERNAME, env.DB_PASSWORD,
                                      env.DB_DATABASE)
     self.db_obj = dbClass.DbWrapper(self.mysql_obj)