예제 #1
0
    def run(self):
        start_mark = False
        crawler = Crawler()
        while self.flag:
            try:
                url = 'http://hw.venndata.cn/proxy?num={num}'.format(
                    num=self.num)

                response, _ = crawler.crawl(url=url)
                html = response.text
                if html:
                    data = json.loads(html)['data']
                    proxy_list = data.split('|')
                    if len(proxy_list) > 500:
                        old_len = len(PROXY_LIST)
                        PROXY_LIST.extend(proxy_list)
                        PROXY_LIST[0:old_len] = []
                    if not start_mark:
                        log.critical("代理启动成功!获取代理%s" % len(proxy_list))
                        start_mark = True
            except Exception as e:
                log.error('生成 Proxy Failed' + str(e))
            time.sleep(self.interval)

        log.info('代理关闭')
        return
예제 #2
0
def crawl(base: str):
    clr = Crawler(base)
    pages = clr.crawl(depth=0)
    pages = {
        'urls': pages,
    }
    results = json.dumps(pages)

    return jsonify(results)
예제 #3
0
    parser.add_argument(
        '-u',
        '--url',
        required=True,
        type=str,
        help='For example => http://yoyowallet.com/')
    parser.add_argument(
        '-o',
        '--out',
        required=True,
        type=str,
        help='You have to enter a valid file address')
    parser.add_argument(
        '-t',
        '--type',
        required=True,
        type=FileType.from_string,
        choices=list(FileType),
        help='You have to choose one of theme => csv or xml')
    args = parser.parse_args()
    crawler = Crawler(str(args.url))
    task = asyncio.Task(crawler.crawl())
    loop = asyncio.get_event_loop()
    print(f'\n{30*"*"} crawler is working {30*"*"}\n\n')
    loop.run_until_complete(task)
    loop.close()
    result = task.result()
    print(f'\n\n{30*"*"} crawling was done {30*"*"}\n\n')
    Export().print(str(args.type), result, str(args.out))
    print(f'{30*"*"} output save on {args.out} {30*"*"}\n')
예제 #4
0
from crawl import Crawler
from house import House
from connect2db import DbConnector
from config import HEADERS
import preprocess
from connect2db import action_type

if __name__ == "__main__":
    """main"""
    mysql_connector = DbConnector()
    shcrawler = Crawler(HEADERS, 'sh')
    rg_urls = shcrawler.composeurl(1, 20)
    for region, urls in rg_urls.iteritems():
        for url in urls:
            res = shcrawler.parse(shcrawler.crawl(url))
            for i in res:
                i.region = region
                preprocess.main(i)
                try:
                    action, update_fields = mysql_connector.search(i)
                    if action == action_type.insert:
                        mysql_connector.insert(i)
                    elif action == action_type.update:
                        mysql_connector.update(i, update_fields)
                    elif action == action_type.none:
                        print "already inserted and up-to-date"
                except Exception as e:
                    print e.message + i.title_url
                    traceback.print_exc()
                finally:
예제 #5
0
                   user_name='root',
                   password='',
                   dbname='taobao_sf')
rows = db.execute(
    'SELECT distinct(itemId) FROM taobao_sf.sf_list_itemid').fetchall()
for row in rows:
    item_id = row[0]
    print(item_id)

    url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory'
    headers = {
        'user-agent':
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36'
    }
    crawler = Crawler()
    res, session = crawler.crawl(url=url, headers=headers)
    cookies = res.cookies.get_dict()
    m_h5_tk = cookies['_m_h5_tk']
    app_key = '12574478'
    data = '{"itemId":"%s"}' % item_id
    sign, t = get_sign(m_h5_tk, app_key, data)
    params = {
        'jsv': '2.4.2',
        'appKey': app_key,
        't': t,
        'sign': sign,
        'api': 'mtop.taobao.GovauctionMTopDetailService.queryHttpsItemDetail',
        'v': '2.0',
        'ecode': '0',
        'type': 'jsonp',
        'dataType': 'jsonp',
예제 #6
0
 cate_name = cate_info['name']
 print(cate_name)
 page = 1
 while True:
     url = 'https://h5api.m.taobao.com/h5/mtop.taobao.govauctionmtopcommonservice.getfrontcategory/1.0/?jsv=2.4.5&appKey=12574478&t=1570096614606&api=mtop.taobao.govauctionmtopcommonservice.getfrontcategory'
     headers = {
         'user-agent':
         'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36'
     }
     crawler = Crawler()
     proxy_ip = random.choice(PROXY_LIST)
     proxies = {
         "https": 'https://' + proxy_ip,
         "http": 'http://' + proxy_ip
     }
     res, session = crawler.crawl(url=url, headers=headers, proxies=proxies)
     cookies = res.cookies.get_dict()
     m_h5_tk = cookies['_m_h5_tk']
     app_key = '12574478'
     data = '{"city":"%s","pageNo":%s,"pageSize":99,"orderId":"0","categoryId":"%s"}' % (
         city, str(page), cate_id)
     sign, t = get_sign(m_h5_tk, app_key, data)
     params = {
         'jsv': '2.4.5',
         'appKey': app_key,
         't': t,
         'sign': sign,
         'api': 'mtop.taobao.govauction.sfsearchlist',
         'v': '1.0',
         'H5Request': 'true',
         'type': 'jsonp',
예제 #7
0
from crawl import Crawler
import json
from common import get_regex
from tools.mysql_operator import MySqlOperator

url = 'https://sf.taobao.com/item_list.htm?city=&province=%D5%E3%BD%AD'
crawler = Crawler()
res, session = crawler.crawl(url=url, encoding='gbk')
raw_data = get_regex(
    r'<script id="sf-item-list-data" type="text/json">([\S\s]*?)</script>',
    res.text, 1)
jdata = json.loads(raw_data)

data_list = list()
for item in jdata['data']:
    item_info = {'id': item['id'], 'title': item['title']}
    data_list.append(item_info)

db = MySqlOperator(server='127.0.0.1',
                   user_name='root',
                   password='',
                   dbname='taobao_sf')
db.bulk_insert('test_tb', data_list)