示例#1
0
import random
import yaml
from lib.log import LogHandler
from lib.rabbitmq import Rabbit
import json

m = Mongo('192.168.0.235')
connect = m.connect

setting = yaml.load(open('config.yaml'))
db_name = setting['CEIC']['mongo']['db']

State_indicators_name = setting['CEIC']['mongo']['State_indicators']
State_indicators_details_name = setting['CEIC']['mongo'][
    'State_indicators_details']
log = LogHandler('ceic_detail')


def create_date(
    indexFrequency,
    start_year,
    start_mouth,
    end_year,
):
    """

    :return: ['from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1',]
    """
    """
    根据开始时间分割年月日
    """
示例#2
0
import yaml
import requests
from lib.log import LogHandler
from lib.mongo import Mongo
from lxml import etree
from sql_mysql import inquire, TypeAuction
from auction import Auction
import re
import datetime

setting = yaml.load(open('config.yaml'))
client = Mongo(host=setting['mongo']['host'], port=setting['mongo']['port']).connect
coll = client[setting['mongo']['db']][setting['mongo']['collection']]

source = 'jiapai'
log = LogHandler(__name__)


class Jiapai:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.117 Safari/537.36'
        }
        self.list_info = []
        self.type_list = inquire(TypeAuction, source)

    def start_crawler(self):
        for type_ in self.type_list:
            html_type = type_.html_type
            auction_type = type_.auction_type
            url = 'http://www.jiapai.net.cn/index.php/Judicial/index/px/' + type_.code
示例#3
0
CO_INDEX : 101
author: 程纪文
"""
from crawler_base import Crawler
from comm_info import Comm, Building, House
from get_page_num import AllListUrl
import re, requests
from lxml import etree
import random
import time
from lib.log import LogHandler

co_index = '101'
city = '保定'

log = LogHandler('baoding_101')

class Baoding(Crawler):
    def __init__(self):
        self.start_url = 'http://www.bdfdc.net/loadAllProjects.jspx'
        self.headers = {
            'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
        }
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='共(\d+)页',
                       )
示例#4
0
city :  武汉
CO_INDEX : 78
author: 程纪文
"""
from backup.crawler_base import Crawler
from backup.comm_info import Comm, Building, House
import re
from lxml import etree
from urllib import parse
import time
from lib.log import LogHandler
from backup.proxy_connection import Proxy_contact

city = '武汉'
co_index = '78'
log = LogHandler('wuhan_78')


class Wuhan(Crawler):
    def __init__(self):
        self.start_url = 'http://scxx.fgj.wuhan.gov.cn/xmqk.asp'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
        }

    def start_crawler(self):
        proxy = Proxy_contact(app_name='wuhan',
                              method='get',
                              url=self.start_url,
                              headers=self.headers)
"""
url = http://www.hyfc365.com/RealEstate/RealtyProject/Search.aspx
city :  衡阳
CO_INDEX : 181
author: 程纪文
"""
from backup.crawler_base import Crawler
from backup.comm_info import Building, House
import re, requests
from lxml import etree
from lib.log import LogHandler

co_index = '181'
city_name = '衡阳'
log = LogHandler('衡阳')


class Hengyang(Crawler):
    def __init__(self):
        self.start_url = 'http://www.hyfc365.com/RealEstate/RealtyProject/Search.aspx'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
        }

    def start_crawler(self):
        viewstate = "/wEPDwUKLTM2MzMxMTM1Nw8WBB4PSGlkZUNvbnRleHRNZW51CymEAXprU3VwZXJNYXAuV2ViLlVJLnprU3VwZXJNYXBQYWdlU3R5bGUsIHprU3VwZXJNYXAuQ29tbW9uTGlicmFyeSwgVmVyc2lvbj0xLjEuNTAwLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49NzJkNzZkMzJkOGNiYTIyZgIeD0hpZGVTZWxlY3RTdGFydAsrBAIWAgIBD2QWCgIDD2QWAmYPDxYEHghDc3NDbGFzcwUQY3NzQm94VGl0bGVUaHJlZR4EXyFTQgICZBYCAgEPDxYGHgRUZXh0BRLlvIDlj5HkvIHkuJrmn6Xor6IeC05hdmlnYXRlVXJsBSQvUmVhbEVzdGF0ZS9SZWFsdHlEZWFsZXIvU2VhcmNoLmFzcHgeBlRhcmdldGUWAh4MVGV4dENoYW5naW5nBQRUcnVlZAIFD2QWAmYPDxYEHwIFFGNzc0JveFRpdGxlVGhyZWVPdmVyHwMCAmQWAgIBDw8WBh8EBRTmpbznm5go6aG555uuKeafpeivoh8FZR8GZRYCHwcFBFRydWVkAgcPZBYCZg8PFgQfAgUQY3NzQm94VGl0bGVUaHJlZR8DAgJkFgICAQ8PFgYfBAUUKOe9keS4iinmiL/mupDmn6Xor6IfBQUqL1JlYWxFc3RhdGUvUmVhbHR5U2VhcmNoL1NlYXJjaF9Ib3VzZS5hc3B4HwZlFgIfBwUEVHJ1ZWQCCQ9kFgJmDw8WBB8CBRBjc3NCb3hUaXRsZVRocmVlHwMCAmQWAgIBDw8WBh8EBRLlkIjlkIzlpIfmoYjmn6Xor6IfBQUsL1JlYWxFc3RhdGUvUmVhbHR5U2VhcmNoL1NlYXJjaF9SZWNvcmRzLmFzcHgfBmUWAh8HBQRUcnVlZAITDzwrAAsAZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WAQUNQ3VzdG9tUGFnaW5nMbpNuvQVuP+DYqCe1+wbVab+715lNR+eC+hDFTSfvE0y"
        valid = "/wEWAwKHpppsAqi0zakHArrY8x1xs+nwBroCH5+KiDI9tW1jyttusdquHQRtH5UPs6GOzg=="
        data = {
            "CustomPaging1_CurrentPageIndex": -1,
            "__VIEWSTATE": viewstate,
import requests
from lxml import etree
from lib.proxy_iterator import Proxies
from pymongo import MongoClient
import re
import aiohttp
import asyncio
from lib.log import LogHandler
import time
import pika
import json
import threading

log = LogHandler('xian')
p = Proxies()
p = p.get_one(proxies_number=7)
# p = {'http': 'http://*****:*****@zproxy.lum-superproxy.io:22225'}

m = MongoClient(host='114.80.150.196',
                port=27777,
                username='******',
                password='******')
crawler_collection = m['hilder_gv']['xian']


class XiAn:
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
        }
import requests
from lxml import etree
from lib.proxy_iterator import Proxies
from pymongo import MongoClient
import re
import aiohttp
import asyncio
from lib.log import LogHandler
import time
import pika
import json
log = LogHandler('loupan')
p = Proxies()
p = p.get_one(proxies_number=7)

m = MongoClient(host='114.80.150.196', port=27777, username='******', password='******')
crawler_collection = m['fangjia']['district_complete']


class LouPanConsumer:

    def __init__(self):
        self.headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        self.connection = pika.BlockingConnection(pika.ConnectionParameters(host='114.80.150.196', port=5673, heartbeat=0))
        self.channel = self.connection.channel()
        self.channel.queue_declare(queue='loupan')

    def final_parse(self, data):
        url = data['url']
示例#8
0
"""
    消费xiaozijia_num队列,请求,入小区库
    大约需要1个小时
"""

from lib.log import LogHandler
from lib.mongo import Mongo
from lib.rabbitmq import Rabbit
import requests
import yaml
import json

log = LogHandler('小资家_comm')

setting = yaml.load(open('config.yaml'))

# mongo
m = Mongo(setting['xiaozijia']['mongo']['host'],
          setting['xiaozijia']['mongo']['port'],
          user_name=setting['xiaozijia']['mongo']['user_name'],
          password=setting['xiaozijia']['mongo']['password'])
coll_comm = m.connect[setting['xiaozijia']['mongo']['db']][
    setting['xiaozijia']['mongo']['comm_coll']]

# rabbit
r = Rabbit(setting['xiaozijia']['rabbit']['host'],
           setting['xiaozijia']['rabbit']['port'])
channel = r.get_channel()
queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_num']
build_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_build']
channel.queue_declare(queue=queue)
示例#9
0
from deal_price_info import Comm
import requests
import re
from lxml import etree
import random
from lib.log import LogHandler
import time, datetime
import json

log = LogHandler('centaline')
source = '中原地产'


class Centaline:
    def __init__(self):

        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
        }
        self.start_url = 'http://www.centaline.com.cn/'

    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        res.encoding = 'gbk'
        second_city_list = re.findall('http://\w+.centanet.com/ershoufang/',
                                      res.text, re.S | re.M)
        for city in second_city_list:
            city_comm = city.replace('ershoufang', 'xiaoqu')
            city_res = requests.get(city_comm, headers=self.headers)
            city_res.encoding = 'gbk'
示例#10
0
import requests
import re
from deal_price_info import Comm
import time, datetime
from lib.log import LogHandler

log = LogHandler('链家在线')
url = 'https://sh.lianjia.com/'


class Lianjiazaixian():
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
        }

    def start_crawler(self):
        response = requests.get(url, headers=self.headers)
        html = response.text
        city_list_html = re.search('city-tab".*?</div></div></div>', html,
                                   re.S | re.M).group()
        city_a_html_list = re.findall('<a.*?</a>', city_list_html, re.S | re.M)
        city_dict = {}
        for i in city_a_html_list:
            city = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1)
            city_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
            if 'you' not in city_url and 'fang' not in city_url:
                city_dict[city] = city_url
        self.get_city_info(city_dict)
示例#11
0
import requests
import re
from deal_price_info import Comm
import time, datetime
from lib.log import LogHandler

url = 'http://sh.koofang.com/xiaoqu/pg1'

log = LogHandler('上海酷房网')


class Kufangwang():
    def __init__(self):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
        }

    def start_crawler(self):
        self.get_comm_info(url)
        self.get_all_comm_url(url)

    def get_comm_info(self, page_url):
        response = requests.get(page_url, headers=self.headers)
        html = response.text
        comm_info_html_list = re.findall('<div class="avail_conr">.*?</li>',
                                         html, re.S | re.M)
        for i in comm_info_html_list:
            comm = Comm('上海酷房网')
            comm.city = '上海'
            comm.district_name = re.search('class="avail_cont".*?>(.*?)<', i,
示例#12
0
# from deal_price_info import Comm
from BaseClass import Base
import requests
import re
from lxml import etree
import time
import datetime
from lib.log import LogHandler
from lib.proxy_iterator import Proxies
p = Proxies()
source = '房途网'
log = LogHandler('房途网')


class Fangtu(object):
    def __init__(self, proxies):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
        }
        self.start_url = 'http://hangzhou.fangtoo.com/building/'
        self.proxies = proxies

    def start_crawler(self):
        url = 'http://hangzhou.fangtoo.com/building/cp1/'
        res = requests.get(url=url, headers=self.headers, proxies=self.proxies)
        num = re.search('pagecount:(\d+),', res.text, re.S | re.M).group(1)

        for i in range(1, int(num) + 1):
            url = self.start_url + "cp" + str(i) + "/"
            try:
示例#13
0
#     ret, info = bucket_manager.fetch(url,bucket,filename)
#     if info.status_code == 200:
#
#         file_url = bucket_domain + "/" + filename
#         print(file_url)
#         return file_url
#     else:
#         print("{}抓取失败".format(url))
"""
    图片爬取
"""

proxy = Proxies()

bucket = 'fangjia-img'
log = LogHandler("qiniu")


@retry(delay=2)
def qiniufetch(url, file_name):
    headers = {"user_agent":
                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
               }
    if 'http' in url:
        """
            使用代理池
        """
        # image_download = Proxy_contact(app_name='qiniufetch', method='get', url=url, headers=headers)
        # con = image_download.contact()
        # while True:
        #     try:
示例#14
0
city :  韶关
CO_INDEX : 194
author: 程纪文
"""

from crawler_base import Crawler
from comm_info import Comm, Building, House
from get_page_num import AllListUrl
from producer import ProducerListUrl
import re, requests
from lxml import etree
from lib.log import LogHandler

co_index = '194'
city_name = '韶关'
log = LogHandler('韶关')


class Shaoguan(Crawler):
    def __init__(self):
        self.start_url = 'http://61.143.241.154/user_kfs.aspx'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Referer': 'http://61.143.241.154/user_itemlist.aspx'
        }
        self.proxies = [
            {
                "http": "http://192.168.0.96:3234"
            },
            {