Пример #1
0
def crawl(middleman_type):
    origin_url = "http://www.anjuke.com/sy-city.html"
    city_xpath = ur"//div[@class='city_list']/a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        # 经纪人的url
        page_url = city_url + "/tycoon/"
        while page_url:
            logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url))
            page_obj = get(page_url, use_proxy=False)
            if not page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, page_url))
                page_url = None
                continue
            page_res_list, next_page_url = parse_page(city_url, page_obj)
            if next_page_url:
                page_url = next_page_url[0]
            else:
                page_url = None
            res = record_res(page_res_list, middleman_type)
            if not res:
                logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
Пример #2
0
def init_site(site_info):
    city_code = json.loads(open('init_module/baidu_city_code.txt').read())
    init_urls = []
    for city, query_word in parse_query_file():
        req_url = ("http://waimai.baidu.com/waimai?qt=poisug&wd=%s&"
                   "cb=suggestion_1442286608299&cid=%s&b=&type=0&"
                   "newmap=1&ie=utf-8&callback=jsonp11"
                   % (query_word, city_code[city]))
        resp = getpage.get(req_url, use_proxy=0)
        if not resp or not resp.text:
            continue
        start_pos = resp.text.find('{')
        end_pos = resp.text.rfind('}') + 1
        if end_pos < 0:
            return init_urls
        text = resp.text[start_pos: end_pos]
        resp_obj = json.loads(text)
        for data in resp_obj.get('s', []):
            data_obj = data.split('$')
            if not data_obj: continue
            try:
                address = urllib.quote(data_obj[3].encode('utf-8'))
                lat = data_obj[5].split(',')[0]
                lng = data_obj[5].split(',')[1]
                init_urls.append("http://waimai.baidu.com/mobile/waimai?qt=shoplist&address=%s&"
                                 "lat=%s&lng=%s&page=1&count=20&display=json"
                                 % (address, lat, lng))
            except Exception, e:
                print e
                continue
Пример #3
0
def get_pagenum(site_info):
    try:
        page = getpage.get(site_info['url'], use_proxy=0)
        page_doc = lxml.html.document_fromstring(page.text)
        pagenum = page_doc.xpath("//input[@name='pageInfo.pageTotal']/@value")[0]
    except Exception, e:
        return None
Пример #4
0
def crawl(middleman_type):
    city_url_list = [
        "http://bj.maitian.cn/bkesf", "http://fz.maitian.cn/bkesf",
        "http://xm.maitian.cn/bkesf"
    ]
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" %
                        (middleman_type, city_url))
        page_url = city_url
        while page_url:
            logging.warning("%s: Get list page url, url: %s" %
                            (middleman_type, page_url))
            page_obj = get(page_url, use_proxy=False)
            if not page_obj:
                logging.warning('%s: Cannot get page. url: %s' %
                                (middleman_type, page_url))
                page_url = None
                continue
            page_res_list, next_page_url = parse_page(city_url, page_obj)
            if next_page_url:
                page_url = next_page_url[0]
            else:
                page_url = None
            res = record_res(page_res_list, middleman_type)
            if not res:
                logging.error("%s: Cannot record res, url: %s" %
                              (middleman_type, page_url))
Пример #5
0
def get_provID():
    province_url = "http://www.zto.cn/Scripts/proselect/Places.js"
    try:
        pageobj = getpage.get(url=province_url, use_proxy=False)
        page = json.loads(pageobj.text[:-1].split('=')[1])
    except Exception,e:
        logging.error("download province info failed, msg: %s" %e)
        return False
Пример #6
0
def get_provID():
    province_url = "http://www.zjs.com.cn/WS_Business/GetPCAData.ashx?province=0&city=0&county=0&companyname=0"
    try:
        pageobj = getpage.get(url=province_url, use_proxy=False)
        page = json.loads(pageobj.text)
    except Exception, e:
        logging.error("download province info failed, msg: %s" % e)
        return False
Пример #7
0
 def test_get_batch(self):
     print 'Batch testing get() w/ proxy...'
     for url in self.urls:
         print '===================================='
         print 'url:\t', url
         p = getpage.get(url)
         self.printPage(p)
         self.assertTrue(type(p) is Page or p is None)
Пример #8
0
def get_provID():
    province_url = "http://www.yto.net.cn/cn/service/map.htm"
    try:
        pageobj = getpage.get(url=province_url, use_proxy=False)
        page = pageobj.text
    except Exception, e:
        logging.error("download province info failed, msg: %s" % e)
        return False
Пример #9
0
def get_provID():
    province_url = "http://www.ztky.com/data/getProvinceData.aspx"
    try:
        pageobj = getpage.get(url=province_url, use_proxy=False)
        page = json.loads(pageobj.text)
    except Exception,e:
        logging.error("download province info failed, msg: %s" %e)
        return False
Пример #10
0
def get_provid():
    prov_url = "http://www.oppo.com/index.php?q=service/oppostore/p/%E5%8C%97%E4%BA%AC/c//g/1"
    try:
        page_obj = getpage.get(url=prov_url, use_proxy=False, render_js=True)
        page_doc = lxml.html.document_fromstring(page_obj.text)
        prov_ids = []
        for val in page_doc.xpath("//ul[@id='province']//a/text()"):
            prov_ids.append(val)
    except Exception, e:
        logging.error("get prov id failed, msg: %s" % e)
        return []
def get_serviceinfo():
    prov_url = "http://support-cn.samsung.com/support/ServiceLocations.asp"
    try:
        page_obj = getpage.get(url=prov_url, use_proxy=False)
        page_doc = lxml.html.document_fromstring(page_obj.text)
        prov_ids = page_doc.xpath("//ul[@id='ulpro']//a/text()")
        categorys = page_doc.xpath(
            "//ul[@class='product-categories']//label/@value")
    except Exception, e:
        logging.error("get prov id failed, msg: %s" % e)
        return ([], [])
Пример #12
0
def get_phone(phone_id):
    if not phone_id:
        return []
    url = 'http://www.yellowpages.com.eg/en/get-phones/%s/' % phone_id
    page = getpage.get(url, use_proxy=0)
    page_obj = lxml.html.document_fromstring(page.text)
    phone_doc = page_obj.xpath("//span[@class='detail']/text()")
    phones = []
    for each in phone_doc:
        phones += [v.strip() for v in each.split(',')]
    return phones
Пример #13
0
def get_provid():
    url = "http://www.byf.com/b2b/list.aspx?p=017000"
    try:
        page_obj = getpage.get(url=url, use_proxy=False, render_js=True)
        page_doc = lxml.html.document_fromstring(page_obj.text)
        prov_ids = []
        for val in page_doc.xpath("//select[@id='ddlProvince']/option/@value"):
            if val:
                prov_ids.append(val)
    except Exception, e:
        logging.error("get prov id failed, msg: %s" % e)
        return []
Пример #14
0
def init_site(site_info):
    init_urls = []
    try:
        page = getpage.get(
            'http://service.sony.com.cn/Maintenance_Station/2518.htm',
            use_proxy=0)
        page_doc = lxml.html.document_fromstring(page.text)
        for v in page_doc.xpath(
                "//select[@class='default']/option[position()>1]/@value"):
            init_urls.append('http://service.sony.com.cn' + v)
    except Exception, e:
        return []
Пример #15
0
def get_cityid():
    city_url = "http://www.wo116114.com/"
    try:
        page_obj = getpage.get(url=city_url, use_proxy=False, render_js=True)
        page_doc = lxml.html.document_fromstring(page_obj.text)
        city_ids = []
        for val in page_doc.xpath(
                "//div[@class='pro_nr02']//ul/li/a/@onclick"):
            id = val[val.find('\'') + 1:val.find(',') - 1]
            city_ids.append(val[val.find('\'') + 1:val.find(',') - 1])
    except Exception, e:
        logging.error("get city id failed, msg: %s" % e)
        return []
def get_cityid(prov_id):
    city_url = "http://support-cn.samsung.com/support/ServiceLocations-ajax.asp?v=%s&act=3" % (
        urllib2.quote(prov_id.encode('utf-8')))
    try:
        page = getpage.get(url=city_url, use_proxy=False).text
        city_ids = []
        for val in page.split(","):
            start_pos = val.find('"')
            end_pos = val.find('"', start_pos + 1)
            city_ids.append(val[start_pos + 1:end_pos])
    except Exception, e:
        logging.error("get city id failed, msg: %s" % e)
        return []
Пример #17
0
def crawl(middleman_type):
    origin_url = "http://house.focus.cn/"
    city_xpath = ur"//div[@id='cityArea']/div[@class='bot']//div[@class='cityAreaBoxCen']//a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        # 经纪人的url
        url_list = city_url.split('.')
        start_page_url = url_list[0] + ".esf.focus.cn/agent"
        page_url = url_list[0] + ".esf.focus.cn/agent"

        while page_url:
            logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url))
            page_obj = get(page_url, use_proxy=False)
            if not page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
                page_url = None
                continue
            page_res_list, next_page_url = parse_page(start_page_url, page_obj)
            # print 'next', next_page_url
            if next_page_url:
                page_url = next_page_url[0]
            else:
                page_url = None
            res = record_res(page_res_list, middleman_type)
            if not res:
                logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
Пример #18
0
    def test_quick(self):
        url = 'http://www.cnbeta.com/articles/372623.htm'
        print 'Quick test'
        print '===================================='
        print 'PX: N, FP: N, JS: N'
        p = getpage.get(url, use_proxy=False, fpfirst=False, render_js=False)
        self.printPage(p)
        print '===================================='
        print 'PX: N, FP: Y, JS: N'
        p = getpage.get(url, use_proxy=False, fpfirst=True, render_js=False)
        self.printPage(p)
        print '===================================='
        print 'PX: N, FP: Y, JS: Y'
        p = getpage.get(url, use_proxy=False, fpfirst=True, render_js=True)
        self.printPage(p)

        print '===================================='
        print 'PX: Y, FP: N, JS: N'
        p = getpage.get(url, use_proxy=True, fpfirst=True, render_js=False)
        self.printPage(p)
        print '===================================='
        print 'PX: Y, FP: Y, JS: Y'
        p = getpage.get(url, use_proxy=True, fpfirst=True, render_js=True)
        self.printPage(p)
Пример #19
0
 def test_get(self):
     print '===================================='
     print '===================================='
     print '===================================='
     print 'Testing get()...'
     print '===================================='
     url = 'http://www.cnbeta.com/articles/372623.htm'
     print '===================================='
     p = getpage.get(url,
                     retry=1,
                     use_proxy=False,
                     fpfirst=False,
                     render_js=False)
     self.printPage(p)
     print '===================================='
     p = getpage.get(url,
                     retry=1,
                     use_proxy=False,
                     fpfirst=True,
                     render_js=False)
     self.printPage(p)
     print '===================================='
     p = getpage.get(url,
                     retry=1,
                     use_proxy=False,
                     fpfirst=True,
                     render_js=True)
     self.printPage(p)
     print '===================================='
     p = getpage.get(url,
                     retry=2,
                     use_proxy=True,
                     fpfirst=False,
                     render_js=False)
     self.printPage(p)
     print '===================================='
     p = getpage.get(url,
                     retry=2,
                     use_proxy=True,
                     fpfirst=True,
                     render_js=False)
     self.printPage(p)
     print '===================================='
     p = getpage.get(url,
                     retry=2,
                     use_proxy=True,
                     fpfirst=True,
                     render_js=True)
     self.printPage(p)
     print '===================================='
Пример #20
0
def crawl(middleman_type):

    origin_url = "http://bj.5i5j.com/"
    city_xpath = "//div[@class='new_city_more']//a/@href"
    # 获取城市url列表
    time.sleep(2)
    origin_page_obj = get(origin_url, use_proxy=False)
    if not origin_page_obj:
        logging.warning('%s: Cannot get page. url: %s' %
                        (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, origin_page_obj.text,
                                      city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url.' % (middleman_type))
        return None

    # city_url_list = ["http://bj.5i5j.com/"]

    for city_url in city_url_list:

        logging.warning("%s: City page url, url: %s" %
                        (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        city_broker_url = city_url + "/broker"

        logging.warning("%s: Get city page url, url: %s" %
                        (middleman_type, city_broker_url))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' %
                            (middleman_type, city_broker_url))
            continue

        if "tj.5i5j" in city_url:
            area_xpath = ur"//ul[@class='search-quyu']/li[1]/a[position()>1]/@href"
            detail_xpath = ur"//li[@class='addressli']/div[@class='shquan quanm']/span/a/@href"
        else:
            area_xpath = ur"//li[@class='quyu_gao']//a[position()>1]/@href"
            detail_xpath = ur"//div[@class='keywords01']/a/@href"

        area_url_list = get_xpath_content(city_url, city_broker_page_obj.text,
                                          area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' %
                            (middleman_type, city_broker_url))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://bj.5i5j.com/broker/haidian/"]
        for area_url in area_url_list:
            logging.warning("%s: Get area page url, url: %s" %
                            (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' %
                                (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(
                city_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' %
                                (middleman_type, area_url))
                continue

            # # 记录
            for detail_address_url in detail_address_broker_list:
                #print 'detail_url', detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" %
                                    (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' %
                                        (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(
                        city_url, detail_page_obj)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    #print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" %
                                      (middleman_type, detail_address_url))
Пример #21
0
def crawl(middleman_type):
    origin_url = "http://shijiazhuang.tuitui99.com/"
    city_xpath = ur"//div[@class='city_more']//a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' %
                        (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" %
                        (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        city_broker_url = city_url + "/broker"

        logging.warning("%s: Get city page url, url: %s" %
                        (middleman_type, city_broker_url))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' %
                            (middleman_type, city_broker_url))
            continue
        area_xpath = ur"//dl[@class='clearfix']/dd/a[position()>1]/@href"
        detail_xpath = ur"//dd[@class='sub_area']/a[position()>1]/@href"

        area_url_list = get_xpath_content(city_url, city_broker_page_obj.text,
                                          area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' %
                            (middleman_type, city_broker_url))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://bj.5i5j.com/broker/haidian/"]
        for area_url in area_url_list:
            logging.warning("%s: Get area page url, url: %s" %
                            (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' %
                                (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(
                city_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' %
                                (middleman_type, area_url))
                continue

            # # 记录
            for detail_address_url in detail_address_broker_list:
                # print 'detail_url', detail_address_url
                first_detail_address_url = detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" %
                                    (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' %
                                        (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(
                        city_url, detail_page_obj, first_detail_address_url)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    # print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" %
                                      (middleman_type, detail_address_url))
Пример #22
0
if len(sys.argv) < 3:
    print "Usage: python %s url task_type" % sys.argv[0]
    exit()

url = sys.argv[1]
task_type = sys.argv[2]

task_obj = task_mongo.Task()

task_conf = task_obj.get_task_conf(task_type)['task_conf']
if not task_conf:
    exit()

print url
page_obj = getpage.get(url, use_proxy=False)
if not page_obj:
    print "get page failed"
    exit()

tasks, items = conf_parser.parse(page_obj.text, url, task_conf, page_obj.encoding)

print "task num: %d" % len(tasks)
for task_type, url in tasks:
    print task_type, url
print "------------------------------------------------------------------------"

print "item num: %d" % len(items)
for item in items:
    item_str = json.dumps(item, ensure_ascii=False).encode("utf-8")
    print item_str
Пример #23
0
def crawl(middleman_type):
    origin_url = "http://fang.com/SoufunFamily.htm"
    city_xpath = "//div[@class='letterSelt']/div[@id='c01']//a/@href"
    # 获取城市url列表
    time.sleep(2)
    origin_page_obj = get(origin_url, use_proxy=False)
    if not origin_page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url.' % (middleman_type))
        return None

    #city_url_list = ["http://bj.fang.com/"]
    area_xpath = ur"//div[@class='qxName']/a[position()>1]/@href"
    detail_xpath = ur"//p[@id='shangQuancontain']/a[position()>1]/@href"

    for city_url in city_url_list:
        # print 'city',city_url
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        if city_url == "http://bj.fang.com/":
            city_broker_url = "http://esf.fang.com"
        else:
            re_pattern = ur"^http://(\w+)\.fang\.com/$"
            m = re.search(re_pattern, city_url)
            if m:
                city_abbr = m.group(1)
                city_broker_url = "http://esf." + city_abbr + ".fang.com"

            else:
                continue
        city_broker_url_first = city_broker_url + '/agenthome/'
        logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url_first))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url_first, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url_first))
            continue
        area_url_list = get_xpath_content(city_broker_url, city_broker_page_obj.text, area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url_first))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://esf.fang.com/agenthome-a03/-i31-j310/"]
        for area_url in area_url_list:
            # print 'area_url', area_url
            logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(city_broker_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url))
                continue

            # # 记录
            # detail_address_broker_list = ['http://esf.fang.com/agenthome-a03-b012384/-i31-j310/']
            for detail_address_url in detail_address_broker_list:
                # print 'detail_url', detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(city_broker_url, detail_page_obj)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    # print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))