예제 #1
0
def get_rent_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    print(total_pages)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    #todo
    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page)
            print(url_page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []

        for ultag in soup.findAll("div", {"class": "content__list"}):
            for name in ultag.find_all('div'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find(
                        "p", {"class": "content__list--item--title"})

                    regionZone = name.find(
                        "p", {"class": "content__list--item--des"})
                    region = regionZone.a.get_text().strip()
                    zone = regionZone.a.next_sibling.next_sibling.get_text(
                    ).strip()

                    info_dict = get_detail_info(city, housetitle.a.get("href"))
                    info_dict.update({u'region': region})
                    info_dict.update({u'zone': zone})
                except:
                    print 'traceback.format_exc():\n%s' % traceback.format_exc(
                    )
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #2
0
def get_community_perregion(regionname=u'xicheng'):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % (page +
                                                                        1, )
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0

        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                info_dict.update(
                    {u'title': communitytitle.get_text().strip('\n')})
                info_dict.update({u'link': communitytitle.a.get('href')})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                info_dict.update({u'id': name.get('data-housecode')})

            except:
                continue
            # communityinfo insert into mysql
            model.Community.insert(**info_dict).upsert().execute()

            time.sleep(1)
예제 #3
0
def get_house_perregion(city, district):
    try:
        baseUrl = u"http://%s.lianjia.com/" % (city)
        url = baseUrl + u"ershoufang/%s/" % district
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')
        if check_block(soup):
            return
        total_pages = misc.get_total_pages(url)
        if total_pages is None:
            row = model.Houseinfo.select().count()
            raise RuntimeError("Finish at %s because total_pages is None" % row)

        for page in range(total_pages):
            if page > 0:
                url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page)
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')
            i = 0
            logging.info("Progress: %s %s: current page %s total pages %s", "GetHouseByRegionlist", district, page + 1, total_pages)
            data_source = []
            hisprice_data_source = []
            for ultag in soup.findAll("ul", {"class": "sellListContent"}):
                for name in ultag.find_all('li'):
                    i = i + 1
                    info_dict = {}
                    try:
                        housetitle = name.find("div", {"class": "title"})
                        info_dict.update({u'title': housetitle.a.get_text().strip()})
                        info_dict.update({u'link': housetitle.a.get('href')})

                        houseID = housetitle.a.get('data-housecode')
                        info_dict.update({u'houseID': houseID})

                        houseinfo = name.find("div", {"class": "houseInfo"})
                        info = houseinfo.get_text().split('|')
                        if len(info) > 0:
                            info_dict.update({u'housetype': info[0]})
                        else:
                            info_dict.update({u'housetype': ""})
                        if len(info) > 1:
                            info_dict.update({u'square': info[1]})
                        else:
                            info_dict.update({u'square': ""})
                        if len(info) > 2:
                            info_dict.update({u'direction': info[2]})
                        else:
                            info_dict.update({u'direction': ""})
                        if len(info) > 3:
                            info_dict.update({u'decoration': info[3]})
                        else:
                            info_dict.update({u'decoration': ""})
                        if len(info) > 4:
                            info_dict.update({u'floor': info[4]})
                        else:
                            info_dict.update({u'floor': ""})
                        if len(info) > 5:
                            info_dict.update({u'years': info[5]})
                        else:
                            info_dict.update({u'years': ""})
                        if len(info) > 6:
                            info_dict.update({u'towertype': info[6]})
                        else:
                            info_dict.update({u'towertype': ""})

                        housefloor = name.find("div", {"class": "positionInfo"})
                        communityInfo = housefloor.get_text().split('-')
                        info_dict.update({u'community': communityInfo[0]})
                        if len(communityInfo) > 1:
                            info_dict.update({u'business': communityInfo[1]})
                        else:
                            info_dict.update({u'business': ""})

                        followInfo = name.find("div", {"class": "followInfo"})
                        info_dict.update(
                            {u'followInfo': followInfo.get_text().strip()})

                        taxfree = name.find("span", {"class": "taxfree"})
                        if taxfree is None:
                            info_dict.update({u"taxtype": ""})
                        else:
                            info_dict.update(
                                {u"taxtype": taxfree.get_text().strip()})

                        totalPrice = name.find("div", {"class": "totalPrice"})
                        info_dict.update(
                            {u'totalPrice': totalPrice.span.get_text()})

                        unitPrice = name.find("div", {"class": "unitPrice"})
                        info_dict.update(
                            {u'unitPrice': unitPrice.get("data-price")})
                    except Exception as e:
                        print(e, traceback.print_exc())
                        continue

                    # Houseinfo insert into mysql
                    data_source.append(info_dict)
                    hisprice_data_source.append(
                        {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]})
                    # model.Houseinfo.insert(**info_dict).upsert().execute()
                    # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

            with model.database.atomic():
                if data_source:
                    logging.info("get_house_perregion: insert %d region info to database", len(data_source))
                    model.Houseinfo.insert_many(data_source).upsert().execute()
                if hisprice_data_source:
                    logging.info("get_house_perregion: insert %d region info to database", len(hisprice_data_source))
                    model.Hisprice.insert_many(
                        hisprice_data_source).upsert().execute()
            # time.sleep(1)
    except Exception as e:
        print(e, traceback.print_exc())
예제 #4
0
def get_community_perregion(city, regionname=u'xicheng'):
    try:
        baseUrl = u"http://%s.lianjia.com/" % (city)
        url = baseUrl + u"xiaoqu/" + regionname + "/"
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')

        if check_block(soup):
            return
        total_pages = misc.get_total_pages(url)

        if total_pages is None:
            row = model.Community.select().count()
            raise RuntimeError("Finish at %s because total_pages is None" % row)

        for page in range(total_pages):
            if page > 0:
                url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')

            nameList = soup.findAll("li", {"class": "clear"})
            i = 0
            logging.info("Progress: %s %s: current page %s total pages %s", "GetCommunityByRegionlist", regionname, page + 1, total_pages)
            data_source = []
            for name in nameList:  # Per house loop
                i = i + 1
                info_dict = {}
                try:
                    communitytitle = name.find("div", {"class": "title"})
                    title = communitytitle.get_text().strip('\n')
                    link = communitytitle.a.get('href')
                    info_dict.update({u'title': title})
                    info_dict.update({u'link': link})

                    district = name.find("a", {"class": "district"})
                    info_dict.update({u'district': district.get_text()})

                    bizcircle = name.find("a", {"class": "bizcircle"})
                    info_dict.update({u'bizcircle': bizcircle.get_text()})

                    tagList = name.find("div", {"class": "tagList"})
                    info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                    onsale = name.find("a", {"class": "totalSellCount"})
                    info_dict.update(
                        {u'onsale': onsale.span.get_text().strip('\n')})

                    onrent = name.find("a", {"title": title + u"租房"})
                    info_dict.update(
                        {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                    info_dict.update({u'id': name.get('data-housecode')})

                    price = name.find("div", {"class": "totalPrice"})
                    info_dict.update({u'price': price.span.get_text().strip('\n')})

                    communityinfo = get_communityinfo_by_url(link)
                    for key, value in communityinfo.iteritems():
                        info_dict.update({key: value})

                    info_dict.update({u'city': city})
                except Exception as e:
                    print(e, traceback.print_exc())
                    continue
                # communityinfo insert into mysql
                data_source.append(info_dict)
            with model.database.atomic():
                if data_source:
                    logging.info("get_community_perregion: insert %d community info to database", len(data_source))
                    model.Community.insert_many(data_source).upsert().execute()
            # time.sleep(1)
    except Exception as e:
        print(e, traceback.print_exc())
예제 #5
0
def get_rent_percommunity(city, communityname):
    try:
        baseUrl = u"http://%s.lianjia.com/" % (city)
        url = baseUrl + u"zufang/rs" + \
            urllib2.quote(communityname.encode('utf8')) + "/"
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')

        if check_block(soup):
            logging.info('soup error')
            return
        total_pages = misc.get_total_pages(url)

        if total_pages is None:
            row = model.Rentinfo.select().count()
            raise RuntimeError("Finish at %s because total_pages is None" % row)

        for page in range(total_pages):
            if page > 0:
                url_page = baseUrl + u"rent/pg%drs%s/" % (page, urllib2.quote(communityname.encode('utf8')))
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')
            i = 0
            data_source = []
            for ultag in soup.findAll("div", {"class": "content__list"}):
                for name in ultag.find_all("div", {"class": "content__list--item"}):
                    i = i + 1
                    info_dict = {}
                    try:
                        housetitle = name.find("div", {"class": "content__list--item--main"})
                        info_dict.update({u'title': housetitle.find('p', {"class": "content__list--item--title"}).find('a', {'class': "twoline"}).get_text().strip()})
                        subway = name.find('i', {"class": "content__item__tag--is_subway_house"})
                        if subway is None:
                            info_dict.update({u'subway': ""})
                        else:
                            info_dict.update({u'subway': subway.get_text().strip()})

                        decoration = name.find('i', {"class": 'content__item__tag--decoration'})
                        if decoration is None:
                            info_dict.update({u'decoration': ""})
                        else:
                            info_dict.update(
                                {u'decoration': decoration.get_text().strip()})

                        houseID = housetitle.a.get(
                            'href').split("/")[-1].split(".")[0]
                        info_dict.update({u'houseID': houseID})
                        region = name.find('p', {"class": "content__list--item--des"}).find('a')
                        if region is None:
                            region = ""
                        else:
                            region = region.get_text().strip()
                        info_dict.update({u'region': region})

                        zone = name.find('p', {"class": "content__list--item--des"}).find_all('a')[1]
                        if zone is None:
                            zone = ""
                        else:
                            zone = zone.get_text().strip()
                        info_dict.update({u'zone': zone})

                        price = name.find("span", {"class": "content__list--item-price"})
                        if price is None:
                            price = ""
                        else:
                            price = price.get_text().strip()
                        info_dict.update({u'price': price})

                        heating = name.find("i", {"class": "content__item__tag--central_heating"})
                        if heating is None:
                            heating = ""
                        else:
                            heating = heating.get_text().strip()
                        info_dict.update({u'heating': heating})

                        other = name.find('p', {"class": "content__list--item--des"})
                        if other is not None:
                            other = other.get_text().replace('\n', '').replace(' ', '').strip()
                        else:
                            other = "-/-/-/-/-"
                        info_dict.update({u'other': other})

                        # position, meters, direction, rooms, desc = other.split('/')
                        # info_dict.update({u'meters': meters})
                        info_dict.update({u'meters': ""})

                        pricepre = ''
                        info_dict.update({u'pricepre': pricepre})

                        info_dict.update({u'link': 'https://bj.lianjia.com/zufang' + housetitle.a.get('href')})

                    except Exception as e:
                        print(e, traceback.print_exc())
                    data_source.append(info_dict)

            with model.database.atomic():
                if data_source:
                    logging.info("get_rent_percommunity: insert %d rent info to database", len(data_source))
                    model.Rentinfo.insert_many(data_source).upsert().execute()
            # time.sleep(1)
    except Exception as e:
        print(e, traceback.print_exc())
예제 #6
0
def get_rent_perregion(district):
    url = BASE_URL + u"zufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"zufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update(
                        {u'title': housetitle.h2.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get("href")})
                    houseID = name.get("data_analysis-housecode")
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway == None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration == None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find("span", {"class": "heating-ex"})
                    if decoration == None:
                        info_dict.update({u'heating': ""})
                    else:
                        info_dict.update(
                            {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #7
0
def get_community_perregion(city, regionname=u'xicheng'):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    
    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)
    
    for page in range(1,total_pages+1):
        if page > 1:
            url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear xiaoquListItem"})
        i = 0
        log_progress("GetCommunityByRegionlist",
                     regionname, page, total_pages)
        data_source = []
        
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.a.get('href')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({u'id': name.get('data-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'price': price.span.get_text().strip('\n')})
                
                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.iteritems():
                    info_dict.update({key: value})

                info_dict.update({u'city': city})
                info_dict.update({u'validdate': datetime.datetime.now()})
                data_source.append(info_dict)  
                model.Community.insert(info_dict).on_conflict(conflict_target=[model.Community.id], preserve=[model.Community.title, model.Community.link, model.Community.district, \
                model.Community.bizcircle, model.Community.tagList, model.Community.onsale, model.Community.onrent, model.Community.year, \
                model.Community.housetype, model.Community.cost, model.Community.service, \
                model.Community.company, model.Community.building_num, model.Community.house_num, \
                model.Community.price, model.Community.city,model.Community.validdate],update={}).execute()
            except Exception as e:
                print "except~~!!"
                logging.error(e)
                print info_dict 
                # data_source.append(info_dict)                  
                continue      
        time.sleep(1)
예제 #8
0
def get_house_perregion(district):
    url = BASE_URL + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'html5lib')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page + 1)
        source_code = misc.get_source_code(url_page)
        soup = BeautifulSoup(source_code, 'html5lib')

        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            namearr = ultag.find_all('li', {"class": "clear"})

            for name in namearr:

                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('/')
                    info_communityid = houseinfo.a.get('href').split('xiaoqu/')
                    communityid = info_communityid[1].strip().rstrip('/')
                    square_info = info[2].encode("utf-8").split('平米')

                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'communityid': communityid})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': square_info[0]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_housefloor = housefloor.get_text().split('/')

                    info_years = info_housefloor[1].strip(
                    ).encode("utf-8").split(
                        '年建'
                    )  #unicode作为python中间编码,先转化成utf8(decode:...->unicode,encode:unicode->...)
                    info_floor = info_housefloor[0].split('(')
                    info_buildheight = info_floor[1].encode("utf-8").rstrip(
                        '层)').lstrip('共')

                    info_dict.update({u'years': info_years[0].strip()})
                    info_dict.update({u'buildingtype': info_years[1].strip()})
                    info_dict.update({u'floor': info_floor[0].strip()})
                    info_dict.update(
                        {u'buildheight': info_buildheight.strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        five = name.find("span", {"class": "five"})
                        if five == None:
                            info_dict.update({u"taxtype": ""})
                        else:
                            info_dict.update(
                                {u"taxtype": five.get_text().strip()})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})

                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                #model.Houseinfo.insert(**info_dict).upsert().execute()
                #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
예제 #9
0
def get_house_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    if city == 'bj':
                        info = houseinfo.get_text().split('/')
                    else:
                        info = houseinfo.get_text().split('|')
                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': info[2]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_dict.update({u'years': housefloor.get_text().strip()})
                    info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})
                    info_dict.update({u'validdate': datetime.datetime.now()})
                    model.Houseinfo.insert(info_dict).on_conflict(conflict_target=[model.Houseinfo.houseID], \
                    preserve=[model.Houseinfo.title, model.Houseinfo.link, model.Houseinfo.community, \
                    model.Houseinfo.years, model.Houseinfo.housetype, model.Houseinfo.square, \
                    model.Houseinfo.direction, model.Houseinfo.floor, model.Houseinfo.taxtype, \
                    model.Houseinfo.totalPrice, model.Houseinfo.unitPrice, model.Houseinfo.followInfo, \
                    model.Houseinfo.decoration, model.Houseinfo.validdate],update={}).execute()

                    hisprice = {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]}
                    model.Hisprice.insert(hisprice).on_conflict(conflict_target = [model.Hisprice.houseID, model.Hisprice.totalPrice],\
                    preserve=[model.Hisprice.houseID,model.Hisprice.totalPrice,model.Hisprice.date],\
                    update={}).execute()                        
                except Exception as e:
                    print "except~~!!"
                    logging.error(e)
                    print info_dict 
                    # data_source.append(info_dict)                  
                    continue   

        with model.database.atomic():
            if data_source:
                model.Houseinfo.insert_many(data_source).upsert().execute()
            if hisprice_data_source:
                model.Hisprice.insert_many(
                    hisprice_data_source).upsert().execute()
        time.sleep(1)
예제 #10
0
def get_lists_perword(keyword):
    url = BASE_URL % (keyword, pg_no)
    total_pages = misc.get_total_pages(url)
    print('The total pages number is: ', total_pages)
    logging.info('The total pages number is: ' + str(total_pages))
    time.sleep(3)

    if total_pages == None:
        row = model.quickinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        url_page = BASE_URL % (keyword, page)
        source_code = misc.get_source_code(url_page)
        soup = BeautifulSoup(source_code, 'lxml')

        itemList = soup.findAll("div", {"class": "result-sherlock-cell"})
        print('The items are: ', len(itemList))
        logging.info('The items are: ' + str(len(itemList)))

        for item in itemList:
            info_dict = {}
            try:
                id = item.get('id')
                info_dict.update({'id': id})

                link_title = item.find("h3", {"class": "job-title"})
                info_dict.update({'title': link_title.get_text()})
                info_dict.update({'link': link_title.a.get('href')})

                if item.find("li", {"itemprop": "description"}):
                    descriptions = item.findAll("li",
                                                {"itemprop": "description"})
                    summary_list = [desc.get_text() for desc in descriptions]
                    summary = ' | '.join(summary_list)
                    info_dict.update({'summary': summary})
                else:
                    info_dict.update({'summary': 'no description'})

                if item.find('p', {'class': 'job-quickinfo-salary'}):
                    wage = item.find('p', {
                        'class': 'job-quickinfo-salary'
                    }).get_text()
                    info_dict.update({'salary': wage})
                else:
                    info_dict.update({'salary': 'not specified'})

                date = item.find("div", {
                    "class": "job-quickinfo"
                }).meta.get('content')
                info_dict.update({'postdate': date})

                label = keyword.replace('+', ' ')
                info_dict.update({'label': label})


#				print(info_dict)

            except:
                #				print('There is no item found')
                logging.info('The item of searching results is not found')
                continue
            quickinfo.insert(**info_dict).upsert().execute()
        time.sleep(5)
예제 #11
0
def get_community_perregion(city, regionname=u'xicheng'):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    # logging.info('checking raw response')
    # print(source_code)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page
            logging.info("fetching from %s", url_page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        # logging.info("querying for page %d content", page)
        nameList = soup.findAll("li", {"class": "xiaoquListItem"})
        # logging.info("checking community list length: %d", len(nameList))
        i = 0
        log_progress("GetCommunityByRegionlist", regionname, page + 1,
                     total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.a.get('href')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({u'id': name.get('data-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'price': price.span.get_text().strip('\n')})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.items():
                    info_dict.update({key: value})

                info_dict.update({u'city': city})
                # logging.info('community info: %s', json.dumps(info_dict))
            except Exception as e:
                traceback.print_exc()
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()
        with model.database.atomic():
            if data_source:
                # logging.info("checking data: %s", ''.join(data_source))
                # logging.info("inserting community info into db")
                model.Community.insert_many(data_source).upsert().execute()
                # logging.info("insertion succeeds")
        time.sleep(1)
예제 #12
0
def get_sell_percommunity(city, communityname, threads=30):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"chengjiao/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                u"chengjiao/pg%drs%s/" % (page,
                                          urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        log_progress("GetSellByCommunitylist",
                     communityname, page + 1, total_pages)
        # logging.info("start")

        data_source = []
        
        nameList = []
        for ultag in soup.findAll("ul", {"class": "listContent"}):
            for name in ultag.find_all('li'):
                nameList.append(name)

        info_ls_mult = Manager().list()
        nameid_q = Queue()
        for i in range(len(nameList)):
            nameid_q.put(i)
        
        processes = []
        try:
            for i in range(threads):
                proc = Process(target=get_sell_worker, args=(nameid_q, nameList, info_ls_mult, communityname, city,))
                processes.append(proc)
                proc.start()
            
            for proc in processes:
                proc.join()

        except KeyboardInterrupt:
            print("Emergency terminate")
            print("killing %d processes" % (len(processes)))
            for proc in processes:
                proc.terminate()
            
        data_source = list(info_ls_mult)
        if len(data_source) == 0:
            print(colored("sth is wrong with %s, give up on this one" % communityname, "red"))
            break
        print("Finished with %d at %s" % (len(data_source), communityname))

        with model.database.atomic():
            if data_source:
                model.Sellinfo.insert_many(data_source).upsert().execute()
                logging.info("Writing to database")
        time.sleep(1)
예제 #13
0
def get_community_perregion(city, regionname=u'xicheng', threads=30):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        try:
            if page > 0:
                url_page = baseUrl + u"xiaoqu/" + regionname + "/pg%d/" % page
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')

            nameList = soup.findAll("li", {"class": "clear"})
            i = 0
            log_progress("GetCommunityByRegionlist",
                        regionname, page + 1, total_pages)
            data_source = []
            # for name in nameList[:1]:  # Per house loop DEBUGGING
            info_ls_mult = Manager().list()
            nameid_q = Queue()
            for i in range(len(nameList)):
                nameid_q.put(i)
            
            processes = []
            try:
                for i in range(threads):
                    proc = Process(target=community_info_worker, args=(nameid_q, nameList, info_ls_mult, regionname, city,))
                    processes.append(proc)
                    proc.start()
                
                for proc in processes:
                    proc.join()

            except KeyboardInterrupt:
                print("Emergency terminate")
                print("killing %d processes" % (len(processes)))
                for proc in processes:
                    proc.terminate()
                
            data_source = list(info_ls_mult)
            if len(data_source) == 0:
                print(colored("sth is wrong with %s, give up on this one" % regionname, "red"))
                break
            print("Finished with %d at %s" % (len(data_source), regionname))
            
            with model.database.atomic():
                print("submitting to dataset")
                if data_source:
                    model.Community.insert_many(data_source).upsert().execute()
                if page % 4 == 0:
                    cnt = []
                    for community in model.Community.select():
                        if community.city == city:
                            cnt.append(community.title)
                    print(" %d Community scraped: %d" % (page, len(cnt)))
            time.sleep(2)
        except:
            print(colored("Failed at %d - %s" % (page, regionname), "red"))
            continue
예제 #14
0
def get_rent_percommunity(communityname):
    url = BASE_URL + u"zufang/rs" + urllib.request.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"rent/pg%drs%s/" % (
                page, urllib.request.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("div", {"class": "content__list"}):
            for name in ultag.find_all("div",
                                       {"class": "content__list--item"}):
                log_progress("GetRentByCommunitylist", communityname, page + 1,
                             total_pages)
                i = i + 1
                info_dict = {}
                tempStr = ''
                try:
                    housetitleOb = name.find(
                        "p", {"class": "content__list--item--title"})
                    housetitle = housetitleOb.find("a")
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID})

                    tempInfo = name.find("p",
                                         {"class": "content__list--item--des"})
                    for tempData in tempInfo.find_all("a"):
                        tempStr += tempData.get_text().strip()

                    info_dict.update({u'region': tempStr})

                    info_dict.update({u'zone': tempInfo.get_text().strip()})

                    info_dict.update({u'meters': tempInfo.get_text().strip()})

                    other = name.find("span", {"class": "hide"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find(
                        "i", {"class": "content__item__tag--is_subway_house"})
                    if subway is None:
                        info_dict.update({u'subway': ''})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find(
                        "i", {"class": "content__item__tag--decoration"})
                    if decoration is None:
                        info_dict.update({u'decoration': ''})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find(
                        "i", {"class": "content__item__tag--two_bathroom"})
                    if decoration is None:
                        info_dict.update({u'heating': ''})
                    else:
                        info_dict.update(
                            {u'heating': heating.span.get_text().strip()})

                    priceT = name.find("span",
                                       {"class": "content__list--item-price"})
                    price = priceT.find("em")
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find(
                        "p", {"class": "content__list--item--time oneline"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                #model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #15
0
def get_house_percommunity(communityname):
    url = BASE_URL + u"ershoufang/rs" + urllib.request.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        return
        #row = model.Houseinfo.select().count()
        #raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/pg%drs%s/" % (
                page, urllib.request.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        i = 0
        log_progress("GetHouseByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-lj_action_housedel_id')
                    if houseID == None:
                        houseID = housetitle.a.get('data-housecode')
                    if houseID == None:
                        continue
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')

                    info_dict.update({u'housetype': info[0]})
                    info_dict.update({u'square': info[1]})
                    info_dict.update({u'direction': info[2]})
                    info_dict.update({u'decoration': info[3]})
                    info_dict.update({u'floor': info[4]})
                    info_dict.update({u'years': info[5]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    housefloorInfo = housefloor.get_text().split('   -  ')
                    info_dict.update({u'community': housefloorInfo[0]})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})
                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                #model.Houseinfo.insert(**info_dict).upsert().execute()
                #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()
            break

        if data_source == []:
            continue
        with model.database.atomic():
            logging.info("data_source : " + json.dumps(data_source))
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
예제 #16
0
def get_house_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"ershoufang/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
         
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)
    #从第二页才开始存入。page1在初始化的时候就已经放进去了,无须修改链接
    for page in range(1,total_pages+1):
        if page > 1:
            url_page = baseUrl + \
                u"ershoufang/pg%drs%s/" % (page,
                                           urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
            print url_page
        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetHouseByCommunitylist",
                     communityname, page, total_pages)
        data_source = []
        hisprice_data_source = []
        for name in nameList:  # per house loop
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.find("div", {"class": "title"})
                info_dict.update({u'title': housetitle.a.get_text().strip()})
                info_dict.update({u'link': housetitle.a.get('href')})

                houseaddr = name.find("div", {"class": "address"})
                if city == 'bj':
                    info = houseaddr.div.get_text().split('/')
                else:
                    info = houseaddr.div.get_text().split('|')
                info_dict.update({u'community': communityname})
                info_dict.update({u'housetype': info[1].strip()})
                info_dict.update({u'square': info[2].strip()})
                info_dict.update({u'direction': info[3].strip()})
                info_dict.update({u'decoration': info[4].strip()})

                housefloor = name.find("div", {"class": "flood"})
                floor_all = housefloor.div.get_text().split(
                    '-')[0].strip().split(' ')
                info_dict.update({u'floor': floor_all[0].strip()})
                info_dict.update({u'years': floor_all[-1].strip()})

                followInfo = name.find("div", {"class": "followInfo"})
                info_dict.update({u'followInfo': followInfo.get_text()})

                tax = name.find("div", {"class": "tag"})
                info_dict.update({u'taxtype': tax.get_text().strip()})

                totalPrice = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'totalPrice': totalPrice.span.get_text()})

                unitPrice = name.find("div", {"class": "unitPrice"})
                info_dict.update({u'unitPrice': unitPrice.get('data-price')})
                info_dict.update({u'houseID': unitPrice.get('data-hid')})
                info_dict.update({u'validdate': datetime.datetime.now()})
                model.Houseinfo.insert(info_dict).on_conflict(conflict_target=[model.Houseinfo.houseID], \
                preserve=[model.Houseinfo.title, model.Houseinfo.link, model.Houseinfo.community, \
                model.Houseinfo.years, model.Houseinfo.housetype, model.Houseinfo.square, \
                model.Houseinfo.direction, model.Houseinfo.floor, model.Houseinfo.taxtype, \
                model.Houseinfo.totalPrice, model.Houseinfo.unitPrice, model.Houseinfo.followInfo, \
                model.Houseinfo.decoration, model.Houseinfo.validdate],update={}).execute()

                hisprice = {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]}
                model.Hisprice.insert(hisprice).on_conflict(conflict_target = [model.Hisprice.houseID, model.Hisprice.totalPrice],\
                preserve=[model.Hisprice.houseID,model.Hisprice.totalPrice,model.Hisprice.date],\
                update={}).execute()
            except Exception as e:
                logging.error(e)
                print info_dict
                continue
            # houseinfo insert into mysql
            
        time.sleep(1)
예제 #17
0
def get_house_percommunity(city, communityname):
    try:
        baseUrl = u"http://%s.lianjia.com/" % (city)
        url = baseUrl + u"ershoufang/rs" + urllib2.quote(communityname.encode('utf8')) + "/"
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')

        if check_block(soup):
            return
        total_pages = misc.get_total_pages(url)

        if total_pages is None:
            row = model.Houseinfo.select().count()
            raise RuntimeError("Finish at %s because total_pages is None" % row)

        for page in range(total_pages):
            if page > 0:
                url_page = baseUrl + \
                    u"ershoufang/pg%drs%s/" % (page,
                                               urllib2.quote(communityname.encode('utf8')))
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')

            nameList = soup.findAll("li", {"class": "clear"})
            i = 0
            logging.info("Progress: %s %s: current page %s total pages %s", "GetHouseByCommunitylist", communityname, page + 1, total_pages)
            data_source = []
            hisprice_data_source = []
            for name in nameList:  # per house loop
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})

                    houseaddr = name.find("div", {"class": "address"})
                    info = houseaddr.div.get_text().split('|')
                    info_dict.update({u'community': communityname})
                    info_dict.update({u'housetype': info[0].strip()})
                    info_dict.update({u'square': info[1].strip()})
                    if len(info) > 6:
                        info_dict.update({u'direction': info[2].strip() + "-" + info[6].strip()})
                    else:
                        info_dict.update({u'direction': info[2].strip()})

                    info_dict.update({u'decoration': info[3].strip()})
                    if len(info) > 5:
                        info_dict.update({u'years': info[5].strip()})
                    else:
                        info_dict.update({u'years': ''})
                    if len(info) > 6:
                        info_dict.update({u'towertype': info[6]})
                    else:
                        info_dict.update({u'towertype': None})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    communityInfo = housefloor.get_text().split('-')
                    if len(communityInfo) > 1:
                        info_dict.update({u'business': communityInfo[1]})
                    else:
                        info_dict.update({u'business': None})

                    housefloor = name.find("div", {"class": "flood"})
                    floor_all = housefloor.div.get_text().split(
                        '-')[0].strip().split(' ')
                    if len(info) > 5:
                        info_dict.update({u'floor': floor_all[0].strip() + '-' + info[4].strip()})
                    else:
                        info_dict.update({u'floor': floor_all[0].strip() + '-'})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update({u'followInfo': followInfo.get_text()})

                    tax = name.find("div", {"class": "tag"})
                    info_dict.update({u'taxtype': tax.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update({u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update({u'unitPrice': unitPrice.get('data-price')})
                    info_dict.update({u'houseID': unitPrice.get('data-hid')})
                except Exception as e:
                    print(e, traceback.print_exc())
                    logging.info('parse error: %s', name)
                    continue
                data_source.append(info_dict)
                hisprice_data_source.append(
                    {"houseID": info_dict["houseID"], "totalPrice": info_dict["totalPrice"]})

            with model.database.atomic():
                if data_source:
                    logging.info("get_house_percommunity: insert %d house info to database", len(data_source))
                    model.Houseinfo.insert_many(data_source).upsert().execute()
                if hisprice_data_source:
                    logging.info("get_house_percommunity: insert %d hisprice data source info to database", len(hisprice_data_source))
                    model.Hisprice.insert_many(
                        hisprice_data_source).upsert().execute()
    except Exception as e:
        print(e, traceback.print_exc())
예제 #18
0
def get_house_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages is None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    #total_pages = 10
    house_ids = set()
    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        ultags = soup.findAll("ul", {"class": "sellListContent"})
        for ultag in ultags:
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    # 对houseID进行去重
                    if houseID in house_ids:
                        continue
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    #info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[0]})
                    info_dict.update({u'square': info[1]})
                    info_dict.update({u'direction': info[2]})
                    info_dict.update({u'decoration': info[3]})
                    info_dict.update({u'floor': info[4]})
                    info_dict.update({u'years': info[5]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    communityInfo = housefloor.get_text().split('-')
                    info_dict.update({u'community': communityInfo[0]})
                    #info_dict.update({u'years': housefloor.get_text().strip()})
                    #info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree is None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})
                    info_dict.update({"validdate": datetime.datetime.now()})
                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                house_ids.add(info_dict["houseID"])

        with model.database.atomic():
            try:
                for data in data_source:
                    model.Houseinfo.insert(data).on_conflict(
                        conflict_target=(model.Houseinfo.houseID, ),
                        update=data,
                        #preserve=(model.Houseinfo.houseID, ),
                    ).execute()
                model.Hisprice.insert_many(hisprice_data_source).execute()
            except Exception as e:
                print("error: %s" % e)
        log_progress("GetHouseByRegionlist inserted", district, page + 1,
                     total_pages)
        time.sleep(0.5)
예제 #19
0
def get_rent_percommunity(communityname):
    url = BASE_URL + u"zufang/rs" + urllib.parse.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"rent/pg%drs%s/" % (
                page + 1, urllib.parse.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = int(
                        housetitle.a.get('href').split("/")[-1].split(".")[0])
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    info_dict.update(
                        {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    info_dict.update(
                        {u'decoration': decoration.span.get_text().strip()})

                    heating = name.find("span", {"class": "heating-ex"})
                    info_dict.update(
                        {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})
                except:
                    continue
                # Rentinfo insert into mysql
                model.Rentinfo.insert(**info_dict).upsert().execute()

                time.sleep(1)
예제 #20
0
def get_community_perregion(regionname):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % page
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetCommunityByRegionlist", regionname, page + 1,
                     total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.a.get('href')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({u'id': name.get('data_analysis-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'price': price.span.get_text().strip('\n')})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.iteritems():
                    info_dict.update({key: value})

            except:
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Community.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #21
0
def get_rent_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/rs" + \
        urllib.parse.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                u"rent/pg%drs%s/" % (page,
                                     urllib.parse.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway is None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration is None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find("span", {"class": "heating-ex"})
                    info_dict.update(
                        {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #22
0
def get_rent_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Rentinfo.select().count()
        print url
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                u"rent/pg%drs%s/" % (page,
                                     urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByCommunitylist",
                     communityname, page + 1, total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get(
                        'href').split("/")[-1].split(".")[0]
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway is None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration is None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update(
                            {u'decoration': decoration.span.get_text().strip()})

                    heating = name.find("span", {"class": "heating-ex"})
                    info_dict.update(
                        {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})
                    info_dict.update({u'updatedate': datetime.datetime.now()})
                    model.Rentinfo.insert(info_dict).on_conflict(conflict_target=[model.Community.id], \
                    preserve=[model.Rentinfo.title, model.Rentinfo.link, \
                    model.Rentinfo.region, model.Rentinfo.zone, model.Rentinfo.meters, \
                    model.Rentinfo.other, model.Rentinfo.subway, model.Rentinfo.decoration, \
                    model.Rentinfo.heating, model.Rentinfo.price, model.Rentinfo.pricepre, \
                    model.Rentinfo.updatedate],update={}).execute()
                except Exception as e:
                    print "except~~!!"
                    logging.error(e)
                    print info_dict 
                    # data_source.append(info_dict)                  
                    continue      

        time.sleep(1)
예제 #23
0
def get_sell_percommunity(communityname):
    url = BASE_URL + u"chengjiao/rs" + urllib2.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"chengjiao/pg%drs%s/" % (
                page, urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetSellByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "listContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID.strip()})

                    house = housetitle.get_text().strip().split(' ')
                    info_dict.update({u'community': house[0].strip()})
                    info_dict.update({u'housetype': house[1].strip()})
                    info_dict.update({u'square': house[2].strip()})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    info_dict.update({u'direction': info[0].strip()})
                    info_dict.update({u'status': info[1].strip()})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    floor_all = housefloor.get_text().strip().split(' ')
                    info_dict.update({u'floor': floor_all[0].strip()})
                    info_dict.update({u'years': floor_all[-1].strip()})

                    followInfo = name.find("div", {"class": "source"})
                    info_dict.update(
                        {u'source': followInfo.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    if totalPrice.span is None:
                        info_dict.update(
                            {u'totalPrice': totalPrice.get_text().strip()})
                    else:
                        info_dict.update({
                            u'totalPrice':
                            totalPrice.span.get_text().strip()
                        })

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    if unitPrice.span is None:
                        info_dict.update(
                            {u'unitPrice': unitPrice.get_text().strip()})
                    else:
                        info_dict.update(
                            {u'unitPrice': unitPrice.span.get_text().strip()})

                    dealDate = name.find("div", {"class": "dealDate"})
                    info_dict.update({
                        u'dealdate':
                        dealDate.get_text().strip().replace('.', '-')
                    })

                except:
                    continue
                # Sellinfo insert into mysql
                data_source.append(info_dict)
                # model.Sellinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Sellinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
예제 #24
0
def get_house_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    normal_housing = "sf1"
    url = baseUrl + u"ershoufang/%srs%s/" % (
        normal_housing, urllib.parse.quote(communityname.encode('utf8')))
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                       u"ershoufang/pg%d%srs%s/" % (page, normal_housing,
                                                    urllib.parse.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetHouseByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        hisprice_data_source = []
        for name in nameList:  # per house loop
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.find("div", {"class": "title"})
                info_dict.update({u'title': housetitle.a.get_text().strip()})
                info_dict.update({u'link': housetitle.a.get('href')})

                houseaddr = name.find("div", {"class": "address"})
                info = houseaddr.div.get_text().split('|')
                info_dict.update({u'community': communityname})
                info_dict.update({u'housetype': info[0].strip()})
                info_dict.update({u'square': info[1].strip()[:-2]})
                info_dict.update({u'direction': info[2].strip()})
                info_dict.update({u'decoration': info[3].strip()})
                info_dict.update({u'floor': info[4].strip()})
                info_dict.update(
                    {u'years': info[5].strip()[:info[5].strip().index("年")]})

                followInfo = name.find("div", {"class": "followInfo"})
                info_dict.update({u'followInfo': followInfo.get_text()})

                tag = name.find("div", {"class": "tag"})
                info_dict.update({u'tagtype': tag.get_text().strip()})

                totalPrice = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'totalPrice': totalPrice.span.get_text()})

                unitPrice = name.find("div", {"class": "unitPrice"})
                info_dict.update({u'unitPrice': unitPrice.get('data-price')})
                info_dict.update({u'houseID': unitPrice.get('data-hid')})
            except:
                continue
            # houseinfo insert into mysql
            data_source.append(info_dict)
            hisprice_data_source.append({
                "houseID": info_dict["houseID"],
                "totalPrice": info_dict["totalPrice"]
            })

        with model.database.atomic():
            if data_source:
                model.Houseinfo.replace_many(data_source).execute()
            if hisprice_data_source:
                model.Hisprice.replace_many(hisprice_data_source).execute()
        time.sleep(1)
예제 #25
0
def get_house_perregion(district):
    url = BASE_URL + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data_analysis-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    if CITY == 'bj':
                        info = houseinfo.get_text().split('/')
                    else:
                        info = houseinfo.get_text().split('|')
                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': info[2]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_dict.update({u'years': housefloor.get_text().strip()})
                    info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data_analysis-price")})
                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                # model.Houseinfo.insert(**info_dict).upsert().execute()
                # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
예제 #26
0
def get_sell_percommunity(city, communityname):
    try:
        baseUrl = u"http://%s.lianjia.com/" % (city)
        url = baseUrl + u"chengjiao/rs" + \
            urllib2.quote(communityname.encode('utf8')) + "/"
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')

        if check_block(soup):
            return
        total_pages = misc.get_total_pages(url)

        if total_pages is None:
            row = model.Sellinfo.select().count()
            raise RuntimeError("Finish at %s because total_pages is None" % row)

        for page in range(total_pages):
            if page > 0:
                url_page = baseUrl + \
                    u"chengjiao/pg%drs%s/" % (page,
                                              urllib2.quote(communityname.encode('utf8')))
                source_code = misc.get_source_code(url_page)
                soup = BeautifulSoup(source_code, 'lxml')

            logging.info("Progress: %s %s: current page %s total pages %s", "GetSellByCommunitylist", communityname, page + 1, total_pages)
            data_source = []
            for ultag in soup.findAll("ul", {"class": "listContent"}):
                for name in ultag.find_all('li'):
                    info_dict = {}
                    try:
                        try:
                            dealinfo = name.find('div', {'class': 'dealCycleeInfo'}).find('span', {'class': 'dealCycleTxt'})
                            info_dict.update({u'dealinfo': dealinfo.get_text().strip()})  # 挂牌价和交易周期
                        except Exception as e:
                            info_dict.update({u'dealinfo': ''})
                        housetitle = name.find("div", {"class": "title"})
                        info_dict.update({u'title': housetitle.get_text().strip()})
                        info_dict.update({u'link': housetitle.a.get('href')})
                        houseID = housetitle.a.get(
                            'href').split("/")[-1].split(".")[0]
                        info_dict.update({u'houseID': houseID.strip()})

                        house = housetitle.get_text().strip().split(' ')
                        info_dict.update({u'community': communityname})
                        info_dict.update(
                            {u'housetype': house[1].strip() if len(house) > 1 else ''})
                        info_dict.update(
                            {u'square': house[2].strip() if len(house) > 2 else ''})

                        houseinfo = name.find("div", {"class": "houseInfo"})
                        info = houseinfo.get_text().split('|')
                        info_dict.update({u'direction': info[0].strip()})
                        info_dict.update(
                            {u'status': info[1].strip() if len(info) > 1 else ''})

                        housefloor = name.find("div", {"class": "positionInfo"})
                        floor_all = housefloor.get_text().strip().split(' ')
                        info_dict.update({u'floor': floor_all[0].strip()})
                        info_dict.update({u'years': floor_all[-1].strip()})

                        followInfo = name.find("div", {"class": "source"})
                        info_dict.update(
                            {u'source': followInfo.get_text().strip()})

                        totalPrice = name.find("div", {"class": "totalPrice"})
                        if totalPrice.span is None:
                            info_dict.update(
                                {u'totalPrice': totalPrice.get_text().strip()})
                        else:
                            info_dict.update(
                                {u'totalPrice': totalPrice.span.get_text().strip()})

                        unitPrice = name.find("div", {"class": "unitPrice"})
                        if unitPrice.span is None:
                            info_dict.update(
                                {u'unitPrice': unitPrice.get_text().strip()})
                        else:
                            info_dict.update(
                                {u'unitPrice': unitPrice.span.get_text().strip()})

                        dealDate = name.find("div", {"class": "dealDate"})
                        info_dict.update(
                            {u'dealdate': dealDate.get_text().strip().replace('.', '-')})

                    except Exception as e:
                        print(e, traceback.print_exc())
                    data_source.append(info_dict)
                    # model.Sellinfo.insert(**info_dict).upsert().execute()

            with model.database.atomic():
                if data_source:
                    logging.info("get_sell_percommunity: insert %d sell info data source info to database", len(data_source))
                    model.Sellinfo.insert_many(data_source).upsert().execute()
    except Exception as e:
        print(e, traceback.print_exc())
예제 #27
0
def get_house_percommunity(communityname):
    url = BASE_URL + u"ershoufang/rs" + urllib2.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/pg%drs%s/" % (
                page, urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetHouseByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        hisprice_data_source = []
        for name in nameList:  # per house loop
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.find("div", {"class": "title"})
                info_dict.update({u'title': housetitle.a.get_text().strip()})
                info_dict.update({u'link': housetitle.a.get('href')})

                houseaddr = name.find("div", {"class": "address"})
                if CITY == 'bj':
                    info = houseaddr.div.get_text().split('/')
                else:
                    info = houseaddr.div.get_text().split('|')
                info_dict.update({u'community': info[0].strip()})
                info_dict.update({u'housetype': info[1].strip()})
                info_dict.update({u'square': info[2].strip()})
                info_dict.update({u'direction': info[3].strip()})
                info_dict.update({u'decoration': info[4].strip()})

                housefloor = name.find("div", {"class": "flood"})
                floor_all = housefloor.div.get_text().split(
                    '-')[0].strip().split(' ')
                info_dict.update({u'floor': floor_all[0].strip()})
                info_dict.update({u'years': floor_all[-1].strip()})

                followInfo = name.find("div", {"class": "followInfo"})
                info_dict.update({u'followInfo': followInfo.get_text()})

                tax = name.find("div", {"class": "tag"})
                info_dict.update({u'taxtype': tax.get_text().strip()})

                totalPrice = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'totalPrice': totalPrice.span.get_text()})

                unitPrice = name.find("div", {"class": "unitPrice"})
                info_dict.update(
                    {u'unitPrice': unitPrice.get('data_analysis-price')})
                info_dict.update(
                    {u'houseID': unitPrice.get('data_analysis-hid')})
            except:
                continue
            # houseinfo insert into mysql
            data_source.append(info_dict)
            hisprice_data_source.append({
                "houseID": info_dict["houseID"],
                "totalPrice": info_dict["totalPrice"]
            })
            # model.Houseinfo.insert(**info_dict).upsert().execute()
            # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
예제 #28
0
def get_sell_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"chengjiao/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)
    #从第二页才开始存入。page1在初始化的时候就已经放进去了,无须修改链接
    for page in range(1,total_pages+1):
        if page > 1:
            url_page = baseUrl + \
                u"chengjiao/pg%drs%s/" % (page,
                                          urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        log_progress("GetSellByCommunitylist",
                     communityname, page, total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "listContent"}):
            for name in ultag.find_all('li'):
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get(
                        'href').split("/")[-1].split(".")[0]
                    info_dict.update({u'houseID': houseID.strip()})

                    house = housetitle.get_text().strip().split(' ')
                    info_dict.update({u'community': communityname})
                    info_dict.update(
                        {u'housetype': house[1].strip() if 1 < len(house) else ''})
                    info_dict.update(
                        {u'square': house[2].strip() if 2 < len(house) else ''})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    info_dict.update({u'direction': info[0].strip()})
                    info_dict.update(
                        {u'status': info[1].strip() if 1 < len(info) else ''})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    floor_all = housefloor.get_text().strip().split(' ')
                    info_dict.update({u'floor': floor_all[0].strip()})
                    info_dict.update({u'years': floor_all[-1].strip()})

                    followInfo = name.find("div", {"class": "source"})
                    info_dict.update(
                        {u'source': followInfo.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    if totalPrice.span is None:
                        info_dict.update(
                            {u'totalPrice': totalPrice.get_text().strip()})
                    else:
                        info_dict.update(
                            {u'totalPrice': totalPrice.span.get_text().strip()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    if unitPrice.span is None:
                        info_dict.update(
                            {u'unitPrice': unitPrice.get_text().strip()})
                    else:
                        info_dict.update(
                            {u'unitPrice': unitPrice.span.get_text().strip()})

                    dealDate = name.find("div", {"class": "dealDate"})
                    info_dict.update(
                        {u'dealdate': dealDate.get_text().strip().replace('.', '-')})
                    info_dict.update({u'updatedate': datetime.datetime.now()})
                    model.Sellinfo.insert(info_dict).on_conflict(conflict_target=[model.Sellinfo.houseID], \
                    preserve=[model.Sellinfo.title, model.Sellinfo.link, model.Sellinfo.community, \
                    model.Sellinfo.years, model.Sellinfo.housetype, model.Sellinfo.square, \
                    model.Sellinfo.direction, model.Sellinfo.floor, model.Sellinfo.status, \
                    model.Sellinfo.source, model.Sellinfo.totalPrice, model.Sellinfo.unitPrice, \
                    model.Sellinfo.dealdate, model.Sellinfo.updatedate],update={}).execute()
                except Exception as e:
                    logging.error(e)
                    print info_dict
                    continue
        time.sleep(1)