Пример #1
0
def get_list(url, region, page = 1):
    links = adsparser.ads_list(url, page).parse()

    parse_next_page = True

    ads = []

    for link in links:
        if Ad.get_by_key_name(link) is None:
            try:
                parser = adsparser.parse(link, region)
            except:
                continue

            if parser.date:
                created_at = parser.date
                created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time())
            else:
                created_at = datetime.datetime.now()

            ad = Ad(key_name = link,
                    title = parser.title,
                    source = parser.get_name(),
                    md5 = parser.md5,
                    contact = parser.contact,
                    phone = parser.phone,
                    price = parser.price,
                    parent_url = url,
                    created_at = created_at,
                    region = parser.region
                    )

            if parser.address_id:
                ad.address_id = parser.address_id[0]

            if parser.agent:
                ad.rating = 0

            print ad.created_at

            ads.append(ad)

            time.sleep(1)
        else:
            print "ad already found"
            parse_next_page = False

    print "saving ads: %d" % len(ads)
    db.put(ads)

    for ad in ads:
        taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() })

    if parse_next_page or len(ads) > 36:
        print "parsing page %d" % (page+1)
        get_list(url, region, page+1)
Пример #2
0
    def post(self):
        url = self.request.get('url')
        force_next_page = self.request.get('force_next_page')

        logging.info("Gettings ads list: %s" % url)

        try:
            page = int(self.request.get('page'))
        except:
            page = 1

        links = adsparser.ads_list(unicode(url).encode('utf-8'), page).parse()

        parse_next_page = True

        for link in links:
            if Ad.get_by_key_name(link) is None:
                taskqueue.add(queue_name = adsparser.parser_name(url), url="/ad", params = {'url': link, 'parent_url': url, 'region': self.request.get('region')})
            else:
                if not force_next_page:
                    parse_next_page = False

        if parse_next_page and page < MAX_PAGES:
            taskqueue.add(url="/ads", params = {'url': url, 'page': page+1, 'force_next_page': force_next_page, 'region': self.request.get('region')})
Пример #3
0
    arr.append(" ".join(phones))

print " ".join(arr)

exit()
"""

#print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse()
#exit();

#print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk')
#exit()

#urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse()
#urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse()
urls = adsparser.ads_list(
    "http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse()

for url in urls:
    print adsparser.parse(url, 'msk')

exit()
"""
#ArendaOpen parser
phones = []
for url in urls:
    page = BeautifulSoup(urllib2.urlopen(url))
    bs = page.findAll('b')
    for b in bs:
        try:
            phone = adsparser.rPHONE.search(b.string)
            if phone:
Пример #4
0
print " ".join(arr)

exit()
"""



#print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse()
#exit();

#print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk')
#exit()

#urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse()
#urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse()
urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse()

for url in urls:
    print adsparser.parse(url, 'msk')

exit()

"""
#ArendaOpen parser
phones = []
for url in urls:
    page = BeautifulSoup(urllib2.urlopen(url))
    bs = page.findAll('b')
    for b in bs:
        try:
            phone = adsparser.rPHONE.search(b.string)