def __VenueParser(self, url):
        try:
            if self.checkDuplicate(self.urlList, url) == False:
                ven = Venue()
                print '[SCRAPING]:' + url
                #ven.scrape_page= url

                ven.country = self._language
                self.urlList.append(url)
                xmlDoc = Util.getRequestsXML(url,
                                             '//div[@class="content"]/main')
                xmlDoc = xmlDoc.find('./main')
                name = xmlDoc.find('./h2')
                ven.name = name.text
                des = xmlDoc.find('./div[@class="clearfix"]')
                if des != None:
                    imgs = []
                    img = des.xpath('.//img')
                    for im in img:
                        imgs.append(self.__url__ + im.get('src'))
                        des.remove((im.getparent()).getparent())
                    if len(imgs) > 0:
                        ven.img_link = imgs
                    ven.description = ''.join(des.itertext())
                    pass
                map_and_phone_number = xmlDoc.xpath(
                    './div/div[@class="footer row"]')
                isMulti = False
                if len(map_and_phone_number) > 1:
                    isMulti = True
                countVenues_ = 0
                for clone_ in map_and_phone_number:
                    countVenues_ += 1
                    self._cloneVenues(ven, clone_, countVenues_, url, isMulti)
            else:
                print '[DUPLICATE]: ' + url
        except Exception, ex:
            print '[ERROR]: ' + url
            print ex
Пример #2
0
    def __VenueParser(self, element, cate, scrappages):
        subA = element.find('./div/a')
        link = subA.get('href')
        try:

            #if link =='http://es.qdq.com/f:1-GV9-6082/':
            #    print
            existing = [x for x in self.listLink if link in x]
            print 'Scraping : ' + link
            if len(existing) <= 0:

                self.listLink.append(link)

                ven = Venue()
                #ven.name = subA.find('./div/h2').text
                ven.scrape_page = link
                #ven.subcategory = cate
                ven.category = cate
                ven.country = self._language
                ven.hqdb_featured_ad_type = "none"
                address = subA.xpath('./div/p/span')
                for span in address:
                    itemprop = span.get('itemprop')
                    if itemprop == 'street-address':
                        ven.street = span.text
                    if itemprop == 'postal-code':
                        ven.zipcode = span.text
                    if itemprop == 'locality':

                        # before the first "," and before the "/"

                        ven.city = span.text  #.split(',')[0]
                        if ven.city == '' or ven.city == None:
                            continue

                        find_slash = ven.city.find('/')
                        find_comma = ven.city.find(',')
                        if find_slash != -1 and find_comma != -1:
                            ven.city = ven.city.split('/')[0]
                            if ven.city.find(',') != -1:
                                ven.city = ven.city.split(',')[1]
                        ven.city = ven.city.split(',')[0]
                        ven.city = ven.city.split('/')[0]
                if ven.street != None:
                    ven.street = self.validateStreet(ven.street)
                if ven.city != None:
                    re_City = re.search(
                        '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                        ven.city,
                        flags=0)
                    if re_City != None:
                        ven.city = ven.city.replace(re_City.group(0), '')
                if ven.zipcode != None:
                    ven.zipcode = ven.zipcode.strip()
                    if len(ven.zipcode) >= 5:
                        re_zipcode = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.zipcode,
                            flags=0)
                        if re_zipcode != None:
                            if re_zipcode.group(0) != ven.zipcode:
                                ven.zipcode = None
                        else:
                            ven.zipcode = None
                    else:
                        ven.zipcode = '0' + ven.zipcode
                        rezipcode = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.zipcode,
                            flags=0)
                        if rezipcode == None:
                            ven.zipcode = None
                        else:
                            if ven.zipcode != rezipcode.group(0):
                                ven.zipcode = None

                try:
                    if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000:
                        ven.zipcode = None
                except Exception, ex:
                    ven.zipcode = None

                detail = Util.getRequestsXML(link, '//div[@id="contenido"]')
                ven.name = detail.find('.//h1').text  #.replace('Niño','Niño')
                #ven.name = Validator.RevalidName(ven.name)
                ven.name = self.replaceName(ven.name)
                phone = detail.find('.//span[@class="telefonoCliente"]')
                if phone != None:
                    phone = phone.text
                    if phone.startswith('6') or phone.startswith('7'):
                        ven.mobile_number = '' + phone
                        ven.mobile_number = self.validatePhone__(
                            ven.mobile_number)
                    else:
                        ven.office_number = '' + phone
                        ven.office_number = self.validatePhone__(
                            ven.office_number)
                maps = detail.find('.//div[@id="mymap"]/img')
                if maps != None:
                    maps = maps.get('src')
                    (ven.latitude, ven.longitude) = self.getLatlng(maps)
                #ven.is_get_by_address =True
                ven.writeToFile(self.folder, self.addIndex(), ven.name, False)
            else:
Пример #3
0
    def __VenueParser_2(self, element, cate, scrape_pages):
        subB = element.find('./div/a')
        link = subB.get('href')

        try:
            existing = [x for x in self.listLink if link in x]
            if len(existing) <= 0:
                print 'Scraping Feature : ' + link

                self.listLink.append(link)
                ven = Venue()
                ven.country = self._language
                ven.hqdb_featured_ad_type = 'featured'
                ven.category = cate
                #ven.subcategory = cate
                ven.scrape_page = scrape_pages
                subDiv = element.find('./div[@class="resultado nada"]')
                div = subDiv.find('./a/div')
                ven.name = div.find('./h2').text  #.replace('Niño','Niño')
                '''if ven.name =='Niño de la Virgen':
                    print'''
                #ven.name = Validator.RevalidName(ven.name)
                ven.name = self.replaceName(ven.name)
                address = div.xpath('./p[@itemprop="address"]/span')
                if address != None:
                    for span in address:
                        itemprop = span.get('itemprop')
                        if itemprop == 'street-address':
                            ven.street = span.text
                        if itemprop == 'postal-code':
                            ven.zipcode = span.text
                        if itemprop == 'locality':
                            ven.city = span.text  #.split(',')[0]
                            if ven.city == '' or ven.city == None:
                                continue
                            find_slash = ven.city.find('/')
                            find_comma = ven.city.find(',')
                            if find_slash != -1 and find_comma != -1:
                                ven.city = ven.city.split('/')[0]
                                if ven.city.find(',') != -1:
                                    ven.city = ven.city.split(',')[1]
                            ven.city = ven.city.split(',')[0]
                            ven.city = ven.city.split('/')[0]
                    if ven.street != None:
                        ven.street = self.validateStreet(ven.street)
                    if ven.city != None:
                        re_City = re.search(
                            '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                            ven.city,
                            flags=0)
                        if re_City != None:
                            ven.city = ven.city.replace(re_City.group(0), '')
                    if ven.zipcode != None:
                        ven.zipcode = ven.zipcode.strip()
                        if len(ven.zipcode) >= 5:
                            re_zipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if re_zipcode != None:
                                if re_zipcode.group(0) != ven.zipcode:
                                    ven.zipcode = None
                            else:
                                ven.zipcode = None
                        else:
                            ven.zipcode = '0' + ven.zipcode
                            rezipcode = re.search(
                                '(?:(?:[1-9]{2}|[0-9][1-9]|[1-9][0-9])[0-9]{3})',
                                ven.zipcode,
                                flags=0)
                            if rezipcode == None:
                                ven.zipcode = None
                            else:
                                if ven.zipcode != rezipcode.group(0):
                                    ven.zipcode = None

                try:
                    if int(ven.zipcode) > 52080 or int(ven.zipcode) < 1000:
                        ven.zipcode = None
                except Exception, ex:
                    ven.zipcode = None

                description = div.find('./p[@class="descripcion"]').text
                if description != None:
                    ven.description = description
                imgs = subDiv.xpath('./a/figure/img')
                if len(imgs) > 0:
                    imgs_ = []
                    for im in imgs:
                        imgs_.append(im.get('src'))
                    ven.img_link = imgs_
                footer = subDiv.xpath('./div[@class="iconos"]/ul/li')
                for fo in footer:
                    text__ = fo.find('./a').text
                    if text__ == 'Mandar mail':
                        ven.business_website = fo.find('./a').get('href')
                    if text__ == 'Ver teléfono':
                        phone = fo.find('./span[@class="telefono"]').text
                        if phone.startswith('+346') or phone.startswith(
                                '+347') or phone.startswith(
                                    '7') or phone.startswith('6'):
                            ven.mobile_number = self.validatePhone__(phone)
                        else:
                            ven.office_number = self.validatePhone__(phone)
                #ven.is_get_by_address =True
                ven.writeToFile(self.folder, self.addIndex(), ven.name, False)
            else: