Exemplo n.º 1
0
def ValidateGeoCode(fulladdress, country, lat, lng, scrape_page=None):
    if fulladdress != None and country != None and lat != None and lng != None:
        GermanyChar = [['ä', 'ae'], ['ö', 'oe'], ['ü', 'ue'], ['Ä', 'Ae'],
                       ['Ö', 'Oe'], ['Ü', 'Ue'], ['ß', 'ss']]
        if country == 'de' or country == 'at':
            for x in GermanyChar:
                fulladdress = fulladdress.replace(x[0], x[1])
        fulladdress = Util.removedoubleSpace(fulladdress)
        jsonLocation = Util.getGEOCode(fulladdress, country)
        if jsonLocation != None and jsonLocation.get('status').upper() == 'OK':
            latLo = str(
                jsonLocation.get('results')[0].get('geometry').get(
                    'location').get('lat'))
            lngLo = str(
                jsonLocation.get('results')[0].get('geometry').get(
                    'location').get('lng'))
            dotIndex = min(len(latLo), latLo.find('.') + 3)
            for c in range(0, min(dotIndex, len(lat))):
                if lat[c] != latLo[c]:
                    Util.log.coordinate_logger.error(scrape_page +
                                                     ': invalid latitude (' +
                                                     lat + ',' + lng + ')')
                    return False
            dotIndex = min(len(lngLo), lngLo.find('.') + 3)
            for c in range(0, min(dotIndex, len(lng))):
                if lng[c] != lngLo[c]:
                    Util.log.coordinate_logger.error(scrape_page +
                                                     ': invalid longitude (' +
                                                     lat + ',' + lng + ')')
                    return False
        else:
            Util.log.coordinate_logger.warning(
                fulladdress + ',' + country + ': cannot get GEO code ' +
                jsonLocation.get('status').upper())
    return True
 def __getListVenues(self, link, city):
     xmlDoc = Util.getRequestsXML(
         link, '//body/div[@class="container"]/div[@class="container"]')
     itemsXpath = [
         '//div[@class="row top-buffer"]/a',
         '//div[@class="row top-buffer border-row"]/a'
     ]
     if xmlDoc != None:
         pages_ = xmlDoc.find('.//div[@class="row centered-text"]/p')
         if pages_ != None:
             totalPages = pages_.text
             try:
                 currentPages = 0
                 totalPages = int(totalPages.split('of')[1])
                 '''self.total+= (totalPages*20)
                 print str(self.total)'''
                 while currentPages < totalPages:
                     currentPages += 1
                     url = link + '&page=' + str(currentPages)
                     xmlContent = Util.getRequestsXML(
                         url,
                         '//body/div[@class="container"]/div[@class="container"]'
                     )
                     for path in itemsXpath:
                         items = xmlContent.xpath(path)
                         for item in items:
                             self.__VenueParser(
                                 self.__url__ + item.get('href'), city)
                 Util.log.running_logger.info(city + ' Done')
             except Exception, ex:
                 print ex
    def __getListVenues(self, city):
        #print "Getting list of Venues"
        #lens= len(self.__city__)
        #index_ = 0
        #for city in range(0,lens):

        _Schools = Util.getRequestsXML(city,
                                       '//td[@class="welcome-padding"]/table')
        if _Schools == None:
            Util.log.running_logger.warning(city + ' Done')
            return
        if len(_Schools) >= 2:
            tds = _Schools[1].xpath('./tr/td')
            for td in tds:
                as_ = td.xpath('./a')
                for a in as_:
                    link = a.get('href')
                    name = a.text

                    ven = self.__VenueParser(link, name)
                    if ven != None:
                        index_ = self.addIndex()
                        print 'Writing to Index: ' + str(index_)
                        ven.writeToFile(self.folder, index_, ven.name, False)
                        #index_= self.addIndex()
                        #time.sleep(2)
            Util.log.running_logger.warning(city + ' Done')
    def doWork(self):

        self.phoneCodeList = Util.getPhoneCodeList()
        '''for i in range(10):
            thread1 = threading.Thread(target=self.print_ex,args=(i,))
            thread1.start()'''
        self.__getListVenues()
 def doWork(self):
     '''
     Code Here
     '''
     #Write Files
     self.phoneCodeList = Util.getPhoneCodeList()
     self.__getListVenues()
    def getLatlng(self, address, countr):
        if address.strip() == '':
            #return (None,None)
            address = 'null'
            return (None, None)
        try:
            jsonLatlng = Util.getGEOCode(address, countr)
            if jsonLatlng != None:
                if jsonLatlng.get('status') == 'OK':
                    result = jsonLatlng.get('results')
                    for re in result:
                        if re.get('geometry') != None:
                            geometry = re.get('geometry')
                            location = geometry.get('location')
                            lat = location.get('lat')
                            lng = location.get('lng')
                            return (str(lat), str(lng))
                else:

                    return (None, None)
            else:

                return (None, None)
        except Exception, ex:
            return (None, None)
Exemplo n.º 7
0
 def getRequest(self, pages, value, xpath_):
     url = 'http://www.architecturalindex.com/consumers/search/search.asp?strNearRegion=' + str(
         value) + '&intPage=' + str(pages)
     print '*' * 25
     print 'Request: ' + url
     print '*' * 25
     results = Util.getRequestsXML(url, xpath_)
     return results
Exemplo n.º 8
0
 def getListRegion(self):
     xmlList = Util.getRequestsXML(self.__url__, '//div[@class="col-md-9 page-main"]/div/ul')
     if xmlList !=None:
         regions = xmlList.xpath('./ul/li/a')
         for region in regions :
             if region.text!='': # Channel Islands
                 url__ = self.__url__+ region.get('href')
                 self.__getListVenues(url__)
Exemplo n.º 9
0
    def doWork(self):
        #Set OutFile Values

        self.phoneCodeList = Util.getPhoneCodeList()
        '''
        Code Here
        '''
        self.__getListVenues()
 def getCategory(self):
     listCate=[]
     xmlDoc = Util.getRequestsXML('https://www.blauarbeit.de/branchenbuch/index.html', '//div[@class="box_w box_r"]/ul/li/a')
     if xmlDoc!=None:
         a   =  xmlDoc.xpath('./a')
     for a_ in a :
         listCate.append(a_.text+'\t'+self.__url__+a_.get('href'))
     return listCate 
 def __getListVenues(self, urlRegion):
     xmlListVenues = Util.getRequestsXML(
         urlRegion,
         '//div[@class="row listing listing-horizontal-xs listing-horizontal-sm listing-horizontal-md business"]'
     )
     if xmlListVenues != None:
         listElements = xmlListVenues.xpath('./div')
         for element in listElements:
             self.__VenueParser(element)
 def getListcategory(self):
     xmlDoc = Util.getRequestsXML(self.__url__ + '/the-directory',
                                  '//div[@id="wpbdp-categories"]/ul/li')
     if xmlDoc != None:
         listcates = xmlDoc.xpath('./li/a')
         for cate in listcates:
             thread__ = threading.Thread(target=self.__getListVenues,
                                         args=(cate, ))
             thread__.start()
Exemplo n.º 13
0
 def __getListVenues_2(self):
     xmldoc = Util.getRequestsXML(self._url_lstVenues, '/html')
     listdata = self.getListGeoCode(xmldoc)
     listvenues = xmldoc.xpath('//div[@class="links"]/ul/li')
     count = 0
     for ven in listvenues:
         count += 1
         self.__VenueParser(ven, listdata,
                            self._url_lstVenues + '#' + str(count))
 def __getListVenues(self):
     xmlDoc = Util.getRequestsXML(self._url_lstVenues,
                                  self._xpath_lstVenues)
     listElement = xmlDoc.xpath('./tr')
     for ele in listElement:
         if len(ele.xpath('./td//a/@onclick')) > 0:
             self.__VenueParser(ele)
         else:
             self.__VenueParser_2(ele)
Exemplo n.º 15
0
 def doWork(self):
     '''
     Code Here
     '''
     #Write Files
     self.phoneCodeList = Util.getPhoneCodeList()
     #string =  'SILIGOM - SARL PNEU ROUTE 01  165 RADIOR'
     #print self.validateStreet2(string)
     #print self.replaceChar('-', self.replaceChar(',', string, True), True)
     self.__getListVenues()
Exemplo n.º 16
0
def ReValidPhone(phone, type='phone'):
    if phone != None:
        if phone == "":
            return None
        phone = ReValidString(phone)
        if phone != None:
            phone = Util.removeSpecialChar(phone, type).replace(' ', '')
        if phone == '':
            return None
        return phone
Exemplo n.º 17
0
    def doWork(self):

        string = '15863 A Baña'
        print self.validateStreet(string)

        self.phoneCodeList = Util.getPhoneCodeList()
        try:
            self.__getListVenues()
        except Exception, ex:
            print ex
 def __list_city(self):
     xmlDoc = Util.getRequestsXML(self._url_lstVenues,
                                  '//td[@class="welcome-padding"]')
     #print ET.dump(xmlDoc)
     listCity = xmlDoc.xpath('//table/tr/td/a')
     if len(listCity) > 0:
         for i in listCity:
             link = i.get('href')
             existing = [x for x in self.__city__ if link in x]
             if len(existing) <= 0:
                 self.__city__.append(link)
 def doWork(self):
     self.outFile = self.folder + '/' + self._chain_ + '_' + Validation.RevalidName(self.__name__) + '_Venues.csv'
     self.phoneCodeList = Util.getPhoneCodeList()
     self.__getListCities()
     
     if len(self.listCities) > 0:
         self.listCities = list(set(self.listCities))
         self.__getListVenues()
         if len(self._lstVenues) > 0: 
             listWrite2File = []               
             for i in range(len(self._lstVenues)):
                 try:
                     ven = self.__VenueParser(self._lstVenues[i])
                     if ven != None:
                         listWrite2File.append(ven.toOrderDict(False))
                 except Exception,ex:
                     print "URL: " + self._lstVenues[i].scrape_page + ": " + ex.message
                     Util.log.error("URL: " + self._lstVenues[i].scrape_page + ": " + ex.message)                        
             
             Util.writelist2File(listWrite2File,self.outFile)
Exemplo n.º 20
0
def ReValidPrice(price, pattern=None):
    if price == None:
        return None
    price = price.strip().replace(",", ".").strip()
    price = price.replace('\r\n', '').replace('\r', '').replace('\n',
                                                                '').strip()
    if pattern != None and re.match(pattern, price) == None:
        return None
    if price.strip() == '':
        return None
    return Util.removedoubleSpace(price).strip()
    def doWork(self):
        self.phoneCodeList = Util.getPhoneCodeList()
        self.__getListVenues()
        self.filterVenues()

        listDict = list(self.venues)
        for l in listDict:
            ven = self.venues.get(l)

            #for ven in self.venues:
            self.index += 1
            ven.writeToFile(self.folder, self.index, ven.name, False)
Exemplo n.º 22
0
 def doWork(self):
  
     self.phoneCodeList = Util.getPhoneCodeList()
     
     #get by rergion
     #self.getListRegion()
     
     #get by postcodes
     self.getByPostCode()
     
     
     print str(self.index)
Exemplo n.º 23
0
 def __getListVenues(self, postcode):
     xmlDoc = Util.getRequestsXML(self._url_lstVenues + postcode, '/html')
     listData = self.getListGeoCode(xmlDoc)
     xmlListVenues = xmlDoc.xpath(self._xpath_lstVenues)
     if len(xmlListVenues) > 0:
         elements = xmlListVenues[0].xpath('./ul/li')
         count = 0
         for ele in elements:
             count += 1
             self.__VenueParser(
                 ele, listData,
                 self._url_lstVenues + postcode + '#' + str(count))
Exemplo n.º 24
0
 def __getListVenues(self, countr):
     for post in range(
             0, 10) + [chr(x) for x in range(ord('a'),
                                             ord('z') + 1)]:
         #for post in  [chr(x) for x in range(ord('b'), ord('z')+1)]:
         page = 1
         countRequest = 1
         url = self.url_(page, post, countRequest, countr)
         print 'Find with: ' + str(post)
         isTable = Util.getRequestsXML(
             url,
             '//table[@class="table-responsive firm-search-results expandable-rows"]'
         )
         if isTable == None:
             continue
         while len(isTable) > 0:
             page = countRequest * 4 - 3
             url = self.url_(page, post, countRequest, countr)
             #url ='http://www.accaglobal.com/ca/en/member/find-an-accountant/find-firm/results.html?isocountry=VN&location=&country=UK&firmname=a&organisationid=ACCA&hid=&pagenumber=3&resultsperpage=5&requestcount=1'
             print 'find with url: ' + url
             isTable = Util.getRequestsXML(
                 url,
                 '//table[@class="table-responsive firm-search-results expandable-rows"]'
             )
             #hides=  isTable.xpath('//tr[@class="expandable"]')
             hides = isTable.xpath('//tr/td/h5/a')
             print 'found: ' + str(len(hides))
             while len(hides) > 0:
                 link = hides[-1].get('href')
                 if self.checkAlive() < 6:
                     thread2 = threading.Thread(target=self.__VenueParser,
                                                args=(self.__url__ + link,
                                                      countr))
                     hides.remove(hides[-1])
                     thread2.start()
                     self.threadRunning.append(thread2)
                 else:
                     time.sleep(1)
             countRequest += 1
         print countr + ' done'
 def getRegions(self):
     xmlDoc = Util.getRequestsXML(self._url_lstVenues,
                                  '//ul[@id="menu-menu-adresses"]/li/a')
     if xmlDoc != None:
         listhref = xmlDoc.xpath('./a')
         while len(listhref) > 0:
             if self.countThread() < 6:
                 href = listhref[-1]
                 thread_ = threading.Thread(target=self.__getListVenues,
                                            args=(href.get('href'), ))
                 thread_.start()
                 self.runningThread.append(thread_)
                 listhref.remove(href)
Exemplo n.º 26
0
    def doWork(self):

        #print self.check_website('https://www.2130113-C-2013')

        self.phoneCodeList = Util.getPhoneCodeList()
        postcode = self.postcode()
        try:
            for post in postcode:
                print '-' * 10 + post + '-' * 10
                self.__getListVenues(post)
            #self.__getListVenues_2()
        except Exception, ex:
            print ex
Exemplo n.º 27
0
    def getListVenues(self, xmlElement, cate):
        pages = -100
        while True:
            pages += 100
            numbers = 0
            try:
                link___ = xmlElement.get('href') + '/pag-' + str(
                    pages) + '/rows-100/s:/'

                #link___ ='http://es.qdq.com/abastecimiento+de+agua/'

                listVenues__ = Util.getRequestsXML(
                    link___, '//div[@id="listadoResultados"]')
                listVenues = listVenues__.xpath(
                    '//li[@class="estirar gratuito"]')
                listVenues_2 = listVenues__.xpath('//li[@class="estirar "]')
                #print ET.dump(listVenues__)
                lenght_2 = len(listVenues_2)
                lenght = len(listVenues)
                self.countingVenues += lenght
                self.countingVenues += lenght_2
                if len(listVenues) <= 0:
                    break
                while len(listVenues) > 0:
                    if self.checkAlive() < 10:
                        numbers += 1
                        scrappages = link___ + '#' + str(numbers)
                        thread1 = threading.Thread(target=self.__VenueParser,
                                                   args=(listVenues[-1], cate,
                                                         scrappages))
                        self.threadRunning.append(thread1)
                        thread1.start()
                        listVenues.pop()
                    else:
                        time.sleep(0.1)
                while len(listVenues_2) > 0:
                    if self.checkAlive() < 10:
                        numbers += 1
                        scrape_pages = link___ + '#00' + str(numbers)
                        '''thread2 = threading.Thread(target=self.__VenueParser_2,args=(listVenues_2[-1],subcate,cate,scrape_pages))
                        self.threadRunning.append(thread2)
                        thread2.start()'''
                        self.__VenueParser_2(listVenues_2[-1], cate,
                                             scrape_pages)
                        listVenues_2.pop()
                    else:
                        time.sleep(0.1)

            except Exception, ex:
                print ex
                continue
Exemplo n.º 28
0
    def doWork(self):

        #print self.validateZipcode('BN11ITU', 'UK')

        self.phoneCodeList = Util.getPhoneCodeList()
        for countr in range(0, len(self.listCountry)):
            countr_ = self.listCountry[countr]
            existing = [x for x in self.countryRunning if countr_ in x]
            if len(existing) <= 0:
                self.countryRunning.append(countr_)
                thread1 = threading.Thread(target=self.__getListVenues,
                                           args=(countr_, ))
                thread1.start()
                thread1.join()
 def __getListVenues(self):
     print "Getting list of Venues"
     xmlArea = Util.getRequestsXML(self.__url__, self._xpath_lstVenues)
     if xmlArea==None | len(xmlArea)<=0:
         ''
     else:
         listArea = xmlArea.xpath('./a')
         for area in listArea:
             linkArea = area.get('href')
             xmlCity = Util.getRequestsXML(linkArea,self._xpath_lstVenues)
             if xmlCity ==None | len(xmlCity)<=0:
                 ''
             else:
                 listCity = xmlCity.xpath('./a')
                 for city in listCity:
                     linkCity = city.get('href')
                     xmlContent = Util.getRequestsXML(linkCity, '//div[@class="content_column"]')
                     if xmlContent==None | len(xmlContent)<=0:
                         ''
                     else:
                         items= xmlContent.xpath('./div[@class="listings"]/div[@class="list"]/div[@class="listing_box"]/a')
                         for item in items:
                             link_item = item.get('href')
                             self.list_url.append(link_item)
    def doWork(self):
        '''
        string='abc def'
        replace ='DEF'
        string_ = string.upper()
        starts =  string_.find(replace)
        replace_ = string[starts:starts+len(replace)]
        print string.replace(replace_, '')
        '''
        '''string ='www.facebook.com/faceface'
        print self.addHTTP(string)'''

        self.phoneCodeList = Util.getPhoneCodeList()
        self.getListRegion()
        print str(self.index)