class DataExtractor: DOMAIN = 'dubai.dubizzle.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubizzle_data_log') self.err_logger = Logger(name='err_dubizzle_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: #self.db.remove_listing(listing_id) self.err_logger.error("404 " + str(url_data)) self.db.set_url_inactive(url_id) return bread = parsed_code.find('span', {'id': 'browse_in_breadcrumb'}) items = bread.findAll('div') try: year = parsed_code.find('img', attrs={ 'alt': 'Year' }).parent.text.replace('Year', '').strip() kilometres = parsed_code.find('img', attrs={ 'alt': 'Kilometers' }).parent.text.replace('Kilometers', '').strip().replace(',', '').replace('.', '') color = parsed_code.find('img', attrs={ 'alt': 'Color' }).parent.text.replace('Color', '').strip() specs = parsed_code.find('img', attrs={ 'alt': 'Specs' }).parent.text.replace('Specs', '').strip() trim = parsed_code.find('img', attrs={ 'alt': 'Trim' }).parent.parent.text.replace('Trim', '').strip() if trim == 'Other': self.db.set_url_processed(url_id) return price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') model = items[-1].find('a').text.strip() marka = items[-2].find('a').text.strip() phone = self.extract_phone(parsed_code, id=url_id) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return try: price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) print "updated" # def __calc_days_on_market(self, listing_id): # days_on_market = self.db.get_car_data(listing_id).days_on_market # if days_on_market is None: # return 0 # days_on_market += 1 # return days_on_market def extract_phone(self, code, id): img = code.find('img', {'class': 'phone-num-img'})['src'] ext = img.partition('data:image/')[2].split(';')[0] with open(self.PATH + str(id) + '.' + ext, 'wb') as f: f.write(ba.a2b_base64(img.partition('base64,')[2])) text = textract.process(self.PATH + str(id) + '.' + ext).replace( ' ', '') if '+971' in text: pass else: text = '+971' + text os.remove(self.PATH + str(id) + '.' + ext) return text.strip()
class DataExtractor: DOMAIN = 'dubicars.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubicars_data_log') self.err_logger = Logger(name='err_dubicars_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() self.trim_list = self.db.get_trim_list() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: self.db.set_url_inactive(url_id) self.err_logger.error("404 " + str(url_data)) return try: marka = self.__find_make(parsed_code) year = self.__find_year(parsed_code) kilometres = self.__find_km(parsed_code) color = self.__find_color(parsed_code) specs = self.__find_specs(parsed_code) price = self.__find_price(parsed_code) model = self.__find_model(parsed_code, make=marka) trim = self.__find_trim(parsed_code, marka=marka, model=model) if trim == 'Other': self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return phone = self.__find_phone(parsed_code) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return try: data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone print data except Exception as exc: self.err_logger.error(str(exc) + url_data) self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return try: price = self.__find_price(parsed_code) except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) def __find_make(self, code): try: make = self.__find_tag_by_text(code, text='Make:') return make except: return '' def __find_year(self, code): try: year = self.__find_tag_by_text(code, text='Year:') year_list = year.split() for year in year_list: try: year = int(year) return year except: continue except: return '' def __find_km(self, code): try: km = self.__find_tag_by_text(code, text='Kilometers:') km = km.replace(",", "").replace(".", "").replace(" ", "") return int(km) except: return 0 def __find_color(self, code): try: color = self.__find_tag_by_text(code, text='Color:') return color.strip() except: return '' def __find_specs(self, code): try: specs = self.__find_tag_by_text(code, text='Specs:') return specs.strip() except: return '' # ============= TRIM =============== # ===== def __generateEditedTrims(self, marka, trim): for example_trim in self.trim_list: try: if len(example_trim['trim']) <= 3: continue except: continue if '-' in example_trim['trim']: if example_trim['make'] == marka: edited_example_trim = example_trim['trim'].replace( '-', ' ') if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] edited_example_trim = example_trim['trim'].replace( '-', ' ').title() if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] return '' def __find_trim(self, code, marka, model): try: to_return_trim = '' not_edited_trim = self.__find_tag_by_text(code, text='Model:').strip() trim = not_edited_trim.replace(model, '').strip() if len(trim.split()) == 0: print not_edited_trim, 'there is no Trim!!!!' return not_edited_trim.strip() for example_trim in self.trim_list: if example_trim['make'] == marka: if example_trim['trim'] in trim: if len(example_trim['trim']) <= 2: if ' ' + example_trim[ 'trim'] + ' ' in ' ' + trim + ' ': if len(example_trim['trim']) > len( to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] continue if len(example_trim['trim']) > len(to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] edited_trim = self.__generateEditedTrims(marka=marka, trim=trim) if len(edited_trim) > len(to_return_trim): return edited_trim elif to_return_trim == '': if len(trim.split()) <= 2 and len(trim.split()) > 0: return trim else: return to_return_trim except: return '' # ===== # ============= TRIM =============== def __find_model(self, code, make): try: breadcrumbs = code.findAll('span', {'typeof': 'v:Breadcrumb'}) name = breadcrumbs[-1].text len_make = len(make.split()) trim = name.split()[len_make:] trim = ' '.join(trim) return trim.strip() except Exception as exc: print exc return '' def __find_phone(self, code): try: phone = code.find('p', { 'id': 'contact-buttons' }).find('a')['data-reveal'] phone = phone.replace('"', "").replace(" ", "").replace("[", "").replace("]", "") return phone.strip() except Exception as exc: print exc return '' def __find_price(self, code): try: price = code.find('strong', {'class': 'money'}).text price = price.replace('AED', "").replace(" ", "").\ replace(",", "").\ replace(".", "").\ replace("-", "") return int(price) except: try: price = code.find('strong', {'class': 'money reduced'}).text price = price.replace('AED', "").replace(" ", ""). \ replace(",", ""). \ replace(".", ""). \ replace("-", "") return int(price) except: return 0 def __find_tag_by_text(self, code, text): tag_with_text = code.find(text=text) needed_tag = tag_with_text.parent.find_next_sibling() return needed_tag.text
class DataExtractor: DOMAIN = 'carswitch.com' PROJECT_ID = 15 def __init__(self): self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() def update_data(self, db_list_listings): list_cars = self.take_js_request() if len(list_cars) == 0: print 'Error Carswitch, len of list cars is 0' return list_expired_listing_id = [] list_active_cars = [] for listing in db_list_listings: checker = False for car in list_cars: active_car = car['inspectionID'] if str(listing) == str(active_car): list_active_cars.append({ 'listing_id':active_car, 'price': car['salePrice'] }) checker = True break if checker is False: list_expired_listing_id.append(listing) if len(list_expired_listing_id) > 0: for expired_id in list_expired_listing_id: self.db.set_sold_status(listing_id=expired_id, days_for_selling=0) print expired_id, 'Not Active' if len(list_active_cars) > 0: for active_car in list_active_cars: timestamp = self.get_info_about_car(active_car['listing_id']).timestamp days_on_market = self.calculate_days_on_market(first_timestamp=timestamp) self.db.update_listing(listing_id=active_car['listing_id'], price=active_car['price'], days_on_market=days_on_market) print active_car, 'Active' def take_js_request(self): data = '{"requests": [{"indexName": "All_Carswitch_Cars","params": "query=&' \ 'numericFilters=%2CinspectionStatus!%3D9%2Cpromoted!%3D1%2C(new!%3D1)&facetFilters=&page=&hitsPerPage=1200"}]}' url = 'http://ih3kc909gb-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20AngularJS%203.15.1&x-algolia-application-id=IH3KC909GB&x-algolia-api-key=0a4fcd3b57535f88c86172d5646d6787' try: response = urllib2.urlopen( url, data=data) data = json.loads(response.read()) return data['results'][0]['hits'] except Exception as e: print('Error: ' + str(e)) return [] def indicate_specs(self, specs_index): if specs_index == 0: return 'American' elif specs_index == 1: return 'GCC' elif specs_index == 2: return 'European' elif specs_index == 3: return 'Japanese' elif specs_index == 4: return 'Canadian' def extract_data(self, db_list_listings): list_cars = self.take_js_request() if len(list_cars) == 0: print 'Error Carswitch, len of list cars is 0' return print len(list_cars) for car in list_cars: data = {} listing_id = car['inspectionID'] inspectionID = car['carID'] if str(listing_id) in db_list_listings: print listing_id, 'Exist!' continue data['make'] = car['make'] data['model'] = car['model'] data['trim'] = car['displayTrim'] data['year'] = car['year'] data['kilometres'] = car['mileage'] data['color'] = car['_highlightResult']['colorPaint']['value'] data['specs'] = self.indicate_specs(car['GCCspecs']) data['price'] = car['salePrice'] url = 'http://carswitch.com/uae/used-car/{0}/{1}/{2}/{3}-{4}'\ .format(data['make'],data['model'],data['year'],listing_id,inspectionID) data['phone'] = '' print data self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) def get_info_about_car(self, listing_id): car_obj = self.db.get_car_data(listing_id=listing_id) return car_obj def calculate_days_on_market(self, first_timestamp): timestamp = generate_timestamp() time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days return time_dif def main(self): list_listings = self.db.get_all_cars_listings(self.DOMAIN) self.extract_data(list_listings) self.update_data(list_listings)