class DataExtractor: DOMAIN = 'dubicars.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubicars_data_log') self.err_logger = Logger(name='err_dubicars_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() self.trim_list = self.db.get_trim_list() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: self.db.set_url_inactive(url_id) self.err_logger.error("404 " + str(url_data)) return try: marka = self.__find_make(parsed_code) year = self.__find_year(parsed_code) kilometres = self.__find_km(parsed_code) color = self.__find_color(parsed_code) specs = self.__find_specs(parsed_code) price = self.__find_price(parsed_code) model = self.__find_model(parsed_code, make=marka) trim = self.__find_trim(parsed_code, marka=marka, model=model) if trim == 'Other': self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return phone = self.__find_phone(parsed_code) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return try: data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone print data except Exception as exc: self.err_logger.error(str(exc) + url_data) self.db.set_url_processed(url_id) self.db.set_url_inactive(url_id) return self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('img', {'class': 'sold'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) return try: price = self.__find_price(parsed_code) except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) def __find_make(self, code): try: make = self.__find_tag_by_text(code, text='Make:') return make except: return '' def __find_year(self, code): try: year = self.__find_tag_by_text(code, text='Year:') year_list = year.split() for year in year_list: try: year = int(year) return year except: continue except: return '' def __find_km(self, code): try: km = self.__find_tag_by_text(code, text='Kilometers:') km = km.replace(",", "").replace(".", "").replace(" ", "") return int(km) except: return 0 def __find_color(self, code): try: color = self.__find_tag_by_text(code, text='Color:') return color.strip() except: return '' def __find_specs(self, code): try: specs = self.__find_tag_by_text(code, text='Specs:') return specs.strip() except: return '' # ============= TRIM =============== # ===== def __generateEditedTrims(self, marka, trim): for example_trim in self.trim_list: try: if len(example_trim['trim']) <= 3: continue except: continue if '-' in example_trim['trim']: if example_trim['make'] == marka: edited_example_trim = example_trim['trim'].replace( '-', ' ') if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] edited_example_trim = example_trim['trim'].replace( '-', ' ').title() if edited_example_trim in trim: print example_trim['trim'] return example_trim['trim'] return '' def __find_trim(self, code, marka, model): try: to_return_trim = '' not_edited_trim = self.__find_tag_by_text(code, text='Model:').strip() trim = not_edited_trim.replace(model, '').strip() if len(trim.split()) == 0: print not_edited_trim, 'there is no Trim!!!!' return not_edited_trim.strip() for example_trim in self.trim_list: if example_trim['make'] == marka: if example_trim['trim'] in trim: if len(example_trim['trim']) <= 2: if ' ' + example_trim[ 'trim'] + ' ' in ' ' + trim + ' ': if len(example_trim['trim']) > len( to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] continue if len(example_trim['trim']) > len(to_return_trim): print example_trim['trim'] to_return_trim = example_trim['trim'] edited_trim = self.__generateEditedTrims(marka=marka, trim=trim) if len(edited_trim) > len(to_return_trim): return edited_trim elif to_return_trim == '': if len(trim.split()) <= 2 and len(trim.split()) > 0: return trim else: return to_return_trim except: return '' # ===== # ============= TRIM =============== def __find_model(self, code, make): try: breadcrumbs = code.findAll('span', {'typeof': 'v:Breadcrumb'}) name = breadcrumbs[-1].text len_make = len(make.split()) trim = name.split()[len_make:] trim = ' '.join(trim) return trim.strip() except Exception as exc: print exc return '' def __find_phone(self, code): try: phone = code.find('p', { 'id': 'contact-buttons' }).find('a')['data-reveal'] phone = phone.replace('"', "").replace(" ", "").replace("[", "").replace("]", "") return phone.strip() except Exception as exc: print exc return '' def __find_price(self, code): try: price = code.find('strong', {'class': 'money'}).text price = price.replace('AED', "").replace(" ", "").\ replace(",", "").\ replace(".", "").\ replace("-", "") return int(price) except: try: price = code.find('strong', {'class': 'money reduced'}).text price = price.replace('AED', "").replace(" ", ""). \ replace(",", ""). \ replace(".", ""). \ replace("-", "") return int(price) except: return 0 def __find_tag_by_text(self, code, text): tag_with_text = code.find(text=text) needed_tag = tag_with_text.parent.find_next_sibling() return needed_tag.text
class DataExtractor: DOMAIN = 'dubai.dubizzle.com' PROJECT_ID = 13 PATH = 'phones/' def __init__(self): print self.DOMAIN self.logger = Logger(name='dubizzle_data_log') self.err_logger = Logger(name='err_dubizzle_data_log') self.request_manager = RequestManager() self.source_code_manager = SourceCodeManager() self.generator = Generator() self.db = DatabaseManager() def extract_data(self, url_data): print url_data url_id = url_data['id'] url = url_data['url'] listing_id = url_data['listing_id'] data = {} response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) self.err_logger.error("EXPIRED " + str(url_data)) return elif response['status_code'] == 404: #self.db.remove_listing(listing_id) self.err_logger.error("404 " + str(url_data)) self.db.set_url_inactive(url_id) return bread = parsed_code.find('span', {'id': 'browse_in_breadcrumb'}) items = bread.findAll('div') try: year = parsed_code.find('img', attrs={ 'alt': 'Year' }).parent.text.replace('Year', '').strip() kilometres = parsed_code.find('img', attrs={ 'alt': 'Kilometers' }).parent.text.replace('Kilometers', '').strip().replace(',', '').replace('.', '') color = parsed_code.find('img', attrs={ 'alt': 'Color' }).parent.text.replace('Color', '').strip() specs = parsed_code.find('img', attrs={ 'alt': 'Specs' }).parent.text.replace('Specs', '').strip() trim = parsed_code.find('img', attrs={ 'alt': 'Trim' }).parent.parent.text.replace('Trim', '').strip() if trim == 'Other': self.db.set_url_processed(url_id) return price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') model = items[-1].find('a').text.strip() marka = items[-2].find('a').text.strip() phone = self.extract_phone(parsed_code, id=url_id) except Exception as exc: self.err_logger.error(str(exc) + str(url_data)) self.db.set_url_processed(url_id) return data['year'] = int(year) data['price'] = int(price) data['kilometres'] = int(kilometres) data['color'] = color data['specs'] = specs data['trim'] = trim data['model'] = model data['make'] = marka data['phone'] = phone self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN) self.db.set_url_processed(url_id) def update_data(self, url_data): timestamp = generate_timestamp() url_id = url_data['id'] listing_id = url_data['listing_id'] print listing_id url = url_data['url'] first_timestamp = url_data['timestamp'] time_dif = first_timestamp - datetime.strptime(timestamp, "%Y.%m.%d:%H:%M:%S") time_dif = time_dif.days response = self.request_manager.take_get_request(url) source_code = response['source_code'] parsed_code = self.source_code_manager.parse_code(source_code) expired = parsed_code.find('div', {'id': 'expired-ad-message'}) if expired is not None: self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return elif response['status_code'] == 404: print 404, listing_id self.db.set_sold_status(listing_id=listing_id, days_for_selling=time_dif) #self.db.remove_listing(listing_id) self.db.set_url_inactive(url_id) print "updated" return try: price = parsed_code.find('span', { 'id': 'actualprice' }).text.replace(',', '').replace('.', '') except: price = 0 # days = self.__calc_days_on_market(listing_id) self.db.update_listing(listing_id=listing_id, price=int(price), days_on_market=time_dif) self.db.set_updated(listing_id=listing_id) print "updated" # def __calc_days_on_market(self, listing_id): # days_on_market = self.db.get_car_data(listing_id).days_on_market # if days_on_market is None: # return 0 # days_on_market += 1 # return days_on_market def extract_phone(self, code, id): img = code.find('img', {'class': 'phone-num-img'})['src'] ext = img.partition('data:image/')[2].split(';')[0] with open(self.PATH + str(id) + '.' + ext, 'wb') as f: f.write(ba.a2b_base64(img.partition('base64,')[2])) text = textract.process(self.PATH + str(id) + '.' + ext).replace( ' ', '') if '+971' in text: pass else: text = '+971' + text os.remove(self.PATH + str(id) + '.' + ext) return text.strip()