class DataExtractor:
    DOMAIN = 'dubai.dubizzle.com'
    PROJECT_ID = 13

    PATH = 'phones/'

    def __init__(self):
        print self.DOMAIN
        self.logger = Logger(name='dubizzle_data_log')
        self.err_logger = Logger(name='err_dubizzle_data_log')
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()

    def extract_data(self, url_data):
        print url_data
        url_id = url_data['id']
        url = url_data['url']
        listing_id = url_data['listing_id']

        data = {}

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)

        expired = parsed_code.find('div', {'id': 'expired-ad-message'})
        if expired is not None:
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            self.err_logger.error("EXPIRED " + str(url_data))

            return
        elif response['status_code'] == 404:
            #self.db.remove_listing(listing_id)
            self.err_logger.error("404 " + str(url_data))

            self.db.set_url_inactive(url_id)

            return

        bread = parsed_code.find('span', {'id': 'browse_in_breadcrumb'})
        items = bread.findAll('div')
        try:
            year = parsed_code.find('img', attrs={
                'alt': 'Year'
            }).parent.text.replace('Year', '').strip()

            kilometres = parsed_code.find('img', attrs={
                'alt': 'Kilometers'
            }).parent.text.replace('Kilometers',
                                   '').strip().replace(',',
                                                       '').replace('.', '')
            color = parsed_code.find('img', attrs={
                'alt': 'Color'
            }).parent.text.replace('Color', '').strip()
            specs = parsed_code.find('img', attrs={
                'alt': 'Specs'
            }).parent.text.replace('Specs', '').strip()
            trim = parsed_code.find('img', attrs={
                'alt': 'Trim'
            }).parent.parent.text.replace('Trim', '').strip()
            if trim == 'Other':
                self.db.set_url_processed(url_id)
                return
            price = parsed_code.find('span', {
                'id': 'actualprice'
            }).text.replace(',', '').replace('.', '')
            model = items[-1].find('a').text.strip()
            marka = items[-2].find('a').text.strip()
            phone = self.extract_phone(parsed_code, id=url_id)
        except Exception as exc:
            self.err_logger.error(str(exc) + str(url_data))
            self.db.set_url_processed(url_id)

            return

        data['year'] = int(year)
        data['price'] = int(price)
        data['kilometres'] = int(kilometres)
        data['color'] = color
        data['specs'] = specs
        data['trim'] = trim
        data['model'] = model
        data['make'] = marka
        data['phone'] = phone

        self.db.insert_data(data=data,
                            listing_id=listing_id,
                            url=url,
                            source=self.DOMAIN)
        self.db.set_url_processed(url_id)

    def update_data(self, url_data):
        timestamp = generate_timestamp()
        url_id = url_data['id']
        listing_id = url_data['listing_id']
        print listing_id
        url = url_data['url']
        first_timestamp = url_data['timestamp']
        time_dif = first_timestamp - datetime.strptime(timestamp,
                                                       "%Y.%m.%d:%H:%M:%S")
        time_dif = time_dif.days

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)
        expired = parsed_code.find('div', {'id': 'expired-ad-message'})
        if expired is not None:
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            print "updated"

            return

        elif response['status_code'] == 404:
            print 404, listing_id
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            print "updated"

            return

        try:
            price = parsed_code.find('span', {
                'id': 'actualprice'
            }).text.replace(',', '').replace('.', '')
        except:
            price = 0

        # days = self.__calc_days_on_market(listing_id)

        self.db.update_listing(listing_id=listing_id,
                               price=int(price),
                               days_on_market=time_dif)
        self.db.set_updated(listing_id=listing_id)
        print "updated"

    # def __calc_days_on_market(self, listing_id):
    #     days_on_market = self.db.get_car_data(listing_id).days_on_market
    #     if days_on_market is None:
    #         return 0
    #     days_on_market += 1
    #     return days_on_market

    def extract_phone(self, code, id):
        img = code.find('img', {'class': 'phone-num-img'})['src']

        ext = img.partition('data:image/')[2].split(';')[0]
        with open(self.PATH + str(id) + '.' + ext, 'wb') as f:
            f.write(ba.a2b_base64(img.partition('base64,')[2]))

        text = textract.process(self.PATH + str(id) + '.' + ext).replace(
            ' ', '')

        if '+971' in text:
            pass
        else:
            text = '+971' + text

        os.remove(self.PATH + str(id) + '.' + ext)
        return text.strip()
Пример #2
0
class DataExtractor:
    DOMAIN = 'dubicars.com'
    PROJECT_ID = 13

    PATH = 'phones/'

    def __init__(self):
        print self.DOMAIN
        self.logger = Logger(name='dubicars_data_log')
        self.err_logger = Logger(name='err_dubicars_data_log')
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()
        self.trim_list = self.db.get_trim_list()

    def extract_data(self, url_data):
        print url_data
        url_id = url_data['id']
        url = url_data['url']
        listing_id = url_data['listing_id']

        data = {}

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)

        expired = parsed_code.find('img', {'class': 'sold'})
        if expired is not None:
            self.db.set_url_inactive(url_id)
            self.err_logger.error("EXPIRED " + str(url_data))

            return
        elif response['status_code'] == 404:
            self.db.set_url_inactive(url_id)
            self.err_logger.error("404 " + str(url_data))

            return

        try:
            marka = self.__find_make(parsed_code)

            year = self.__find_year(parsed_code)
            kilometres = self.__find_km(parsed_code)
            color = self.__find_color(parsed_code)
            specs = self.__find_specs(parsed_code)
            price = self.__find_price(parsed_code)
            model = self.__find_model(parsed_code, make=marka)
            trim = self.__find_trim(parsed_code, marka=marka, model=model)
            if trim == 'Other':
                self.db.set_url_processed(url_id)
                self.db.set_url_inactive(url_id)
                return
            phone = self.__find_phone(parsed_code)
        except Exception as exc:
            self.err_logger.error(str(exc) + str(url_data))

            self.db.set_url_processed(url_id)
            return
        try:
            data['year'] = int(year)
            data['price'] = int(price)
            data['kilometres'] = int(kilometres)
            data['color'] = color
            data['specs'] = specs
            data['trim'] = trim
            data['model'] = model
            data['make'] = marka
            data['phone'] = phone
            print data
        except Exception as exc:
            self.err_logger.error(str(exc) + url_data)

            self.db.set_url_processed(url_id)
            self.db.set_url_inactive(url_id)

            return

        self.db.insert_data(data=data,
                            listing_id=listing_id,
                            url=url,
                            source=self.DOMAIN)
        self.db.set_url_processed(url_id)

    def update_data(self, url_data):
        timestamp = generate_timestamp()
        url_id = url_data['id']
        listing_id = url_data['listing_id']
        print listing_id
        url = url_data['url']
        first_timestamp = url_data['timestamp']
        time_dif = first_timestamp - datetime.strptime(timestamp,
                                                       "%Y.%m.%d:%H:%M:%S")
        time_dif = time_dif.days

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)
        expired = parsed_code.find('img', {'class': 'sold'})
        if expired is not None:
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            return

        elif response['status_code'] == 404:
            print 404, listing_id
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            return

        try:
            price = self.__find_price(parsed_code)
        except:
            price = 0

        # days = self.__calc_days_on_market(listing_id)

        self.db.update_listing(listing_id=listing_id,
                               price=int(price),
                               days_on_market=time_dif)
        self.db.set_updated(listing_id=listing_id)

    def __find_make(self, code):
        try:
            make = self.__find_tag_by_text(code, text='Make:')
            return make
        except:
            return ''

    def __find_year(self, code):
        try:
            year = self.__find_tag_by_text(code, text='Year:')
            year_list = year.split()
            for year in year_list:
                try:
                    year = int(year)
                    return year
                except:
                    continue
        except:
            return ''

    def __find_km(self, code):
        try:
            km = self.__find_tag_by_text(code, text='Kilometers:')
            km = km.replace(",", "").replace(".", "").replace(" ", "")
            return int(km)
        except:
            return 0

    def __find_color(self, code):
        try:
            color = self.__find_tag_by_text(code, text='Color:')
            return color.strip()
        except:
            return ''

    def __find_specs(self, code):
        try:
            specs = self.__find_tag_by_text(code, text='Specs:')
            return specs.strip()
        except:
            return ''

    # ============= TRIM ===============
    # =====
    def __generateEditedTrims(self, marka, trim):
        for example_trim in self.trim_list:
            try:
                if len(example_trim['trim']) <= 3:
                    continue
            except:
                continue

            if '-' in example_trim['trim']:
                if example_trim['make'] == marka:

                    edited_example_trim = example_trim['trim'].replace(
                        '-', ' ')
                    if edited_example_trim in trim:
                        print example_trim['trim']
                        return example_trim['trim']

                    edited_example_trim = example_trim['trim'].replace(
                        '-', ' ').title()
                    if edited_example_trim in trim:
                        print example_trim['trim']
                        return example_trim['trim']
        return ''

    def __find_trim(self, code, marka, model):
        try:
            to_return_trim = ''
            not_edited_trim = self.__find_tag_by_text(code,
                                                      text='Model:').strip()
            trim = not_edited_trim.replace(model, '').strip()

            if len(trim.split()) == 0:
                print not_edited_trim, 'there is no Trim!!!!'
                return not_edited_trim.strip()

            for example_trim in self.trim_list:
                if example_trim['make'] == marka:
                    if example_trim['trim'] in trim:

                        if len(example_trim['trim']) <= 2:
                            if ' ' + example_trim[
                                    'trim'] + ' ' in ' ' + trim + ' ':
                                if len(example_trim['trim']) > len(
                                        to_return_trim):
                                    print example_trim['trim']
                                    to_return_trim = example_trim['trim']
                            continue

                        if len(example_trim['trim']) > len(to_return_trim):
                            print example_trim['trim']
                            to_return_trim = example_trim['trim']

            edited_trim = self.__generateEditedTrims(marka=marka, trim=trim)
            if len(edited_trim) > len(to_return_trim):
                return edited_trim
            elif to_return_trim == '':
                if len(trim.split()) <= 2 and len(trim.split()) > 0:
                    return trim
            else:
                return to_return_trim
        except:
            return ''

    # =====
    # ============= TRIM ===============

    def __find_model(self, code, make):
        try:
            breadcrumbs = code.findAll('span', {'typeof': 'v:Breadcrumb'})
            name = breadcrumbs[-1].text
            len_make = len(make.split())
            trim = name.split()[len_make:]
            trim = ' '.join(trim)
            return trim.strip()
        except Exception as exc:
            print exc
            return ''

    def __find_phone(self, code):
        try:
            phone = code.find('p', {
                'id': 'contact-buttons'
            }).find('a')['data-reveal']
            phone = phone.replace('"',
                                  "").replace(" ",
                                              "").replace("[",
                                                          "").replace("]", "")
            return phone.strip()
        except Exception as exc:
            print exc
            return ''

    def __find_price(self, code):
        try:
            price = code.find('strong', {'class': 'money'}).text
            price = price.replace('AED', "").replace(" ", "").\
                replace(",", "").\
                replace(".", "").\
                replace("-", "")
            return int(price)
        except:
            try:
                price = code.find('strong', {'class': 'money reduced'}).text
                price = price.replace('AED', "").replace(" ", ""). \
                    replace(",", ""). \
                    replace(".", ""). \
                    replace("-", "")
                return int(price)
            except:
                return 0

    def __find_tag_by_text(self, code, text):

        tag_with_text = code.find(text=text)
        needed_tag = tag_with_text.parent.find_next_sibling()
        return needed_tag.text
class DataExtractor:

    DOMAIN = 'carswitch.com'
    PROJECT_ID = 15

    def __init__(self):
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()

    def update_data(self, db_list_listings):
        list_cars = self.take_js_request()
        if len(list_cars) == 0:
            print 'Error Carswitch, len of list cars is 0'
            return

        list_expired_listing_id = []
        list_active_cars = []

        for listing in db_list_listings:
            checker = False
            for car in list_cars:
                active_car = car['inspectionID']
                if str(listing) == str(active_car):
                    list_active_cars.append({
                        'listing_id':active_car,
                        'price': car['salePrice']
                    })
                    checker = True
                    break

            if checker is False:
                list_expired_listing_id.append(listing)


        if len(list_expired_listing_id) > 0:
            for expired_id in list_expired_listing_id:
                self.db.set_sold_status(listing_id=expired_id, days_for_selling=0)
                print expired_id, 'Not Active'

        if len(list_active_cars) > 0:
            for active_car in list_active_cars:
                timestamp = self.get_info_about_car(active_car['listing_id']).timestamp
                days_on_market = self.calculate_days_on_market(first_timestamp=timestamp)
                self.db.update_listing(listing_id=active_car['listing_id'], price=active_car['price'], days_on_market=days_on_market)
                print active_car, 'Active'


    def take_js_request(self):
        data = '{"requests": [{"indexName": "All_Carswitch_Cars","params": "query=&' \
               'numericFilters=%2CinspectionStatus!%3D9%2Cpromoted!%3D1%2C(new!%3D1)&facetFilters=&page=&hitsPerPage=1200"}]}'

        url = 'http://ih3kc909gb-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20AngularJS%203.15.1&x-algolia-application-id=IH3KC909GB&x-algolia-api-key=0a4fcd3b57535f88c86172d5646d6787'

        try:
            response = urllib2.urlopen(
                url,
                data=data)
            data = json.loads(response.read())
            return data['results'][0]['hits']
        except Exception as e:
            print('Error: ' + str(e))
            return []

    def indicate_specs(self, specs_index):
        if specs_index == 0:
            return 'American'
        elif specs_index == 1:
            return 'GCC'
        elif specs_index == 2:
            return 'European'
        elif specs_index == 3:
            return 'Japanese'
        elif specs_index == 4:
            return 'Canadian'

    def extract_data(self, db_list_listings):
        list_cars = self.take_js_request()

        if len(list_cars) == 0:
            print 'Error Carswitch, len of list cars is 0'
            return

        print len(list_cars)

        for car in list_cars:
            data = {}

            listing_id = car['inspectionID']
            inspectionID = car['carID']

            if str(listing_id) in db_list_listings:
                print listing_id, 'Exist!'
                continue

            data['make'] = car['make']
            data['model'] = car['model']
            data['trim'] = car['displayTrim']
            data['year'] = car['year']
            data['kilometres'] = car['mileage']
            data['color'] = car['_highlightResult']['colorPaint']['value']
            data['specs'] = self.indicate_specs(car['GCCspecs'])
            data['price'] = car['salePrice']



            url = 'http://carswitch.com/uae/used-car/{0}/{1}/{2}/{3}-{4}'\
                .format(data['make'],data['model'],data['year'],listing_id,inspectionID)

            data['phone'] = ''

            print data

            self.db.insert_data(data=data, listing_id=listing_id, url=url, source=self.DOMAIN)


    def get_info_about_car(self, listing_id):
        car_obj = self.db.get_car_data(listing_id=listing_id)
        return car_obj


    def calculate_days_on_market(self, first_timestamp):
        timestamp = generate_timestamp()
        time_dif = first_timestamp - datetime.strptime(timestamp,
                                                       "%Y.%m.%d:%H:%M:%S")
        time_dif = time_dif.days

        return time_dif

    def main(self):
        list_listings = self.db.get_all_cars_listings(self.DOMAIN)
        self.extract_data(list_listings)
        self.update_data(list_listings)