def get_finalcheck_offset(): """ Get the finalcheck offset from the last spider run """ settings = DatabaseUtil( self.site, self.spider).load_settings(fields="finalcheck_offset") return int(settings['finalcheck_offset'])
def get_finalcheck_urls(self, old_days, block_size): """ Get a block_size of rechecking URLs which adapt old_days constrain, returns an array of url_ids """ def get_finalcheck_offset(): """ Get the finalcheck offset from the last spider run """ settings = DatabaseUtil( self.site, self.spider).load_settings(fields="finalcheck_offset") return int(settings['finalcheck_offset']) def write_finalcheck_offset(setting): """ Write the finalcheck offset for this run of the spider """ DatabaseUtil(self.site, self.spider).write_settings(field="finalcheck_offset", value=setting) # call inner method to get finalcheck_offset finalcheck_offset = get_finalcheck_offset() url_ids = DatabaseUtil(self.site, self.spider).get_checking_urls( old_days, block_size, finalcheck_offset, 'H', 'D') # if any accepted url_id if len(url_ids) > 0: # call inner method update a new finalcheck offset write_finalcheck_offset(finalcheck_offset + len(url_ids)) else: # call inner method to reset finalcheck offset write_finalcheck_offset(0) return url_ids
def load_settings(self): """ Load settings (active, block_size, main_startid, recon_startid) from last spider run. \ Grab defaults if none are present, returns a dict of settings """ settings = DatabaseUtil(self.site, self.spider).load_settings( fields="active, block_size, main_startid, recon_startid") return {'active': settings['active'], 'block_size': int(settings['block_size']), 'main_startid': int(settings['main_startid']),\ 'recon_startid': int(settings['recon_startid'])}
def write_active(self, setting): """ write status of activating """ # if spider is recheck of finalcheck if 'check' in self.spider: active_field = 'recheck_active' else: # spder is recon or main active_field = 'active' # Call write_settings method to make transaction DatabaseUtil(self.site, self.spider).write_settings(field=active_field, value=setting)
def load_settings(self): """ Load settings (recheck_active, finalcheck_olddays, block_size) from last finalcheck spider run. \ Grab defaults if none are present, returns a dict of settings """ settings = DatabaseUtil(self.site, self.spider).load_settings( fields="recheck_active, finalcheck_olddays, block_size") return { 'active': settings['recheck_active'], 'url_ids': self.get_finalcheck_urls(settings['finalcheck_olddays'], settings['block_size']) }
def write_startid(self, setting): """ Write the new startid setting for this run of the spider """ DatabaseUtil(self.site, self.spider).write_settings(field="main_startid", value=setting)
def write_cycles(self, setting): """ Write the cycles setting for this run of the spider """ DatabaseUtil(self.site, self.spider).write_settings(field="cycles", value=setting)
def initialize_settings(self): """ initial settings for a new site """ DatabaseUtil(self.site, self.spider).initialize_settings()
def write_finalcheck_offset(setting): """ Write the finalcheck offset for this run of the spider """ DatabaseUtil(self.site, self.spider).write_settings(field="finalcheck_offset", value=setting)
def get_ids_for_vin(site, block_size): """ get url_ids to get vins """ return DatabaseUtil(site).get_ids_for_vin(block_size)
def extract_YMMT(data): """ parse description to get year, make, model and trim from the description returns a dict of them or -1 if not found any make """ # a hard-coded list of makes to match make in description standard_makes = ( 'Acura', 'Alfa Romeo', 'AMC', 'Aston Martin', 'Audi', 'Avanti', 'Bentley', 'BMW', 'Buick', 'Cadillac', 'Chevrolet', 'Chrysler', 'Daewoo', 'Daihatsu', 'Datsun', 'DeLorean', 'Dodge', 'Eagle', 'Ferrari', 'Fiat', 'Fisker', 'Ford', 'Freightliner', 'Geo', 'GMC', 'Honda', 'Hummer', 'Hyundai', 'Infiniti', 'Isuzu', 'Jaguar', 'Jeep', 'Kia', 'Lamborghini', 'Lancia', 'Land Rover', 'Lexus', 'Lincoln', 'Lotus', 'Maserati', 'Maybach', 'Mazda', 'McLaren', 'Mercedes-Benz', 'Mercury', 'Merkur', 'Mini', 'Mitsubishi', 'Nissan', 'Oldsmobile', 'Peugeot', 'Plymouth', 'Pontiac', 'Porsche', 'Renault', 'Rolls-Royce', 'Saab', 'Saturn', 'Scion', 'Smart', 'SRT', 'Sterling', 'Subaru', 'Suzuki', 'Tesla', 'Toyota', 'Triumph', 'Volkswagen', 'Volvo', 'Yugo', 'Ram', ) # looking for the year in the description year = re.search(r'(\d+)', data).group(1) make = None # looking for make in the manual list for m in standard_makes: if m in data: make = m break elif m.upper() in data: make = m.upper() break if not make: # Can't found any make, exit the method here return -1 data = data.replace(year, '', 1) data = data.replace(make, '', 1).strip() model = "" trim = "" # Generate all ngrams from the description to match make and model pair ngrams = generate_ngrams(data) # Load all models of the make from the DB all_models = DatabaseUtil().get_all_models(make) for gram in ngrams: found = False for each in all_models: # try to match make from the description with one of them in the DB if each.lower().strip() == gram.lower().strip(): model = gram # Extract trim after model's place try: trim = re.search(model + r'(.+)', data).group(1).strip() except: pass found = True break if found: break return { 'year': year, 'make': make.strip(), 'model': model.strip(), 'trim': trim.strip() }
def generate_ids(site): """ Generate ids for recon spider """ from array import array # Load settings of recon_startid, block_size, cycles, cycles_limit, overs settings = DatabaseUtil(site).load_settings( fields="recon_startid, block_size, cycles, cycles_limit, overs") # pass settings to variables old_startid = int(settings['recon_startid']) + 1 cycles = int(settings['cycles']) cycles_limit = int(settings['cycles_limit']) block_size = int(settings['block_size']) overs = int(settings['overs']) if cycles >= cycles_limit: overs += 1 if cycles / 100 > 0: cycles = cycles / 100 # Get the 10's place digit for cycles cycles1 = cycles % 10 # Get the 100's place digit from cycles cycles2 = (cycles / 10) % 10 # How many ID's to skip skip = 17 # Where the new spider starts. Use cycles1 to determine which 10's digit to scan new_startid = int(old_startid + cycles1) # How far to jump ahead. Use cycles2 to determine how far out to recon recon_startid = int(new_startid + cycles2 * block_size) # Pretty obvious end_id = int(recon_startid + block_size) # Number of cycles before going back and checking the very first range again backcheck = 5 # Create array (less memory than a list) of integers to generate urls from recon_list = array('i', (xrange(recon_startid, end_id, skip))) # If cycles are more that 50, then double back and check the first group without jumping # Dived the skip number by 2 to double the intensity of check the first block if cycles >= backcheck: backcheck_list = array( 'i', (xrange(new_startid, new_startid + skip / 2, skip))) recon_list.extend(backcheck_list) recon_list = array('i', (id + overs for id in recon_list)) # update a new overs if it needed if overs != int(settings['overs']): if overs > 10: overs = 0 DatabaseUtil(site).write_settings(field="overs", value=overs) return recon_list