def process_item(self, item, spider): # check by title if RejectionPattern.should_be_rejected(item.job_title): raise DropItem('Job matches rejection pattern. Removing...') if RejectionPattern.should_be_rejected(item.job_desc): raise DropItem('Job Description matches rejection pattern. Removing...') return item
def test_should_be_rejected(self): RejectionPattern("[1-9]+").save() self.assertTrue(RejectionPattern.should_be_rejected("9887"), "input_text should be rejected") self.assertFalse(RejectionPattern.should_be_rejected("abcd"), "input_text should not be rejected") RejectionPattern(u"(?<!非)中介").save() self.assertTrue(RejectionPattern.should_be_rejected(u"中介"), "input_text should be rejected") self.assertTrue(RejectionPattern.should_be_rejected(u"是中介"), "input_text should be rejected") self.assertFalse(RejectionPattern.should_be_rejected(u"非中介"), "input_text should not be rejected")
def process_item(self, item, spider): # check by title if RejectionPattern.should_be_rejected(item.job_title): raise DropItem('Job matches rejection pattern. Removing...') if RejectionPattern.should_be_rejected(item.job_desc): raise DropItem( 'Job Description matches rejection pattern. Removing...') return item
def test_should_be_rejected(self): RejectionPattern('[1-9]+').save() self.assertTrue(RejectionPattern.should_be_rejected('9887'), 'input_text should be rejected') self.assertFalse(RejectionPattern.should_be_rejected('abcd'), 'input_text should not be rejected') RejectionPattern(u'(?<!非)中介').save() self.assertTrue(RejectionPattern.should_be_rejected(u'中介'), 'input_text should be rejected') self.assertTrue(RejectionPattern.should_be_rejected(u'是中介'), 'input_text should be rejected') self.assertFalse(RejectionPattern.should_be_rejected(u'非中介'), 'input_text should not be rejected')
def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info( '[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info( '[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info( '[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info( '[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True
def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info('[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info('[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info('[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info('[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True