def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info( '[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info( '[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info( '[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info( '[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True
def test_find_all(self): self.job_item.save() another_job_item = JobItem() another_job_item.job_title = "Another Test Job" another_job_item.save() records = JobItem.findall() print "Job Items", records self.assertEqual(2, len(records))
def test_find_with_pagination(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = "job_item_%d" % i job_item.save() records = JobItem.find_with_pagination(page_request={"page_no": 2, "size": 10}) print "Job Items", records self.assertEqual(10, len(records))
def setUp(self): self.clean_db() self.job_item = JobItem() self.job_item.job_title="Test Job" self.job_item.employer_name="Test Job Employer" # self.job_item.crawled_date = datetime.datetime.now() # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d') self.job_item.job_country = "Singapore" self.job_item.job_desc = "This is a test job" self.job_item.contact = "88888888" self.source = 'unit_test'
def process_item(self, item, spider): if JobItem.is_older_required(item): raise DropItem('Job is published order than %s days. Removing...' % str(config.HOUSEKEEPING_RECORD_ORDLER_THAN)) return item
def process_item(self, item, spider): if JobItem.is_older_required(item): raise DropItem( 'Job is published order than %s days. Removing...' % str(config.HOUSEKEEPING_RECORD_ORDLER_THAN)) return item
def should_load_details(self, job_item): if JobItem.is_exists(job_item): logger.info('[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title)) return False if JobItem.is_older_required(job_item): logger.info('[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title)) return False if BlockedContact.is_contact_blocked(job_item.contact): logger.info('[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact)) return False if RejectionPattern.should_be_rejected(job_item.job_title): logger.info('[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title)) return False return True
def run_emailer(cls): from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email import Encoders import smtplib logger.info('start sending email to subscribers...') smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT) try: smtp.set_debuglevel(4) smtp.ehlo() smtp.starttls() smtp.ehlo() smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD) logger.info('established secure connection to smtp server...') toaddrs = [ user.email for user in User.findall() if user.subscription_status == 'subscribed' ] print toaddrs fromaddr = config.FROM_ADDR current_date_string = datetime.datetime.now().strftime('%Y-%m-%d') message_subject = "%s:%s" % (config.APP_NAME, current_date_string) message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % ( config.APP_NAME, current_date_string) msg = MIMEMultipart() msg['From'] = fromaddr msg['To'] = '' msg['Cc'] = ','.join(toaddrs) msg['Subject'] = message_subject msg.attach(MIMEText(message_text)) part = MIMEBase('application', "octet-stream") file_format = 'xlsx' part.set_payload(JobItem.extract_records_as_bytes(file_format)) logger.info( 'attached extracted files to the mail...waiting to be sent..') Encoders.encode_base64(part) part.add_header( 'Content-Disposition', 'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format)) msg.attach(part) smtp.sendmail(fromaddr, toaddrs, msg.as_string()) logger.info('done sending email to subscribers...') except Exception as e: logger.error(e) finally: smtp.quit()
def setUp(self): self.clean_db() self.job_item = JobItem() self.job_item.job_title = "Test Job" self.job_item.employer_name = "Test Job Employer" # self.job_item.crawled_date = datetime.datetime.now() # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d') self.job_item.job_country = "Singapore" self.job_item.job_desc = "This is a test job" self.job_item.contact = "88888888" self.source = "unit_test"
def run_emailer(cls): from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email import Encoders import smtplib logger.info('start sending email to subscribers...') smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT) try: smtp.set_debuglevel(4) smtp.ehlo() smtp.starttls() smtp.ehlo() smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD) logger.info('established secure connection to smtp server...') toaddrs = [user.email for user in User.findall() if user.subscription_status == 'subscribed'] print toaddrs fromaddr = config.FROM_ADDR current_date_string = datetime.datetime.now().strftime('%Y-%m-%d') message_subject = "%s:%s" % (config.APP_NAME, current_date_string) message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % ( config.APP_NAME, current_date_string) msg = MIMEMultipart() msg['From'] = fromaddr msg['To'] = '' msg['Cc'] = ','.join(toaddrs) msg['Subject'] = message_subject msg.attach(MIMEText(message_text)) part = MIMEBase('application', "octet-stream") file_format = 'xlsx' part.set_payload(JobItem.extract_records_as_bytes(file_format)) logger.info('attached extracted files to the mail...waiting to be sent..') Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format)) msg.attach(part) smtp.sendmail(fromaddr, toaddrs, msg.as_string()) logger.info('done sending email to subscribers...') except Exception as e: logger.error(e) finally: smtp.quit()
def test_find_all(self): self.job_item.save() another_job_item = JobItem() another_job_item.job_title = 'Another Test Job' another_job_item.save() records = JobItem.findall() print 'Job Items', records self.assertEqual(2, len(records))
def parse_item_requests_callback(self, response, item_xpath_selector=''): requests = [] for job_item in response.xpath(item_xpath_selector): job_crawler_item = JobItem() self.populate_job_crawler_item(job_item, job_crawler_item) if self.should_load_details(job_crawler_item): requests.append( Request(url=job_crawler_item.job_details_link, callback=self.retrieve_job_details, meta={'item': job_crawler_item}, dont_filter=True)) return requests
def test_find_with_pagination(self): for i in range(0, 20): job_item = JobItem() job_item.job_title='job_item_%d' % i job_item.save() records = JobItem.find_with_pagination(page_request={'page_no': 2, 'size': 10}) print 'Job Items', records self.assertEqual(10, len(records))
def parse_item(self, response): requests = [] for job_item in response.xpath('//tr'): job_crawler_item = JobItem() for index, detail_item in enumerate(job_item.xpath('./td')): self.populate_job_crawler_item(index, detail_item, job_crawler_item) if index == 4: if self.should_load_details(job_crawler_item): requests.append( Request(url=job_crawler_item.job_details_link, callback=self.retrieve_job_details, meta={'item': job_crawler_item}, dont_filter=True)) return requests
def test_remove_blocked_records(self): for i in range(0, 20): job_item = JobItem() job_item.job_title=u'人员_%d' % i job_item.contact = str(random.randint(90000000, 99999999)) job_item.save() # mark the contact as blocked BlockedContact(job_item.contact, u'人员').save() # run the remove action JobItem.remove_blocked_records() conn = self.connect_db() try: c = conn.cursor() c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0') except: pass finally: conn.close()
def run_housekeeper(cls): logger.info('start running housekeeper..') logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) JobItem.remove_old_records(retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('start removing records posted by blocked contacts..') JobItem.remove_blocked_records() logger.info('done removing records posted by blocked contacts..') logger.info('start removing records should have been rejected..') JobItem.remove_records_matches_rejection_pattern() logger.info('done removing records should have been rejected..') logger.info('done running housekeeper..')
def test_remove_records_matches_rejection_pattern(self): for i in range(0, 20): job_item = JobItem() job_item.job_title=u'人员_%d' % i job_item.save() # mark the title as blocked RejectionPattern(u'人员_\d+', 'For Testing').save() # run the remove action JobItem.remove_records_matches_rejection_pattern() conn = self.connect_db() try: c = conn.cursor() c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0') except: pass finally: conn.close()
def run_housekeeper(cls): logger.info('start running housekeeper..') logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) JobItem.remove_old_records( retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN) logger.info('start removing records posted by blocked contacts..') JobItem.remove_blocked_records() logger.info('done removing records posted by blocked contacts..') logger.info('start removing records should have been rejected..') JobItem.remove_records_matches_rejection_pattern() logger.info('done removing records should have been rejected..') logger.info('done running housekeeper..')
def test_remove_blocked_records(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = u"人员_%d" % i job_item.contact = str(random.randint(90000000, 99999999)) job_item.save() # mark the contact as blocked BlockedContact(job_item.contact, u"人员").save() # run the remove action JobItem.remove_blocked_records() conn = self.connect_db() try: c = conn.cursor() c.execute("SELECT COUNT(*) FROM " + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0") except: pass finally: conn.close()
def test_remove_records_matches_rejection_pattern(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = u"人员_%d" % i job_item.save() # mark the title as blocked RejectionPattern(u"人员_\d+", "For Testing").save() # run the remove action JobItem.remove_records_matches_rejection_pattern() conn = self.connect_db() try: c = conn.cursor() c.execute("SELECT COUNT(*) FROM " + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0") except: pass finally: conn.close()
def test_find(self): self.job_item.save() print JobItem.find(self.job_item)
def process_item(self, item, spider): if JobItem.is_exists(item): raise DropItem('Duplicated Job title. Removing...') else: return item
def test_is_exists(self): self.job_item.save() self.assertTrue(JobItem.is_exists(self.job_item), '%s should exist' % self.job_item.job_title)
class JobItemTest(BaseTestCase): def setUp(self): self.clean_db() self.job_item = JobItem() self.job_item.job_title = "Test Job" self.job_item.employer_name = "Test Job Employer" # self.job_item.crawled_date = datetime.datetime.now() # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d') self.job_item.job_country = "Singapore" self.job_item.job_desc = "This is a test job" self.job_item.contact = "88888888" self.source = "unit_test" def tearDown(self): pass def test_save(self): self.job_item.save() conn = self.connect_db() try: c = conn.cursor() c.execute("SELECT COUNT(*) FROM " + JobItem.table_name) self.assertEqual(c.fetchone()[0], 1, "Count of job items should be 1") except: pass finally: conn.close() def test_find_all(self): self.job_item.save() another_job_item = JobItem() another_job_item.job_title = "Another Test Job" another_job_item.save() records = JobItem.findall() print "Job Items", records self.assertEqual(2, len(records)) def test_find(self): self.job_item.save() print JobItem.find(self.job_item) def test_find_with_pagination(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = "job_item_%d" % i job_item.save() records = JobItem.find_with_pagination(page_request={"page_no": 2, "size": 10}) print "Job Items", records self.assertEqual(10, len(records)) def test_iter_listOfTuple(self): list_of_tuples = [("key", "value"), ("key1", "value1")] print [key + " " + value for (key, value) in list_of_tuples] def test_is_exists(self): self.job_item.save() self.assertTrue(JobItem.is_exists(self.job_item), "%s should exist" % self.job_item.job_title) def test_remove_blocked_records(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = u"人员_%d" % i job_item.contact = str(random.randint(90000000, 99999999)) job_item.save() # mark the contact as blocked BlockedContact(job_item.contact, u"人员").save() # run the remove action JobItem.remove_blocked_records() conn = self.connect_db() try: c = conn.cursor() c.execute("SELECT COUNT(*) FROM " + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0") except: pass finally: conn.close() def test_remove_records_matches_rejection_pattern(self): for i in range(0, 20): job_item = JobItem() job_item.job_title = u"人员_%d" % i job_item.save() # mark the title as blocked RejectionPattern(u"人员_\d+", "For Testing").save() # run the remove action JobItem.remove_records_matches_rejection_pattern() conn = self.connect_db() try: c = conn.cursor() c.execute("SELECT COUNT(*) FROM " + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, "Count of job items should be 0") except: pass finally: conn.close()
def test_is_exists(self): self.job_item.save() self.assertTrue(JobItem.is_exists(self.job_item), "%s should exist" % self.job_item.job_title)
class JobItemTest(BaseTestCase): def setUp(self): self.clean_db() self.job_item = JobItem() self.job_item.job_title="Test Job" self.job_item.employer_name="Test Job Employer" # self.job_item.crawled_date = datetime.datetime.now() # self.job_item.publish_date = datetime.datetime.strptime('2014-10-31', '%Y-%m-%d') self.job_item.job_country = "Singapore" self.job_item.job_desc = "This is a test job" self.job_item.contact = "88888888" self.source = 'unit_test' def tearDown(self): pass def test_save(self): self.job_item.save() conn = self.connect_db() try: c = conn.cursor() c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name) self.assertEqual(c.fetchone()[0], 1, 'Count of job items should be 1') except: pass finally: conn.close() def test_find_all(self): self.job_item.save() another_job_item = JobItem() another_job_item.job_title = 'Another Test Job' another_job_item.save() records = JobItem.findall() print 'Job Items', records self.assertEqual(2, len(records)) def test_find(self): self.job_item.save() print JobItem.find(self.job_item) def test_find_with_pagination(self): for i in range(0, 20): job_item = JobItem() job_item.job_title='job_item_%d' % i job_item.save() records = JobItem.find_with_pagination(page_request={'page_no': 2, 'size': 10}) print 'Job Items', records self.assertEqual(10, len(records)) def test_iter_listOfTuple(self): list_of_tuples = [('key', 'value'), ('key1', 'value1')] print [key + ' ' + value for (key, value) in list_of_tuples] def test_is_exists(self): self.job_item.save() self.assertTrue(JobItem.is_exists(self.job_item), '%s should exist' % self.job_item.job_title) def test_remove_blocked_records(self): for i in range(0, 20): job_item = JobItem() job_item.job_title=u'人员_%d' % i job_item.contact = str(random.randint(90000000, 99999999)) job_item.save() # mark the contact as blocked BlockedContact(job_item.contact, u'人员').save() # run the remove action JobItem.remove_blocked_records() conn = self.connect_db() try: c = conn.cursor() c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0') except: pass finally: conn.close() def test_remove_records_matches_rejection_pattern(self): for i in range(0, 20): job_item = JobItem() job_item.job_title=u'人员_%d' % i job_item.save() # mark the title as blocked RejectionPattern(u'人员_\d+', 'For Testing').save() # run the remove action JobItem.remove_records_matches_rejection_pattern() conn = self.connect_db() try: c = conn.cursor() c.execute('SELECT COUNT(*) FROM ' + JobItem.table_name) self.assertEqual(c.fetchone()[0], 0, 'Count of job items should be 0') except: pass finally: conn.close()