Exemplo n.º 1
0
 def remove_blocked_records(cls):
     records = cls.findall()
     count = 0
     for record in records:
         if BlockedContact.is_contact_blocked(record.contact):
             record.remove()
             count += 1
     logger.info('cleared %d job items with blocked contacts' % count)
Exemplo n.º 2
0
 def remove_records_matches_rejection_pattern(cls):
     records = cls.findall()
     count = 0
     for record in records:
         if RejectionPattern.should_be_rejected(record.job_title) or RejectionPattern.should_be_rejected(record.job_desc):
             record.remove()
             count += 1
     logger.info('cleared %d job items matching the rejection pattern' % count)
Exemplo n.º 3
0
def logs_socket(ws):
    message = ws.receive()
    while True:
        logger.info('server received from client: ' +
                    datetime.datetime.now().isoformat())
        cmd = 'awk -v Time="`date -d\'now-5 seconds\' \'+[%Y-%m-%d %H:%M:%S\'`" \'{if($0 > Time) print $0}\' ' + Config.LOG_FILE
        # logger.info(cmd)
        output = os.popen(cmd).readlines()

        ws.send('\n'.join(output)) if len(output) > 0 else None
        time.sleep(5)
Exemplo n.º 4
0
    def run_crawler(cls):
        start_time = time.time()
        logger.info('start running crawler..')

        spider_names = ['sgxin', 'shichengbbs', 'singxin', 'sggongzuo']

        # pool = Pool(processes=len(spider_names))
        # pool.map(CrawlerRunner(), spider_names)
        for spider_name in spider_names:
            CrawlerRunner().__call__(spider_name)

        logger.info('done running crawler.. Time elapsed: %.3fs' % (time.time() - start_time))
Exemplo n.º 5
0
    def run_crawler(cls):
        start_time = time.time()
        logger.info('start running crawler..')

        spider_names = ['sgxin', 'shichengbbs', 'singxin', 'sggongzuo']

        # pool = Pool(processes=len(spider_names))
        # pool.map(CrawlerRunner(), spider_names)
        for spider_name in spider_names:
            CrawlerRunner().__call__(spider_name)

        logger.info('done running crawler.. Time elapsed: %.3fs' %
                    (time.time() - start_time))
Exemplo n.º 6
0
def logs_socket(ws):
    message = ws.receive()
    while True:
        logger.info("server received from client: " + datetime.datetime.now().isoformat())
        cmd = (
            "awk -v Time=\"`date -d'now-5 seconds' '+[%Y-%m-%d %H:%M:%S'`\" '{if($0 > Time) print $0}' "
            + Config.LOG_FILE
        )
        # logger.info(cmd)
        output = os.popen(cmd).readlines()

        ws.send("\n".join(output)) if len(output) > 0 else None
        time.sleep(5)
Exemplo n.º 7
0
    def should_load_details(self, job_item):
        if JobItem.is_exists(job_item):
            logger.info(
                '[%s] skipping loading details as job already exists. job_title: %s'
                % (self.name, job_item.job_title))
            return False
        if JobItem.is_older_required(job_item):
            logger.info(
                '[%s] skipping loading details as job is older than %s days. job_title: %s'
                % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN),
                   job_item.job_title))
            return False

        if BlockedContact.is_contact_blocked(job_item.contact):
            logger.info(
                '[%s] skipping loading details as job contact is blocked. contact: %s'
                % (self.name, job_item.contact))
            return False

        if RejectionPattern.should_be_rejected(job_item.job_title):
            logger.info(
                '[%s] skipping loading details as job matches rejection pattern. job_title: %s'
                % (self.name, job_item.job_title))
            return False

        return True
Exemplo n.º 8
0
 def remove(self):
     conn = self.connect_db()
     try:
         c = conn.cursor()
         c.execute('DELETE FROM ' + self.table_name + ' WHERE ' + ' AND '.join(['%s=?' % property for property in self.key_properties]),
                   tuple([getattr(self, property) for property in self.key_properties]))
         conn.commit()
         logger.info('Removed: %s' % self)
     except Exception as e:
         logger.error(e)
         logger.info('Unable to remove: %s' % self)
         conn.rollback()
         raise DatabaseError(str(e))
     finally:
         conn.close()
Exemplo n.º 9
0
def import_records_from_file(item_desc):
    cls = desc_to_cls_mapping.get(item_desc)
    file = request.files["file_to_upload"]
    redirect_url = request.form.get("redirect_url", url_for("index"))
    for record in cls.findall():
        record.remove()
    logger.info("Done removing all existing %s" % item_desc)

    count = 0
    file.readline()  # for the header, ignore
    for line in file.readlines():
        columns = line.rstrip("\r\n").rstrip("\n").decode("utf-8").split(",")
        cls(columns[0], columns[1]).save()
        count += 1
    logger.info("Done importing %d %s from %s" % (count, item_desc, file.filename))
    return redirect(redirect_url)
Exemplo n.º 10
0
    def run_emailer(cls):
        from email.mime.base import MIMEBase
        from email.mime.multipart import MIMEMultipart
        from email.mime.text import MIMEText
        from email import Encoders
        import smtplib

        logger.info('start sending email to subscribers...')
        smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT)

        try:
            smtp.set_debuglevel(4)
            smtp.ehlo()
            smtp.starttls()
            smtp.ehlo()
            smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD)

            logger.info('established secure connection to smtp server...')

            toaddrs = [
                user.email for user in User.findall()
                if user.subscription_status == 'subscribed'
            ]
            print toaddrs
            fromaddr = config.FROM_ADDR

            current_date_string = datetime.datetime.now().strftime('%Y-%m-%d')
            message_subject = "%s:%s" % (config.APP_NAME, current_date_string)
            message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % (
                config.APP_NAME, current_date_string)

            msg = MIMEMultipart()
            msg['From'] = fromaddr
            msg['To'] = ''
            msg['Cc'] = ','.join(toaddrs)
            msg['Subject'] = message_subject
            msg.attach(MIMEText(message_text))

            part = MIMEBase('application', "octet-stream")
            file_format = 'xlsx'
            part.set_payload(JobItem.extract_records_as_bytes(file_format))
            logger.info(
                'attached extracted files to the mail...waiting to be sent..')
            Encoders.encode_base64(part)
            part.add_header(
                'Content-Disposition',
                'attachment; filename="extracted_jobs_%s.%s"' %
                (current_date_string, file_format))
            msg.attach(part)

            smtp.sendmail(fromaddr, toaddrs, msg.as_string())
            logger.info('done sending email to subscribers...')
        except Exception as e:
            logger.error(e)
        finally:
            smtp.quit()
Exemplo n.º 11
0
def import_records_from_file(item_desc):
    cls = desc_to_cls_mapping.get(item_desc)
    file = request.files['file_to_upload']
    redirect_url = request.form.get('redirect_url', url_for('index'))
    for record in cls.findall():
        record.remove()
    logger.info('Done removing all existing %s' % item_desc)

    count = 0
    file.readline()  #for the header, ignore
    for line in file.readlines():
        columns = line.rstrip('\r\n').rstrip('\n').decode('utf-8').split(',')
        cls(columns[0], columns[1]).save()
        count += 1
    logger.info('Done importing %d %s from %s' %
                (count, item_desc, file.filename))
    return redirect(redirect_url)
Exemplo n.º 12
0
 def update(self):
     conn = self.connect_db()
     try:
         c = conn.cursor()
         c.execute(' UPDATE ' + self.table_name +
                   ' SET ' + ', '.join(['%s=?' % property for property in self.property_names]) +
                   ' WHERE ' + ' AND '.join(['%s=?' % property for property in self.key_properties]),
                   tuple([getattr(self, property) for property in self.property_names] + [getattr(self, property) for property in self.key_properties]))
         conn.commit()
         logger.info('Updated: %s' % self)
     except Exception as e:
         logger.error(e)
         logger.info('Unable to update: %s' % self)
         conn.rollback()
         raise DatabaseError(str(e))
     finally:
         conn.close()
Exemplo n.º 13
0
    def migrate_db(cls):
        """
        place holder for putting the migrate db scripts -- need to be updated before every release
        :return:
        """

        cls.create_db()
        conn = cls.datasource.get_connection()
        try:
            logger.info('start migrating database')
            User('meng', 'meng123', '*****@*****.**', 'admin').save()
            logger.info('done migrating database')
        except Exception as e:
            logger.error('Unable to run migrate_db')
            logger.error(e)

        finally:
            conn.close()
Exemplo n.º 14
0
    def migrate_db(cls):
        """
        place holder for putting the migrate db scripts -- need to be updated before every release
        :return:
        """

        cls.create_db()
        conn = cls.datasource.get_connection()
        try:
            logger.info('start migrating database')
            User('meng', 'meng123', '*****@*****.**', 'admin').save()
            logger.info('done migrating database')
        except Exception as e:
            logger.error('Unable to run migrate_db')
            logger.error(e)

        finally:
            conn.close()
Exemplo n.º 15
0
    def run_emailer(cls):
        from email.mime.base import MIMEBase
        from email.mime.multipart import MIMEMultipart
        from email.mime.text import MIMEText
        from email import Encoders
        import smtplib

        logger.info('start sending email to subscribers...')
        smtp = smtplib.SMTP(host=config.SMTP_HOST, port=config.SMTP_PORT)

        try:
            smtp.set_debuglevel(4)
            smtp.ehlo()
            smtp.starttls()
            smtp.ehlo()
            smtp.login(user=config.SMTP_USER, password=config.SMTP_PASSWORD)

            logger.info('established secure connection to smtp server...')

            toaddrs = [user.email for user in User.findall() if user.subscription_status == 'subscribed']
            print toaddrs
            fromaddr = config.FROM_ADDR

            current_date_string = datetime.datetime.now().strftime('%Y-%m-%d')
            message_subject = "%s:%s" % (config.APP_NAME, current_date_string)
            message_text = "Thank you for subscribing %s. Please find the newly posted jobs as of %s" % (
                config.APP_NAME, current_date_string)

            msg = MIMEMultipart()
            msg['From'] = fromaddr
            msg['To'] = ''
            msg['Cc'] = ','.join(toaddrs)
            msg['Subject'] = message_subject
            msg.attach(MIMEText(message_text))

            part = MIMEBase('application', "octet-stream")
            file_format = 'xlsx'
            part.set_payload(JobItem.extract_records_as_bytes(file_format))
            logger.info('attached extracted files to the mail...waiting to be sent..')
            Encoders.encode_base64(part)
            part.add_header('Content-Disposition',
                            'attachment; filename="extracted_jobs_%s.%s"' % (current_date_string, file_format))
            msg.attach(part)

            smtp.sendmail(fromaddr, toaddrs, msg.as_string())
            logger.info('done sending email to subscribers...')
        except Exception as e:
            logger.error(e)
        finally:
            smtp.quit()
Exemplo n.º 16
0
    def run_heartbeater(cls):
        import requests

        logger.info('started heartbeating..')
        resp = requests.get(config.APP_HEARTBEAT_URL, headers={'User-Agent': 'Zjobs Heartbeater'})
        logger.info('heartbeater received status_code %s', resp.status_code)
        logger.info('done hearting beating')
Exemplo n.º 17
0
    def run_heartbeater(cls):
        import requests

        logger.info('started heartbeating..')
        resp = requests.get(config.APP_HEARTBEAT_URL,
                            headers={'User-Agent': 'Zjobs Heartbeater'})
        logger.info('heartbeater received status_code %s', resp.status_code)
        logger.info('done hearting beating')
Exemplo n.º 18
0
 def save(self):
     if self:
         if self.find(self) is None:
             conn = self.connect_db()
             try:
                 c = conn.cursor()
                 c.execute('INSERT INTO ' + self.table_name +
                           '(' +
                           ', '.join(self.property_names) +
                           ') ' +
                           'VALUES (' + ', '.join(['?'] * len(self.property_names)) + ')',
                           tuple([getattr(self, property_name) for property_name in self.property_names])
                           )
                 conn.commit()
                 logger.info('Inserted item: %s' % self)
             except Exception as e:
                 conn.rollback()
                 logger.error('Unable to insert the item: %s' % self)
                 logger.error(e)
             finally:
                 conn.close()
         else:
             self.update()
Exemplo n.º 19
0
    def should_load_details(self, job_item):
        if JobItem.is_exists(job_item):
            logger.info('[%s] skipping loading details as job already exists. job_title: %s' % (self.name, job_item.job_title))
            return False
        if JobItem.is_older_required(job_item):
            logger.info('[%s] skipping loading details as job is older than %s days. job_title: %s' % (self.name, str(config.HOUSEKEEPING_RECORD_ORDLER_THAN), job_item.job_title))
            return False

        if BlockedContact.is_contact_blocked(job_item.contact):
            logger.info('[%s] skipping loading details as job contact is blocked. contact: %s' % (self.name, job_item.contact))
            return False

        if RejectionPattern.should_be_rejected(job_item.job_title):
            logger.info('[%s] skipping loading details as job matches rejection pattern. job_title: %s' % (self.name, job_item.job_title))
            return False

        return True
Exemplo n.º 20
0
 def run_web(cls):
     logger.info('starting web..')
     os.system(
         'cd ' + app_home_dir +
         ' && gunicorn -c app/gunicorn.conf.py web.jobboard:app --debug')
Exemplo n.º 21
0
 def _crawl(cls, spider_name=None):
     if spider_name:
         os.popen('cd %s && scrapy crawl %s' % (app_home_dir, spider_name))
         logger.info('Done running spider %s' % spider_name)
     return None
Exemplo n.º 22
0
 def run_web(cls):
     logger.info('starting web..')
     os.system('cd ' + app_home_dir + ' && gunicorn -c app/gunicorn.conf.py web.jobboard:app --debug')
Exemplo n.º 23
0
    def run_housekeeper(cls):
        logger.info('start running housekeeper..')
        logger.info('start removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        JobItem.remove_old_records(retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        logger.info('done removing records older than %s days..' % config.HOUSEKEEPING_RECORD_ORDLER_THAN)

        logger.info('start removing records posted by blocked contacts..')
        JobItem.remove_blocked_records()
        logger.info('done removing records posted by blocked contacts..')

        logger.info('start removing records should have been rejected..')
        JobItem.remove_records_matches_rejection_pattern()
        logger.info('done removing records should have been rejected..')

        logger.info('done running housekeeper..')
Exemplo n.º 24
0
    def create_db(cls):
        conn = cls.datasource.get_connection()
        try:
            c = conn.cursor()

            c.execute('DROP TABLE IF EXISTS CRAWLED_JOBS')
            c.execute('DROP INDEX IF EXISTS job_title_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS CRAWLED_JOBS(
                    source            text,
                    crawled_date      date,
                    publish_date      date,
                    job_title         text,
                    job_desc          text,
                    job_details_link  text,
                    job_location      text,
                    job_country       text,
                    salary            text,
                    employer_name     text,
                    contact           text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX job_title_idx ON CRAWLED_JOBS(job_title)
            ''')

            logger.info("created table and indexes for CRAWLED_JOBS")

            c.execute('DROP TABLE IF EXISTS JOB_REJECTION_RULES')
            c.execute('DROP INDEX IF EXISTS reject_pattern_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS JOB_REJECTION_RULES(
                    reject_pattern    text,
                    reject_reason     text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX reject_pattern_idx ON JOB_REJECTION_RULES(reject_pattern)
            ''')

            logger.info("created table and indexes for JOB_REJECTION_RULES")

            c.execute('DROP TABLE IF EXISTS BLOCKED_CONTACTS')
            c.execute('DROP INDEX IF EXISTS blocked_contacts_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS BLOCKED_CONTACTS(
                    contact      text,
                    block_reason text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX blocked_contacts_idx ON BLOCKED_CONTACTS(contact)
            ''')

            logger.info("created table and indexes for BLOCKED_CONTACTS")

            c.execute('DROP TABLE IF EXISTS USERS')
            c.execute('DROP INDEX IF EXISTS users_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS USERS(
                    username            text,
                    password            text,
                    email               text,
                    subscription_status text,
                    role           text,
                    last_login_date     date,
                    register_date       date
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX users_idx ON USERS(username)
            ''')

            logger.info("created table and indexes for USERS")

            c.execute('DROP TABLE IF EXISTS DOCS')
            c.execute('DROP INDEX IF EXISTS docs_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS DOCS(
                    filename              text,
                    content_type          text,
                    content               bytea,
                    uploaded_by           text,
                    uploaded_date         date
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX docs_idx ON DOCS(filename)
            ''')

            logger.info("created table and indexes for DOCS")

            conn.commit()
            logger.info('done create database')
        except Exception as e:
            logger.error('Unable to run create_db')
            logger.error(e)
            conn.rollback()

        finally:
            conn.close()
Exemplo n.º 25
0
    def create_db(cls):
        conn = cls.datasource.get_connection()
        try:
            c = conn.cursor()

            c.execute('DROP TABLE IF EXISTS CRAWLED_JOBS')
            c.execute('DROP INDEX IF EXISTS job_title_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS CRAWLED_JOBS(
                    source            text,
                    crawled_date      date,
                    publish_date      date,
                    job_title         text,
                    job_desc          text,
                    job_details_link  text,
                    job_location      text,
                    job_country       text,
                    salary            text,
                    employer_name     text,
                    contact           text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX job_title_idx ON CRAWLED_JOBS(job_title)
            ''')

            logger.info("created table and indexes for CRAWLED_JOBS")

            c.execute('DROP TABLE IF EXISTS JOB_REJECTION_RULES')
            c.execute('DROP INDEX IF EXISTS reject_pattern_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS JOB_REJECTION_RULES(
                    reject_pattern    text,
                    reject_reason     text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX reject_pattern_idx ON JOB_REJECTION_RULES(reject_pattern)
            ''')

            logger.info("created table and indexes for JOB_REJECTION_RULES")

            c.execute('DROP TABLE IF EXISTS BLOCKED_CONTACTS')
            c.execute('DROP INDEX IF EXISTS blocked_contacts_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS BLOCKED_CONTACTS(
                    contact      text,
                    block_reason text
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX blocked_contacts_idx ON BLOCKED_CONTACTS(contact)
            ''')

            logger.info("created table and indexes for BLOCKED_CONTACTS")

            c.execute('DROP TABLE IF EXISTS USERS')
            c.execute('DROP INDEX IF EXISTS users_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS USERS(
                    username            text,
                    password            text,
                    email               text,
                    subscription_status text,
                    role           text,
                    last_login_date     date,
                    register_date       date
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX users_idx ON USERS(username)
            ''')

            logger.info("created table and indexes for USERS")

            c.execute('DROP TABLE IF EXISTS DOCS')
            c.execute('DROP INDEX IF EXISTS docs_idx')

            c.execute('''
                CREATE TABLE IF NOT EXISTS DOCS(
                    filename              text,
                    content_type          text,
                    content               bytea,
                    uploaded_by           text,
                    uploaded_date         date
                )
                ''')

            c.execute('''
                CREATE UNIQUE INDEX docs_idx ON DOCS(filename)
            ''')

            logger.info("created table and indexes for DOCS")

            conn.commit()
            logger.info('done create database')
        except Exception as e:
            logger.error('Unable to run create_db')
            logger.error(e)
            conn.rollback()

        finally:
            conn.close()
Exemplo n.º 26
0
 def _crawl(cls, spider_name=None):
     if spider_name:
         os.popen('cd %s && scrapy crawl %s' % (app_home_dir, spider_name))
         logger.info('Done running spider %s' % spider_name)
     return None
Exemplo n.º 27
0
    def run_housekeeper(cls):
        logger.info('start running housekeeper..')
        logger.info('start removing records older than %s days..' %
                    config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        JobItem.remove_old_records(
            retention_days=config.HOUSEKEEPING_RECORD_ORDLER_THAN)
        logger.info('done removing records older than %s days..' %
                    config.HOUSEKEEPING_RECORD_ORDLER_THAN)

        logger.info('start removing records posted by blocked contacts..')
        JobItem.remove_blocked_records()
        logger.info('done removing records posted by blocked contacts..')

        logger.info('start removing records should have been rejected..')
        JobItem.remove_records_matches_rejection_pattern()
        logger.info('done removing records should have been rejected..')

        logger.info('done running housekeeper..')