Пример #1
0
 def init_validator():
     """ check if validating thread is alive and if not, then initiate it """
     if not ValidatorThreadHandler.__validator_thread.is_alive():
         logger.info("Creating validator thread")
         print('Creating validator thread')
         ValidatorThreadHandler.__validator_thread = threading.Thread(target=ValidatorThreadHandler.__process_method)
         ValidatorThreadHandler.__validator_thread.start()
Пример #2
0
def _reprocess_image(queue: Queue) -> None:
    global stats
    while not queue.empty():
        img_data = queue.get()
        img_data["filePath"] = f"{os.path.splitext(img_data['filePath'])[0]}.tif"
        tif_filename = os.path.basename(img_data["filePath"])
        local_file = f"TEMP_{os.path.basename(img_data['id'])}"
        logger.info(f"Processing {img_data['id']}")
        if _download_source_file(img_data, local_file):
            image = _preprocess_image(img_data, local_file)
            if image:
                image.tiffsave(tif_filename, tile=True, pyramid=True, compression=config.COMPRESSION_TYPE,
                    tile_width=config.PYTIF_TILE_WIDTH, tile_height=config.PYTIF_TILE_HEIGHT, \
                    xres=config.DPI_VALUE, yres=config.DPI_VALUE) # noqa
                new_tiff = Image.tiffload(tif_filename)
                _upload_files(img_data, local_file, tif_filename)
                gql.update_item(img_data['id'], new_tiff.height, new_tiff.width)
                os.remove(tif_filename)
            os.remove(local_file)
            logger.info(f'Completed {local_file}')
        else:
            gql.remove_missing_item(img_data['id'])
            Statistic.download_err(img_data)
        Statistic.attempted()
        queue.task_done()
Пример #3
0
    def __rule_1_after(msg):
        """ after the reply from CRCL is back check if the incident is spam """

        try:
            report_id = msg['body']['reportID']
            report_type = shared.processed_mgs[report_id]['inc'][
                'incidentType']
            precipitation = msg['body']['precipitation']
        except (KeyError, TypeError, ValueError, IndexError) as e:
            logger.info(
                "Cannot load reportID / report type/ precipitation from processed messages"
            )
            logger.debug(str(type(e)) + str(e))
            return

        logger.info("Validating 803 with type: " + str(report_type) +
                    " and precipitation: " + str(round(precipitation, 2)) +
                    " ID: " + str(report_id))

        # type == Precipitation / Heavy Precipitation + precipitation == 0 ==> SPAM
        if precipitation < .1 and (report_type == 'Precipitation'
                                   or report_type == 'HeavyPrecipitation'
                                   or report_type == 'Heavy Precipitation'
                                   or report_type == "Blizzard"):
            spam = True
        else:
            spam = False
        return spam
Пример #4
0
def tournament(url: str, name: str) -> int:
    s = fetch_tools.fetch(url, character_encoding='utf-8', retry=True)

    # Tournament details
    soup = BeautifulSoup(s, 'html.parser')
    cell = soup.find('div', {'id': 'EventReport'}).find_all('td')[1]

    name = cell.find('a').string.strip()
    day_s = cell.find('br').next.strip()
    if '-0001' in day_s:
        # Tournament has been incorrectly configured.
        return 0

    dt, competition_series = get_dt_and_series(name, day_s)
    top_n = find_top_n(soup)
    if top_n == competition.Top.NONE: # Tournament is in progress.
        logger.info('Skipping an in-progress tournament.')
        return 0
    db().begin('tournament')
    competition_id = competition.get_or_insert_competition(dt, dt, name, competition_series, url, top_n)
    ranks = rankings(soup)
    medals = medal_winners(s)
    final = finishes(medals, ranks)
    n = add_decks(dt, competition_id, final, s)
    db().commit('tournament')
    return n
Пример #5
0
def message_to_queue(message):
    try:
        message = json.loads(message)
    except json.decoder.JSONDecodeError as e:
        logger.warning("message from bus is not a valid json: " + str(e))
        logger.debug(message)
        return
    message_queue.MessageQueue.put_message(message)
    logger.info("Message arrived from bus and inserted in queue")
    logger.debug(json.dumps(message))
    ValidatorThreadHandler.init_validator()
Пример #6
0
    def validate_803(msg):
        """ find the incident in the local storage and continue validation according to the info from TOP803"""

        logger.info("Message TOP803 is processed.")
        logger.debug("TOP803 message: " + str(msg))

        if msg['body']['reportID'] not in shared.processed_mgs:
            logger.warning(
                "Message TOP803 does not correspond to a stored report. ID: " +
                str(msg['body']['reportID']))
            return

        Validator.__incident_spam(msg['body']['reportID'],
                                  Validator.__rule_1_after(msg))
Пример #7
0
def process_image_changes(data: list):
    jobs = Queue()
    for img_data in data:
        jobs.put(img_data)
    logger.info(f"{jobs.qsize()} IMAGES TO PROCESS")

    start_time = time.time()
    for i in range(config.MAX_THREADS):
        threading.Thread(target=_reprocess_image, args=(jobs,)).start()

    jobs.join()
    end_time = time.time()
    elapsed_time = end_time - start_time
    Statistic.summary()
    logger.info(f"ELAPSED TIME = {elapsed_time} seconds")
Пример #8
0
    def __incident_spam(reportID, spam):
        """ when the message is detected to be spam send update TOP801 to KBS

        Even if not detected to be spam inform KBS via TOP801. So that it is known that the validation step works.
        """
        logger.info("Message IS " + ('' if spam else 'NOT ') + "SPAM! " +
                    " Passed validation successfully. ID: " + str(reportID))
        msg = Validator.generate_TOP801(reportID, spam)

        if msg is None:
            logger.warning("TOP801 was not generated correctly")
            return

        Validator.bus_prod.send(topic=msg['header']['topicName'],
                                message=json.dumps(msg))
Пример #9
0
    def send(self, topic, message):

        logger.info("Sending: " + str(topic))
        logger.debug("Sending: " + str(topic) + ": " + str(message))
        # return

        # Produce and flush message to bus
        try:
            self.producer.produce(topic, message.encode('utf-8'), 'key', -1,
                                  self.on_delivery)
            self.producer.flush()
        except Exception as err:
            print('Sending data failed')
            print(err)
            return False

        return True
Пример #10
0
def image(c: str = '') -> wrappers.Response:
    names = c.split('|')
    try:
        requested_cards = oracle.load_cards(names)
        path = image_fetcher.download_image(requested_cards)
        if path is None:
            raise InternalServerError(f'Failed to get image for {c}')
        return send_file(
            os.path.abspath(path)
        )  # Send abspath to work around monolith root versus web root.
    except TooFewItemsException as e:
        logger.info(f'Did not find an image for {c}: {e}')
        if len(names) == 1:
            return redirect(
                f'https://api.scryfall.com/cards/named?exact={c}&format=image',
                code=303)
        return make_response('', 400)
Пример #11
0
def main(start=0, end=None):
    import random
    from datetime import datetime
    from bus_communication import bus_producer

    with open("VAL_TOP030.json", 'r') as f:
        top030 = json.load(f)

    bp = bus_producer.BusProducer()

    max_delay = 1  # delay in the range [0, max_delay] from uniform distribution

    if end is None: end = len(top030)

    count = 0
    for m in top030:
        if count >= end:
            break
        count += 1
        if count < start:
            continue

        logger.info("sending message 30 to bus : " + str(count))

        try:
            m['header']['sentUTC'] = datetime.utcnow().isoformat().split(
                ".")[0] + 'Z'
        except:
            pass

        try:
            if 'incidents' in m['body']:
                for inc in m['body']['incidents']:
                    inc['timestamp'] = datetime.utcnow().isoformat().split(
                        ".")[0] + 'Z'
        except:
            pass

        # print(json.dumps(m, indent=2))
        bp.send(topic=m['header']['topicName'], message=json.dumps(m))

        time.sleep(random.random() * max_delay)
Пример #12
0
    def validate(message):
        """ read the message and determine if it is TOP 030 or TOP 803 and invoke the corresponding method
        """
        print("Thread id in validator: " + str(threading.get_ident()))
        try:
            inc_topic = message['header']['topicName']
        except (KeyError, TypeError, ValueError, IndexError) as e:
            logger.warning("could not read topicName from message. Do nothing")
            logger.debug(e)
            logger.debug(message)
            return

        logger.info("Message is now processed. TOPIC: " + str(inc_topic))

        if inc_topic == 'TOP030_REPORT_REQUESTED':
            Validator.validate_TOP030(message)
        elif inc_topic == 'TOP803_WEATHER_REPORT':
            Validator.validate_803(message)
        else:
            logger.warning(
                "Message read in validator is not TOP030 nor TOP803")
Пример #13
0
def comment(job_id):
    data = json.loads(flask.request.data)

    comment_text = data['text']
    comment_date = dateutil.parser.parse(data['date'])
    salary = data['salary'] or 0
    rating = (float(data['rating']) / 5) or 0

    if job_id is not None and comment_text:
        job = Job.objects(id=job_id).first()

        if not job:
            return render_template('404.html')

        logger.info(COMPONENT, 'Adding comment for job: {}'.format(job_id))

        new_comment = Comment(comment=comment_text, date=comment_date, salary=salary, crawled=False,
                              rating=AggregateRating(rating=rating, count=1))

        job.update(push__comments=new_comment)

    return json.dumps({'success': True}), 200, {'ContentType': 'application/json'}
Пример #14
0
    def listen(self, performed_action, topics=None):
        # Topics should be a list of topic names e.g. ['topic1', 'topic2']
        if topics is None:
            topics = self.default_topics

        self.listening = True

        # Subscribe to topics
        try:
            self.consumer.subscribe(topics)
        except Exception as e:
            logger.error("Error @ BusConsumer.listen()")
            logger.debug(str(type(e)) + str(e))
            return False
        logger.info("listener subscribed successfully to topics:" +
                    str(topics))

        # Initiate a loop for continuous listening
        while self.listening:
            msg = self.consumer.poll(0)

            # If a message is received and it is not an error message
            if msg is not None and msg.error() is None:

                # Add incoming message to requests database
                try:
                    message_text = msg.value().decode('utf-8')
                except:
                    message_text = msg.value()

                performed_action(message_text)

            # TODO: check if it works ok with the sleep .5
            time.sleep(0.5)

        # Unsubscribe and close consumer
        self.consumer.unsubscribe()
        self.consumer.close()
Пример #15
0
def import_comment(**kwargs):
    """Import comment from RateMyCoopJob.

    Keyword arguments:
    employer_name -- Employer name
    job_title -- Title of job
    comments: -- Array of comments
        comment -- Comment
        comment_date -- Date comment was submitted. Note: in non-standard form such as: 5 years ago, 3 weeks ago etc
        salary -- Job salary (hourly)
        rating -- Job rating out of 5 (1 - 5 stars on ratemycoopjob)
    """

    employer_name = kwargs['employer_name'].lower()

    job_title = kwargs['job_title'].lower()

    # If employer alias exists (ex. Research in motion -> Blackberry), use instead
    if employer_name in employer_alias.aliases:
        employer_name = employer_alias.aliases[employer_name].lower()

    # If employer does not exist
    if not Employer.objects.search_text(
            "\"{}\"".format(employer_name)).count() > 0:
        logger.info(
            COMPONENT,
            'Employer: {} does not exist, ignoring..'.format(employer_name))
        return

    logger.info(
        COMPONENT, 'Importing comments for job: {} from employer: {}'.format(
            job_title, employer_name))

    employer = Employer.objects.search_text(
        "\"{}\"".format(employer_name)).no_dereference().first()

    # Iterate through all comments
    for index, comment_obj in enumerate(kwargs['comments']):

        comment = comment_obj['comment']

        comment_date = _get_comment_date(comment_obj['comment_date'])

        salary = float(comment_obj['salary'])

        rating = float(comment_obj['rating']) / 5

        # If job does not exist add to employer
        if not employer.job_exists(job_title):
            if employer.comment_exists(comment=comment,
                                       date=comment_date,
                                       salary=salary,
                                       rating=rating):
                logger.info(
                    COMPONENT,
                    'Comment: {} already exists for employer: {}, ignoring'.
                    format(index, employer_name))

            else:
                logger.info(
                    COMPONENT, 'Adding comment: {} to employer: {}'.format(
                        index, employer_name))

                new_comment = Comment(comment=comment,
                                      date=comment_date,
                                      salary=salary,
                                      crawled=True,
                                      rating=AggregateRating(rating=rating,
                                                             count=1))

                employer.update(push__comments=new_comment)

        # Job already exists
        else:
            job = Job.objects(id__in=[job.id for job in employer.jobs],
                              title=job_title).first()

            if job.comment_exists(comment=comment,
                                  date=comment_date,
                                  salary=salary,
                                  rating=rating):
                logger.info(
                    COMPONENT,
                    'Comment: {} already exists for job: {} for employer: {}, ignoring'
                    .format(index, job_title, employer_name))

            else:
                logger.info(
                    COMPONENT, 'Adding comment: {} for job: {} from {}'.format(
                        index, job_title, employer_name))

                new_comment = Comment(comment=comment,
                                      date=comment_date,
                                      salary=salary,
                                      crawled=True,
                                      rating=AggregateRating(rating=rating,
                                                             count=1))

                job.update(push__comments=new_comment)
Пример #16
0
def import_job(**kwargs):
    """Import job.

    Keyword arguments:
    employer_name -- Employer name
    job_title -- Title of job
    summary -- Job summary
    year -- Year the job was advertised
    term -- Term job was advertised [Fall, Winter, Spring]
    location -- Location job was advertised
    openings -- Number of job openings
    remaining -- Number of job openings remaining
    applicants -- Number of applicants job has (Optional)
    levels -- Levels job is intended for [Junior, Intermediate, Senior]
    programs -- Programs the job is specified for
    url -- URL of job
    date -- Date job was crawled (useful for knowing exactly # of applicants at what time)
    index -- Boolean to indicate whether to index or not (default True)
    """

    employer_name = kwargs['employer_name'].lower()

    job_title = kwargs['job_title'].lower()

    term = kwargs['term']

    levels = []

    for level in kwargs['levels']:
        uw_level = Term.get_level(level)
        if uw_level:
            levels.append(uw_level)
        else:
            logger.error(COMPONENT, 'Error processing level: {}'.format(level))

    programs = []

    for program in kwargs['programs']:
        uw_program = Program.get_program(program)
        if uw_program:
            programs.append(uw_program)
        else:
            logger.error(COMPONENT,
                         'Error processing program: {}'.format(program))

    location = kwargs['location'].lower()

    openings = int(kwargs['openings'])

    remaining = int(kwargs['remaining']) if 'remaining' in kwargs else openings

    summary = kwargs['summary']

    filtered_summary = engine.filter_summary(summary)

    summary_keywords = engine.get_keywords(filtered_summary, programs)

    date = kwargs['date']

    year = date.year

    url = kwargs['url']

    applicants = 0

    try:
        if kwargs['applicants']:
            applicants = int(kwargs['applicants'])
    except Exception:
        pass

    index = False

    if index in kwargs:
        index = kwargs['index']

    logger.info(COMPONENT,
                'Importing job: {} from {}'.format(job_title, employer_name))

    # If employer does not exist, create it
    if not Employer.employer_exists(employer_name):
        logger.info(
            COMPONENT,
            'Employer: {} does not exist, creating..'.format(employer_name))

        employer = Employer(name=employer_name)

        logger.info(COMPONENT, 'Creating job: {}'.format(job_title))

        location = Location(name=location)

        applicant = Applicant(applicants=applicants, date=date)

        keywords = [
            Keyword(keyword=k['keyword'], types=k['types'])
            for k in summary_keywords
        ]

        # New job so number of remaining positions is same as openings
        job = Job(title=job_title,
                  summary=filtered_summary,
                  year=year,
                  term=term,
                  location=[location],
                  openings=openings,
                  remaining=remaining,
                  applicants=[applicant],
                  levels=levels,
                  programs=programs,
                  url=url,
                  keywords=keywords)

        job.save()
        job.reload()

        employer.jobs.append(job)
        employer.save()
        employer.reload()

        if index:
            elastic.index_employer_waterlooworks(employer)
            elastic.index_job_waterlooworks(employer, job)

    # Employer already exists
    else:
        employer = Employer.objects(
            name=employer_name).no_dereference().first()

        logger.info(COMPONENT,
                    'Employer: {} already exists'.format(employer_name))

        # If job does not exist, create it
        if not employer.job_exists(job_title):
            logger.info(COMPONENT, 'Creating job: {}'.format(job_title))

            location = Location(name=location)

            applicant = Applicant(applicants=applicants, date=date)

            keywords = [
                Keyword(keyword=k['keyword'], types=k['types'])
                for k in summary_keywords
            ]

            # New job so number of remaining positions is same as openings
            job = Job(title=job_title,
                      summary=engine.filter_summary(summary),
                      year=year,
                      term=term,
                      location=[location],
                      openings=openings,
                      remaining=remaining,
                      applicants=[applicant],
                      levels=levels,
                      programs=programs,
                      url=url,
                      keywords=keywords)

            job.save()
            job.reload()

            employer.update(push__jobs=job)

            if index:
                elastic.update_employer_waterlooworks(employer)
                elastic.index_job_waterlooworks(employer, job)

        # Job already exists
        else:
            logger.info(COMPONENT, 'Job: {} already exists'.format(job_title))

            job = Job.objects(id__in=[job.id for job in employer.jobs],
                              title=job_title).first()

            if not year >= job.year:
                raise DataIntegrityError(
                    'Job: {} by {} cannot be advertised before {}'.format(
                        job_title, employer_name, job.year))

            filtered_summary_compare = re.sub(
                r'\W+', '',
                filtered_summary.lower().strip()).strip()
            job_summary_compare = re.sub(r'\W+', '',
                                         job.summary.lower().strip()).strip()

            # Job summary is not the same. In this case the employer most likely changed the job
            if not filtered_summary_compare == job_summary_compare:

                if openings >= 1:
                    logger.info(
                        COMPONENT,
                        'Job: {}: different summary detected, deprecating and creating new job..'
                        .format(job_title))

                    job.update(set__deprecated=True)

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    keywords = [
                        Keyword(keyword=k['keyword'], types=k['types'])
                        for k in summary_keywords
                    ]

                    # Assume new job so number of remaining positions is same as openings
                    new_job = Job(title=job_title,
                                  summary=filtered_summary,
                                  year=year,
                                  term=term,
                                  location=[location],
                                  openings=openings,
                                  remaining=remaining,
                                  applicants=[applicant],
                                  levels=levels,
                                  programs=programs,
                                  url=url,
                                  keywords=keywords)

                    new_job.save()
                    new_job.reload()

                    employer.update(push__jobs=new_job)

                    if index:
                        elastic.delete_employer_waterlooworks(employer)
                        elastic.delete_job_waterlooworks(employer, job)
                        elastic.index_employer_waterlooworks(employer)
                        elastic.index_job_waterlooworks(employer, new_job)
                else:
                    logger.info(
                        COMPONENT,
                        'Job: {}: different summary detected but invalid openings: {}, ignoring..'
                        .format(job_title, openings))

            # Job is the same (same title and description)
            else:
                # If job is being advertised in new term
                if year != job.year or term != job.term:
                    logger.info(
                        COMPONENT,
                        'Job: {}: being advertised in new term, updating..'.
                        format(job_title))

                    # Add hire ratio for previous term
                    hire_ratio = float(job.openings -
                                       job.remaining) / job.openings

                    job.hire_rate.add_rating(hire_ratio)

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    hire_rate = AggregateRating(rating=job.hire_rate.rating,
                                                count=job.hire_rate.count)

                    job.update(set__year=year,
                               set__term=term,
                               add_to_set__location=location,
                               set__openings=openings,
                               set__remaining=remaining,
                               push__applicants=applicant,
                               set__hire_rate=hire_rate,
                               set__levels=levels,
                               set__programs=programs,
                               set__url=url,
                               set__last_indexed=datetime.now())

                    if index:
                        elastic.update_job_waterlooworks(employer, job)

                # Job is being updated. We need to update location, openings, levels, remaining, hire_rate, applicants
                else:
                    logger.info(
                        COMPONENT,
                        'Job: {}: updating for current term'.format(job_title))

                    remaining = job.remaining

                    # Job posting has decreased, some positions filled up
                    if openings < remaining:
                        remaining = openings

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    job.update(add_to_set__location=location,
                               set__remaining=remaining,
                               set__levels=list(set(levels + job.levels)),
                               push__applicants=applicant,
                               set__programs=list(set(programs +
                                                      job.programs)),
                               set__url=url,
                               set__last_indexed=datetime.now())

                    if index:
                        elastic.update_job_waterlooworks(employer, job)
Пример #17
0
    def validate_TOP030(message):
        """
        δες αν εχει ξαναέρθει msg με το incident id που ηρθε τώρα (είτε τοπικά είτε σε ένα csv).
            Αν έχει ξαναέρθει, τότε μην κάνεις τίποτα.
            Αν δεν έχει ξαναέρθει, τότε δες αν το msg έχει attachment και incident type.
                Αν δεν έχει, τότε μην κάνεις τίποτα
                Αν έχει (incident type)
                    Βαλε το στην λίστα των incidents που έχουν επεξεργαστεί (έχουν περάσει ή περνανε τώρα validation)
                    Δες αν το incident type == Heatwave (?)
                        Αν οχι, τότε βάλε spam == false
                        Αν ναι, τότε ρώτα το CRCL για να σου πει τις καιρικές συνθήκες στην περιοχή του incident
        """

        report_id = None
        report_type = None
        inc_long = None
        inc_lat = None
        report_time = None
        report_spam = None

        # print(message['body']['incidents'])
        logger.debug("Processed TOP030 message: " + str(message))

        header = message['header']

        try:
            inc_long = float(message['body']['position']['long'])
            inc_lat = float(message['body']['position']['lat'])
        except (KeyError, TypeError, ValueError, IndexError) as e:
            logger.info(
                "Incoming message does not have location, validation will stop."
            )
            logger.debug(str(type(e)) + str(e))
            logger.debug(message)
            return

        try:
            incidents = message['body']['incidents']
        except (KeyError, TypeError) as e:
            logger.info("No reports in TOP030, validation will stop.")
            logger.debug(str(type(e)) + str(e))
            logger.debug(str(message))
            return

        if len(incidents) == 0:
            logger.info("No incidents in TOP030.")

        for inc in incidents:
            try:
                report_id = inc['reportId']
                report_type = inc['incidentType']
                report_time = inc['timestamp']
            except (KeyError, TypeError, ValueError, IndexError) as e:
                logger.warning(
                    "Incident does not have report ID / incident Type / timestamp"
                )
                logger.debug(str(type(e)) + str(e))
                return

            if report_id in shared.processed_mgs:
                logger.debug("Report already processed. ReportId: " +
                             str(report_id))
                continue

            shared.processed_mgs[report_id] = {'inc': inc, 'header': header}

            # TODO: check if spam field is already there, and if is spam=True/False stop validation (not None)

            logger.info("Report is checked to determine if it is spam. ID:" +
                        str(report_id))
            if Validator.__rule_1_pre(report_type) is True:
                logger.info("Asking CRCL for report with ID:" + str(report_id))
                t_802 = Validator.generate_TOP802(message, report_id, inc_long,
                                                  inc_lat, report_time)
                Validator.bus_prod.send(topic=t_802['header']['topicName'],
                                        message=json.dumps(t_802))
            else:
                Validator.__incident_spam(report_id, False)
Пример #18
0
def update_job(**kwargs):
    """Update job.

    Keyword arguments:
    id -- Job ID
    summary -- Job summary
    location -- Location job was advertised
    programs -- Programs the job is specified for
    levels -- Levels job is intended for [Junior, Intermediate, Senior]
    openings -- Number of job openings
    index -- Boolean to indicate whether to index or not (default True)
    """

    summary = kwargs['summary']

    location = kwargs['location'].lower()

    levels = kwargs['levels']

    programs = []

    for program in kwargs['programs']:
        uw_program = Program.get_program(program)
        if uw_program:
            programs.append(uw_program)
        else:
            logger.error(COMPONENT,
                         'Error processing program: {}'.format(program))

    openings = 0

    try:
        if kwargs['openings']:
            openings = int(kwargs['openings']) or 0
    except Exception:
        pass

    index = False

    if index in kwargs:
        index = kwargs['index']

    job = Job.objects(id=kwargs['id']).first()

    remaining = job.openings

    # Job posting has decreased, some positions filled up
    if openings < job.openings:
        remaining = openings

    filtered_summary = engine.filter_summary(summary)

    summary_keywords = engine.get_keywords(filtered_summary, programs)

    filtered_summary_compare = re.sub(
        r'\W+', '',
        filtered_summary.lower().strip()).strip()
    job_summary_compare = re.sub(r'\W+', '',
                                 job.summary.lower().strip()).strip()

    employer = Employer.objects(jobs=kwargs['id']).first()

    # Job summary is not the same. In this case the employer most likely changed the job
    if not filtered_summary_compare == job_summary_compare:

        if openings >= 1:
            logger.info(
                COMPONENT,
                'Job: {}: different summary detected, deprecating and creating new job..'
                .format(kwargs['id']))

            job.update(set__deprecated=True)

            location = Location(name=location)

            keywords = [
                Keyword(keyword=k['keyword'], types=k['types'])
                for k in summary_keywords
            ]

            # Assume new job so number of remaining positions is same as openings
            new_job = Job(title=job.title,
                          summary=filtered_summary,
                          year=job.year,
                          term=job.term,
                          location=[location],
                          openings=openings,
                          remaining=openings,
                          levels=levels,
                          programs=programs,
                          url=job.url,
                          keywords=keywords)

            new_job.save()

            employer.update(push__jobs=new_job)

            if index:
                elastic.delete_employer_waterlooworks(employer)
                elastic.delete_job_waterlooworks(employer, job)
                elastic.index_employer_waterlooworks(employer)
                elastic.index_job_waterlooworks(employer, new_job)
        else:
            logger.info(
                COMPONENT,
                'Job: {}: different summary detected but invalid openings: {}, ignoring..'
                .format(job.title, openings))
    else:
        logger.info(COMPONENT,
                    'Job: {}: updating for current term'.format(kwargs['id']))

        location = Location(name=location)

        job.update(add_to_set__location=location,
                   set__remaining=remaining,
                   set__levels=list(set(levels + job.levels)),
                   set__programs=list(set(programs + job.programs)),
                   set__last_indexed=datetime.now())

        if index:
            elastic.update_job_waterlooworks(employer, job)
Пример #19
0
def index_waterlooworks():
    logger.info(COMPONENT, 'Indexing waterlooworks data')

    elastic_instance.indices.delete(index='waterlooworks', ignore=[404])

    elastic_instance.indices.create('waterlooworks', body={
        "mappings": {
            "employers": {
                "properties": {
                    "employer_name": {"type": "string"},
                    "employer_jobs": {"type": "string"}
                }
            },
            "jobs": {
                "_parent": {
                    "type": "employers"
                },
                "properties": {
                    "job_title": {"type": "string"},
                    "job_year": {"type": "integer"},
                    "job_term": {"type": "string"},
                    "job_summary": {"type": "string"},
                    "job_locations": {"type": "string"},
                    "job_programs": {"type": "string"},
                    "job_levels": {"type": "string"}
                }
            }
        }
    })

    logger.info(COMPONENT, 'Indexing waterlooworks employers and jobs')

    employers = []
    jobs = []

    for employer in Employer.objects.only('name', 'jobs'):
        logger.info(COMPONENT, 'Indexing employer: {}'.format(employer.name))

        employer_document = {
            "_index": "waterlooworks",
            "_type": "employers",
            "_id": employer.name,
            "_source": {
                "employer_name": employer.name,
                "employer_jobs": [str(job.id) for job in employer.jobs]
            }
        }

        employers.append(employer_document)

        for job in employer.jobs:
            if not job.deprecated:
                logger.info(COMPONENT, 'Indexing job: {} for employer: {}'.format(job.title, employer.name))

                job_document = {
                    "_index": "waterlooworks",
                    "_type": "jobs",
                    "_parent": employer.name,
                    "_id": str(job.id),
                    "_source": {
                        "employer_name": employer.name,
                        "job_title": job.title,
                        "job_year": job.year,
                        "job_term": job.term,
                        "job_summary": job.summary,
                        "job_keywords": [k.keyword for k in job.keywords],
                        "job_locations": [location.name for location in job.location],
                        "job_programs": job.programs,
                        "job_levels": job.levels
                    }
                }

                jobs.append(job_document)

            if len(jobs) == 1000:
                helpers.bulk(elastic_instance, jobs)
                jobs = []

        if len(employers) == 1000:
            helpers.bulk(elastic_instance, employers)
            employers = []

    if len(employers) > 0:
        helpers.bulk(elastic_instance, employers)

    if len(jobs) > 0:
        helpers.bulk(elastic_instance, jobs)
Пример #20
0
def import_comment(**kwargs):
    """Import comment from RateMyCoopJob.

    Keyword arguments:
    employer_name -- Employer name
    job_title -- Title of job
    comments: -- Array of comments
        comment -- Comment
        comment_date -- Date comment was submitted. Note: in non-standard form such as: 5 years ago, 3 weeks ago etc
        salary -- Job salary (hourly)
        rating -- Job rating out of 5 (1 - 5 stars on ratemycoopjob)
    """

    employer_name = kwargs['employer_name'].lower()

    job_title = kwargs['job_title'].lower()

    # If employer alias exists (ex. Research in motion -> Blackberry), use instead
    if employer_name in employer_alias.aliases:
        employer_name = employer_alias.aliases[employer_name].lower()

    # If employer does not exist
    if not Employer.objects.search_text("\"{}\"".format(employer_name)).count() > 0:
        logger.info(COMPONENT, 'Employer: {} does not exist, ignoring..'.format(employer_name))
        return

    logger.info(COMPONENT, 'Importing comments for job: {} from employer: {}'.format(job_title, employer_name))

    employer = Employer.objects.search_text("\"{}\"".format(employer_name)).no_dereference().first()

    # Iterate through all comments
    for index, comment_obj in enumerate(kwargs['comments']):

        comment = comment_obj['comment']

        comment_date = _get_comment_date(comment_obj['comment_date'])

        salary = float(comment_obj['salary'])

        rating = float(comment_obj['rating']) / 5

        # If job does not exist add to employer
        if not employer.job_exists(job_title):
            if employer.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating):
                logger.info(COMPONENT, 'Comment: {} already exists for employer: {}, ignoring'
                            .format(index, employer_name))

            else:
                logger.info(COMPONENT, 'Adding comment: {} to employer: {}'.format(index, employer_name))

                new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True,
                                      rating=AggregateRating(rating=rating, count=1))

                employer.update(push__comments=new_comment)

        # Job already exists
        else:
            job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first()

            if job.comment_exists(comment=comment, date=comment_date, salary=salary, rating=rating):
                logger.info(COMPONENT, 'Comment: {} already exists for job: {} for employer: {}, ignoring'
                            .format(index, job_title, employer_name))

            else:
                logger.info(COMPONENT, 'Adding comment: {} for job: {} from {}'.format(index, job_title, employer_name))

                new_comment = Comment(comment=comment, date=comment_date, salary=salary, crawled=True,
                                      rating=AggregateRating(rating=rating, count=1))

                job.update(push__comments=new_comment)
Пример #21
0
def import_job(**kwargs):
    """Import job.

    Keyword arguments:
    employer_name -- Employer name
    job_title -- Title of job
    summary -- Job summary
    year -- Year the job was advertised
    term -- Term job was advertised [Fall, Winter, Spring]
    location -- Location job was advertised
    openings -- Number of job openings
    remaining -- Number of job openings remaining
    applicants -- Number of applicants job has (Optional)
    levels -- Levels job is intended for [Junior, Intermediate, Senior]
    programs -- Programs the job is specified for
    url -- URL of job
    date -- Date job was crawled (useful for knowing exactly # of applicants at what time)
    index -- Boolean to indicate whether to index or not (default True)
    """

    employer_name = kwargs['employer_name'].lower()

    job_title = kwargs['job_title'].lower()

    term = kwargs['term']

    levels = []

    for level in kwargs['levels']:
        uw_level = Term.get_level(level)
        if uw_level:
            levels.append(uw_level)
        else:
            logger.error(COMPONENT, 'Error processing level: {}'.format(level))

    programs = []

    for program in kwargs['programs']:
        uw_program = Program.get_program(program)
        if uw_program:
            programs.append(uw_program)
        else:
            logger.error(COMPONENT, 'Error processing program: {}'.format(program))

    location = kwargs['location'].lower()

    openings = int(kwargs['openings'])

    remaining = int(kwargs['remaining']) if 'remaining' in kwargs else openings

    summary = kwargs['summary']

    filtered_summary = engine.filter_summary(summary)

    summary_keywords = engine.get_keywords(filtered_summary, programs)

    date = kwargs['date']

    year = date.year

    url = kwargs['url']

    applicants = 0

    try:
        if kwargs['applicants']:
            applicants = int(kwargs['applicants'])
    except Exception:
        pass

    index = False

    if index in kwargs:
        index = kwargs['index']

    logger.info(COMPONENT, 'Importing job: {} from {}'.format(job_title, employer_name))

    # If employer does not exist, create it
    if not Employer.employer_exists(employer_name):
        logger.info(COMPONENT, 'Employer: {} does not exist, creating..'.format(employer_name))

        employer = Employer(name=employer_name)

        logger.info(COMPONENT, 'Creating job: {}'.format(job_title))

        location = Location(name=location)

        applicant = Applicant(applicants=applicants, date=date)

        keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords]

        # New job so number of remaining positions is same as openings
        job = Job(title=job_title, summary=filtered_summary, year=year,
                  term=term, location=[location], openings=openings, remaining=remaining,
                  applicants=[applicant], levels=levels, programs=programs, url=url,
                  keywords=keywords)

        job.save()
        job.reload()

        employer.jobs.append(job)
        employer.save()
        employer.reload()

        if index:
            elastic.index_employer_waterlooworks(employer)
            elastic.index_job_waterlooworks(employer, job)

    # Employer already exists
    else:
        employer = Employer.objects(name=employer_name).no_dereference().first()

        logger.info(COMPONENT, 'Employer: {} already exists'.format(employer_name))

        # If job does not exist, create it
        if not employer.job_exists(job_title):
            logger.info(COMPONENT, 'Creating job: {}'.format(job_title))

            location = Location(name=location)

            applicant = Applicant(applicants=applicants, date=date)

            keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords]

            # New job so number of remaining positions is same as openings
            job = Job(title=job_title, summary=engine.filter_summary(summary), year=year,
                      term=term, location=[location], openings=openings, remaining=remaining,
                      applicants=[applicant], levels=levels, programs=programs, url=url,
                      keywords=keywords)

            job.save()
            job.reload()

            employer.update(push__jobs=job)

            if index:
                elastic.update_employer_waterlooworks(employer)
                elastic.index_job_waterlooworks(employer, job)

        # Job already exists
        else:
            logger.info(COMPONENT, 'Job: {} already exists'.format(job_title))

            job = Job.objects(id__in=[job.id for job in employer.jobs], title=job_title).first()

            if not year >= job.year:
                raise DataIntegrityError('Job: {} by {} cannot be advertised before {}'
                                         .format(job_title, employer_name, job.year))

            filtered_summary_compare = re.sub(r'\W+', '', filtered_summary.lower().strip()).strip()
            job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip()

            # Job summary is not the same. In this case the employer most likely changed the job
            if not filtered_summary_compare == job_summary_compare:

                if openings >= 1:
                    logger.info(COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..'
                                .format(job_title))

                    job.update(set__deprecated=True)

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords]

                    # Assume new job so number of remaining positions is same as openings
                    new_job = Job(title=job_title, summary=filtered_summary, year=year, term=term,
                                  location=[location], openings=openings, remaining=remaining, applicants=[applicant],
                                  levels=levels, programs=programs, url=url, keywords=keywords)

                    new_job.save()
                    new_job.reload()

                    employer.update(push__jobs=new_job)

                    if index:
                        elastic.delete_employer_waterlooworks(employer)
                        elastic.delete_job_waterlooworks(employer, job)
                        elastic.index_employer_waterlooworks(employer)
                        elastic.index_job_waterlooworks(employer, new_job)
                else:
                    logger.info(COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..'
                                .format(job_title, openings))

            # Job is the same (same title and description)
            else:
                # If job is being advertised in new term
                if year != job.year or term != job.term:
                    logger.info(COMPONENT, 'Job: {}: being advertised in new term, updating..'.format(job_title))

                    # Add hire ratio for previous term
                    hire_ratio = float(job.openings - job.remaining) / job.openings
                    
                    job.hire_rate.add_rating(hire_ratio)

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    hire_rate = AggregateRating(rating=job.hire_rate.rating, count=job.hire_rate.count)
                    
                    job.update(set__year=year, set__term=term, add_to_set__location=location, set__openings=openings,
                               set__remaining=remaining, push__applicants=applicant, set__hire_rate=hire_rate,
                               set__levels=levels, set__programs=programs, set__url=url, set__last_indexed=datetime.now())

                    if index:
                        elastic.update_job_waterlooworks(employer, job)

                # Job is being updated. We need to update location, openings, levels, remaining, hire_rate, applicants
                else:
                    logger.info(COMPONENT, 'Job: {}: updating for current term'.format(job_title))

                    remaining = job.remaining

                    # Job posting has decreased, some positions filled up
                    if openings < remaining:
                        remaining = openings

                    location = Location(name=location)

                    applicant = Applicant(applicants=applicants, date=date)

                    job.update(add_to_set__location=location, set__remaining=remaining,
                               set__levels=list(set(levels + job.levels)), push__applicants=applicant,
                               set__programs=list(set(programs + job.programs)), set__url=url,
                               set__last_indexed=datetime.now())

                    if index:
                        elastic.update_job_waterlooworks(employer, job)
Пример #22
0
def update_job(**kwargs):
    """Update job.

    Keyword arguments:
    id -- Job ID
    summary -- Job summary
    location -- Location job was advertised
    programs -- Programs the job is specified for
    levels -- Levels job is intended for [Junior, Intermediate, Senior]
    openings -- Number of job openings
    index -- Boolean to indicate whether to index or not (default True)
    """

    summary = kwargs['summary']

    location = kwargs['location'].lower()

    levels = kwargs['levels']

    programs = []

    for program in kwargs['programs']:
        uw_program = Program.get_program(program)
        if uw_program:
            programs.append(uw_program)
        else:
            logger.error(COMPONENT, 'Error processing program: {}'.format(program))

    openings = 0

    try:
        if kwargs['openings']:
            openings = int(kwargs['openings']) or 0
    except Exception:
        pass

    index = False

    if index in kwargs:
        index = kwargs['index']

    job = Job.objects(id=kwargs['id']).first()

    remaining = job.openings

    # Job posting has decreased, some positions filled up
    if openings < job.openings:
        remaining = openings

    filtered_summary = engine.filter_summary(summary)

    summary_keywords = engine.get_keywords(filtered_summary, programs)

    filtered_summary_compare = re.sub(r'\W+', '', filtered_summary.lower().strip()).strip()
    job_summary_compare = re.sub(r'\W+', '', job.summary.lower().strip()).strip()

    employer = Employer.objects(jobs=kwargs['id']).first()

    # Job summary is not the same. In this case the employer most likely changed the job
    if not filtered_summary_compare == job_summary_compare:

        if openings >= 1:
            logger.info(COMPONENT, 'Job: {}: different summary detected, deprecating and creating new job..'
                        .format(kwargs['id']))

            job.update(set__deprecated=True)

            location = Location(name=location)

            keywords = [Keyword(keyword=k['keyword'], types=k['types']) for k in summary_keywords]

            # Assume new job so number of remaining positions is same as openings
            new_job = Job(title=job.title, summary=filtered_summary, year=job.year, term=job.term,
                          location=[location], openings=openings, remaining=openings,
                          levels=levels, programs=programs, url=job.url, keywords=keywords)

            new_job.save()

            employer.update(push__jobs=new_job)

            if index:
                elastic.delete_employer_waterlooworks(employer)
                elastic.delete_job_waterlooworks(employer, job)
                elastic.index_employer_waterlooworks(employer)
                elastic.index_job_waterlooworks(employer, new_job)
        else:
            logger.info(COMPONENT, 'Job: {}: different summary detected but invalid openings: {}, ignoring..'
                        .format(job.title, openings))
    else:
        logger.info(COMPONENT, 'Job: {}: updating for current term'.format(kwargs['id']))

        location = Location(name=location)

        job.update(add_to_set__location=location, set__remaining=remaining,
                   set__levels=list(set(levels + job.levels)),
                   set__programs=list(set(programs + job.programs)), set__last_indexed=datetime.now())

        if index:
            elastic.update_job_waterlooworks(employer, job)