Пример #1
0
def get_status(sku):
    response = {"status": __in_queue__}
    status = None
    try:
        status = list(db_status.find({"sku": sku}))[0]
        status = status.get("msg")
        db_details = DB.init_db(config.get("details_db")).product_details
        product = list(db_details.find({"sku": sku}))
        product_url, product_name, image_url = "", "", ""
        if product:
            product_name = product[0].get("product_name")
            product_url = product[0].get("url")
            image_url = product[0].get("img")
        logger.info("Status for {}: {}".format(sku, status))
        return {
            "status": status,
            "product_name": product_name,
            "product_url": product_url,
            "image_url": image_url,
        }
    except IndexError:
        # this happens due to a race condition because the sku hasn't been
        # added to the database yet or because it simply doesn't exist. The
        # second case only true if the URL has been typed in manually or
        # bookmarked but the sku is missing from the URL.
        logger.warning(
            "Product status not yet available for sku {}".format(sku))
        _set_status(__in_queue__, sku)
        response = {"status": status}
    except Exception as e:
        logger.exception(e)
        response = {"status": __error__}
    return response
Пример #2
0
def _format_msg(fields, kw, maxlen=_MAXLEN):
    # adding custom extensions
    # sorting by size
    msg = _CEF_FORMAT % fields
    extensions = [(len(str(value)), len(key), key, value)
                    for key, value in kw.items()
                  if key not in _EXTENSIONS]
    extensions.sort()

    msg_len = len(msg)

    for value_len, key_len, key, value in extensions:
        added_len = value_len + key_len + 2
        value = _convert_ext(value)
        key = _check_key(key)

        if maxlen and msg_len + added_len > maxlen:
            # msg is too big.
            warn = 'CEF Message too big. %s %s' % (msg, str(kw.items()))
            logger.warning(warn)
            break

        msg += ' %s=%s' % (key, value)
        msg_len += added_len

    return msg
Пример #3
0
def _format_msg(fields, kw, maxlen=_MAXLEN):
    # adding custom extensions
    # sorting by size
    msg = _CEF_FORMAT % fields
    extensions = [(len(str(value)), len(key), key, value)
                  for key, value in kw.items() if key not in _EXTENSIONS]
    extensions.sort()

    msg_len = len(msg)

    for value_len, key_len, key, value in extensions:
        added_len = value_len + key_len + 2
        value = _convert_ext(value)
        key = _check_key(key)

        if maxlen and msg_len + added_len > maxlen:
            # msg is too big.
            warn = 'CEF Message too big. %s %s' % (msg, str(kw.items()))
            logger.warning(warn)
            break

        msg += ' %s=%s' % (key, value)
        msg_len += added_len

    return msg
Пример #4
0
def main():
    logger.config_logs(ROOT_DIR)
    processor = file_processor.FileProcessor(os.getcwd()+'/klambda.yml') # instance for opening klambda.yml
    cli_tool = cli.CLI(processor) # instance of CLI Tool
    client = cognito_client.CognitoClient()
    args = cli_tool.parser.parse_args() # reads the arguments written by the user
    if args.signup:
        if len(args.signup) == 4:
            user = klambda_user.KlambdaUser(args.signup[0], args.signup[1], args.signup[2], args.signup[3])
            client.sign_up(klambda_config.KlambdaConfig.COGNITO_APP_CLIENT, user)
        exit()
    elif args.command:
        credentials = file_processor.FileProcessor(os.getcwd()+'/user.yml')
        client.initiate_auth(klambda_config.KlambdaConfig.COGNITO_APP_CLIENT, 
                        credentials.data['USERNAME'], 
                        str(credentials.data['PASSWORD'])) # authenticate user
        module = cli_tool.modules[args.command] # gets the module specified by the user
        for com in module.commands: 
            if vars(args)[com] != None: # checks if command has parameters and starts execution
                module.execute(com, vars(args)[com])
                event_body = event.Event(
                    processor.data['project']['name'],
                    processor.data['project']['author'],
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 
                    credentials.data['USERNAME'],
                    com,
                    module.name)
                event_logger.save_event(event_body)
    else:
        logger.warning("No module nor command typed, please try again...")
        sys.exit()
Пример #5
0
def _check_key(key):
    if _KEY.match(key) is not None:
        return key
    msg = 'The "%s" key contains illegal characters' % key
    logger.warning(msg)

    # replacing illegal characters with a '?'
    return _KEY.sub('?', key)
def _check_key(key):
    if _KEY.match(key) is not None:
        return key
    msg = 'The "%s" key contains illegal characters' % key
    logger.warning(msg)

    # replacing illegal characters with a '?'
    return _KEY.sub('?', key)
Пример #7
0
    def test_graberrors(self):
        # simpler case: services logger, error level
        with capture_logs() as errors:
            logger.error("Yeah")

        self.assertEqual(errors.read(), "Yeah\n")

        # services logger, warning level
        with capture_logs(level=logging.WARNING) as wrn:
            logger.debug("Yeah")
            logger.warning("Yeah2")

        self.assertEqual(wrn.read(), "Yeah2\n")

        # root logger, warning
        root = logging.getLogger()
        with capture_logs(logger="root", level=logging.WARNING) as wrn:
            root.debug("Yeah")
            root.warning("Yeah2")

        self.assertEqual(wrn.read(), "Yeah2\n")
Пример #8
0
def _format_msg(fields, kw, maxlen=_MAXLEN, as_unicode=False):
    # adding custom extensions
    # sorting by size
    if as_unicode:
        for k, v in fields.items():
            v = _force_unicode(v)
            fields[k] = v

    msg = _CEF_FORMAT % fields

    extensions = [(_len(value), len(key), key, value)
                  for key, value in kw.items() if key not in _EXTENSIONS]
    extensions.sort()

    msg_len = len(msg)

    if as_unicode:
        msg = _force_unicode(msg)

    for value_len, key_len, key, value in extensions:
        added_len = value_len + key_len + 2
        value = _convert_ext(value)
        key = _check_key(key)

        if maxlen and msg_len + added_len > maxlen:
            # msg is too big.
            warn = 'CEF Message too big. %s %s' % (msg, str(kw.items()))
            logger.warning(warn)
            break

        fragment = ' %s=%s' % (key, value)

        if as_unicode:
            fragment = _force_unicode(fragment)

        msg += fragment

        msg_len += added_len

    return msg
def log_cef(name,
            severity,
            environ,
            config,
            username='******',
            signature=None,
            **kw):
    """Creates a CEF record, and emit it in syslog or another file.

    Args:
        - name: name to log
        - severity: integer from 0 to 10
        - environ: the WSGI environ object
        - config: configuration dict
        - signature: CEF signature code - defaults to name value
        - username: user name - defaults to 'none'
        - extra keywords: extra keys used in the CEF extension
    """
    # XXX might want to remove the request dependency here
    # so this module is standalone
    from services.util import filter_params
    name = _convert_prefix(name)
    if signature is None:
        signature = name
    else:
        signature = _convert_prefix(signature)
    severity = _convert_prefix(severity)
    config = filter_params('cef', config)

    source = get_source_ip(environ)

    fields = {
        'severity': severity,
        'source': source,
        'method': _convert_ext(environ['REQUEST_METHOD']),
        'url': _convert_ext(environ['PATH_INFO']),
        'dest': _convert_ext(environ.get('HTTP_HOST', u'none')),
        'user_agent': _convert_ext(environ.get('HTTP_USER_AGENT', u'none')),
        'signature': signature,
        'name': name,
        'version': config['version'],
        'vendor': config['vendor'],
        'device_version': config['device_version'],
        'product': config['product'],
        'host': _HOST,
        'suser': username,
        'date': strftime("%b %d %H:%M:%S")
    }

    # make sure we don't have a | anymore in regular fields
    for key, value in list(kw.items()):
        new_key = _check_key(key)
        if new_key == key:
            continue
        kw[new_key] = value
        del kw[key]

    # overriding with provided datas
    fields.update(kw)

    # resulting message
    msg = _CEF_FORMAT % fields

    # adding custom extensions
    # sorting by size
    extensions = [(len(str(value)), len(key), key, value)
                  for key, value in kw.items() if key not in _EXTENSIONS]
    extensions.sort()

    msg_len = len(msg)

    for value_len, key_len, key, value in extensions:
        added_len = value_len + key_len + 2
        value = _convert_ext(value)
        key = _check_key(key)

        if msg_len + added_len > _MAXLEN:
            # msg is too big.
            warn = 'CEF Message too big. %s %s' % (msg, str(kw.items()))
            logger.warning(warn)
            break

        msg += ' %s=%s' % (key, value)
        msg_len += added_len

    if config['file'] == 'syslog':
        if not SYSLOG:
            raise ValueError('syslog not supported on this platform')
        _syslog(msg, config)
    else:
        with _log_lock:
            with open(config['file'], 'a') as f:
                f.write('%s\n' % msg)
Пример #10
0
def _workflow(decoded, url):
    """
    Run the whole data scraping, processing, and analysis in new threads.
    Each thread, beginning with this one, will make its calls in a try-except
    block. Why do it this way? Because the parent thread that launched this 
    thread dies immediately. Therefore, when an exception is raised in the child
    thread, there's no one to receive it. This is bad for the client. The client
    relies on the status of the job. If an exception is raised in the child thread,
    the thread would die and the status would no longer be updated. This would cause
    the client to stall forever with a progress animation. If an exception 
    is raised, we want to update the status right away so that the user 
    doesn't have to wait. 

    All operations down the line like scraping or some other launch their own 
    child threads. Those operations also need to update the status before 
    exiting.
    """
    logger.info("Running a new thread for scraping and data processing")
    source = decoded[0]
    sku = decoded[1]
    url = decoded[2]
    _set_status(__in_queue__, sku)
    parsed = _db_product_details(sku)
    try:
        # Has the detail page been parsed?
        if not parsed:
            logger.info(
                "Detail page not available for {}. Proceeding to download...".
                format(sku))
            parsed = _get_product_details(source, url, sku)
            if not parsed:
                logger.error("Error while parsing product detail page for " +
                             sku)
                logger.error("Aborting process")
                _set_status(__error__, sku)
                return
        else:
            logger.info(
                "Detail page for {} already parsed. Skipping download...".
                format(sku))

        _set_status("Gathering data", sku)
        prod_name = parsed.get("product_name")
        review_count = parsed.get("review_count")
        page_count = parsed.get("page_count")

        # Do we have enough data to train on?
        if review_count <= config.get("misc").get("min_review_count"):
            logger.warning("Not enough reviews for " + sku)
            logger.error("Aborting process")
            _set_status("Not Enough Data", sku)
            return
        # If it's not in the queue, add it
        if not _is_in_queue(sku):
            logger.info(sku +
                        " not in queue. Checking if it's been scraped before")
            if not _reviews_scraped(sku):
                logger.info(sku +
                            " has not been scraped. Adding to the queue...")
                sc_helper.add_to_queue(source, sku, page_count)
        # If it's in the queue, scrape it
        if _is_in_queue(sku):
            logger.info(sku + " is in the queue. Launching the scraper")
            sc_helper.scrape(sku, prod_name, source)

        # If it hasn't been trained, train it
        if not _is_trained(sku):
            _nlp_reset(sku)
            logger.info("Starting NLP preprocessing")
            _set_status("Analyzing language", sku)
            preprocess.NLPreprocessor(sku).tokenize()
            logger.info("Finished NLP preprocessing")
            logger.info("Starting model trianing")
            _set_status("Building knowledge base", sku)
            d2v = training.Document2Vector(sku).train()
            logger.info("Finished model training")
            _update_details_db(sku)
            _set_status("Ready", sku)
    except Exception as e:
        logger.exception(e)
        _set_status(__error__, sku)
Пример #11
0
def log_cef(name, severity, environ, config, username='******',
            signature=None, **kw):
    """Creates a CEF record, and emit it in syslog or another file.

    Args:
        - name: name to log
        - severity: integer from 0 to 10
        - environ: the WSGI environ object
        - config: configuration dict
        - signature: CEF signature code - defaults to name value
        - username: user name - defaults to 'none'
        - extra keywords: extra keys used in the CEF extension
    """
    # XXX might want to remove the request dependency here
    # so this module is standalone
    from services.util import filter_params
    name = _convert_prefix(name)
    if signature is None:
        signature = name
    else:
        signature = _convert_prefix(signature)
    severity = _convert_prefix(severity)
    config = filter_params('cef', config)

    source = get_source_ip(environ)

    fields = {'severity': severity,
              'source': source,
              'method': _convert_ext(environ['REQUEST_METHOD']),
              'url': _convert_ext(environ['PATH_INFO']),
              'dest': _convert_ext(environ.get('HTTP_HOST', u'none')),
              'user_agent': _convert_ext(environ.get('HTTP_USER_AGENT',
                                                     u'none')),
              'signature': signature,
              'name': name,
              'version': config['version'],
              'vendor': config['vendor'],
              'device_version': config['device_version'],
              'product': config['product'],
              'host': _HOST,
              'suser': username,
              'date': strftime("%b %d %H:%M:%S")}

    # make sure we don't have a | anymore in regular fields
    for key, value in list(kw.items()):
        new_key = _check_key(key)
        if new_key == key:
            continue
        kw[new_key] = value
        del kw[key]

    # overriding with provided datas
    fields.update(kw)

    # resulting message
    msg = _CEF_FORMAT % fields

    # adding custom extensions
    # sorting by size
    extensions = [(len(str(value)), len(key), key, value)
                    for key, value in kw.items()
                  if key not in _EXTENSIONS]
    extensions.sort()

    msg_len = len(msg)

    for value_len, key_len, key, value in extensions:
        added_len = value_len + key_len + 2
        value = _convert_ext(value)
        key = _check_key(key)

        if msg_len + added_len > _MAXLEN:
            # msg is too big.
            warn = 'CEF Message too big. %s %s' % (msg, str(kw.items()))
            logger.warning(warn)
            break

        msg += ' %s=%s' % (key, value)
        msg_len += added_len

    if config['file'] == 'syslog':
        if not SYSLOG:
            raise ValueError('syslog not supported on this platform')
        _syslog(msg, config)
    else:
        with _log_lock:
            with open(config['file'], 'a') as f:
                f.write('%s\n' % msg)