Пример #1
0
    def bulk_queue_push(self,
                        data,
                        case_id,
                        source=None,
                        machine=None,
                        data_type=None,
                        data_path=None,
                        chunk_size=500,
                        kjson=False):
        case_id = case_id.lower()
        bulk_queue = []

        for d in data:
            di = {"_index": case_id, "_source": {}, '_id': str(uuid.uuid4())}

            di['_source']['Data'] = d['Data'] if kjson else d
            source = d['data_source'] if kjson else source
            data_type = d['data_type'] if kjson else source
            data_path = d['data_path'] if kjson else source

            if source is not None:
                di['_source']['data_source'] = source
            if machine is not None:
                di['_source']['machine'] = machine
            if data_type is not None:
                di['_source']['data_type'] = data_type
            if data_path is not None:
                di['_source']['data_path'] = data_path

            bulk_queue.append(di)

        logger.logger(level=logger.DEBUG,
                      type="elasticsearch",
                      message="Index [" + case_id + "]: Pushing [" +
                      str(len(bulk_queue)) + "] records")

        push_es = self.bulk_to_elasticsearch(bulk_queue, case_id, chunk_size)
        if push_es[0]:
            logger.logger(level=logger.INFO,
                          type="elasticsearch",
                          message="Index [" + case_id + "]: Pushed [" +
                          str(len(bulk_queue) - len(push_es[2])) +
                          "] records successfully")
            return [
                True, "Pushed [" + str(len(bulk_queue)) + "] records",
                push_es[2], push_es[3]
            ]
        else:
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Index [" + case_id +
                          "]: Failed pusheing [" + str(len(bulk_queue)) +
                          "] records",
                          reason=push_es[1])
            return [
                False,
                'Failed to bulk data to Elasticsearch: ' + str(push_es[1]),
                bulk_queue, push_es[3]
            ]
Пример #2
0
from app.brain import twitter
from app.brain.classifier import Classifier
from app.brain.markov import Markov
from app.models import Muse, Tweet, Doc
from app.config import config

from tweepy.error import TweepError
from mongoengine.errors import NotUniqueError, OperationError
from pymongo.errors import DuplicateKeyError

import random

# Logging
from app.logger import logger
logger = logger(__name__)

# Load the classifier and markov.
# Loaded here so we can keep it in memory.
# accessible via app.brain.CLS or app.brain.MKV
CLS = Classifier()
MKV = Markov(ramble=config().ramble,
             ngram_size=config().ngram_size,
             spasm=config().spasm)


def ponder():
    """
    Fetch tweets from the Muses
    and memorize them;
    i.e. train classifier or Markov on them.
    """
Пример #3
0
    def bulk_to_elasticsearch_fix_errors(self, indx, errors):
        logger.logger(level=logger.WARNING,
                      type="elasticsearch",
                      message="Index [" + indx + "]: Failed pushing [" +
                      str(len(errors)) +
                      "] records [BulkIndexError], retry to fix the issue")

        # check the returned error for each document and try to solve it
        fixed_data = []
        nonfixed_data = []
        limit_fields_increased = False
        for _id, doc in errors.iteritems():

            record_msg_info = "Indx[" + indx + "]"
            if 'machine' in doc['index']['data'].keys():
                record_msg_info += ", machine [" + doc['index']['data'][
                    'machine'] + "]"
            if 'data_type' in doc['index']['data'].keys():
                record_msg_info += ", data_type[" + doc['index']['data'][
                    'data_type'] + "]"
            if '_id' in doc['index'].keys():
                record_msg_info += ", rec_id[" + doc['index']['_id'] + "]"

            try:

                doc_reason = doc['index']['error']['reason']
                logger.logger(level=logger.WARNING,
                              type="elasticsearch",
                              message=record_msg_info + ": record failed",
                              reason=doc_reason)

                # === if the error is the limitation on the fields number, get the add 1000 to the limitation and try again
                if "Limit of total fields" in doc_reason and limit_fields_increased == False:
                    new_limit = int(self.get_total_fields_limit(indx))
                    new_limit = new_limit + 1000
                    inc = self.es_db.indices.put_settings(
                        index=indx,
                        body='{"index.mapping.total_fields.limit": ' +
                        str(new_limit) + '}')

                    if inc["acknowledged"]:
                        logger.logger(
                            level=logger.INFO,
                            type="elasticsearch",
                            message=record_msg_info +
                            " : The total_fields.limit has been increased to "
                            + str(new_limit))
                        limit_fields_increased = True
                    else:
                        logger.logger(
                            level=logger.ERROR,
                            type="elasticsearch",
                            message=record_msg_info +
                            " : failed to increase total_fields.limit")

                # === if already fixed the limit of total fields issue, then add it to the list
                if "Limit of total fields" in doc_reason and limit_fields_increased:
                    fixed_data.append({
                        "_index": doc['index']['_index'],
                        "_type": doc['index']['_type'],
                        "_id": doc['index']['_id'],
                        "_source": doc['index']['data']
                    })
                    continue

                # if there is error where the text field exceeded the maximum number of charactors (by default 32766)
                match = re.match(
                    'Document contains at least one immense term in field="(.+)" \(whose UTF8 encoding is longer than the max length ([0-9]+)\), all of which were skipped.* original message: bytes can be at most ([0-9]+) in length; got ([0-9]+)',
                    doc_reason)
                if match is not None:
                    field = match.groups()[0]
                    current_max = int(match.groups()[1])
                    data_length = int(match.groups()[3])

                    logger.logger(level=logger.ERROR,
                                  type="elasticsearch",
                                  message=record_msg_info +
                                  " : field data more than the specified",
                                  reason="field " + field +
                                  ", defined max length [" + str(current_max) +
                                  "], field data [" + str(data_length) + "]")

                # ==== check if reason that an object received but the field data type is not correct
                match = re.match(
                    "object mapping for \[(.*)\] tried to parse field \[(.*)\] as (.*), but found a concrete value",
                    doc_reason)
                if match is not None:
                    match = match.groups()
                    failed_field = match[0]

                    # if datatype is object but found concrete value
                    if match[2] == 'object':
                        d = json_get_val_by_path(doc['index']['data'],
                                                 failed_field)

                        if d[0]:
                            # if type of field is object but found "None" as string
                            if d[1] == 'None':

                                if json_update_val_by_path(
                                        doc['index']['data'], failed_field,
                                        None)[0]:

                                    fixed_data.append({
                                        "_index":
                                        doc['index']['_index'],
                                        "_type":
                                        doc['index']['_type'],
                                        "_id":
                                        doc['index']['_id'],
                                        "_source":
                                        doc['index']['data']
                                    })
                                    continue

                            # if type of field is object but found string
                            if isinstance(d[1], str):
                                if json_update_val_by_path(
                                        doc['index']['data'], failed_field,
                                    {'value': d[1]})[0]:

                                    fixed_data.append({
                                        "_index":
                                        doc['index']['_index'],
                                        "_type":
                                        doc['index']['_type'],
                                        "_id":
                                        doc['index']['_id'],
                                        "_source":
                                        doc['index']['data']
                                    })
                                    continue

                # ==== failed to parse field as date
                match = re.match(
                    "failed to parse field \[(.*)\] of type \[(.*)\] in document with id .*",
                    doc_reason)
                if match is not None:
                    match = match.groups()
                    failed_field = match[0]
                    failed_field_type = match[1]

                    # if the field mapped as date
                    if failed_field_type == 'date':
                        if json_update_val_by_path(doc['index']['data'],
                                                   failed_field,
                                                   '1700-01-01T00:00:00')[0]:
                            fixed_data.append({
                                "_index": doc['index']['_index'],
                                "_type": doc['index']['_type'],
                                "_id": doc['index']['_id'],
                                "_source": doc['index']['data']
                            })
                            continue

                    # if the field mapped as text
                    if failed_field_type == 'text':
                        d = json_get_val_by_path(doc['index']['data'],
                                                 failed_field)
                        if d[0]:
                            d = d[1]
                            try:
                                if isinstance(d, list):
                                    res = [0 for x in range(len(d))]
                                    for i in d.keys():
                                        res[int(i)] = d[i]
                                    res_str = '\n'.join(res)
                                    if json_update_val_by_path(
                                            doc['index']['data'], failed_field,
                                            res_str)[0]:
                                        fixed_data.append({
                                            "_index":
                                            doc['index']['_index'],
                                            "_type":
                                            doc['index']['_type'],
                                            "_id":
                                            doc['index']['_id'],
                                            "_source":
                                            doc['index']['data']
                                        })
                                        continue
                                elif isinstance(d, dict):
                                    res_str = "\n".join([
                                        str(k) + "=" + str(d[k])
                                        for k in d.keys()
                                    ])
                                    if json_update_val_by_path(
                                            doc['index']['data'], failed_field,
                                            res_str)[0]:
                                        fixed_data.append({
                                            "_index":
                                            doc['index']['_index'],
                                            "_type":
                                            doc['index']['_type'],
                                            "_id":
                                            doc['index']['_id'],
                                            "_source":
                                            doc['index']['data']
                                        })
                                        continue

                            except Exception as e:
                                pass

                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message=record_msg_info +
                              " : No fix found for failed record [" +
                              doc['index']['_id'] + "] data",
                              reason=doc['index']['data'])
                nonfixed_data.append({
                    "_index": doc['index']['_index'],
                    "_type": doc['index']['_type'],
                    "_id": doc['index']['_id'],
                    "_source": doc['index']['data']
                })
            except Exception as e:
                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message=record_msg_info +
                              " : unsuspected error in fixing record issue",
                              reason=str(e))
                nonfixed_data.append({
                    "_index": doc['index']['_index'],
                    "_type": doc['index']['_type'],
                    "_id": doc['index']['_id'],
                    "_source": doc['index']['data']
                })

        return fixed_data, nonfixed_data
Пример #4
0
    def bulk_to_elasticsearch(self, bulk_queue, indx, chunk_size):

        try:
            errors = {
            }  # contain dictionary of failed data (origin data and error info)
            failed = []  # contain the IDs of the failed records
            successed = []  # contain the IDs of successed records

            logger.logger(level=logger.DEBUG,
                          type="elasticsearch",
                          message="Index [" + indx +
                          "]: bulk push to ES, default chunk[" +
                          str(chunk_size) + "]: ",
                          reason="number of records: " + str(len(bulk_queue)))
            # use helpers to push the data to elasticsearch
            for ok, item in helpers.parallel_bulk(self.es_db,
                                                  bulk_queue,
                                                  chunk_size=chunk_size,
                                                  raise_on_error=False,
                                                  raise_on_exception=False):
                if not ok:
                    errors[item['index']['_id']] = item
                    logger.logger(level=logger.WARNING,
                                  type="elasticsearch",
                                  message="Index [" + indx +
                                  "]: Failed pushing record: ",
                                  reason=str(item))
                    failed.append(item['index']['_id'])
                else:
                    successed.append(item['index']['_id'])

            if len(failed):
                logger.logger(level=logger.WARNING,
                              type="elasticsearch",
                              message="Index [" + indx +
                              "]: Failed pushing [" + str(len(failed)) +
                              "] records, try to fix the issue")
                # get origin data from ID
                for data in bulk_queue:
                    try:
                        errors[data['_id']]['index']['data'] = data['_source']
                        logger.logger(level=logger.DEBUG,
                                      type="elasticsearch",
                                      message="Index [" + indx +
                                      "]: get data for failed record [" +
                                      data['_id'] + "]",
                                      reason=str(errors[data['_id']]))
                    except:
                        # if record not in the errors list, continue
                        continue
                    logger.logger(level=logger.WARNING,
                                  type="elasticsearch",
                                  message="Index [" + indx +
                                  "]: Failed pushing record: ",
                                  reason=str(data['_id']))

                fixed_errors, nonfixed_errors = self.bulk_to_elasticsearch_fix_errors(
                    indx, errors)
                failed = nonfixed_errors
                if len(fixed_errors):
                    logger.logger(
                        level=logger.DEBUG,
                        type="elasticsearch",
                        message="Index [" + indx + "]: fixed issue of [" +
                        str(len(fixed_errors)) + "] records, retry to push it")
                    repush_failed_errors = self.bulk_to_elasticsearch(
                        fixed_errors, indx, chunk_size)
                    if repush_failed_errors[0]:
                        successed += repush_failed_errors[3]
                        failed += repush_failed_errors[2]

            return [
                True, "Pushed [" + str(len(successed)) + "] records to [" +
                indx + "] index", failed, successed
            ]

        # if connection timeout to elasticsearch occurred
        except elasticsearch.exceptions.ConnectionTimeout as e:
            logger.logger(level=logger.WARNING,
                          type="elasticsearch",
                          message="Index [" + indx +
                          "]: Failed to push the records, retry again",
                          reason="Connection to Elasticsearch timeout")
            return self.bulk_to_elasticsearch(bulk_queue, indx, chunk_size)

        except Exception as e:
            logger.logger(
                level=logger.ERROR,
                type="elasticsearch",
                message="Failed pushing the records, unexpected error",
                reason=str(e))

            return [
                False, "Failed pushing [" + str(len(bulk_queue)) +
                "] records to [" + indx + "] index", bulk_queue, []
            ]
Пример #5
0
    def query(self, indexname, body, count=3):
        count -= 1

        indexname = indexname.lower()
        body["track_total_hits"] = True
        logger.logger(level=logger.DEBUG,
                      type="elasticsearch",
                      message="Query to index [" + indexname + "]",
                      reason=json.dumps(body))
        filter_path = [
            'hits.hits._source.Data', 'hits.total.value',
            'aggregations.*.buckets'
        ]
        try:
            #search_res = self.es_db.search(index=indexname,body=body , filter_path=filter_path)
            search_res = self.es_db.search(index=indexname, body=body)
            return [True, search_res]
        except elasticsearch.RequestError as e:
            reason = e.info['error']['reason']
            logger.logger(level=logger.WARNING,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [RequestError]",
                          reason=reason)
            # if the problem in shards
            if reason == "all shards failed":
                for shard in e.info['error']['failed_shards']:
                    if 'caused_by' in shard['reason'].keys():
                        shard_reason = shard['reason']['caused_by']['reason']
                    else:
                        shard_reason = shard['reason']['reason']

                    # if the reason is that the field used for key is text and is not sortable, then try it sub-field ".keyword"
                    if shard_reason.startswith(
                            "Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default"
                    ):
                        if "sort" in body.keys():
                            field = body['sort'].keys()[0]
                            order = body['sort'][field]['order']
                            body['sort'] = {
                                field + ".keyword": {
                                    'order': order
                                }
                            }

                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "], the sort is not a sortable field, try using sub-field .keyword"
                            )
                            return self.query(indexname, body, count)

                    # if the reason is the result has too many fields
                    match = re.match(
                        'field expansion (for \[.*\] )?matches too many fields, limit: ([0-9]+), got: ([0-9]+)',
                        shard_reason)
                    if match is not None:
                        # if the problem is the number of fields more than the default max number of fields in query
                        max_field_num = int(match.groups()[1]) + 100

                        inc = self.es_db.indices.put_settings(
                            index=indexname,
                            body='{ "index" : { "query": { "default_field" : '
                            + str(max_field_num) + '} } }')
                        if inc["acknowledged"]:
                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] max query fields number increased " +
                                str(max_field_num))
                            if count != 0:
                                return self.query(indexname, body, count)
                            else:
                                return [
                                    False,
                                    "exceeded the number of tries to fix the issue, field expansion matches too many fields"
                                ]
                        else:
                            logger.logger(
                                level=logger.ERROR,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] Failed increasing the result window")
                            continue

                    # if the result window is too large, increase the window
                    match = re.match(
                        'Result window is too large, from \+ size must be less than or equal to: \[([0-9]+)\] but was \[([0-9]+)\].*',
                        shard_reason)
                    if match is not None:
                        max_result_window = int(match.groups()[1]) + 1000
                        inc = self.es_db.indices.put_settings(
                            index=indexname,
                            body='{ "index" : { "max_result_window" : ' +
                            str(max_result_window) + ' } }')
                        if inc["acknowledged"]:
                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] result window increased to " +
                                str(self.get_max_result_window(indexname)))
                            if count != 0:
                                return self.query(indexname, body, count)
                            else:
                                return [
                                    False,
                                    "exceeded the number of tries to fix the issue, Result window is too large"
                                ]
                        else:
                            logger.logger(
                                level=logger.ERROR,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] Failed increasing the result window")
                            continue

                    else:
                        logger.logger(level=logger.ERROR,
                                      type="elasticsearch",
                                      message="Query [" + indexname +
                                      "] failed [RequestError]",
                                      reason=shard_reason)
            else:
                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message="Query [" + indexname +
                              "] failed [RequestError]",
                              reason=json.dumps(e.info))
            res = [False, reason]
        except elasticsearch.ConnectionError as e:
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ConnectionError]",
                          reason=e.info)
            res = [False, 'Failed to connect to elasticsearch']
        except elasticsearch.TransportError as e:
            reason = str(e)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [TransportError]",
                          reason=reason)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [TransportError]",
                          reason=json.dumps(e.info))
            res = [False, reason]
        except elasticsearch.ElasticsearchException as e:
            reason = str(e)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ElasticsearchException]",
                          reason=reason)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ElasticsearchException]",
                          reason=json.dumps(e.info))
            res = [False, reason]
        except Exception as e:
            print str(e)
            res = [False, str(e)]
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [Exception]",
                          reason=str(e))

        return res
Пример #6
0
from flask import Blueprint, render_template, redirect, request, url_for, jsonify, flash
from flask.views import MethodView
from flask.ext.mongoengine.wtf import model_form
from app import app, brain, db
from app.models import Muse, Tweet, Config, Doc
from app.auth import requires_auth
from app.forms import TweetingForm

# Logging
from app.logger import logger
logger = logger(__name__)

# Landing page
@app.route('/')
@app.route('/index')
def index():
    return render_template('index.html', speech=brain.MKV.generate())

@app.route('/generate')
def generate():
    return render_template('generate.html', speech=brain.MKV.generate())

@app.route('/generate_', methods=['GET', 'POST'])
@requires_auth
def generate_():
    form = TweetingForm()
    if form.validate_on_submit():
        flash('Tweet twoot')
        brain.twitter.tweet(form.tweet.data)
        return redirect('/generate_')
    return render_template('generate_.html', form=form, speech=brain.MKV.generate())