예제 #1
0
파일: base.py 프로젝트: princez1214/flask
 def _save_analysis(self, temp_results, n_processed_items):
     end_time = utc_now()
     processing_time = (end_time - self.start_time).total_seconds()
     self.analysis._results = json.dumps(temp_results)
     self.analysis.status_message = "Successfully finished the analysis of %s items in %s seconds." % (
         n_processed_items, processing_time)
     self.analysis.progress = PROGRESS_DONE
     self.analysis.save()
     manager.produce_state_update({'progress': 'done'})
예제 #2
0
def some_hanging_func(channel, user_id):
    for i in xrange(10):
        print 'some_func: doing work', i
        time.sleep(0.1)

    manager.produce_state_update({})

    print 'func is hanging now'
    time.sleep(100)
    return 1
예제 #3
0
def process_journeys_analysis(analysis_tpl):
    start_time = utc_now()
    journey_type_instance = JourneyType.objects.get(
        analysis_tpl.filters['journey_type']
        [0])  # We pass 1 journey_type id as array
    j_analysis = JourneysAnalysis(
        analysis_tpl,
        journey_type_instance)  # JourneyAnalysis (data parsing, filtering etc)

    try:
        journey_filters = j_analysis.parsed_journeys_filters()
        # We only use these for matching to classes in case of converstion analysis, pop them from filters
        if 'funnel_id' in journey_filters:
            journey_filters.pop('funnel_id')
        if 'stage_id' in journey_filters:
            journey_filters.pop('stage_id')

        match = journey_filters
        initial_pipeline = [{"$match": match}]
        pipe = initial_pipeline.append
        pipe({'$sort': {CustomerJourney.F.first_event_date: 1}})

        if 'from_dt' in journey_filters:
            journey_filters.pop('from_dt')
            journey_filters.pop('to_dt')

        timeline_filter = deepcopy(journey_filters)
        timeline_filter.update({
            'level': analysis_tpl.get_timerange_level(),
            'computed_metric': analysis_tpl.analyzed_metric,
            'plot_type': 'timeline'
        })

        params = dict(filters=journey_filters,
                      initial_pipeline=initial_pipeline,
                      start_time=start_time,
                      timeline_filter=timeline_filter)
        analysis_process = AnalysisProcess(j_analysis, params)

        if analysis_tpl.analysis_type == analysis_tpl.CLASSIFICATION_TYPE:
            analysis_process.classification()
        elif analysis_tpl.analysis_type == analysis_tpl.REGRESSION_TYPE:
            analysis_process.regression()

    except AnalysisTerminationException, ex:
        LOGGER.error(ex)
        j_analysis.analysis.status_message = str(ex)
        j_analysis.analysis.progress = PROGRESS_ERROR
        j_analysis.analysis.save()
        manager.produce_state_update({'error': str(ex)})
        return
예제 #4
0
def process_predictive_analysis(analysis_tpl):
    start_time = utc_now()
    p_analysis_handler = PredictorsAnalysis(analysis_tpl)

    try:
        filters = p_analysis_handler.parsed_predictor_filters(
        )  # add from_dt, to_dt fields
        query = p_analysis_handler.predictors_view.prepare_query(filters)
        initial_pipeline = [{'$match': query}]

        if 'from_dt' in filters:
            filters.pop('from_dt')
            filters.pop('to_dt')

        timeline_filter = deepcopy(filters)
        timeline_filter.update({
            'level': analysis_tpl.get_timerange_level(),
            'plot_by': 'reward',
            'plot_type': 'time'
        })
        params = dict(filters=filters,
                      initial_pipeline=initial_pipeline,
                      start_time=start_time,
                      timeline_filter=timeline_filter)
        analysis_process = AnalysisProcess(p_analysis_handler, params)

        if analysis_tpl.analysis_type == analysis_tpl.REGRESSION_TYPE:
            analysis_process.regression()  #use_mongo=False)
        elif analysis_tpl.analysis_type == analysis_tpl.CLASSIFICATION_TYPE:
            analysis_process.classification()

    except AnalysisTerminationException, ex:
        LOGGER.error(ex)
        p_analysis_handler.analysis.status_message = str(ex)
        p_analysis_handler.analysis.progress = PROGRESS_ERROR
        p_analysis_handler.analysis.save()
        manager.produce_state_update({'error': str(ex)})
        return
예제 #5
0
def retrain_function(predictor, models):
    from copy import deepcopy

    for model in models:
        backup_model_data = deepcopy(model.data)
        predictor.select_model(model)
        predictor.features_space_size(model)
        try:
            manager.produce_state_update({
                'model': model.display_name,
                'progress': model.task_data.progress
            })
            predictor.train_models(model=model)
            manager.produce_state_update({
                'model': model.display_name,
                'progress': model.task_data.progress
            })

            predictor.save_model(model)
        except Exception, ex:
            # TODO: This needs some user notification somehow!
            LOGGER.exception(ex)
            model.data = backup_model_data
            model.save()
예제 #6
0
파일: base.py 프로젝트: princez1214/flask
class AnalysisProcess(object):
    # Common class for current PRR and JA analysis
    # Each Attribute Handler Class should have 'train_class' field which is the reference of Collection
    def __init__(self, special_attr_handler, params):
        self.analysis = special_attr_handler.analysis
        self.initial_pipeline = params['initial_pipeline']
        self.filters = params['filters']
        self.timeline_filter = params['timeline_filter']
        self.special_attr_handler = special_attr_handler
        self.start_time = params['start_time']

    def get_class_idx(self, item):
        stored_metric_values = self.analysis.metric_values  # Load so no extra validation / mongo calls done

        if self.analysis.analyzed_metric == "conversion":
            return self.special_attr_handler.get_conversion_class(item)

        if self.analysis.analyzed_metric == "stage-paths":
            return self.special_attr_handler.get_stage_path_class(item)

        if self.analysis.analyzed_metric == 'nps_categories':
            return self.special_attr_handler.get_nps_category(item)

        if self.analysis.analyzed_metric == 'paths-comparison':
            return self.special_attr_handler.get_path_class(item)

        if hasattr(item, 'reward'):
            metric_value = item.reward  # TODO: old field name, should be removed as well
        else:
            # Journey Analysis stuff, this needs to be refactored
            # metric_value = getattr(item, self.analysis.analyzed_metric)
            if self.analysis.analyzed_metric in item.journey_attributes:
                metric_value = item.journey_attributes[
                    self.analysis.analyzed_metric]
            else:
                metric_value = item.journey_attributes[
                    self.analysis.analyzed_metric.lower()]

        if metric_value is None:
            return -1
        if self.analysis.metric_type == self.analysis.BOOLEAN_METRIC or self.analysis.metric_type == self.analysis.LABEL_METRIC:
            if str(metric_value).lower() in stored_metric_values:
                return stored_metric_values.index(str(metric_value).lower())
            else:
                return self.analysis.IDX_UNKNOWN
        if self.analysis.metric_type == self.analysis.NUMERIC_METRIC:
            for idx, boundary in enumerate(stored_metric_values):
                if int(metric_value) <= int(boundary):
                    return idx
            return idx + 1
        return self.analysis.IDX_UNKNOWN

    def get_scatter_bar_plot(self, feature_dict):
        # If we have known Feature Value categories, then we can group_by them per each key
        scatter_results = dict(key='Bubble', values=[])
        bar_results = dict(key='Bar', values=[])
        for _feature_value_key, _feature_value_rewards in feature_dict.iteritems(
        ):
            reward_counts = sorted(list(_feature_value_rewards.iteritems()),
                                   key=lambda x: x[0])
            sum_counts = 0
            sum_values = 0
            for feat_val, feat_val_count in reward_counts:
                sum_counts += feat_val_count
                sum_values += feat_val_count * feat_val
            bar_results['values'].append({
                'label':
                str(_feature_value_key),
                'count':
                len(_feature_value_rewards),
                'avg_metric':
                float(sum_values) / sum_counts
            })
        return [scatter_results], [bar_results]

    # def get_pie_plot(self, feature_dict):
    #     pie_plot_results = []
    #     for _feature_value_key, _feature_value_rewards in feature_dict.iteritems():
    #         _item = {'label': _feature_value_key,
    #                  'value': _feature_value_rewards}
    #         pie_plot_results.append(_item)
    #     return pie_plot_results

    def get_box_plot(self, feature_dict):
        """Computation of metric values for boxplot chart"""
        box_plot_results = []
        descriptive_statistics = []

        for _feature_value_key, _feature_value_rewards in feature_dict.iteritems(
        ):
            # if len(set(_feature_value_rewards)) == 1:
            # ignore feature value with 1 reward, because it's useless to find mean, Q1, Q3 etc with 1 value
            # continue
            reward_counts = sorted(list(_feature_value_rewards.iteritems()),
                                   key=lambda x: x[0])
            total_counts = sum([r[1] for r in reward_counts])
            current_count = 0
            found_25_quartile = False
            found_median = False
            found_75_quartile = False
            max_count = 0
            most_common = None
            running_sum = 0
            for feat_val, feat_val_count in reward_counts:
                current_count += feat_val_count
                if current_count > 0.25 * total_counts and not found_25_quartile:
                    found_25_quartile = True
                    q1 = feat_val
                if current_count > 0.5 * total_counts and not found_median:
                    found_median = True
                    q2 = feat_val
                if current_count > 0.75 * total_counts and not found_75_quartile:
                    found_75_quartile = True
                    q3 = feat_val
                if feat_val_count > max_count:
                    max_count = feat_val_count
                    most_common = feat_val
                running_sum += feat_val * feat_val_count

            iqr = q3 - q1
            _lowest = q1 - 1.5 * iqr
            _highest = q3 + 1.5 * iqr

            if self.analysis.metric_type == self.analysis.NUMERIC_METRIC:
                _lowest = max(_lowest, self.analysis.metric_values_range[0])
                _highest = min(_highest, self.analysis.metric_values_range[1])

            _descriptive_analysis = dict(Q1=q1,
                                         Q2=q2,
                                         Q3=q3,
                                         mean=float(running_sum) /
                                         total_counts,
                                         mode=most_common,
                                         whisker_low=_lowest,
                                         whisker_high=_highest)
            _descriptive_analysis.update(outliers=[
                x[0] for x in reward_counts
                if x[0] > _highest or x[0] < _lowest
            ])

            box_plot_results.append({
                'label': _feature_value_key,
                'values': _descriptive_analysis
            })
            descriptive_statistics.append(_descriptive_analysis)

        return sorted(box_plot_results), descriptive_statistics

    def _fetch_in_batches(self, pipeline, batch_size=BATCH_SIZE):
        agg_cursor = self.special_attr_handler.train_class.objects.coll.aggregate(
            pipeline, allowDiskUse=True, cursor={'batchSize': batch_size})

        counter = 0
        ids = []
        for entry in agg_cursor:
            ids.append(entry['_id'])
            counter += 1
            if counter % batch_size == 0:
                for item in self.special_attr_handler.train_class.objects(
                        id__in=ids):
                    yield item
                ids = []
        if ids:
            for item in self.special_attr_handler.train_class.objects(
                    id__in=ids):
                yield item

    def classification(self):
        n_processed_items = 0
        temp_results = dict()

        timeslot_counts = self.analysis.initialize_timeslot_counts()
        count = 0

        for item in self._fetch_in_batches(self.initial_pipeline,
                                           batch_size=BATCH_SIZE):
            if count % BATCH_SIZE == 0:
                try:
                    self.analysis.reload()
                except self.special_attr_handler.train_class.DoesNotExist:
                    LOGGER.warning(
                        "Analysis with id=%s was removed while running." %
                        self.analysis.id)

            count += 1
            if self.analysis.is_stopped():
                return

            try:
                class_idx = self.get_class_idx(item)
            except AnalysisTerminationException, ex:
                LOGGER.error(ex)
                self.analysis.status_message = str(ex)
                self.analysis.progress = PROGRESS_ERROR
                self.analysis.save()
                manager.produce_state_update({'error': str(ex)})
                return

            if class_idx == self.analysis.IDX_SKIP:
                continue

            timeslot_idx = self.analysis.get_timeslot_index(item)
            if timeslot_counts[class_idx][timeslot_idx] is not None:
                timeslot_counts[class_idx][timeslot_idx] += 1

            for feature in self.special_attr_handler.FEATURES:
                feature = self.special_attr_handler.train_class.translate_static_key_name(
                    feature)
                if feature not in temp_results:
                    temp_results[feature] = {
                        self.analysis.KEY_WEIGHT: 0,
                        self.analysis.KEY_VALUES: [],
                        self.analysis.KEY_CROSSTAB: {}
                    }
                attribute_handler = ClassificationAttrHandler(feature)
                feature_value = self.special_attr_handler.get_value(
                    item, feature)
                known_feature_values = temp_results[feature][
                    self.analysis.KEY_VALUES]
                # Brand new value never processed, add to list of all existing values for this feature
                attribute_handler.ensure_all_values(feature_value,
                                                    known_feature_values)

                # If we have a list field, process each individually, all of them might have made
                # this specific item instance fall into this class
                crosstab_results = temp_results[feature][
                    self.analysis.KEY_CROSSTAB]
                if isinstance(feature_value, list):
                    for one_feature in feature_value:
                        attribute_handler.increment_counts(
                            crosstab_results, one_feature, class_idx)
                else:
                    attribute_handler.increment_counts(crosstab_results,
                                                       feature_value,
                                                       class_idx)

            n_processed_items += 1
            self.analysis.progress = n_processed_items
            self.analysis.save()
            if n_processed_items % 100 == 0:
                manager.produce_state_update({'progress': n_processed_items})

        if n_processed_items == 0:
            self.analysis.status_message = "Could not find any results for specified filters. Canceled analysis."
            self.analysis.progress = PROGRESS_ERROR
            self.analysis.save()
            manager.produce_state_update(
                {'error': self.analysis.status_message})
            return

        manager.produce_state_update({'progress': n_processed_items})

        for one_feature_values in temp_results.values():
            sum_per_attribute = 0
            min_weight = 1. / self.analysis.get_num_classes()
            # Normalize all results
            weights = []
            for feature_class_counts in one_feature_values[
                    self.analysis.KEY_CROSSTAB].values():
                sum_per_value = 0
                max_per_value = 0

                for key, individual_count in feature_class_counts.iteritems():
                    normalized_value = float(
                        individual_count) / n_processed_items
                    if normalized_value > max_per_value:
                        max_per_value = normalized_value
                    sum_per_value += normalized_value
                    feature_class_counts[key] = "%.3f" % (normalized_value *
                                                          100)

                if max_per_value:
                    weight = max_per_value / sum_per_value
                    weight = (weight - min_weight) / (1 - min_weight)
                    weights.append(weight)
                sum_per_attribute += sum_per_value
            if weights:
                one_feature_values['discriminative_weight'] = sum(
                    weights) / len(weights)
                LOGGER.info(
                    "Individual weights are %s and final weight is %s" %
                    (weights, sum(weights) / len(weights)))
            else:
                one_feature_values['discriminative_weight'] = 0

            # Append 0's as needed
            all_classes = range(self.analysis.get_num_classes()) + [-1]
            for feature_class_counts in one_feature_values[
                    self.analysis.KEY_CROSSTAB].values():
                for class_key in all_classes:
                    if class_key not in feature_class_counts:
                        feature_class_counts[class_key] = '0'

        ordered_timeslot_counts = []
        for key, value in timeslot_counts.iteritems():
            timerange_entry = dict(class_key=key, timerange=[])
            ordered_timeslot_counts.append(timerange_entry)
            for timeslot in sorted(timeslot_counts[key].keys()):
                timerange_entry['timerange'].append(
                    [timeslot, timeslot_counts[key][timeslot]])

        if 'timerange' in self.analysis.filters:
            self.analysis.filters.pop('timerange')

        self.analysis._timerange_results = json.dumps(ordered_timeslot_counts)
        self._save_analysis(temp_results, n_processed_items)
예제 #7
0
파일: base.py 프로젝트: princez1214/flask
    def regression(self):
        from copy import deepcopy
        min_metric, max_metric = self.analysis.metric_values_range
        min_metric = int(min_metric)
        max_metric = int(max_metric)

        max_helper_values = 100  # Maximum number of distinct metric value buckets to keep counts of
        helper_metric_counts = dict()

        one_metric_helper = dict()
        if max_metric - min_metric + 1 < max_helper_values:
            metric_step = 1
            for metric_val in range(min_metric, max_metric + 1):
                one_metric_helper[metric_val] = 0
        else:
            metric_step = (max_metric - min_metric) / max_helper_values
            for idx in xrange(max_helper_values):
                one_metric_helper[int(idx * metric_step)] = 0

        def get_reward_bucket(reward):
            n_steps = (reward - min_metric) / metric_step
            return min_metric + n_steps * metric_step

        batch_size = 10000
        have_more_data = True
        current_batch = 0
        _results = {}
        ranking_helper = dict()
        result_count = 0

        while have_more_data:
            pipeline = deepcopy(self.initial_pipeline)
            pipeline.append({'$skip': current_batch * batch_size})
            pipeline.append({'$limit': batch_size})
            pipeline = self.special_attr_handler.build_regression_pipe(
                pipeline)
            # if use_mongo:
            aggregation_results = self.special_attr_handler.train_class.objects.coll.aggregate(
                pipeline, allowDiskUse=True)['result']
            if not aggregation_results:
                have_more_data = False
                break
            else:
                aggregation_results = aggregation_results[0]
                current_batch += 1
                result_count += aggregation_results['count']

            for feat_key, feat_values in aggregation_results.iteritems():
                if feat_key not in helper_metric_counts:
                    helper_metric_counts[feat_key] = dict()
                feature_helper = helper_metric_counts[feat_key]
                if isinstance(feat_values, list) and feat_values:
                    for value in feat_values:
                        if not value or 'value' not in value:
                            continue
                        if type(value['value']) in (dict, list):
                            value['value'] = str(value['value'])
                        if value['value'] not in feature_helper:
                            feature_helper[value['value']] = deepcopy(
                                one_metric_helper
                            )  # TODO: bucket value['value'] for features with huge cardinalties
                        reward = get_reward_bucket(value['reward'])
                        if reward not in feature_helper[value['value']]:
                            feature_helper[value['value']][reward] = 1
                        else:
                            feature_helper[value['value']][reward] += 1

        try:
            timeline_results = self.special_attr_handler.get_timeline_results(
                self.initial_pipeline, self.timeline_filter)
        except AnalysisTerminationException, ex:
            LOGGER.error(ex)
            self.analysis.status_message = str(ex)
            self.analysis.progress = PROGRESS_ERROR
            self.analysis.save()
            manager.produce_state_update({'error': str(ex)})
            return
예제 #8
0
파일: base.py 프로젝트: princez1214/flask
    def classification(self):
        n_processed_items = 0
        temp_results = dict()

        timeslot_counts = self.analysis.initialize_timeslot_counts()
        count = 0

        for item in self._fetch_in_batches(self.initial_pipeline,
                                           batch_size=BATCH_SIZE):
            if count % BATCH_SIZE == 0:
                try:
                    self.analysis.reload()
                except self.special_attr_handler.train_class.DoesNotExist:
                    LOGGER.warning(
                        "Analysis with id=%s was removed while running." %
                        self.analysis.id)

            count += 1
            if self.analysis.is_stopped():
                return

            try:
                class_idx = self.get_class_idx(item)
            except AnalysisTerminationException, ex:
                LOGGER.error(ex)
                self.analysis.status_message = str(ex)
                self.analysis.progress = PROGRESS_ERROR
                self.analysis.save()
                manager.produce_state_update({'error': str(ex)})
                return

            if class_idx == self.analysis.IDX_SKIP:
                continue

            timeslot_idx = self.analysis.get_timeslot_index(item)
            if timeslot_counts[class_idx][timeslot_idx] is not None:
                timeslot_counts[class_idx][timeslot_idx] += 1

            for feature in self.special_attr_handler.FEATURES:
                feature = self.special_attr_handler.train_class.translate_static_key_name(
                    feature)
                if feature not in temp_results:
                    temp_results[feature] = {
                        self.analysis.KEY_WEIGHT: 0,
                        self.analysis.KEY_VALUES: [],
                        self.analysis.KEY_CROSSTAB: {}
                    }
                attribute_handler = ClassificationAttrHandler(feature)
                feature_value = self.special_attr_handler.get_value(
                    item, feature)
                known_feature_values = temp_results[feature][
                    self.analysis.KEY_VALUES]
                # Brand new value never processed, add to list of all existing values for this feature
                attribute_handler.ensure_all_values(feature_value,
                                                    known_feature_values)

                # If we have a list field, process each individually, all of them might have made
                # this specific item instance fall into this class
                crosstab_results = temp_results[feature][
                    self.analysis.KEY_CROSSTAB]
                if isinstance(feature_value, list):
                    for one_feature in feature_value:
                        attribute_handler.increment_counts(
                            crosstab_results, one_feature, class_idx)
                else:
                    attribute_handler.increment_counts(crosstab_results,
                                                       feature_value,
                                                       class_idx)

            n_processed_items += 1
            self.analysis.progress = n_processed_items
            self.analysis.save()
            if n_processed_items % 100 == 0:
                manager.produce_state_update({'progress': n_processed_items})