Exemplo n.º 1
0
def calculate_merge_actions(source_entries, dest_entries, revisions):
    actions = iter_merge_actions_without_moves(
        source_entries, dest_entries, revisions)
    action_by_type = groupby(actions, MergeAction.get_type)
    touches, copies, moves, deletes, undeletes, updates, uphists, conflicts = \
             (action_by_type.get(type, []) for type in MergeActionType)
    moves = []
    unmoved_copies = []

    # If a copy also has a matching delete, make it as "move".
    deletes_by_hash = groupby(deletes, \
            lambda delete: delete.older.hash if delete.older else None)
    for action in copies:
        deletes_of_hash = deletes_by_hash.get(action.newer.hash, [])
        if action.newer.hash and deletes_of_hash:
            # Pop so we only match a given delete once.  But we
            # leave the deleted in the actions so that it's put
            # in the history and merge data, but we don't put it
            # in the revisions.
            delete = deletes_of_hash.pop()
            moves.append(action.alter(
                type = MergeActionType.move, details = delete.older))
        else:
            unmoved_copies.append(action)
    copies = unmoved_copies

    return (touches, copies, moves, deletes, undeletes,
            updates, uphists, conflicts)
Exemplo n.º 2
0
def gen_week_skillsets(xml):
    # returns an integer week from 0-51
    def week_from_score(score) -> int:
        datetime = parsedate(score.findtext("DateTime"))
        week = datetime.isocalendar()[1]
        return week

    chronological_scores = sorted(iter_scores(xml),
                                  key=lambda s: s.findtext("DateTime"))

    week_start_datetimes: List[datetime] = []
    diffsets: List[List[float]] = []

    for week, scores_in_week in util.groupby(chronological_scores,
                                             week_from_score):
        diffset = [0, 0, 0, 0, 0, 0, 0]
        for score in scores_in_week:
            skillset_ssrs = score.find("SkillsetSSRs")
            if skillset_ssrs is None: continue
            diffs = [float(diff.text) for diff in skillset_ssrs[1:]]
            main_diff = diffs.index(max(diffs))
            diffset[main_diff] += 1

        total = sum(diffset)
        if total == 0: continue
        diffset = [diff / total * 100 for diff in diffset]

        year = scores_in_week[0].findtext("DateTime")[:4]
        week_start_datetime = datetime.strptime(f"{year} {week} {0}",
                                                "%Y %W %w")

        diffsets.append(diffset)
        week_start_datetimes.append(week_start_datetime)

    return (week_start_datetimes, diffsets)
Exemplo n.º 3
0
    def aggregate_timedeltas(self, col_ids, aggr_func=None):
        """
        col_ids is the list of column indices that should be aggregated. The aggregation function
        can be specified, but is otherwise sum(), and always acts over the time columns. Please
        note that index numbers follow this order:
        id, resource_type, resource_id, operation, username, time_started, time_ended
        """
        if aggr_func is None: aggr_func = ASADataSet.sum_timedeltas

        def set_keys(*indices):
            """Returns a function that returns a tuple of key values"""
            def get_keys(seq, indices=indices):
                keys = []
                for i in indices:
                    keys.append(seq[i])
                return tuple(keys)
            return get_keys

        keyfunc = set_keys(*col_ids)
        aggregated = []
        for k,v in groupby(self.data, key=keyfunc):
            aggregated.append(tuple(list(k) + [aggr_func(v)]))
        return ASADataSet(
            ['resource_type', 'operation', 'username', 'durantion'],
            aggregated)
Exemplo n.º 4
0
    def aggregate_timedeltas(self, col_ids, aggr_func=None):
        """
        col_ids is the list of column indices that should be aggregated. The aggregation function
        can be specified, but is otherwise sum(), and always acts over the time columns. Please
        note that index numbers follow this order:
        id, resource_type, resource_id, operation, username, time_started, time_ended
        """
        if aggr_func is None: aggr_func = ASADataSet.sum_timedeltas

        def set_keys(*indices):
            """Returns a function that returns a tuple of key values"""
            def get_keys(seq, indices=indices):
                keys = []
                for i in indices:
                    keys.append(seq[i])
                return tuple(keys)

            return get_keys

        keyfunc = set_keys(*col_ids)
        aggregated = []
        for k, v in groupby(self.data, key=keyfunc):
            aggregated.append(tuple(list(k) + [aggr_func(v)]))
        return ASADataSet(
            ['resource_type', 'operation', 'username', 'durantion'],
            aggregated)
Exemplo n.º 5
0
def earlyorder(*goals):
    """ Reorder goals to avoid EarlyGoalErrors

    All goals are evaluated.  Those that raise EarlyGoalErrors are placed at
    the end in a lallearly

    See also:
        EarlyGoalError
    """
    groups = groupby(earlysafe, goals)
    good = groups.get(True, [])
    bad = groups.get(False, [])

    if not good:
        raise EarlyGoalError()
    elif not bad:
        return tuple(good)
    else:
        return tuple(good) + ((lallearly, ) + tuple(bad), )
Exemplo n.º 6
0
def earlyorder(*goals):
    """ Reorder goals to avoid EarlyGoalErrors

    All goals are evaluated.  Those that raise EarlyGoalErrors are placed at
    the end in a lallearly

    See also:
        EarlyGoalError
    """
    groups = groupby(earlysafe, goals)
    good = groups.get(True, [])
    bad  = groups.get(False, [])

    if not good:
        raise EarlyGoalError()
    elif not bad:
        return tuple(good)
    else:
        return tuple(good) + ((lallearly,) + tuple(bad),)
Exemplo n.º 7
0
    def _transpose_activity_times(self, activity_times):
        def get_duration_for_user_and_activity(user, activity_type):
            for row in activity_times:
                if row[0] == user and row[1] == activity_type:
                    return row[2]
            return timedelta(0)

        blanked_and_ordered_activity_times = []
        import numpy as np
        for user in set(np.array(activity_times)[:, 0]):  # unique users
            for activity_type in Measurements.to_list():
                blanked_and_ordered_activity_times.append([
                    user, activity_type,
                    get_duration_for_user_and_activity(user, activity_type)
                ])

        transposed_activity_times = [
            [user] + np.array(list(row)).transpose().tolist()[2:][0]
            for user, row in groupby(blanked_and_ordered_activity_times,
                                     lambda x: x[0])
        ]
        return transposed_activity_times
Exemplo n.º 8
0
    def _transpose_activity_times(self, activity_times):
        def get_duration_for_user_and_activity(user, activity_type):
            for row in activity_times:
                if row[0] == user and row[1] == activity_type:
                    return row[2]
            return timedelta(0)

        blanked_and_ordered_activity_times = []
        import numpy as np
        for user in set(np.array(activity_times)[:,0]): # unique users
            for activity_type in Measurements.to_list():
                blanked_and_ordered_activity_times.append([user, activity_type, get_duration_for_user_and_activity(user, activity_type)])

        transposed_activity_times = [[user] + np.array(list(row)).transpose().tolist()[2:][0] for user,row in groupby(blanked_and_ordered_activity_times, lambda x: x[0])]
        return transposed_activity_times
Exemplo n.º 9
0
def group_history_by_peerid(entries):
    return groupby(entries, operator.itemgetter(1), into=History)
Exemplo n.º 10
0
def group_history_by_gpath(entries):
    # TODO: Make faster (give entry a cached gpath)?
    return groupby(entries, HistoryEntry.get_gpath, into=History)
Exemplo n.º 11
0
def format_top_features(args):
    # Get list of features, w other data
    tables = util.get_tables_to_join()
    all_results = []
    for tab in tables:
        print "processing table ", tab
        try:
            columns = [col for col, tp in util.get_table_schema_w_types(tab)]
            fname = "%s_w_nonnull_counts.csv" % tab
            for l in open(fname,'r').readlines():
                parts = l.strip().split(util.DELIM)
                date, pfcode, bootstrap_sample = parts[0], parts[1], parts[2]
                cors_and_nonnull_counts = parts[3:]
                cors = [str_to_corr(x) for x in cors_and_nonnull_counts[0::2]]
                nonnull_fail_counts = [
                    (int(x) if x.isdigit() else 0)
                    for x in cors_and_nonnull_counts[1::2]]
                new_results = [
                        {"date": date, "product": util.PRODUCT,
                        "metric": util.FEATURE_WEIGHT_METRIC,
                        "bootstrap_sample": bootstrap_sample,
                        "pfcode": pfcode,
                        "table": tab,
                        "feature": feature,
                        "weight": weight,
                        "n_nonnull_fails": n_nonnull_fails,
                        }
                        for feature, weight, n_nonnull_fails
                        in zip(columns, cors, nonnull_fail_counts)
                    ]
                new_valid_results = [r for r in new_results if
                    r["weight"] != util.NULL_WEIGHT and
                    r["weight"]<util.MAX_WEIGHT and
                    r["n_nonnull_fails"]>=util.MIN_N_NONNULL_DATAPOINTS
                    ]
                del new_results
                all_results.extend(new_valid_results)
        except:
            print "failed"
            continue
    
    valid_results = all_results
    
    print "len valid", len(valid_results)
    if not util.INCLUDE_PFCODE_OTHER:
        valid_results = [r for r in valid_results if r["pfcode"] != util.OTHER_PFCODE]
    
    # Aggregate across each bootstrap sample to get aggregate weight
    by_pfcode_date_table_feature = util.groupby(valid_results, ["pfcode", "date", "table", "feature"])
    agg_results = {}
    for k, recs in by_pfcode_date_table_feature.items():
        agg_weight = aggregate_cors([r["weight"] for r in recs])
        first_rec = recs[0].copy()
        first_rec["weight"] = agg_weight
        agg_results[k] = first_rec
    
    # Group features by date/pfcode and take top 20
    by_pfcode_date = util.groupby(agg_results.values(), ["pfcode", "date"])
    output_results = []
    top_errors = set(util.get_top_errors(args))
    for key, lst in by_pfcode_date.items():
        pf, date = key
        if pf not in top_errors: continue
        lst.sort(key = lambda r: str_to_corr(r["weight"]), reverse=True)
        ln = min(len(lst), args.num_top_features)
        output_results.extend(lst[:ln])
    split_output_lines = [
        [str(record[c]) for c in util.TOP_FEATURES_SCHEMA]
        for record in output_results]
    output_lines = [util.DELIM.join(l) for l in split_output_lines]
    open(util.TOP_FEATURES_FNAME,"w").write("\n".join(output_lines))