def session_aggr(sf, cols, key="session_code"):
    mean_operations = {("%s_mean" % col): agg.MEAN(col) for col in cols}
    std_operations = {("%s_std" % col): agg.STD(col) for col in cols}
    min_operations = {("%s_min" % col): agg.MIN(col) for col in cols}
    max_operations = {("%s_max" % col): agg.MAX(col) for col in cols}
    all_operations = {}
    all_operations.update(mean_operations)
    all_operations.update(std_operations)
    all_operations.update(min_operations)
    all_operations.update(max_operations)
    return sf.groupby(key_column_names=[key], operations=all_operations)
    def get_venue_authors_timeseries(self):

        p = self._all_papers_sf["Paper ID", "Paper publish year"]
        a = self.authors_affilations_sframe["Paper ID", "Author ID"]
        sf = p.join(a, on="Paper ID")["Author ID", "Paper publish year"]
        sf = sf.groupby(
            "Author ID", {
                "mindate": agg.MIN("Paper publish year"),
                "maxdate": agg.MAX("Paper publish year")
            })
        sf.rename({"Author ID": "v_id"})
        sf["mindate"] = sf["mindate"].apply(
            lambda y: datetime(year=y, month=1, day=1))
        sf["maxdate"] = sf["maxdate"].apply(
            lambda y: datetime(year=y, month=1, day=1))

        if sf.num_rows() == 0:
            return None

        return tc.TimeSeries(sf, index="mindate")
Exemplo n.º 3
0
    def predict(self, dataset, output_type='class', output_frequency='per_row'):
        """
        Return predictions for ``dataset``, using the trained activity classifier.
        Predictions can be generated as class labels, or as a probability
        vector with probabilities for each class.

        The activity classifier generates a single prediction for each
        ``prediction_window`` rows in ``dataset``, per ``session_id``. Thus the
        number of predictions is smaller than the length of ``dataset``. By
        default each prediction is replicated by ``prediction_window`` to return
        a prediction for each row of ``dataset``. Use ``output_frequency`` to
        get the unreplicated predictions.

        Parameters
        ----------
        dataset : SFrame
            Dataset of new observations. Must include columns with the same
            names as the features used for model training, but does not require
            a target column. Additional columns are ignored.

        output_type : {'class', 'probability_vector'}, optional
            Form of each prediction which is one of:

            - 'probability_vector': Prediction probability associated with each
              class as a vector. The probability of the first class (sorted
              alphanumerically by name of the class in the training set) is in
              position 0 of the vector, the second in position 1 and so on.
            - 'class': Class prediction. This returns the class with maximum
              probability.

        output_frequency : {'per_row', 'per_window'}, optional
            The frequency of the predictions which is one of:

            - 'per_window': Return a single prediction for each
              ``prediction_window`` rows in ``dataset`` per ``session_id``.
            - 'per_row': Convenience option to make sure the number of
              predictions match the number of rows in the dataset. Each
              prediction from the model is repeated ``prediction_window``
              times during that window.

        Returns
        -------
        out : SArray | SFrame
            If ``output_frequency`` is 'per_row' return an SArray with predictions
            for each row in ``dataset``.
            If ``output_frequency`` is 'per_window' return an SFrame with
            predictions for ``prediction_window`` rows in ``dataset``.

        See Also
        ----------
        create, evaluate, classify

        Examples
        --------

        .. sourcecode:: python

            # One prediction per row
            >>> probability_predictions = model.predict(
            ...     data, output_type='probability_vector', output_frequency='per_row')[:4]
            >>> probability_predictions

            dtype: array
            Rows: 4
            [array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086]),
             array('d', [0.01857384294271469, 0.0348394550383091, 0.026018327102065086])]

            # One prediction per window
            >>> class_predictions = model.predict(
            ...     data, output_type='class', output_frequency='per_window')
            >>> class_predictions

            +---------------+------------+-----+
            | prediction_id | session_id |class|
            +---------------+------------+-----+
            |       0       |     3      |  5  |
            |       1       |     3      |  5  |
            |       2       |     3      |  5  |
            |       3       |     3      |  5  |
            |       4       |     3      |  5  |
            |       5       |     3      |  5  |
            |       6       |     3      |  5  |
            |       7       |     3      |  4  |
            |       8       |     3      |  4  |
            |       9       |     3      |  4  |
            |      ...      |    ...     | ... |
            +---------------+------------+-----+
        """
        _tkutl._raise_error_if_not_sframe(dataset, 'dataset')
        _tkutl._check_categorical_option_type(
            'output_frequency', output_frequency, ['per_window', 'per_row'])
        _tkutl._check_categorical_option_type(
            'output_type', output_type, ['probability_vector', 'class'])
        from ._sframe_sequence_iterator import SFrameSequenceIter as _SFrameSequenceIter
        from ._sframe_sequence_iterator import prep_data as _prep_data

        from ._sframe_sequence_iterator import _ceil_dev

        prediction_window = self.prediction_window
        chunked_dataset, _ = _prep_data(dataset, self.features, self.session_id, prediction_window,
                                     self._predictions_in_chunk, verbose=False)
        data_iter = _SFrameSequenceIter(chunked_dataset, len(self.features),
                                        prediction_window, self._predictions_in_chunk,
                                        self._recalibrated_batch_size, use_pad=True)

        chunked_data = data_iter.dataset
        preds = self._pred_model.predict(data_iter).asnumpy()

        if output_frequency == 'per_row':
            # Replicate each prediction times prediction_window
            preds = preds.repeat(prediction_window, axis=1)

            # Remove predictions for padded rows
            unpadded_len = chunked_data['chunk_len'].to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            # Reshape from (num_of_chunks, chunk_size, num_of_classes)
            # to (ceil(length / prediction_window), num_of_classes)
            # chunk_size is DIFFERENT between chunks - since padding was removed.
            out = _np.concatenate(preds)
            out = out.reshape((-1, len(self._target_id_map)))
            out = _SArray(out)

            if output_type == 'class':
                id_target_map = self._id_target_map
                out = out.apply(lambda c: id_target_map[_np.argmax(c)])

        elif output_frequency == 'per_window':
            # Calculate the number of expected predictions and
            # remove predictions for padded data
            unpadded_len = chunked_data['chunk_len'].apply(
                lambda l: _ceil_dev(l, prediction_window)).to_numpy()
            preds = [p[:unpadded_len[i]] for i, p in enumerate(preds)]

            out = _SFrame({
                self.session_id: chunked_data['session_id'],
                'preds': _SArray(preds, dtype=list)
            }).stack('preds', new_column_name='probability_vector')

            # Calculate the prediction index per session
            out = out.add_row_number(column_name='prediction_id')
            start_sess_idx = out.groupby(
                self.session_id, {'start_idx': _agg.MIN('prediction_id')})
            start_sess_idx = start_sess_idx.unstack(
                [self.session_id, 'start_idx'], new_column_name='idx')['idx'][0]

            if output_type == 'class':
                id_target_map = self._id_target_map
                out['probability_vector'] = out['probability_vector'].apply(
                    lambda c: id_target_map[_np.argmax(c)])
                out = out.rename({'probability_vector': 'class'})

        return out
Exemplo n.º 4
0
    'Paper ID', {'Ref Count': agg.COUNT()})  # There are 30058322 in the list
r_sf.save('/data/sframes/PapersRefCount.sframe')
r_sf = r_sf[r_sf['Ref Count'] >= 5]  # left with 22,083,058

p_sf = gl.load_sframe("./Papers.sframe/")  # 126,903,970 rows
p_sf = r_sf.join(p_sf)  # 22,082,741
p_sf.save('./PapersMin5Ref.sframe')

p_sf = gl.load_sframe('./PapersMin5Ref.sframe')
a_sf = gl.load_sframe('./PaperAuthorAffiliations.sframe/')  # 337000127
sf = p_sf[['Paper ID']].join(a_sf)  # 86,561,861 rows
sf = sf.join(p_sf, on="Paper ID")
sf.groupby(
    "Author ID", {
        'Papers Count': agg.COUNT_DISTINCT('Paper ID'),
        'start_year': agg.MIN('Paper publish year'),
        'last_year': agg.MAX('Paper publish year'),
        'mean_ref_count': agg.AVG('Ref Count'),
        'papers_list': agg.CONCAT('Paper ID'),
        'journals_list': agg.CONCAT('Journal ID mapped to venue name'),
        'conference_list': agg.CONCAT('Conference ID mapped to venue name'),
        'affilation_list': agg.CONCAT('Affiliation ID')
    })

sf = gl.SFrame()
r = re.compile(r"\d{4}")
for i in l:
    try:
        y = r.findall(i)[0]
        x = gl.SFrame.read_csv("%s/%s" % (p, i))
        x['Year'] = y
    max_operations = {("%s_max" % col): agg.MAX(col) for col in cols}
    all_operations = {}
    all_operations.update(mean_operations)
    all_operations.update(std_operations)
    all_operations.update(min_operations)
    all_operations.update(max_operations)
    return sf.groupby(key_column_names=[key], operations=all_operations)


session_stats = session_aggr(session_data, agg_cols_total)

print("## vi.) Session heterosity")

ops = {
    "num_uniq_tracks": agg.COUNT_DISTINCT("track_code"),
    "session_length": agg.MIN("session_length")
}

uniq_info = session_data.groupby("session_code", operations=ops)
uniq_info["track_heterogenity"] = uniq_info["num_uniq_tracks"] / uniq_info[
    "session_length"]
uniq_info["track_repetition"] = uniq_info["session_length"] - uniq_info[
    "num_uniq_tracks"]
uniq_info = uniq_info.remove_column("session_length")
session_stats = session_stats.join(uniq_info, on="session_code")
del uniq_info
session_stats.save("%s/sess_stats" % folder, format='binary')

print("## vii.) Separate first and second half of the playlist")

session_data["position_over_length"] = session_data[
        if col in m_col:
            cols_for_total_aggr.append(m_col)
            break

print("cols for total aggregations:", cols_for_total_aggr)

agg_total_operations = {}
for col in cols_for_total_aggr:
    parts = col.split("_")
    feat = "_".join(parts[:-1])
    if parts[-1] == "mean" and feat != "dist_from_sess":
        agg_total_operations[col] = agg.MEAN(feat)
    elif parts[-1] == "std":
        agg_total_operations[col] = agg.STD(feat)
    elif parts[-1] == "min":
        agg_total_operations[col] = agg.MIN(feat)
    elif parts[-1] == "max":
        agg_total_operations[col] = agg.MAX(feat)
    else:
        print(col)
        continue

session_stats = session_data.groupby(key_column_names=["session_code"], operations=agg_total_operations)

print("## vi.) Session heterosity")

ops = {
    "num_uniq_tracks": agg.COUNT_DISTINCT("track_code"),
    "session_length": agg.MIN("session_length")
}
uniq_info = session_data.groupby("session_code", operations=ops)