Exemplo n.º 1
0
    def __init__(
        self,
        start,
        stop,
        hours=(20, 4),
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
        tables="all",
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.hours = hours
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "datetime",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours="all",
            subscriber_identifier=subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        super().__init__()
    def __init__(
        self,
        start,
        stop,
        statistic="avg",
        *,
        hours="all",
        tables="all",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
        exclude_self_calls=True,
    ):
        self.tables = tables
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.direction = Direction(direction)
        self.exclude_self_calls = exclude_self_calls

        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        column_list = [
            "msisdn", "msisdn_counterpart", "id", "location_id", "outgoing"
        ]
        self.tables = tables

        # EventsTablesUnion will only subset on the subscriber identifier,
        # which means that we need to query for a unioned table twice. That has
        # a considerable negative impact on execution time.
        self.unioned_from_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier="msisdn",
            hours=hours,
            subscriber_subset=subscriber_subset,
        )

        self.unioned_to_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier="msisdn_counterpart",
            hours=hours,
            subscriber_subset=subscriber_subset,
        )

        self.distance_matrix = DistanceMatrix()

        super().__init__()
Exemplo n.º 3
0
    def __init__(
        self,
        start,
        stop,
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.OUT,
        statistic="sum",
        spatial_unit: Optional[AnySpatialUnit] = None,
        hours="all",
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        if spatial_unit is None:
            self.spatial_unit = make_spatial_unit("admin", level=3)
        else:
            self.spatial_unit = spatial_unit
        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats
                )
            )

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            "duration",
            "location_id",
            "datetime",
            *self.direction.required_columns,
        ]
        self.unioned_query = location_joined_query(
            EventsTablesUnion(
                self.start,
                self.stop,
                tables="events.calls",
                columns=column_list,
                hours=hours,
                subscriber_subset=subscriber_subset,
                subscriber_identifier=self.subscriber_identifier,
            ),
            spatial_unit=self.spatial_unit,
            time_col="datetime",
        )
        super().__init__()
Exemplo n.º 4
0
    def __init__(
        self,
        start,
        stop,
        statistic="avg",
        *,
        spatial_unit: AnySpatialUnit = make_spatial_unit("cell"),
        hours="all",
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.spatial_unit = spatial_unit
        self.hours = hours
        self.tables = tables
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.statistic = statistic

        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        column_list = [
            self.subscriber_identifier,
            "location_id",
            "datetime",
            *self.direction.required_columns,
        ]

        self.unioned_query = location_joined_query(
            EventsTablesUnion(
                self.start,
                self.stop,
                tables=self.tables,
                columns=column_list,
                hours=hours,
                subscriber_identifier=subscriber_identifier,
                subscriber_subset=subscriber_subset,
            ),
            spatial_unit=self.spatial_unit,
            time_col="datetime",
        )

        super().__init__()
Exemplo n.º 5
0
    def __init__(
        self,
        start,
        stop,
        numerator,
        *,
        numerator_direction: Union[str, Direction] = Direction.BOTH,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        hours="all",
        subscriber_subset=None,
        tables="all",
    ):
        self.start = start
        self.stop = stop
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.numerator_direction = Direction(numerator_direction)
        self.hours = hours
        self.tables = tables
        self.numerator = numerator if isinstance(numerator,
                                                 list) else [numerator]

        self.numerator_query = EventCount(
            self.start,
            self.stop,
            subscriber_identifier=self.subscriber_identifier,
            direction=self.numerator_direction,
            hours=self.hours,
            subscriber_subset=subscriber_subset,
            tables=self.numerator,
        )

        self.denominator_query = EventCount(
            self.start,
            self.stop,
            subscriber_identifier=self.subscriber_identifier,
            direction=self.direction,
            hours=self.hours,
            subscriber_subset=subscriber_subset,
            tables=self.tables,
        )

        super().__init__()
Exemplo n.º 6
0
    def __init__(
        self,
        start: str,
        stop: str,
        statistic: str = "avg",
        *,
        hours: Union[str, Tuple[int, int]] = "all",
        tables: Union[str, List[str]] = "all",
        subscriber_identifier: str = "msisdn",
        subscriber_subset: Optional[Query] = None,
        direction: Union[str, Direction] = Direction.OUT,
    ):

        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.tables = tables
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)

        column_list = [
            self.subscriber_identifier,
            "datetime",
            *self.direction.required_columns,
        ]

        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours=self.hours,
            subscriber_identifier=self.subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        super().__init__()
Exemplo n.º 7
0
    def __init__(
        self,
        start,
        stop,
        contact_reciprocal,
        *,
        direction: Union[str, Direction] = Direction.OUT,
        subscriber_identifier="msisdn",
        hours: Optional[Tuple[int, int]] = None,
        subscriber_subset=None,
        tables="all",
        exclude_self_calls=True,
    ):

        self.start = start
        self.stop = stop
        self.subscriber_identifier = subscriber_identifier
        self.hours = hours
        self.exclude_self_calls = exclude_self_calls
        self.direction = Direction(direction)
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn",
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours=hours,
            subscriber_identifier=subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )

        self.contact_reciprocal_query = contact_reciprocal

        super().__init__()
Exemplo n.º 8
0
    def __init__(
        self,
        start: str,
        stop: str,
        *,
        table: Union[None, List[str]] = None,
        spatial_unit: AnySpatialUnit = make_spatial_unit("cell"),
        interval: str = "hour",
        direction: Union[str, Direction] = Direction.BOTH,
        hours: Optional[Tuple[int, int]] = None,
        subscriber_subset=None,
        subscriber_identifier="msisdn",
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.table = table
        self.spatial_unit = spatial_unit
        self.interval = interval
        self.direction = Direction(direction)

        if self.interval not in self.allowed_intervals:
            raise ValueError("'Interval must be one of: {} got: {}".format(
                self.allowed_intervals, self.interval))

        self.time_cols = ["(datetime::date)::text AS date"]
        if self.interval == "hour" or self.interval == "min":
            self.time_cols.append("extract(hour FROM datetime) AS hour")
        if self.interval == "min":
            self.time_cols.append("extract(minute FROM datetime) AS min")

        events_tables_union_cols = [
            "location_id", "datetime", subscriber_identifier
        ]
        # if we need to filter on outgoing/incoming calls, we will also fetch this
        # column. Don't fetch it if it is not needed for both efficiency and the
        # possibility that we might want to do pass another data type which does not
        # have this information.
        events_tables_union_cols += self.direction.required_columns

        self.unioned = location_joined_query(
            EventsTablesUnion(
                self.start,
                self.stop,
                tables=self.table,
                columns=events_tables_union_cols,
                hours=hours,
                subscriber_subset=subscriber_subset,
                subscriber_identifier=subscriber_identifier,
            ),
            spatial_unit=self.spatial_unit,
            time_col="datetime",
        )
        super().__init__()
Exemplo n.º 9
0
    def __init__(
        self,
        start,
        stop,
        proportion=0.8,
        *,
        direction: Union[str, Direction] = Direction.BOTH,
        tables="all",
        subscriber_identifier="msisdn",
        hours="all",
        exclude_self_calls=False,
        subscriber_subset=None,
    ):

        self.start = start
        self.stop = stop
        self.hours = hours
        self.direction = Direction(direction)
        self.tables = tables
        self.subscriber_identifier = subscriber_identifier
        self.exclude_self_calls = exclude_self_calls

        if 1 > proportion > 0:
            self.proportion = proportion
        else:
            raise ValueError(
                "{} is not a valid proportion.".format(proportion))

        self.contact_balance = ContactBalance(
            self.start,
            self.stop,
            hours=self.hours,
            tables=self.tables,
            subscriber_identifier=self.subscriber_identifier,
            direction=self.direction,
            exclude_self_calls=exclude_self_calls,
            subscriber_subset=subscriber_subset,
        )

        self.subscriber_degree = SubscriberDegree(
            self.start,
            self.stop,
            hours=self.hours,
            tables=self.tables,
            subscriber_identifier=self.subscriber_identifier,
            direction=self.direction,
            exclude_self_calls=self.exclude_self_calls,
            subscriber_subset=subscriber_subset,
        )

        self._cols = ["subscriber", "pareto"]

        super().__init__()
Exemplo n.º 10
0
    def __init__(
        self,
        start,
        stop,
        *,
        hours: Optional[Tuple[int, int]] = None,
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        exclude_self_calls=True,
        subscriber_subset=None,
    ):
        self.tables = tables
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.direction = Direction(direction)
        self.subscriber_identifier = subscriber_identifier
        self.exclude_self_calls = exclude_self_calls
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier=self.subscriber_identifier,
            hours=hours,
            subscriber_subset=subscriber_subset,
        )
        self._cols = [
            "subscriber", "msisdn_counterpart", "events", "proportion"
        ]
        super().__init__()
Exemplo n.º 11
0
    def __init__(
        self,
        start,
        stop,
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.OUT,
        statistic="sum",
        hours="all",
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.hours = hours
        self.direction = Direction(direction)
        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats
                )
            )

        column_list = [
            self.subscriber_identifier,
            "duration",
            *self.direction.required_columns,
        ]
        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables="events.calls",
            columns=column_list,
            hours=hours,
            subscriber_subset=subscriber_subset,
            subscriber_identifier=subscriber_identifier,
        )
        super().__init__()
Exemplo n.º 12
0
    def __init__(
        self,
        start,
        stop,
        *,
        hours="all",
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        exclude_self_calls=True,
        subscriber_subset=None,
    ):
        self.start = start
        self.stop = stop
        self.hours = hours
        self.direction = Direction(direction)
        self.subscriber_identifier = subscriber_identifier
        self.exclude_self_calls = exclude_self_calls
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            hours=self.hours,
            tables=self.tables,
            columns=column_list,
            subscriber_identifier=self.subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        self._cols = ["subscriber", "degree"]
        super().__init__()
Exemplo n.º 13
0
class PerLocationSubscriberCallDurations(SubscriberFeature):
    """
    This class returns the total amount of time a subscriber spent calling
    within the period, optionally limited to only calls they made, or received,
    faceted by their location at the time.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.OUT
        Whether to consider calls made, received, or both. Defaults to 'out'.
    spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default admin3
        Spatial unit to which subscriber locations will be mapped. See the
        docstring of make_spatial_unit for more information.
    statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum'
        Defaults to sum, aggregation statistic over the durations.


    Examples
    --------

    >>> s = PerLocationSubscriberCallDurations("2016-01-01", "2016-01-07", direction="in")
    >>> s.get_dataframe()

                subscriber            name          value
    0     038OVABN11Ak4W5P         Baglung          1979.0
    1     038OVABN11Ak4W5P           Banke          2204.0
    2     038OVABN11Ak4W5P           Dolpa          9169.0
    ...

    """

    def __init__(
        self,
        start,
        stop,
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.OUT,
        statistic="sum",
        spatial_unit: Optional[AnySpatialUnit] = None,
        hours="all",
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        if spatial_unit is None:
            self.spatial_unit = make_spatial_unit("admin", level=3)
        else:
            self.spatial_unit = spatial_unit
        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats
                )
            )

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            "duration",
            "location_id",
            "datetime",
            *self.direction.required_columns,
        ]
        self.unioned_query = location_joined_query(
            EventsTablesUnion(
                self.start,
                self.stop,
                tables="events.calls",
                columns=column_list,
                hours=hours,
                subscriber_subset=subscriber_subset,
                subscriber_identifier=self.subscriber_identifier,
            ),
            spatial_unit=self.spatial_unit,
            time_col="datetime",
        )
        super().__init__()

    @property
    def column_names(self) -> List[str]:
        return ["subscriber"] + self.spatial_unit.location_id_columns + ["value"]

    def _make_query(self):
        loc_cols = ", ".join(self.spatial_unit.location_id_columns)
        where_clause = make_where(self.direction.get_filter_clause())

        return f"""
Exemplo n.º 14
0
class PerLocationEventStats(SubscriberFeature):
    """
    This class returns the statistics of event count per location per
    subscriber within the period, optionally limited to only incoming or
    outgoing events. For instance, it calculates the average number of events
    per cell per subscriber.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    statistic : {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg'
        Defaults to avg, aggregation statistic over the durations.
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH
        Whether to consider calls made, received, or both. Defaults to 'both'.
    tables : str or list of strings, default 'all'
        Can be a string of a single table (with the schema)
        or a list of these. The keyword all is to select all
        subscriber tables
    spatial_unit : flowmachine.core.spatial_unit.*SpatialUnit, default cell
        Spatial unit to which subscriber locations will be mapped. See the
        docstring of make_spatial_unit for more information.

    Examples
    --------

    >>> s = PerLocationEventStats("2016-01-01", "2016-01-07")
    >>> s.get_dataframe()

          subscriber      value
    OemQ7q2DLZMWnwzB   1.388889
    By4j6PKdB4NGMpxr   1.421053
    L4V537alj321eWz6   1.130435
    4pQo67v0PWyLdYKO   1.400000
    8br1gO32xWXxjY0R   1.100000
                 ...        ...

    """
    def __init__(
        self,
        start,
        stop,
        statistic="avg",
        *,
        spatial_unit: AnySpatialUnit = make_spatial_unit("cell"),
        hours="all",
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.spatial_unit = spatial_unit
        self.hours = hours
        self.tables = tables
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.statistic = statistic

        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        column_list = [
            self.subscriber_identifier,
            "location_id",
            "datetime",
            *self.direction.required_columns,
        ]

        self.unioned_query = location_joined_query(
            EventsTablesUnion(
                self.start,
                self.stop,
                tables=self.tables,
                columns=column_list,
                hours=hours,
                subscriber_identifier=subscriber_identifier,
                subscriber_subset=subscriber_subset,
            ),
            spatial_unit=self.spatial_unit,
            time_col="datetime",
        )

        super().__init__()

    @property
    def column_names(self):
        return ["subscriber", "value"]

    def _make_query(self):
        loc_cols = ", ".join(self.spatial_unit.location_id_columns)

        where_clause = make_where(self.direction.get_filter_clause())

        return f"""
Exemplo n.º 15
0
class SubscriberCallDurations(SubscriberFeature):
    """
    This class returns the total amount of time a subscriber spent calling
    within the period, optionally limited to only calls they made, or received.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.OUT
        Whether to consider calls made, received, or both. Defaults to 'out'.
    statistic :  {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'sum'
        Defaults to sum, aggregation statistic over the durations.


    Examples
    --------

    >>> s = SubscriberCallDurations("2016-01-01", "2016-01-07", direction="in")
    >>> s.get_dataframe()

                   msisdn           value
    0    jWlyLwbGdvKV35Mm          4038.0
    1    EreGoBpxJOBNl392         12210.0
    2    nvKNoAmxMvBW4kJr         10847.0
    3    VkzMxYjv7mYn53oK         48374.0
    4    BKMy1nYEZpnoEA7G          8697.0
    ...

    """

    def __init__(
        self,
        start,
        stop,
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.OUT,
        statistic="sum",
        hours="all",
        subscriber_subset=None,
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.hours = hours
        self.direction = Direction(direction)
        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats
                )
            )

        column_list = [
            self.subscriber_identifier,
            "duration",
            *self.direction.required_columns,
        ]
        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables="events.calls",
            columns=column_list,
            hours=hours,
            subscriber_subset=subscriber_subset,
            subscriber_identifier=subscriber_identifier,
        )
        super().__init__()

    @property
    def column_names(self) -> List[str]:
        return ["subscriber", "value"]

    def _make_query(self):
        where_clause = make_where(self.direction.get_filter_clause())

        return f"""
Exemplo n.º 16
0
class NocturnalEvents(SubscriberFeature):
    """
    Represents the percentage of events that a subscriber make/receives which
    began at night. The definition of night is configurable.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : tuple of ints, default (20, 4)
        Hours that count as being nocturnal. e.g. (20,4)
        will be the times after 8pm and before 4 am.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH
        Whether to consider calls made, received, or both. Defaults to 'both'.
    tables : str or list of strings, default 'all'
        Can be a string of a single table (with the schema)
        or a list of these. The keyword all is to select all
        subscriber tables

    Examples
    --------

    >>> s = NocturnalEvents("2016-01-01", "2016-01-02")
    >>> s.get_dataframe()

          subscriber                 value
    2ZdMowMXoyMByY07              0.000000
    MobnrVMDK24wPRzB             40.000000
    0Ze1l70j0LNgyY4w             16.666667
    Nnlqka1oevEMvVrm             33.333333
    4dqenN2oQZExwEK2             83.333333
                 ...                   ...
    """
    def __init__(
        self,
        start,
        stop,
        hours=(20, 4),
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
        tables="all",
    ):
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.hours = hours
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "datetime",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours="all",
            subscriber_identifier=subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        super().__init__()

    @property
    def column_names(self):
        return ["subscriber", "value"]

    def _make_query(self):
        where_clause = make_where(self.direction.get_filter_clause())

        sql = f"""
        SELECT
            subscriber,
            AVG(nocturnal)*100 AS value
        FROM (
            SELECT
                subscriber,
                CASE
                    WHEN extract(hour FROM datetime) >= {self.hours[0]}
                      OR extract(hour FROM datetime) < {self.hours[1]}
                    THEN 1
                ELSE 0
            END AS nocturnal
            FROM ({self.unioned_query.get_query()}) U
            {where_clause}
        ) U
        GROUP BY subscriber
        """

        return sql
Exemplo n.º 17
0
    def __init__(
        self,
        start,
        stop,
        phase="hour",
        *,
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        hours: Optional[Tuple[int, int]] = None,
        subscriber_subset=None,
        tables="all",
    ):

        self.tables = tables
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)
        self.hours = hours

        column_list = [
            self.subscriber_identifier,
            "datetime",
            *self.direction.required_columns,
        ]

        # extracted from the POSTGRES manual
        allowed_phases = (
            "century",
            "day",
            "decade",
            "dow",
            "doy",
            "epoch",
            "hour",
            "isodow",
            "isoyear",
            "microseconds",
            "millennium",
            "milliseconds",
            "minute",
            "month",
            "quarter",
            "second",
            "week",
            "year",
        )

        if phase not in allowed_phases:
            raise ValueError(
                f"{phase} is not a valid phase. Choose one of {allowed_phases}"
            )

        self.phase = phase

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours=hours,
            subscriber_identifier=subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        super().__init__()
Exemplo n.º 18
0
class DistanceCounterparts(SubscriberFeature):
    """
    This class returns metrics related with the distance between event
    initiator and her/his counterparts.

    It assumes that the ID column uniquely identifies the event initiator and
    their counterparts' event. Choose only tables for which this assumption is
    true. In some cases, asynchronous communication like SMS might not be
    tagged with an ID that allows one to recover the counterpart event.

    Distances are measured in km.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    tables: str, default 'all'.
        The table must have a `msisdn_counterpart` column.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    statistic :  {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg'
    exclude_self_calls : bool, default True
        Set to false to *include* calls a subscriber made to themself
        Defaults to sum, aggregation statistic over the durations.


    Examples
    --------

    >>> s = DistanceCounterparts("2016-01-01", "2016-01-07", statistic="avg")
    >>> s.get_dataframe()

              subscriber    distance_avg
        038OVABN11Ak4W5P      272.167815
        09NrjaNNvDanD8pk      241.290233
        0ayZGYEQrqYlKw6g      218.161568
        0DB8zw67E9mZAPK2      228.235324
        0Gl95NRLjW2aw8pW      189.008980
                     ...             ...

    """
    def __init__(
        self,
        start,
        stop,
        statistic="avg",
        *,
        hours="all",
        tables="all",
        direction: Union[str, Direction] = Direction.BOTH,
        subscriber_subset=None,
        exclude_self_calls=True,
    ):
        self.tables = tables
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.direction = Direction(direction)
        self.exclude_self_calls = exclude_self_calls

        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        column_list = [
            "msisdn", "msisdn_counterpart", "id", "location_id", "outgoing"
        ]
        self.tables = tables

        # EventsTablesUnion will only subset on the subscriber identifier,
        # which means that we need to query for a unioned table twice. That has
        # a considerable negative impact on execution time.
        self.unioned_from_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier="msisdn",
            hours=hours,
            subscriber_subset=subscriber_subset,
        )

        self.unioned_to_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier="msisdn_counterpart",
            hours=hours,
            subscriber_subset=subscriber_subset,
        )

        self.distance_matrix = DistanceMatrix()

        super().__init__()

    @property
    def column_names(self) -> List[str]:
        return ["subscriber", "value"]

    def _make_query(self):

        filters = [self.direction.get_filter_clause("A")]
        if self.exclude_self_calls:
            filters.append("A.subscriber != A.msisdn_counterpart")
        on_filters = make_where(filters)

        sql = f"""
        SELECT
            U.subscriber AS subscriber,
            {self.statistic}(D.value) AS value
        FROM
            (
                SELECT A.subscriber, A.location_id AS location_id_from, B.location_id AS location_id_to FROM
                ({self.unioned_from_query.get_query()}) AS A
                JOIN ({self.unioned_to_query.get_query()}) AS B
                ON A.id = B.id AND A.outgoing != B.outgoing {on_filters}
            ) U
        JOIN
            ({self.distance_matrix.get_query()}) D
        USING (location_id_from, location_id_to)
        GROUP BY U.subscriber
        """

        return sql
Exemplo n.º 19
0
class IntereventInterval(SubscriberFeature):
    """
    This class calculates intervent period statistics such as the average and
    standard deviation of the duration between calls and returns them as time
    intervals.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    time_resolution : str
        Temporal resolution to return results at, e.g. 'hour' for fractional hours.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.OUT
        Whether to consider calls made, received, or both. Defaults to 'out'.
    tables : str or list of strings, default 'all'
        Can be a string of a single table (with the schema)
        or a list of these. The keyword all is to select all
        subscriber tables
    statistic :  {'count', 'sum', 'avg', 'max', 'min', 'median', 'mode', 'stddev', 'variance'}, default 'avg'
        Defaults to sum, aggregation statistic over the durations.

    Examples
    --------

    >>> s = IntereventInterval("2016-01-01", "2016-01-07")
    >>> s.get_dataframe()
               subscriber           value
    0    038OVABN11Ak4W5P 04:57:22.428571
    1    09NrjaNNvDanD8pk 03:52:38.454545
    2    0ayZGYEQrqYlKw6g 04:02:05.666667
    3    0DB8zw67E9mZAPK2 06:32:30.714285
    4    0Gl95NRLjW2aw8pW 05:44:20.625000
    ..                ...             ...
    495  ZQG8glazmxYa1K62 04:12:27.705882
    496  Zv4W9eak2QN1M5A7 03:41:10.323529
    497  zvaOknzKbEVD2eME 04:21:27.218750
    498  Zy3DkbY7MDd6Er7l 04:33:00.870968
    499  ZYPxqVGLzlQy6l7n 04:01:28.212121

    [500 rows x 2 columns]

    """
    def __init__(
        self,
        start: str,
        stop: str,
        statistic: str = "avg",
        *,
        hours: Union[str, Tuple[int, int]] = "all",
        tables: Union[str, List[str]] = "all",
        subscriber_identifier: str = "msisdn",
        subscriber_subset: Optional[Query] = None,
        direction: Union[str, Direction] = Direction.OUT,
    ):

        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.tables = tables
        self.subscriber_identifier = subscriber_identifier
        self.direction = Direction(direction)

        column_list = [
            self.subscriber_identifier,
            "datetime",
            *self.direction.required_columns,
        ]

        self.statistic = statistic.lower()
        if self.statistic not in valid_stats:
            raise ValueError(
                "{} is not a valid statistic. Use one of {}".format(
                    self.statistic, valid_stats))

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours=self.hours,
            subscriber_identifier=self.subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        super().__init__()

    @property
    def column_names(self):
        return ["subscriber", "value"]

    def _make_query(self):

        where_clause = make_where(self.direction.get_filter_clause())

        # Postgres does not support the following three operations with intervals
        if self.statistic in {"median", "stddev", "variance"}:
            statistic_clause = (
                f"MAKE_INTERVAL(secs => {self.statistic}(EXTRACT(EPOCH FROM delta)))"
            )
        else:
            statistic_clause = f"{self.statistic}(delta)"

        sql = f"""
        SELECT
            subscriber,
            {statistic_clause} AS value
        FROM (
            SELECT subscriber, datetime - LAG(datetime, 1, NULL) OVER (PARTITION BY subscriber ORDER BY datetime) AS delta
            FROM ({self.unioned_query.get_query()}) AS U
            {where_clause}
        ) AS U
        GROUP BY subscriber
        """

        return sql
Exemplo n.º 20
0
class ProportionEventReciprocal(SubscriberFeature):
    """
    This class calculates the proportion of events with a reciprocal contact
    per subscriber.  It is possible to fine-tune the period for which a
    reciprocal contact must have happened.

    A reciprocal contact is a contact who has initiated contact with the
    subscriber  and who also has been the counterpart of an initatiated contact
    by the subscriber.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    contact_reciprocal: flowmachine.features.ContactReciprocal
        An instance of `ContactReciprocal` listing which contacts are reciprocal
        and which are not.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.OUT
        Whether to consider calls made, received, or both. Defaults to 'out'.
    exclude_self_calls : bool, default True
        Set to false to *include* calls a subscriber made to themself
    tables : str or list of strings, default 'all'
        Can be a string of a single table (with the schema)
        or a list of these. The keyword all is to select all
        subscriber tables

    Example
    -------

    >> s = ProportionEventReciprocal('2016-01-01', '2016-01-08',
        ContactReciprocal('2016-01-01', '2016-01-08'))
    >> s.get_dataframe()

          subscriber       value
    9vXy462Ej8V1kpWl         0.0
    Q4mwVxpBOo7X2lb9         0.0
    5jLW0EWeoyg6NQo3         0.0
    QEoRM9vlkV18N4ZY         0.0
    a76Ajyb9dmEYNd8L         0.0
                 ...         ...
    """
    def __init__(
        self,
        start,
        stop,
        contact_reciprocal,
        *,
        direction: Union[str, Direction] = Direction.OUT,
        subscriber_identifier="msisdn",
        hours: Optional[Tuple[int, int]] = None,
        subscriber_subset=None,
        tables="all",
        exclude_self_calls=True,
    ):

        self.start = start
        self.stop = stop
        self.subscriber_identifier = subscriber_identifier
        self.hours = hours
        self.exclude_self_calls = exclude_self_calls
        self.direction = Direction(direction)
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn",
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            tables=self.tables,
            columns=column_list,
            hours=hours,
            subscriber_identifier=subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )

        self.contact_reciprocal_query = contact_reciprocal

        super().__init__()

    @property
    def column_names(self):
        return ["subscriber", "value"]

    def _make_query(self):

        filters = [self.direction.get_filter_clause()]

        if self.exclude_self_calls:
            filters.append("subscriber != msisdn_counterpart")
        where_clause = make_where(filters)

        on_clause = f"""
        ON {'U.subscriber' if self.subscriber_identifier == 'msisdn' else 'U.msisdn'} = R.subscriber
        AND  U.msisdn_counterpart = R.msisdn_counterpart
        """

        sql = f"""
        SELECT subscriber, AVG(reciprocal::int) AS value
        FROM (
            SELECT U.subscriber, COALESCE(reciprocal, FALSE) AS reciprocal
            FROM (
                SELECT *
                FROM ({self.unioned_query.get_query()}) U
                {where_clause}
            ) U
            LEFT JOIN (
                SELECT subscriber, msisdn_counterpart, reciprocal
                FROM ({self.contact_reciprocal_query.get_query()}) R
            ) R
            {on_clause}
        ) R
        GROUP BY subscriber
        """

        return sql
Exemplo n.º 21
0
class SubscriberDegree(SubscriberFeature):
    """
    Find the total number of unique contacts
    that each subscriber interacts with.

    Parameters
    ----------
    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    tables : str, default 'all'
    direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH
        Whether to consider calls made, received, or both. Defaults to 'both'.
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    kwargs
        Passed to flowmachine.EventTableUnion

    Notes
    -----

    `subscriber_identifier` refers only to the subject of the analysis
    so for example subscriber_identifier='imei' will find all the unique
    msisdns that each imei calls. There is currently no way to specify
    the unique number of imei that each subscriber calls for instance.

    Examples
    --------

    >>> SubscriberDegree('2016-01-01', '2016-01-01')
                   msisdn  value
    0    038OVABN11Ak4W5P      2
    1    09NrjaNNvDanD8pk      2
    2    0ayZGYEQrqYlKw6g      2
    3    0DB8zw67E9mZAPK2      2
    4    0Gl95NRLjW2aw8pW      2
    5    0gmvwzMAYbz5We1E      2
    ...

    """

    def __init__(
        self,
        start,
        stop,
        *,
        hours="all",
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        exclude_self_calls=True,
        subscriber_subset=None,
    ):
        self.start = start
        self.stop = stop
        self.hours = hours
        self.direction = Direction(direction)
        self.subscriber_identifier = subscriber_identifier
        self.exclude_self_calls = exclude_self_calls
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            hours=self.hours,
            tables=self.tables,
            columns=column_list,
            subscriber_identifier=self.subscriber_identifier,
            subscriber_subset=subscriber_subset,
        )
        self._cols = ["subscriber", "degree"]
        super().__init__()

    @property
    def column_names(self) -> List[str]:
        return ["subscriber", "value"]

    def _make_query(self):

        filters = [self.direction.get_filter_clause()]

        if self.exclude_self_calls:
            filters.append("subscriber != msisdn_counterpart")
        where_clause = make_where(filters)

        sql = f"""
        SELECT
           subscriber,
           COUNT(*) AS value
        FROM (
            SELECT DISTINCT subscriber, msisdn_counterpart
            FROM ({self.unioned_query.get_query()}) AS U
            {where_clause}
        ) AS U
        GROUP BY subscriber
        """

        return sql
Exemplo n.º 22
0
class ContactBalance(GraphMixin, SubscriberFeature):
    """
    This class calculates the total number of events
    that a subscriber interacts with a counterpart,
    and the proportion of events that a given contact
    participates out of the subscriber's total event count.
    This can be used to calculate a subscriber's contact
    network graph and the respective weighted edges
    for each contact.

    Parameters
    ----------

    start, stop : str
         iso-format start and stop datetimes
    hours : 2-tuple of floats, default 'all'
        Restrict the analysis to only a certain set
        of hours within each day.
    tables : str, default 'all'
    exclude_self_calls : bool, default True
        Set to false to *include* calls a subscriber made to themself
    subscriber_identifier : {'msisdn', 'imei'}, default 'msisdn'
        Either msisdn, or imei, the column that identifies the subscriber.
    subscriber_subset : str, list, flowmachine.core.Query, flowmachine.core.Table, default None
        If provided, string or list of string which are msisdn or imeis to limit
        results to; or, a query or table which has a column with a name matching
        subscriber_identifier (typically, msisdn), to limit results to.
    direction : {'in', 'out', 'both'} or Direction, default Direction.BOTH
        Event direction to include in computation. This
        can be outgoing ('out'), incoming ('in'), or both ('both').


    Examples
    --------

    >>> ContactBalance('2016-01-01', '2016-01-07')

                   msisdn       msisdn_counterpart  events     proportion
    0    038OVABN11Ak4W5P         09NrjaNNvDanD8pk     110           0.54
    1    09NrjaNNvDanD8pk         0ayZGYEQrqYlKw6g      94           0.44
    2    0ayZGYEQrqYlKw6g         0DB8zw67E9mZAPK2      70           0.23
    3    0DB8zw67E9mZAPK2         0DB8zw67E9mZAXFF      20           0.12
    ...
    """
    def __init__(
        self,
        start,
        stop,
        *,
        hours: Optional[Tuple[int, int]] = None,
        tables="all",
        subscriber_identifier="msisdn",
        direction: Union[str, Direction] = Direction.BOTH,
        exclude_self_calls=True,
        subscriber_subset=None,
    ):
        self.tables = tables
        self.start = standardise_date(start)
        self.stop = standardise_date(stop)
        self.hours = hours
        self.direction = Direction(direction)
        self.subscriber_identifier = subscriber_identifier
        self.exclude_self_calls = exclude_self_calls
        self.tables = tables

        column_list = [
            self.subscriber_identifier,
            "msisdn_counterpart",
            *self.direction.required_columns,
        ]

        self.unioned_query = EventsTablesUnion(
            self.start,
            self.stop,
            columns=column_list,
            tables=self.tables,
            subscriber_identifier=self.subscriber_identifier,
            hours=hours,
            subscriber_subset=subscriber_subset,
        )
        self._cols = [
            "subscriber", "msisdn_counterpart", "events", "proportion"
        ]
        super().__init__()

    @property
    def column_names(self) -> List[str]:
        return ["subscriber", "msisdn_counterpart", "events", "proportion"]

    def _make_query(self):

        filters = [self.direction.get_filter_clause()]
        if (self.subscriber_identifier in {"msisdn"
                                           }) and (self.exclude_self_calls):
            filters.append("subscriber != msisdn_counterpart")
        where_clause = make_where(filters)

        sql = f"""
        WITH unioned AS (
            SELECT
                *
            FROM ({self.unioned_query.get_query()}) as U
            {where_clause}
        ),
        total_events AS (
            SELECT
                subscriber,
                count(*) AS events
            FROM unioned
            GROUP BY subscriber
        )
        SELECT
            U.subscriber,
            U.msisdn_counterpart,
            count(*) as events,
            (count(*)::float / T.events::float) as proportion
        FROM
        (SELECT U.subscriber,
            U.msisdn_counterpart
          FROM unioned as U) AS U
        JOIN total_events AS T
            ON U.subscriber = T.subscriber
        GROUP BY U.subscriber,
                 U.msisdn_counterpart,
                 T.events
        ORDER BY proportion DESC
        """

        return sql

    def counterparts_subset(self, include_subscribers=False):
        """
        Returns the subset of counterparts. In some cases, we are interested in
        obtaining information about the subset of subscribers contacts.

        This method also allows one to get the subset of counterparts together
        with subscribers by turning the `include_subscribers` flag to `True`.

        Parameters
        ----------
        include_subscribers: bool, default True
            Wether to include the list of subscribers in the subset as well.
        """

        return _ContactBalanceSubset(contact_balance=self,
                                     include_subscribers=include_subscribers)