Exemplo n.º 1
0
class DataDefinition(models.Model):
    """ Defines a common datadefinition to share among common tripdata schema instances """
    short_name = models.CharField(max_length=32)
    definition = JSONField(default={}, validators=[validators.JsonValidator()])

    def __str__(self):
        return self.short_name
Exemplo n.º 2
0
class ClusterConfig(models.Model):
    """
    Represents the cluster algorithm and algorithm inputs.
    """
    ALGORITHMS = (
        ('AffinityPropagation', 'Affinity Propagation'),
        ('DBSCAN', 'DBSCAN'),
        ('Agglomerative', 'Agglomerative'),
        ('Birch', 'Birch'),
        ('KMeans', 'k-Means'),
        ('MiniBatchKMeans', 'Mini Batch k-Means'),
        ('MeanShift', 'Mean Shift'),
        ('Spectral', 'Spectral'),
        ('Ward', 'Ward'),
    )

    algorithm = models.CharField(max_length=20, choices=ALGORITHMS)
    arguments = JSONField(
        default={},
        validators=[validators.JsonValidator()],
        blank=True,
        null=True,
        help_text='Additional arguments to pass to the specific cluster model')

    def __str__(self):
        return '{} {}'.format(
            self.get_algorithm_display(),
            self.arguments,
        )
Exemplo n.º 3
0
class Attribute(models.Model, AttributeQueryMixin):
    objects = AttributeQueryObjectManager()
    name = models.CharField(max_length=64)
    attribute = JSONField(null=True,
                          blank=True,
                          validators=[validators.JsonValidator()])

    def __str__(self):
        return self.name
Exemplo n.º 4
0
class Organization(models.Model):
    name = models.CharField(max_length=64)
    timezone = models.CharField(max_length=64,
                                null=True,
                                choices=((k, timezones[k])
                                         for k in sorted(timezones.keys())))
    metadata = JSONField(default={},
                         validators=[validators.JsonValidator()],
                         blank=True)

    def __str__(self):
        return self.name
Exemplo n.º 5
0
class TopicModelConfig(models.Model):
    """
    Stores the configuration for a particular model that can be applied
    to multiple sets of data to allow for general comparison of the model
    to multiple data sets.
    """
    IMPLEMENTATIONS = (
        ('MalletLda', 'LDA MALLET'),
        ('GensimLda', 'LDA Gensim'),
        # ('GensimTfIdf', 'TF/IDF'),
        ('GensimLsi', 'LSI'),
    )
    algorithm = models.CharField(max_length=16, choices=IMPLEMENTATIONS)
    num_topics = models.IntegerField(default=300)
    arguments = JSONField(
        default={},
        blank=True,
        null=True,
        validators=[validators.JsonValidator()],
        help_text='Additional arguments to pass to the topic model.')

    def __str__(self):
        return '%s(%s)' % (self.get_algorithm_display(), self.num_topics)
Exemplo n.º 6
0
class Entity(models.Model):
    """
    Represents a physical entity that can generate trajectories (aka Trips).
    It allows association of multiple trips to a single resource for better
    data connectivity and relationships.
    """
    objects = EntityManager()
    organization = models.ForeignKey(Organization, null=True)
    common_id = models.CharField(
        max_length=16,
        db_index=True,
        help_text='Commonly used identifier (e.g. Tail Number, License plate')
    physical_id = models.CharField(
        max_length=64,
        db_index=True,
        help_text='Unique id following the physical resource (e.g. VIN)')
    metadata = JSONField(default={},
                         validators=[validators.JsonValidator()],
                         blank=True)

    class Meta:
        # It seems tail numbers are more unique
        unique_together = ('physical_id', )  # 'common_id')
        verbose_name_plural = 'entities'

    def __str__(self):
        return self.common_id

    def natural_key(self):
        return self.physical_id

    def combine_trip_data(self, prepare=_prep_dataframe, trip_filter=None):
        trips = self.trip_set.all()
        if trip_filter:
            trips = trips.filter(trip_filter)
        qs = TripData.objects.filter(trip__in=trips)
        return TripManager.combine_trip_data(qs, prepare)
Exemplo n.º 7
0
class TripData(models.Model):
    """ Isolates the large binary data from the rest of the record. """
    trip = models.ForeignKey(Trip, related_name='tripdata_set')
    _dataframe = models.BinaryField()
    metadata = JSONField(default={}, validators=[validators.JsonValidator()])
    definition = models.ForeignKey(DataDefinition)

    class Meta:
        verbose_name_plural = 'Trip data'

    def __str__(self):
        return '%s|%s' % (str(self.trip), self.id)

    @classmethod
    def from_db(cls, db, field_names, values):
        instance = super().from_db(db, field_names, values)
        instance._db_to_dataframe()
        return instance

    def _dataframe_to_db(self):
        """
        This is basically the same code from pandas 0.18 pandas.io.pickle.to_pickle
        but keeping the bytes in memory.  Since that method does not allow passing the
        pickle in memory (only via file on file system) the logic is duplicated.
        """
        self._dataframe = pickle.dumps(self.dataframe,
                                       protocol=pickle.HIGHEST_PROTOCOL)

    def _db_to_dataframe(self):
        """
        This is basically pd.read_pickle but allowing in-memory objects and
        forcing python 3.  The idea is that pandas.read_pickle maintains some semblance
        of backward compatibility making this more robust.  The code this is derived from
        came from pandas 0.18 pandas.io.pickle.
        """
        fh = BytesIO(self._dataframe)
        encoding = 'latin1'
        try:
            self.dataframe = pickle.load(fh)
        except (Exception) as e:
            try:
                # reg/patched pickle
                self.dataframe = pc.load(fh, encoding=encoding, compat=False)
            except:
                # compat pickle
                self.dataframe = pc.load(fh, encoding=encoding, compat=True)

    def save(self, *args, **kwargs):
        self._dataframe_to_db()
        super().save(*args, **kwargs)

    def dataframe_filter(self, params=None, times=(None, None)):
        """
        params is an iterable of parameters to retrieve
        times is a tuple of start_time, duration to retrieve
        """
        if times[0] is None and (params is None or len(params) == 0):
            return self.dataframe

        if times is not None and times[0] is not None:
            times = (times[0], times[0] + times[1])
        else:
            times = (None, None)

        params = params if params is not None else []
        if len(params) > 0:
            avail_params_set = set(self.paramlist())
            params = avail_params_set.intersection(params)
            if len(params) == 0:
                return None  # This dataframe does not contain desired params
            return self.dataframe.loc[times[0]:times[1], params]
        else:
            return self.dataframe.loc[times[0]:times[1], ]

    def paramlist(self):
        return self.dataframe.columns

    @property
    def organization(self):
        return self.trip.entity.organization
Exemplo n.º 8
0
class Trip(geomodels.Model):
    """
    Represents a partition of the movement of an Entity.  It represents the
    base atomic element of most analysis.  Although some analysis will dissect
    further into the TripData or specific points, the results are often
    associated back to the Trip.  When different granularity is required, the
    partitioning is usually modified and a new set of trips created.
    """
    objects = TripManager()
    # Not all taxi data associates with an single physical taxi
    id = geomodels.BigIntegerField(primary_key=True)
    entity = geomodels.ForeignKey(Entity, blank=True, null=True)
    start_datetime = geomodels.DateTimeField(db_index=True)
    duration = geomodels.DurationField()
    geometry = geomodels.LineStringField(dim=3, null=True)
    metadata = JSONField(
        default={},
        validators=[validators.JsonValidator()],
        blank=True,
        # db_index=True - Defined by migration so it uses a GIST instead of BTREE index
    )
    archive_uri = geomodels.CharField(max_length=1024)

    class Meta:
        # ordering = ['-start_datetime']  # Newer trips listed first
        # Trip IDs are generated as a function of entity and start_datetime as well as
        # another ID that enforce this unique_together in the PK.
        # unique_together = ('entity', 'start_datetime')
        pass

    def __str__(self):
        return '%s|%s' % (self.label, self.start_datetime.isoformat())

    @property
    def label(self):
        return self.entity.common_id if self.entity else ('Trip(%s)' % self.id)

    def natural_key(self):
        return self.entity, self.start_datetime

    @property
    def organization(self):
        return self.entity.organization

    def dataframe_filter(self,
                         params=None,
                         times=(None, None),
                         interpolate=None):
        df_list = list(tripdata for tripdata in self.__iter__(
            params=params, times=times, interpolate=interpolate))
        return df_list

    def __iter__(self, params=None, times=(None, None), interpolate=None):
        # TODO: Best ways to carve up a data frame.
        #       Is it faster to return a series when only one parameter requested?
        #       How will the code calling this most often iterate the results?
        for tripdata in self.tripdata_set.all():
            data = tripdata.dataframe_filter(params=params, times=times)
            if data is None:
                continue
            if interpolate:
                data = data.interpolate(method=interpolate)

            yield data
Exemplo n.º 9
0
class ClusterModel(models.Model):
    """
    Defines an execution of the clustering data
    """
    config = models.ForeignKey(ClusterConfig)
    data = models.ForeignKey(TripQuery)
    topic_model = models.ForeignKey(TopicModel, null=True, blank=True)
    arguments = JSONField(blank=True,
                          null=True,
                          validators=[validators.JsonValidator()],
                          help_text='Additional arguments for clustering')
    created = models.DateTimeField(auto_now_add=True)
    modified = models.DateTimeField(auto_now=True)

    def __init__(self, *args, **kwargs):
        self._model = None
        super().__init__(*args, **kwargs)

    def __str__(self):
        name = '%s with data (%s)' % (
            self.config,
            self.data,
        )
        if self.topic_model is not None:
            name = '%s on topic model (%s)' % (
                name,
                self.topic_model,
            )
        return name

    def geo_data(self, data):
        cluster_data = numpy.ndarray(shape=(len(data.q()), 4))
        for i, trip in enumerate(data.q()):
            geom_start = trip.start_point
            geom_end = trip.end_point
            cluster_data[i][0] = geom_start.coords[0]
            cluster_data[i][1] = geom_start.coords[1]
            cluster_data[i][2] = geom_end.coords[0]
            cluster_data[i][3] = geom_end.coords[1]

        return cluster_data

    def topic_data(self, data):
        cluster_data = numpy.zeros(shape=(len(data.q()),
                                          self.topic_model.model.num_topics))

        if True or self.arguments.get('cluster_street_topics', None):
            # Find the most probable topic for each street in the data by
            # creating a corpus of single term documents with the term being
            # the street id and then inferring that with the model
            streets_corpus = []
            for trip in data.q():
                gid_df = trip.dataframe_filter(params=['gid'])[0]
                for gid in gid_df['gid']:
                    gid = str(gid)
                    if gid not in streets_corpus:
                        streets_corpus.append(gid)
            streets_corpus = [[street] for street in streets_corpus]
            inferred_streets = self.topic_model.model[streets_corpus]
            street_topics = {}
            for i, topic in enumerate(inferred_streets):
                topics = sorted(topic, key=lambda t: -t[1])
                street_topics[int(streets_corpus[i][0])] = topics[0]

            adj = 1
            # For each document, create an entry in the cluster data by
            # iterating through each street in the trajectory and incrementing
            # the topic that is most probable for that street.
            for i, trip in enumerate(data.q()):
                streets_df = trip.dataframe_filter(params=['gid'])[0]
                for gid in streets_df['gid']:
                    street_topic = street_topics[gid][0]
                    cluster_data[i][street_topic] += adj

                for s in range(len(cluster_data[i])):
                    cluster_data[i][s] /= (len(streets_df) * adj)
        else:
            # Infer the data associated with the cluster.
            inferred_corpus = self.topic_model.model[trip_queryset_to_corpus(
                data.q(), data.id)]

            # Create cluster data base of inferred corpus
            for i, topics in enumerate(inferred_corpus):
                topics = sorted(topics, key=lambda t: t[1], reverse=True)

                for t in topics:
                    cluster_data[i][t[0]] = t[1]

        return cluster_data

    @property
    def model(self):

        # If the cluster model hasn't been accessed on this model yet, get it.
        if self._model is None:
            # First, attempt to get the model from the cache if available.
            if CLUSTER_SETTINGS['CACHE'] is not None:
                key = 'cluster:model%s' % (self.id)
                cache = caches[CLUSTER_SETTINGS['CACHE']]
                self._model = cache.get(key,
                                        CLUSTER_SETTINGS['QUERY_CACHE_TIME'])

            # If the model was not in the cache then calculate it.
            if self._model is None:
                logger.info('Calculating cluster %s' % self)
                if self.topic_model is not None:
                    cluster_data = self.topic_data(self.data)
                else:
                    cluster_data = self.geo_data(self.data)

                self._model = processing.ClusterModel(
                    impl=self.config.algorithm, **self.config.arguments)
                self._model.fit(cluster_data)

                if CLUSTER_SETTINGS['CACHE'] is not None:
                    # key is calculated above and cache retrieved above
                    cache.set(key, self._model,
                              CLUSTER_SETTINGS['QUERY_CACHE_TIME'])
            else:
                logger.info('Cluster pulled from cache')

        return self._model