示例#1
0
def clustering(lats, longs, timestamps, ID, timestmp, multiPDF=False):
    """
    Clusters the GPS coordinates using DBSCAN
    :param timestmp:                The timestamp
    :param ID:                      The ID
    :param timestamps:              The timestamps of the GPS coordinates
    :param lats:                    The latitudes
    :param longs:                   The longitudes
    :return:                        The rounded distance
    """
    folder = "out/"
    plotDir = folder + "plots/Walking Test Analysis"

    R = 6371  # Radius of the earth in km
    cartesianX = []
    cartesianY = []
    cartesianZ = []

    for lat, long in zip(lats, longs):
        # Convert to cartesian coordinates
        x = R * cos(lat) * cos(long)
        y = R * cos(lat) * sin(long)
        z = R * sin(lat)
        cartesianX.append(x)
        cartesianY.append(y)
        cartesianZ.append(z)

    combined = np.vstack((cartesianX, cartesianY, cartesianZ)).T
    (core_samples, labels) = dbscan(combined, eps=0.5)
    grouped = zip(labels, core_samples)
    nonGroupedPositions = []

    for (label, core_sample) in grouped:
        if label != -1:
            lat = lats[core_sample]
            long = longs[core_sample]
            stamp = timestamps[core_sample]
            nonGroupedPositions.append((lat, long, stamp))

    if len(nonGroupedPositions) > 0:
        y = zip(*nonGroupedPositions)[0]  # the latitudes
        x = zip(*nonGroupedPositions)[1]  # the longitudes
        t = zip(*nonGroupedPositions)[2]  # the timestamps
        x2, y2, newx2, newy2 = smooth(y, x, t)

        plt.plot(y2, x2, label="Linear Interpolation")
        plt.plot(newy2, newx2, label="Savgol Filter", color="r")
        distance = calcDistanceWalked(newy2, newx2)
        grouped = sorted(grouped, key=itemgetter(0))

        clusters = {}
        labels = []
        for key, group in groupby(grouped, key=itemgetter(0)):
            # group the clusters based on their label
            labels.append(key)
            clusters[key] = [el[1] for el in group]

        noise = False
        colors = plt.get_cmap("Spectral")(np.linspace(0, 1, len(clusters)))
        for label in labels:
            indices = clusters[label]
            latitudes = []
            longitudes = []
            size = 10
            alpha = 0.5
            lineWidth = 0.15
            for i in indices:
                latitudes.append(lats[i])
                longitudes.append(longs[i])
            if label == -1:
                # outliers are identified with a label of -1
                plt.plot(latitudes, longitudes, "o", markerfacecolor=almost_black, markeredgecolor=almost_black,
                         markersize=size, alpha=alpha, linewidth=lineWidth, label="Outlier")
                noise = True
            else:
                plt.plot(latitudes, longitudes, "o", markerfacecolor=colors[label], markeredgecolor=almost_black,
                         markersize=size, alpha=alpha, linewidth=lineWidth, label="Cluster %i" % (label + 1))

        plt.title("Timestamp: %s\n Number of clusters: %i\n Calculated distance: %i meters" % (
            timestmp, (len(clusters) - 1) if noise else len(clusters), round(distance)))
        plt.xlabel("Latitude")
        plt.ylabel("Longitude")
        fancyPlot()
        writeToPdf(ID, plotDir)
        return True, distance
    else:
        # DBSCAN gave back an empty array, therefore we cannot perform any smoothing or distance calculation
        return False, 0
示例#2
0
def sleepAnalysis():
    """
    Analysis of the sleep duration of our participants per day
    """
    csvDir = "out/csv/Sleep Analysis"
    plotDir = "out/plots/Sleep Analysis"
    measurementDictionary = OrderedDict()
    try:
        os.makedirs(csvDir)
    except OSError:
        # path already exists
        pass

    for (ID, pid, device) in basisPeak + fitBit:
        measurements = None
        if device == "basispeak":
            measurements = db.measurements.find({"pid": pid, "mtype": 7, "date": {"$gte": start, "$lt": end}}).sort(
                [("date", 1)])
            measurements = list(measurements)

        elif device == "fitbit":
            measurements = db.diaFitBit.find({"pid": pid, "mtype": 7, "date": {"$gte": start, "$lt": end}}).sort(
                [("date", 1)])
            measurements = list(measurements)

        elif device == "microsoftband":
            measurements = []

        keys = []
        if device == "basispeak":
            for key, group in groupby(measurements, lambda x: x["end"].strftime("%y-%m-%d")):
                # group the measurements based on date
                keys.append(key)
                measurementDictionary[key] = [el for el in group]
        elif device == "fitbit":
            for measurement in measurements:
                measurement['end'] = measurement['date']
                measurement['value'] /= 60.0
                date = measurement['date'].strftime("%y-%m-%d")
                keys.append(date)
                measurementDictionary[date] = [measurement]

        ratings = questionAnalysis(ID)
        with open(os.path.join(csvDir, ID + ".csv"), "w") as csvFile:
            writer = csv.writer(csvFile, delimiter=",")
            writer.writerow(("Date", "Rating", "Duration"))
            intersectedDates = np.intersect1d([el[1] for el in ratings], keys)
            sleepDurationPerDate = OrderedDict()

            for date in np.union1d([el[1] for el in ratings], keys):
                filteredRatings = [el for el in ratings if el[1] == date]
                if date in keys:
                    for measurement in measurementDictionary[date]:
                        duration = round(measurement['value'], 2)
                        if filteredRatings and duration != 0:
                            # We found a rating for this date
                            rating = filteredRatings[0][0]

                            try:
                                (rating, sleepDuration, dateObject) = sleepDurationPerDate[date]
                                sleepDurationPerDate[date] = (rating, sleepDuration + duration, dateObject)
                            except KeyError:
                                sleepDurationPerDate[date] = (rating, duration, measurement['date'])

                            writer.writerow((
                                measurement['end'].strftime("%y-%m-%d %H:%M:%S"),
                                rating,
                                duration
                            ))
                        else:
                            # No rating found, write "-" as rating
                            writer.writerow((
                                measurement['end'].strftime("%y-%m-%d %H:%M:%S"),
                                "-",
                                duration
                            ))
                else:
                    # This date had no measurement
                    if filteredRatings:
                        rating = ratings[0][0]
                        writer.writerow((
                            date,
                            rating,
                            "-"
                        ))
                    else:
                        writer.writerow((
                            date,
                            "-",
                            "-"
                        ))

            alpha = 0.5
            lineWidth = 0.15
            s = 100
            X = []  # contains the measurements
            Y = []  # contains the ratings

            for date in sleepDurationPerDate:
                (rating, totalSleepDuration, dateObject) = sleepDurationPerDate[date]
                X.append(totalSleepDuration)
                Y.append(rating)
                if rating == "Goed":
                    plt.scatter(dateObject, totalSleepDuration, s=s, color="g", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Good day")
                elif rating == "Gemiddeld":
                    plt.scatter(dateObject, totalSleepDuration, s=s, color="orange", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Average day")
                elif rating == "Slecht":
                    plt.scatter(dateObject, totalSleepDuration, s=s, color="r", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Bad day")

            plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d'))
            plt.gca().xaxis.set_major_locator(WeekdayLocator())
            plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40)
            plt.xlabel("Date")
            plt.ylabel("Sleep duration (hours)")
            fancyPlot(dateLimit=True)
            writeToPdf(ID, plotDir)

            (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:])
            if -1 <= correlation <= 1:
                print "Walking Analysis | ID: %s | pid : %s | correlation: %f, pvalue: %f" % (
                    ID, pid, correlation, pvalue)

            X = np.array(X)
            X = X.reshape(len(X), 1)
            Y = map(toNumerical, Y)
            Y = np.array(Y)
            model = mord.OrdinalLogistic()
            try:
                result = model.fit(X, Y)
                # Coefficient of the first feature (only one feature here)
                print "Coefficient | ", result.coef_
                # print result.theta_
            except Exception as ex:
                print ex
                pass
def heartRateAnalysis():
    """
    Analysis of the heart rates of our participants per day
    """
    # heart rate has mtype 1
    folder = "out/"
    csvDir = folder + "csv/Heart Rate"
    plotDir = folder + "plots/Heart Rate plots"
    boxPlotDir = folder + "plots/Heart Rate boxplots"

    measurementDictionary = OrderedDict()
    try:
        os.makedirs(csvDir)
    except OSError:
        pass  # path already exists

    for (ID, pid, device) in basisPeak + fitBit:
        measurements = None
        if device == "basispeak":
            measurements = db.measurements.find({"pid": pid, "mtype": 1, "date": {"$gte": start, "$lt": end}}).sort(
                [("date", 1)])
            measurements = list(measurements)
        elif device == "fitbit":
            measurements = db.diaFitBitPatients.find(
                {"pid": pid, "mtype": 1, "date": {"$gte": start, "$lt": end}}).sort(
                [("date", 1)])
            measurements = list(measurements)
        elif device == "microsoftband":
            measurements = []
            # The measurements for this device were not present in the database while writing this code

        keys = []
        for key, group in groupby(measurements, lambda x: x["date"].strftime("%y-%m-%d")):
            # group the measurements based on date
            keys.append(key)
            measurementDictionary[key] = [el for el in group
                                          if isinstance(el["value"], int) and
                                          not math.isnan(el["value"]) and
                                          el["value"] == el["value"]]
        # Create a list witch contains list with all the measurement of the same date
        measurementsPerDate = []
        dateObjects = []

        for key in keys:
            dateMeasurement = [measurement["value"] for measurement in measurementDictionary[key]]
            measurementsPerDate.append(dateMeasurement)

            # Finding the date objects
            if dateMeasurement:
                newDate = measurementDictionary[key][0]['date']
                if not dateObjects:
                    dateObjects.append(newDate)
                else:
                    similarDates = [date for date in dateObjects if
                                    date.year == newDate.year and
                                    date.month == newDate.month and
                                    date.day == newDate.day
                                    ]
                    if not similarDates:
                        dateObjects.append(newDate)
        # The boxplot
        fig = plt.figure()
        plt.xlabel("Date")
        plt.ylabel("Heart rate")
        plt.title("ID: %s" % ID)
        bp = plt.gca().boxplot(measurementsPerDate, patch_artist=True)
        fig.autofmt_xdate()
        plt.xticks(np.arange(1, len(keys) + 1), keys, rotation=45)
        fancyBoxPlot(bp)
        writeToPdf(ID, boxPlotDir)

        ratings = questionAnalysis(ID)

        with open(os.path.join(csvDir, ID + ".csv"), "w") as csvFile:
            writer = csv.writer(csvFile, delimiter=",")
            writer.writerow(("Date", "Rating", "Median", "Average", "Standard Deviation"))

            X = []  # contains the measurements
            Y = []  # contains the ratings
            intersectedDates = np.intersect1d([el[1] for el in ratings], keys)

            for (rating, date) in ratings:
                if date in intersectedDates:
                    measurements = [measurement["value"] for measurement in measurementDictionary[date]]
                    measurements = sorted(measurements)
                    writer.writerow((
                        date,
                        rating,
                        np.median(measurements),
                        round(np.average(measurements)),
                        round(np.std(measurements), 2)
                    ))
                    measurements = measurements[0: int(len(measurements) * 0.05)]
                    stableHeartRate = np.average(measurements)

                    Y.append(rating)
                    X.append(stableHeartRate)

                    alpha = 0.5
                    lineWidth = 0.15
                    s = 100
                    dateObject = measurementDictionary[date][0]['date']
                    # Normal plotting
                    if rating == "Goed":
                        plt.scatter(dateObject, stableHeartRate, s=s, color="g", alpha=alpha, linewidth=lineWidth,
                                    edgecolor=almost_black, label="Good day")
                    elif rating == "Gemiddeld":
                        plt.scatter(dateObject, stableHeartRate, s=s, color="orange", alpha=alpha,
                                    linewidth=lineWidth,
                                    edgecolor=almost_black, label="Average day")
                    elif rating == "Slecht":
                        plt.scatter(dateObject, stableHeartRate, s=s, color="r", alpha=alpha, linewidth=lineWidth,
                                    edgecolor=almost_black, label="Bad day")

            plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d'))
            plt.gca().xaxis.set_major_locator(WeekdayLocator())
            plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40)
            plt.xlabel("Date")
            plt.ylabel("Heart rate")
            fancyPlot(dateLimit=True)
            writeToPdf(ID, plotDir)
            (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:])
            if -1 <= correlation <= 1:
                print "Walking Analysis | ID: %s | pid : %s | correlation: %f, pvalue: %f" % (
                    ID, pid, correlation, pvalue)

            # Use Ordinal Regression, see Mord
            # https://en.wikipedia.org/wiki/Ordinal_regression
            X = np.array(X)
            X = X.reshape(len(X), 1)
            Y = map(toNumerical, Y)
            Y = np.array(Y)
            model = mord.OrdinalLogistic()
            """
            fit() calls threshold_fit(), which makes use of the optimize.minimize() method.
            To prevent the "Desired error not necessarily achieved due to precision loss" message,
            add add the following paramter to this function: optimize.minize(..., ..., method='Nelder-Mead')
            """
            result = model.fit(X, Y)
示例#4
0
def energyAnalysis():
    """
    Analysis of the energy analysis of our participants per day
    """
    energyQuestionId = "HanWRjvZe8PiLvfD4"
    folder = "out/"
    plotDir = folder + "plots/Energy Analysis plots"
    try:
        os.makedirs(plotDir)
    except OSError:
        pass  # path already exists

    for ID in IDs:
        experiments = db_mijnKwik.observations.find({"userId": ID, "questionId": energyQuestionId,
                                                     "timestamp": {"$gte": start, "$lt": end}}).sort([("timestamp", 1)])
        experiments = list(experiments)

        X = []  # The measurements
        Y = []  # The ratings
        keys = []

        measurementDictionary = OrderedDict()
        ratings = questionAnalysis(ID)

        for key, group in groupby(experiments, lambda x: x["timestamp"].strftime("%y-%m-%d")):
            # group the measurements based on date
            keys.append(key)
            measurementDictionary[key] = [el for el in group]

        plt.xlabel("Date")
        plt.ylabel("Energy Level")
        alpha = 0.5
        lineWidth = 0.15
        s = 100

        intersectedDate = np.intersect1d([el[1] for el in ratings], keys)
        for (rating, date) in ratings:
            if date in intersectedDate:

                measurements = measurementDictionary[date]
                dateObject = measurements[0]['timestamp']
                energyRating = np.average([measurement['value'] for measurement in measurements])
                X.append(energyRating)
                Y.append(rating)

                if rating == "Goed":
                    plt.scatter(dateObject, energyRating, s=s, color="g", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Good day")
                elif rating == "Gemiddeld":
                    plt.scatter(dateObject, energyRating, s=s, color="orange", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Average day")
                elif rating == "Slecht":
                    plt.scatter(dateObject, energyRating, s=s, color="r", alpha=alpha, linewidth=lineWidth,
                                edgecolor=almost_black, label="Bad day")
                else:
                    print "Something went wrong. The value of rating is ", rating

        plt.gca().xaxis.set_major_formatter(DateFormatter('%y-%m-%d'))
        plt.gca().xaxis.set_major_locator(WeekdayLocator())
        plt.setp(plt.gca().xaxis.get_majorticklabels(), rotation=40)
        fancyPlot(dateLimit=True)
        writeToPdf(ID, plotDir)
        """
        To predict the next day, we need to drop certain elements in the list.
        Lets look at an example.
        X: 4,2,4,5      (The measurements)
        Y: G,A,B,A      (The ratings)

        By dropping the last element of the "X" array
        and dropping the first element of the "Y" array we get the following lists:
        X: 4,2,4
        Y: A,B,A

        This is exactly what we need!
        PS: we need to make sure that all the dates are consecutive
        """
        (correlation, pvalue) = kendalltau(X[0:len(X) - 1], Y[1:])
        if not math.isnan(correlation) and not math.isnan(pvalue):
            print "Walking Analysis | ID: %s | correlation: %f, pvalue: %f" % (ID, correlation, pvalue)

        X = np.array(X).T
        # Because energy rating is nominal data, we can use normal logistic regression
        logic = LogisticRegression()
        logic.fit(X.reshape(len(X), 1), Y)
        print logic.coef_  # Coefficient of the first feature (only one feature here)