args = cmd_parser.parse_args() print "Slurping data from %s, writing to %s" % (args.datadir, args.outfile) files = sorted(os.listdir(args.datadir)) print "Iterating over datasets. This might take a while." ensemble_all = np.empty([datatool.secs_per_day()]) counter_all = np.empty([datatool.secs_per_day()]) ensemble_weekend = np.empty([datatool.secs_per_day()]) counter_weekend = np.empty([datatool.secs_per_day()]) ensemble_weekday = np.empty([datatool.secs_per_day()]) counter_weekday = np.empty([datatool.secs_per_day()]) with pb.ProgressBar(maxval=len(files)) as progress: for idx, file in enumerate(files): freqdata = datatool.load_data_as_dataframe(os.path.join(args.datadir, file)) weekend = False if freqdata.iloc[0].weekday in set([5, 6]): weekend = True for i, row in freqdata.iterrows(): ensemble_all[row.s_since_midnight] += row.freq counter_all[row.s_since_midnight] += 1 if weekend: ensemble_weekend[row.s_since_midnight] += row.freq counter_weekend[row.s_since_midnight] += 1 else: ensemble_weekday[row.s_since_midnight] += row.freq counter_weekday[row.s_since_midnight] += 1 progress.update(idx) ensemble_all = ensemble_all / counter_all
# vim:fileencoding=utf-8 import matplotlib.pyplot as plt import numpy as np import freqanalysis.ecdf as ecdf import freqanalysis.datatools as datatool from scipy.stats import ks_2samp datasetfile = "datasets/20140904-export.txt" print "loading ", datasetfile df = datatool.load_data_as_dataframe(datasetfile) print "Calculating ECDF of all values" all_series, yvals = ecdf.get_ecdf(df['freq']) print "Plotting graph" ecdf.plot_ecdf_curve(all_series, yvals, color="b", label="Alle Werte") df['minute'] = df.time.apply(lambda x: x.minute) hour_df = df[(df.minute >= 58) | (df.minute <= 5)] hour_series, yvals = ecdf.get_ecdf(hour_df['freq']) ecdf.plot_ecdf_curve(hour_series, yvals, color="r", label="Stundenwechsel") not_hour_df = df[(df.minute < 58 ) & (df.minute > 5)] not_hour_series, yvals = ecdf.get_ecdf(not_hour_df['freq']) ecdf.plot_ecdf_curve(not_hour_series, yvals, color="y", linestyle="-", label="unter der Stunde") print "Null hypothesis: the two samples are drawn from the same continuous distribution." D, p_value = ks_2samp(all_series, hour_series)
# vim:fileencoding=utf-8 import matplotlib.pyplot as plt import numpy as np from scipy.stats import norm from scipy.stats import ks_2samp from scipy.stats.mstats import normaltest import freqanalysis.datatools as datatool import freqanalysis.normdist as nd datasetfile = "datasets/20140904-export.txt" print "loading ", datasetfile df = datatool.load_data_as_dataframe(datasetfile) df['minute'] = df.time.apply(lambda x: x.minute) hour_df = df[(df.minute >= 58) | (df.minute <= 5)] not_hour_df = df[(df.minute < 58) & (df.minute > 5)] f, (ax1, ax2, ax3) = plt.subplots(3, sharex=True, sharey=True) # pandas/matplotlib incompatibility: http://stackoverflow.com/a/22764377 nd.plot_fit(df['freq'].values, ax1, "Alle\ Werte") nd.plot_fit(hour_df['freq'].values, ax2, "Nur\ Stundenwechsel") nd.plot_fit(not_hour_df['freq'].values, ax3, "Kein\ Stundenwechsel") f.savefig("images/normdistrib.png", bbox_inches='tight') print print "Executing KS-Test: is the data normally distributed?" mu, std = norm.fit(df['freq']) refdist = np.random.normal(mu, std, 100000) D, p_value = ks_2samp(df['freq'].values, refdist) if p_value < 0.01: print "Rejecting null hypothesis - the two distributions differ significantly. p = %.4f" % p_value
import freqanalysis.datatools as datatool import numpy as np import sys as sys import argparse import os cmd_parser = argparse.ArgumentParser() cmd_parser.add_argument("datafile", help="the csv containing the frequency measurements") cmd_parser.add_argument("outfile", help="HDF+ file to create") args = cmd_parser.parse_args() print "Slurping the CSV-file %s, writing to %s" % (args.datafile, args.outfile) print "Loading datasets. This might take a while." alldata = datatool.load_data_as_dataframe(args.datafile) print "Selecting all friday data for comparison." # select the friday 8:00 to 11:00 UTC datasets from the alldata frame fridays = alldata[alldata.weekday == 4] fridaydata = fridays[(fridays.hour > 7) & (fridays.hour < 11)] print "Selecting eclipse data" eclipsedata = alldata[(alldata.unix >= 1426838400) & (alldata.unix < 1426849200)] with pd.get_store(args.outfile) as store: store['eclipsedata'] = eclipsedata store['fridaydata'] = fridaydata