Пример #1
0
 def test_zs(self):
     """
     not in R, so tested by using
     (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
     """
     y = stats.zs(self.testcase)
     desired = ([-1.3416407864999, -0.44721359549996 , 0.44721359549996 , 1.3416407864999])
     assert_array_almost_equal(desired,y,decimal=12)
Пример #2
0
 def standardize(self, data):
     """standardize data"""
     import scipy.stats as st
     newdata = copy.deepcopy(data)
     i=0
     for col in zip(*data):
         newdata[:,i] = st.zs(col)            
         i+=1
     print newdata    
     return newdata
Пример #3
0
    def standardize(self, data):
        """standardize data"""
        import scipy.stats as st

        newdata = copy.deepcopy(data)
        i = 0
        for col in zip(*data):
            newdata[:, i] = st.zs(col)
            i += 1
        print newdata
        return newdata
Пример #4
0
 def test_zs(self):
     """
     not in R, so tested by using
     (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
     """
     y = stats.zs(self.testcase)
     desired = ([
         -1.3416407864999, -0.44721359549996, 0.44721359549996,
         1.3416407864999
     ])
     assert_array_almost_equal(desired, y, decimal=12)
Пример #5
0
def residual_nllf(v):
    # Normalize the scores so that we are assuming
    z = stats.zs(v)
period = 350
_df, _adf = .1, '10%'
desvio = 2

yi, xi = 0, 0
for i in range(base.shape[1] - 1):
    yn = base.iloc[:, i].name
    xn = base.iloc[:, i].name
    if yn == atvA:
        yi = i
    if xn == atvB:
        xi = i

period = period * -1
y = base.iloc[period:, yi].values
y = zs(y)
x = base.iloc[period:, xi].values
x = zs(x)
ynn = base.iloc[:, yi].name
xnn = base.iloc[:, xi].name

x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
adf = ts.adfuller(model.resid, 1)
std = statistics.stdev(model.resid)

if model.resid[-1] > desvio * std or model.resid[-1] < desvio * std * -1:
    print('Pode Operar!')
print(ynn, xnn)

if adf[1] < _df:
def main():

    top = 50  #max: 8921  with filters

    database = "calorie_king_social_networking_2010"
    server = "tarraco.chem-eng.northwestern.edu"
    user = "******"
    passwd = "n1ckuDB!"

    db = Connection(server, database, user, passwd)

    db.execute(
        "DROP TABLE IF EXISTS gaps_by_frequency")  #i remove the (old) table

    db.execute(
        """                      
       CREATE TABLE gaps_by_frequency
       (
         file_index     INT,
         ck_id          CHAR (20),
         start_date     INT,
         end_date       INT,
         start_day      INT,
         end_day        INT,
         days_gap       INT,        
         zscore_gap     FLOAT
       )
     """
    )  # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones

    #query="""describe gaps_by_frequency"""
    #db.execute ("DROP TABLE IF EXISTS animal")
    # query="""show tables"""

    query = """select * from gaps_by_frequency"""

    # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ")

    # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ")

    #query="""show tables"""

    #   query="""select * from gaps_by_frequency"""
    #  result1 = db.query(query)  # is a list of dict.
    # for r1 in result1:
    #    print r1

    list_all_average_frequencies = []
    histogram_all_freq_no_averaged = [0] * 1000
    num_events_all_freq_no_averaged = 0.

    for index_file in range(top):
        index_file += 1

        list_average_frequencies_one_user = []
        histogram_idiv = [0] * 1000
        num_events_indiv = 0.

        #input file:
        file_name = "temporal_series/most_weigh_ins/weigh_in_time_serie_days" + str(
            index_file) + "_top50"
        #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters"

        file = open(file_name + ".dat", 'r')
        list_lines_file = file.readlines()

        list_dates = []
        list_days = []
        list_frequencies = []
        cont = 0
        for line in list_lines_file:
            if cont > 0:  # i skip the first line,cos it doesnt have an associated freq.

                list = line.split(" ")

                ck_id = list[10]

                print line
                try:
                    list_frequencies.append(float(list[9]))  #frequency
                    list_days.append(float(list[4]))  #relative day
                    list_dates.append(list[7])  #dates

                except IndexError:

                    list_frequencies.append(float(0.0))  #frequency
                    list_days.append(float(list[4]))  #day
                    list_dates.append(list[7])  #dates

            cont += 1

        print list_dates

        print "\n\n"

        list_zscores = stats.zs(list_frequencies)

        for i in range(len(list_zscores)):

            if list_zscores[
                    i] >= 3.0:  # statistically significant gap if zs>=3 std
                if list_frequencies[
                        i] > 15.:  # dont consider it a gap if it is shorter than 2weeks
                    if i > 2:  #or happens for the very second measurement

                        print "on file", index_file, "between days:", list_days[
                            i - 1], "-", list_days[
                                i], "there is a gap. freq:", list_frequencies[
                                    i], "zscore:", list_zscores[i]

                        time_gap = list_days[i] - list_days[i - 1]

                        db.execute(
                            "INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) VALUES ("
                            + str(index_file) + ", " + str(ck_id) + "," +
                            str(list_dates[i - 1]) + ", " +
                            str(list_dates[i]) + "," + str(list_days[i - 1]) +
                            "," + str(list_days[i]) + "," + str(time_gap) +
                            ", " + str(list_zscores[i]) + " ")

            print "\n", "on file", index_file, "mean freq:", np.asanyarray(
                list_frequencies).mean(
                    axis=0), "std:", np.asanyarray(list_frequencies).std(
                        axis=0, ddof=0)

        raw_input()
Пример #8
0
#!/usr/bin/env python

from numpy import array
from scipy.stats import zs

import csv

reader = csv.reader(open("K562_H3K27me_mono-tri.txt", "rb"), delimiter="\t")
writer = csv.writer(open("K562_H3K27me_mono-tri.zscored.tester.txt", "wb"),
                    delimiter="\t")

ids = []
vals = []
for row in reader:
    thisid, val = row
    ids.append(thisid)
    vals.append(float(val))

newvals = zs(array(vals))

for i in range(len(ids)):
    thisid = ids[i]
    thisval = newvals[i]
    writer.writerow((thisid, thisval))
Пример #9
0
def residual_nllf(v):
    # Normalize the scores so that we are assuming
    z = stats.zs(v)
def main():

    top = 8924  #max: 8924  for the files with filters (>=10 days, >=10weigh-ins >= 1/30 weigh-ins per day).    max:50 for the top50 longest time series (no filter)

    zscore_threshold = 1.  # it is a statistically significant gap if zs>=3 std

    min_freq = 10.  # to consider something a gap

    database = "calorie_king_social_networking_2010"
    server = "tarraco.chem-eng.northwestern.edu"
    user = "******"
    passwd = "n1ckuDB!"

    db = Connection(server, database, user, passwd)

    db.execute(
        "DROP TABLE IF EXISTS gaps_by_frequency")  #i remove the old table

    #i create a new table in an existing DB
    db.execute(
        """                      
       CREATE TABLE gaps_by_frequency
       (
         file_index     INT,
         ck_id          CHAR (36),           
         index_start_day      INT,
         index_end_day        INT,
         start_day      INT,
         end_day        INT,
         days_gap       INT,        
         zscore_gap     FLOAT,
         average_freq     FLOAT
       
        
       )
     """
    )  # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones

    #query="""describe gaps_by_frequency"""
    #db.execute ("DROP TABLE IF EXISTS animal")
    # query="""show tables"""

    query = """select * from gaps_by_frequency"""

    # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ")

    # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ")

    #query="""show tables"""

    #   query="""select * from gaps_by_frequency"""
    #  result1 = db.query(query)  # is a list of dict.
    # for r1 in result1:
    #    print r1

    list_all_average_frequencies = []
    histogram_all_freq_no_averaged = [0] * 1000
    num_events_all_freq_no_averaged = 0.

    for index_file in range(top):

        index_file += 1
        print "\n\n", index_file
        list_average_frequencies_one_user = []
        histogram_idiv = [0] * 1000
        num_events_indiv = 0.

        #input file:
        #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50"
        file_name = "temporal_series/most_weigh_ins/weigh_in_time_serie_days" + str(
            index_file) + "_filters"

        # OJO!!!!!!!! EN ESTE ARCHIVO, EL DIA (RELATIVO AL PRIMERO) ES LA COLUNMA 4, NO LA 0 !!!!!!!!!!

        file = open(file_name + ".dat", 'r')
        list_lines_file = file.readlines()

        list_dates = []
        list_days = []
        list_frequencies = []
        cont = 0
        for line in list_lines_file:
            if cont > 0:  # i skip the first line,cos it doesnt have an associated freq.

                list = line.split(" ")

                ck_id = list[8].strip("\n")

                try:
                    list_frequencies.append(float(list[7]))  #frequency
                    list_days.append(float(
                        list[4]))  #relative day  to the sign-up  date
                    list_dates.append(list[5])  #dates

                except IndexError:

                    list_frequencies.append(float(0.0))  #frequency
                    list_days.append(float(list[4]))  #day
                    list_dates.append(list[5])  #dates

            cont += 1

        average_freq = np.mean(list_frequencies)

        list_zscores = stats.zs(list_frequencies)

        # OJO!!!!!!!!! list_zscores[0] (o tb list_frequencies[0]) corresponde a la diff entre la primera y la segunda entrada de list_days, por lo que en realindad
        #hay un desfase de una unidad entre los indices de las dos listas
        num_gaps = 0
        for i in range(len(list_zscores)):

            if list_zscores[
                    i] >= zscore_threshold:  # it is a statistically significant gap if zs>= zscore_threshold
                if list_frequencies[
                        i] > min_freq:  # dont consider it a gap if it is shorter than  x days
                    if i >= 1:  #because of the python thing about list[-1]=last_element_of_list)

                        print "    between days:", list_days[i - 1], "-", list_days[
                            i], "there is a gap. freq:", list_frequencies[
                                i], "zscore:", list_zscores[
                                    i], "average freq: ", average_freq, ck_id, "on file", index_file

                        time_gap = list_days[i] - list_days[i - 1]

                        # db.execute ("""
                        # INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap)
                        #VALUES (%s, %s, %s,%s, %s, %s,%s, %s,%s, %s)
                        #""", str(index_file), str(ck_id),str(list_dates[i-1]), str(list_dates[i]),str(list_days[i-1]),str(list_days[i]),str(time_gap), str(list_zscores[i]), str(np.asanyarray(list_frequencies).mean(axis=0)), str(np.asanyarray(list_frequencies).std(axis=0, ddof=0)))    NO FUNCIONA!!

                        db.execute(
                            """
                        INSERT INTO gaps_by_frequency (file_index, ck_id,  start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq)
                        VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s)
                        """, str(index_file), str(ck_id),
                            str(list_days[i - 1]), str(list_days[i]), i, i + 1,
                            str(time_gap), str(list_zscores[i]),
                            str(average_freq))

                        # note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index)

                        num_gaps += 1

                    # if ck_id== "34214d9b-3fae-43d5-a961-bf7a94e22a3c" :
                    #    for ii in range(len(list_zscores)):
                    #       print list_days[ii],list_frequencies[ii],list_zscores[ii]

                    #  raw_input()

#                        print  str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq)

                    else:  # for the very first point

                        time_gap = list_days[i]

                        db.execute(
                            """
                        INSERT INTO gaps_by_frequency (file_index, ck_id,  start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq)
                        VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s)
                        """, str(index_file), str(ck_id), str(0),
                            str(list_days[i]), i, i + 1, str(time_gap),
                            str(list_zscores[i]), str(average_freq))

                        # note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index)

                        num_gaps += 1

        print "on file", index_file, "mean freq:", np.asanyarray(
            list_frequencies).mean(
                axis=0), "std:", np.asanyarray(list_frequencies).std(axis=0,
                                                                     ddof=0)