def checkDataOrientation(experiments):
# This code reads through every produced training file and prints the number of positive and negative cells
# It's old, from back when I was still getting orientation wrong. 
# Its purpose is to check indexing across my data and the original data.

# if the specified data hasn't been built, errors will happen

  for experiment in experiments:
    print experiment
    S_train, S_test = read_data(*experiment)
    cells,adj = S_train
    qwer = max([int(x['id'][0]) for x in cells])
    #check balance
    pos = len([x for x in cells if x['class'] != 'null'])
    neg = len([x for x in cells if x['class'] == 'null'])
    print 'pos:',pos
    print 'neg:',neg
    #pick an arbitrary image
    arbImageNum = cells[0]['id'][0]
    arbImageCells = [x for x in cells if x['id'][0] == arbImageNum]
    dataFile = arbImageCells[0]['id'][3][0] # grab the original data file this images cells come from
                                            # id structure is (imageCounter,row,col,(dataFileName,timeStamp))
    print dataFile
    with open(dataFile) as f2:                          # read the parameter data for this image
      c = csv.reader(f2,dialect = 'excel-tab')
      Juancells = [x for x in c]
    for cell in arbImageCells:
      myx = cell['id'][1]#these should be zero based
      myy = cell['id'][2]
      juanx = myx+1      # Juan's indexing is 1-based
      juany = myy+1
      juanCell = [x for x in Juancells if x[0] == str(juanx) and x[1] == str(juany)][0]
    print
def checkDatasetNeighbors(experiments):
# This code checks the number of temporal neighbors each cell has and prints a detailing ot the distribution

# if the specified data hasn't been built, errors will happen
  for experiment in experiments:
    print experiment
    S_train, S_test = read_data(*experiment)
    cells,adj = S_train
    allNeighbors = [len(adj[cell['id']]) for cell in cells] # this is all neighbors
    c = Counter(allNeighbors)
    print c
    print
def checkDatasetSizes(experiments):
# this function checks specified  dataset structures and prints out the number of training cells and the number of entries in
# the adjacency matrix. These numbers should be the same    

# if the specified data hasn't been built, errors will happen
  for experiment in experiments:
    print experiment
    S_train,S_test = read_data(*experiment)
    cells, adj = S_train
    print len(cells)
    print len(adj.keys())
    print
def stuff2():
  print 'looking at the value distributions of the different parameters'
  S_train,S_test = read_data('AR',neighborhood='rook',dataset='1DAY',waves=['0094'],balanceOption = 'Mirror')
  G = NeighborGraph.NeighborGraph(S_train)

  for p in G.F: # for each parameter
    l = [pix[p] for pix in G.cells] # grab the pixels
    print p
    print 'Max:',max(l)
    print 'Min:',min(l)
    print 'Avg:',sum(l)/len(l)
    print 'Most Common:',Counter(l).most_common(10)
    print
def checkDatasetSplits(experiments):
# this function checks specified dataset structures and prints out the number _different_values_ for each parameter
# this helps see how large the decision space for split selection is.

# if the specified data hasn't been built, errors will happen

  for experiment in experiments:
    print experiment
    summ = 0
    S_train,S_test = read_data(*experiment)
    cells, adj = S_train
    print 'numcells: ', len(cells)
    P = dict()
    for p in ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10']:
      s = set()
      P[p] = set()
      for cell in cells:
        s.add(cell[p])
      print p,': ',len(s)
      summ+=len(s)
    print 'total:',summ
    print
Пример #6
0
from readData import read_data

e = 'AR'
n = 'rook'
d = '1DAY'
w = ['0171']
b = 'Mirror'

# This script takes a single experiment's set of data and produces an ARFF file for use with WEKA. 
# Extending it to be able to handle multiple experiments at a time (producing multiple ARFF files) is a good idea

S_train, S_test = read_data(e, neighborhood=n, dataset=d, waves=w, balanceOption=b)

def make_arff():
    filename = '_'.join([e, d, '-'.join(w), b, 'train']) + '.arff'
    relation_name = 'derp'
    feature_names = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10']
    classes = [e, 'null']
    cells, adj = S_train
    cells2, adj2 = S_test
    with open(filename, 'w') as f:
        f.write('@relation ' + str(relation_name) + "\n\n")

        for n in feature_names:
            f.write("@attribute " + str(n) + " numeric\n")

        s = ','.join(classes)
        f.write("@attribute label {" + s + "}\n\n")

        f.write('@data\n')
def testing():
#  events = ['FL','SG','AR','CH']
  events = ['SG']
#  events = ['FL','SG','FI','CH','AR','SS']
#  neighborhoods = ['rook','queen','rooktemp']
  neighborhoods = ['rook','rooktemp','rooktemplong']
  datasets = ['3DAYDEMO']

  thetas = [0.7] # theta is the classification parameter

#  grid = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
#  alphas = [x for x in itertools.product(grid,grid) if sum(x) <= 1]# alpha is the parameter that weights the entropy and neighb$
  alphas = [(0.3,0.3),(0.6,0.3),(0.3,0.6)]

  NeighborFunctions = ["spatial","temporal"]


  splitRes = 10000000 # splitRes affects how many splits are considered among each parameter at each tree node
  c_0vals = [100]# c_0val is the _proportion_ of total values set as the minimum leaf size
  waves = [['0193'],['0171'],['0094']]
#  balances = ['Mirror','Duplication','Random']
  balances = ['Mirror']
  X = [x for x in itertools.product(events,neighborhoods,datasets,waves,balances,c_0vals)]
  Y = [x for x in itertools.product(alphas,thetas)]
  results = []
  for e,n,d,w,b,cv in X:
   S_train,S_test = read_data(e,neighborhood = n, dataset = d, waves = w,balanceOption = b)
   cells, adj = S_train
   cells2, adj2 = S_test
   c_0 = max(len(cells)//cv,1)
   for alpha,theta in Y:
    t1 = time.time()
    if sum(alpha) > 1:
     raise Exception('invalid alpha')
    treefile = "temp.tree"
    TP,FP,TN,FN = SDT.sdt_learn(S_train, S_test, alpha, c_0, theta, splitRes, NeighborFunctions)
    ACC = 100*(TP+TN)/(TP+TN+FP+FN) 
    if TP+FP != 0:
      PREC = 100*(TP)/(TP+FP)
    else:
      PREC = 0
    if TP+FN != 0:
      REC = 100*TP/(TP+FN)
    else:
      REC = 0
    F1 = 0 if PREC+REC == 0 else (2*PREC*REC)/(PREC+REC)
    summ = 0
    for p in ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10']: # for each parameter
      s = set()                                                    # build a set of all of the values of that parameter
      for cell in cells:                                           # the set data structure will eliminate duplicates
        s.add(cell[p])                                             # so its length is the number of distinct values for that 
      summ+=len(s)                                                 # parameter
    splits = summ                                                  # add all these together to get the number of potential 
                                                                   # splits in the dataset
    t2 = time.time()
    runTime = t2-t1
    
    strr = "".join([
                  'event:',str(e),'\n',
                  'neighborhood:',str(n),'\n',
                  'numcells: ', str(len(cells)),'\n',
                  'wave:',str(w),'\n',
                  'dataset:',str(d),'\n',
                  'alpha:',str(alpha),'\n',
                  'c_0:',str(c_0),'\n',
                  'theta:',str(theta),'\n',
                  'balance:',str(b),'\n',
                  'splits:',str(splits),'\n',
                  'Runtime:',str(t2-t1),'\n'
                  'F1:',str(F1),'\n',
                  ' ','\n',
                  ' ','\n',
                  ' ','\n'])
    logging.debug(strr)
    parameters = (n,w,alpha)
    results.append((parameters,F1,runTime))
    
  with open('TestTimeNeighborhoodSize.results','wb') as f:
    cPickle.dump(results,f)
def testing():
    timeout = 0  # if timeout is zero, no timeout will be triggered; otherwise the code will cut off the runtime of the experiment at this many seconds
    skipTOs = False;  # if this is true, the script will skip runs that previously timed out; otherwise they will be re-run (and potentially timeout again)
    pooling = False;  # if this is true, the runs will be parallelized with Python's Pool tool; it's faster,
    # especially for large run sets, but debugging errors is easier when it's off

    # --------------------
    # setting parameters
    # --------------------
    #  events = ['FL','SG','AR','CH']
    events = ['AR']
    #  events = ['SG']
    #  events = ['AR','SG']
    #  events = ['FL','SG','FI','CH','AR','SS']
    #  neighborhoods = ['rook','queen','rooktemp']
    neighborhoods = ['rook']
    #  datasets = ['1DAY','3DAYDEMO']
    datasets = ['1DAY']
    #  waves = [['0193'],['0171'],['0094']]
    waves = [['0193']]
    #  balances = ['Mirror','Duplication','Random']
    balances = ['Mirror']

    c_0vals = [100]  # c_0val is the _proportion_ of total values set as the minimum leaf size
    # a value of 100 means that minimum leaf size is roughly 1/100 of the total number of cells in the dataset
    # the actual training parameter needs to be calculated from this value and the dataset

    alphas = [(0.6,)]
    NeighborFunctions = ["none"]
    #  grid = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    #  alphas = [x for x in itertools.product(grid,grid) if sum(x) <= 1]# alpha is the parameter that weights the entropy and neighbor coherence heuristics
    #  NeighborFunctions = ["spatial","temporal"]

    #  alphas = [(x) for x in grid]# alpha is the parameter that weights the entropy and neighbor coherence heuristics
    #  NeigborFunctions = ["none"]

    thetas = [0.7]  # theta is the classification parameter

    splitRes = 50
    # splitRes specifies how many splits are considered among each parameter at each tree node these
    # splits are selected uniformly according to value distribution (e.g. with 100 splits you get 0th percentile value, 
    # 1st percentile value, 2nd percentile value, etc.)
    # a very high value (e.g. 10,000,000) will cause the training to evaluate every potential split.
    # once every potential split is being evaluated, increasing splitRes won't affect the result or runtime of the algorithm.


    experiment_data = [x for x in itertools.product(events, neighborhoods, datasets, waves, balances)]
    experiment_data_and_c0_vals = [x for x in itertools.product(experiment_data, c_0vals)]

    trainingParamSets = [x for x in itertools.product(alphas, thetas)]
    #  trainingParamSets.append(((0.0,0.0),1.1))# add baseline experiment
    #  trainingParamSets.append(((0.0,),1.1))# add baseline experiment
    # this value of alpha/theta will result in a pure entropy-based classification tree

    # -----------------------
    # end parameter setup
    # -----------------------

    for experiment, cv in experiment_data_and_c0_vals:
        S_train, S_test = read_data(*experiment)
        cells = S_train[0]
        cells2 = S_test[0]
        c_0 = max(len(cells) // cv, 1)



    # if not os.path.exists(treefile) or treefile == 'temp.tree':
    #     g = NeighborGraph(s_train)
    #     tree = sdt_train(g, alpha, c_0, split_res, neighbor_functions)
    #     with open(treefile, 'w') as f:
    #         tree.save(f)
    # else:
    #     print 'tree exists'
    #     tree = TreeNode('dummy')
    #     with open(treefile) as f:
    #         tree.load(f)






        e, n, d, w, b = experiment
        fileBase = "results/" + "_".join([str(e), str(n), str(d), "-".join(w), 'c_0=' + str(c_0), 'balance=' + str(b)])
        constants = [(experiment, cv, S_train, S_test, c_0, splitRes, NeighborFunctions, fileBase, skipTOs, timeout)]
        runs = [x for x in itertools.product(constants, trainingParamSets)]
        incompleteRuns = []
        for run in runs:
            constants, trainingParams = run
            alpha, theta = trainingParams
            strAlpha = str(alpha).replace('(', '[').replace(')', ']').replace(', ', '-')
            treefile = "_".join([fileBase, 'alpha=' + strAlpha]) + ".tree"
            resultsfile = "_".join([fileBase, 'alpha=' + strAlpha, 'theta=' + str(theta)]) + ".results"
            if os.path.exists(resultsfile) or (skipTOs and os.path.exists(resultsfile + 'TO')):
                print 'already ran this', resultsfile
            else:
                incompleteRuns.append(run)

        if pooling:
            p = Pool(4)  # TODO: fix magic number
            p.map(parallelized_component, incompleteRuns)
        else:
            for run in incompleteRuns:
                parallelized_component(run)
Пример #9
0
import socket
import pynmea2
import time
import datetime
import readData

UDP_IP = "127.0.0.1"
UDP_PORT = 10111

sock = socket.socket(socket.AF_INET, # Internet
                     socket.SOCK_DGRAM) # UDP
sock.bind((UDP_IP, UDP_PORT))

print("Loading data")
aisdata = readData.read_data("oceansofdata/ais-exploratorium-edu/feed.ais.txt")
print("Data Loaded")

start_time = time.time()

#our datasources are old and don't change time
sim_start_time = 1417005700
sim_end_time = sim_start_time + 60

sim_real_diff = start_time - sim_start_time

#used to store a list of collisions and near misses 
collisions = list()
near_misses = list()
position_log = dict()

position_log[0] = list()
Пример #10
0
# Define parameters of the decomposition
max_iter = 4
opt_tol = 1  # %
ns = 5  # Number of scenarios solved per Forward/Backward Pass per process
# NOTE: ns should be between 1 and len(n_stage[n_stages])/NumProcesses
z_alpha_2 = 1.96  # 95% confidence level

# Parallel parameters
NumProcesses = 1

# ######################################################################################################################

# create scenarios and input data
nodes, n_stage, parent_node, children_node, prob, sc_nodes = create_scenario_tree(stages, scenarios, single_prob)
readData.read_data(filepath, curPath, stages, n_stage, t_per_stage)
sc_headers = list(sc_nodes.keys())

# operating scenarios
operating_scenarios = list(range(0, len(readData.L_by_scenario)))
prob_op = 1 / len(readData.L_by_scenario)
# print(operating_scenarios)

# separate nodes by processes
scenarios_by_processid = {}
for i in range(NumProcesses):
    start = int(len(sc_nodes) * i / float(NumProcesses))
    stop = int(len(sc_nodes) * (i + 1) / float(NumProcesses))
    scenarios_by_processid[i] = sc_headers[start:stop]
# print(scenarios_by_processid)
Пример #11
0
single_prob = {'L': 1 / 3, 'R': 1 / 3, 'H': 1 / 3}

# Define parameters of the decomposition
max_iter = 10
opt_tol = 2  # %
ns = 15  # Number of scenarios solved per Forward/Backward Pass
# NOTE: ns should be between 1 and len(n_stage[time_periods])
z_alpha_2 = 1.96  # 95% confidence level

# ######################################################################################################################

# create scenarios and input data
nodes, n_stage, parent_node, children_node, prob, sc_nodes = create_scenario_tree(
    stages, scenarios, single_prob)
sc_headers = list(sc_nodes.keys())
readData.read_data(filepath, stages, n_stage)

# create blocks
m = b.create_model(time_periods, max_iter, n_stage, nodes, prob)
print('finished generating the blocks, started counting solution time')
start_time = time.time()

# Decomposition Parameters
m.ngo_rn_par = Param(m.rn_r, m.n_stage, default=0, initialize=0, mutable=True)
m.ngo_th_par = Param(m.th_r, m.n_stage, default=0, initialize=0, mutable=True)
m.ngo_rn_par_k = Param(m.rn_r,
                       m.n_stage,
                       m.iter,
                       default=0,
                       initialize=0,
                       mutable=True)
Пример #12
0
 def __init__(self):
     self.read_data = read_data()
Пример #13
0
def main():
    documents = readData.read_data(
    )  #this variable holds the edited_senteces in a list
 alphaNonSpatial = '[0.0-0.0]'
 c_0 = cref[e]
 eventClass = e
 dataset = d
 print eventClass,dataset
 headers,matches = SOLARGenImageList.image_event_matches(dataset=d,waves = w)
 print 'matches calculated'
 treefileSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaSpatial)])+".tree"
 treefileNonSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaNonSpatial)])+".tree"
 treeSpatial = TreeNode('dummy')
 treeNonSpatial = TreeNode('dummy')
 with open(treefileSpatial) as f:
   treeSpatial.load(f)
 with open(treefileNonSpatial) as f:
   treeNonSpatial.load(f)
 S_train,S_test, = read_data(e,n,d,w,b) # read the data set
 cells_train, adj = S_train
 cells_test, adj_test = S_test
 counter = 0
 for x in sorted(matches.keys()): # for each image of the data set
   paramsFilename = x[0]
   imageFilename = paramsFilename[:-4]+'_th.png'
   ISpatial = m.imread(imageFilename) # read the image
   INonSpatial = ISpatial.copy()
   outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'_'+d+'.png')
   cells_testWeWant = [x for x in cells_test if x['id'][3][0] == paramsFilename] # get the test cells tied to this image
   for x in range(2):
     if x == 0:
       I = ISpatial
       tree = treeSpatial
     else:
def dataIter(batch_size,trainX, trainY):
    dataset = gluon.data.ArrayDataset(trainX, trainY)
    train_data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True)
    return train_data_iter


def train( trainX, trainY):
    train_data_iter=dataIter(batchSize,trainX,trainY)
    lenTrainY=len(trainY)
    net = Net()
    lr=0.0001
    sigmoidBCEloss= gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)
    for e in range(epochs):
        total_loss = 0
        for x,y in tqdm(train_data_iter):
            with autograd.record():
                y_hat=net.net(x)
                loss = sigmoidBCEloss(y_hat,y)
            loss.backward()
            net.SGD(lr)
            total_loss += nd.sum(loss).asscalar()
        print("Epoch %d, average loss:%f" % (e, total_loss / lenTrainY))
    return net



if __name__ == '__main__':
    trainX, trainY, testX, testY = readData.read_data()
    net=train(trainX,trainY)
#events = ['AR','CH','FL','FI','SG','SS'] 
events = ['AR'] 
#datasets = ['1WEEK']
datasets = ['3DAYDEMO']
for e,d in itertools.product(events,datasets):
  if not os.path.exists(os.path.join(outputFolder,e)):
    os.mkdir(os.path.join(outputFolder,e))
  #similar to above, make subfolder if doesn't exist
  n = 'rook'
  w = ['0171']
  b = 'Mirror'
  eventClass = e
  dataset = d
  print eventClass,dataset
  headers,matches = SOLARGenImageList.image_event_matches(dataset=dataset,waves = ['0171'])
  S_train,S_test, = read_data(e,n,d,w,b)
  cells, adj = S_train
  cells2, adj2 = S_test
  print 'matches calculated'
  counter = 0
  total = len(matches.keys())
  for x in sorted(matches.keys()): # for each image
    paramsFilename = x[0]
    imageFilename = paramsFilename[:-4]+'_th.png'
    I = m.imread(imageFilename)
    outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'.png')
    cellsWeWant = [x for x in cells if x['id'][3][0] == paramsFilename]
    for cell in cellsWeWant:
      cellr = cell['id'][1]
      cellc = cell['id'][2]
      I = drawSquare(I,cellr,cellc,colordict[e] if cell['class'] != 'null' else black )