configuration = getattr(configurations_0, args.proto)() # added by Zhaopeng Tu, 2016-05-12 if args.state: configuration.update(eval(open(args.state).read())) logger.info("\nModel options:\n{}".format(pprint.pformat(configuration))) rng = numpy.random.RandomState(1234) enc_dec = EncoderDecoder(rng, **configuration) enc_dec.build_sampler() # added by Zhaopeng Tu, 2016-05-27 # options to use other trained models if args.model: enc_dec.load(path=args.model) else: enc_dec.load(path=configuration['saveto_best']) test_align = Align(enc_dec=enc_dec, configuration) test_src = get_stream(args.source, configuration['vocab_src'], **configuration) test_trg = get_stream(args.target, configuration['vocab_trg'], **configuration) aligner = Aligner(align_model=test_align, test_src=test_src, test_trg=test_trg, **configuration) aligner.apply(args.alignment, True)
def get_external_metadata(inputfile): # define an empty python dictionary md = {} # Check whether this file exists. if not os.path.exists(inputfile): return md # Get the other meta data field parameters md['file_name'] = os.path.basename(inputfile) md['file_size'] = str(os.path.getsize(inputfile)) md['crc'] = fileEnstoreChecksum(inputfile) # Quit here if file type is not ".root" if not inputfile.endswith('.root'): return md # Root checks. file = ROOT.TFile.Open(larbatch_posix.root_stream(inputfile)) if file and file.IsOpen() and not file.IsZombie(): # Root file opened successfully. # Get number of events. obj = file.Get('Events') if obj and obj.InheritsFrom('TTree'): # This has a TTree named Events. nev = obj.GetEntriesFast() md['events'] = str(nev) # Get runs and subruns fro SubRuns tree. subrun_tree = file.Get('SubRuns') if subrun_tree and subrun_tree.InheritsFrom('TTree'): md['subruns'] = [] nsubruns = subrun_tree.GetEntriesFast() tfr = ROOT.TTreeFormula('subruns', 'SubRunAuxiliary.id_.run_.run_', subrun_tree) tfs = ROOT.TTreeFormula('subruns', 'SubRunAuxiliary.id_.subRun_', subrun_tree) for entry in range(nsubruns): subrun_tree.GetEntry(entry) run = tfr.EvalInstance64() subrun = tfs.EvalInstance64() run_subrun = (run, subrun) if not run_subrun in md['subruns']: md['subruns'].append(run_subrun) # Get stream name. try: stream_name = stream.get_stream(inputfile) md['data_stream'] = stream_name except: pass return md
def build_and_run(save_to,modelconfig,experimentconfig): """ part of this is adapted from lasagne tutorial""" n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack'] print("Amount of bottlenecks: %d" % n) # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') #target_value = T.ivector('targets') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) #target_var = T.matrix('targets') # Create residual net model print("Building model...") network = build_cnn(input_var, image_size, n, num_blockstack, num_filters) get_info(network) prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network)) test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True)) # Loss function -> The objective to minimize print("Instanciation of loss function...") #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction) #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean() # test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean() loss = lasagne.objectives.squared_error(prediction,target_vec).mean() test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean() # loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean() # test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean() test_loss.name = "loss" # loss.name = 'x-ent_error' # loss.name = 'sqr_error' layers = lasagne.layers.get_all_layers(network) #l1 and l2 regularization #pondlayers = {x:0.000025 for i,x in enumerate(layers)} #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6 #reg_penalty = l1_penality + l2_penality #reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'reg_loss' error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy( name='error_rate') # Load the dataset print("Loading data...") istest = 'test' in experimentconfig.keys() if istest: print("Using test stream") train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True)) print("Initializing algorithm") algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params step_rule=step_rule) #algorithm.add_updates(extra_updates) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = "grad_norm" print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss'], ['train_grad_norm'], #['train_grad_norm','train_reg_penalty'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 plot, #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), # Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm' # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(loss) # print("Model",model) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
import sqlite3 from sqlite3 import Error import stream as st from time import strftime stream = st.create_stream() tweets, dt, future = st.get_stream(stream) conn = sqlite3.connect("assign2.db") db = conn.cursor() db.execute('drop table if exists HASHTAGS;') db.execute( 'create table HASHTAGS (tag_ID integer primary key autoincrement,hashtag text);' ) conn.commit() for tweet in tweets: if 'entities' in tweet: hashtags = tweet['entities']['hashtags'] for hashtag in hashtags: # print "%s" % (hashtag['text']) db.execute('INSERT INTO HASHTAGS (hashtag) VALUES (?);', [hashtag['text']]) conn.commit() cur = db.execute( 'select hashtag, count(*) as count from HASHTAGS group by hashtag order by count desc limit 10;' ) top10 = cur.fetchall() fp = open('assign2-report.txt', 'w')
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[ 'num_epochs'], config['num_batches'], config['num_channels'], config[ 'image_shape'], config['filter_size'], config[ 'num_filter'], config['pooling_sizes'], config[ 'mlp_hiddens'], config['output_size'], config[ 'batch_size'], config['activation'], config[ 'mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list( interleave([(Convolutional(filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output')) ] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([ lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases) ]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size, image_shape, test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_' + label + '.tar') checkpoint.add_condition( ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot( label, channels=[ ['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # [ 'train_costreg', 'train_total_gradient_norm', 'train_l2_penalty', 'train_l1_penalty' ] ], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([ costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty ], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate', min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4) ] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm, data_stream=train_stream, model=model, extensions=extensions) main_loop.run()
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config['num_epochs'], config['num_batches'], config['num_channels'], config['image_shape'], config['filter_size'], config['num_filter'], config['pooling_sizes'], config['mlp_hiddens'], config['output_size'], config['batch_size'], config['activation'], config['mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list(interleave([(Convolutional( filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output'))] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([lambda_l2 * (W ** 2).sum() for i,W in enumerate(weights+biases)]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1*T.abs_(z).sum() for z in weights+biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size,image_shape,test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_'+label+'.tar') checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot(label, channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # ['train_costreg','train_total_gradient_norm','train_l2_penalty','train_l1_penalty']], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([costreg, error_rate, error_rate2, grad_norm,l2_penalty,l1_penalty], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4)] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm,data_stream=train_stream,model=model,extensions=extensions) main_loop.run()
def build_and_run(experimentconfig, modelconfig, save_to=None): #modelconfig, """ part of this is adapted from lasagne tutorial""" # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) # Create vgg model print("Building model...") image_size = modelconfig['image_size'] network = vgg16.build_small_model() prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network["prob"],input_var)) # test_prediction = lasagne.layers.get_output(network["prob"],input_var,deterministic=True) # Loss function -> The objective to minimize print("Instanciation of loss function...") # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()) loss = lasagne.objectives.squared_error(prediction,target_vec) # test_loss = lasagne.objectives.squared_error(test_prediction,target_vec) loss = loss.mean() # layers = network.values() #l1 and l2 regularization # pondlayers = {x:0.01 for x in layers} # l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) # l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 1e-4 # reg_penalty = l1_penality + l2_penality # reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'loss' error_rate = MisclassificationRate().apply(target_var.flatten(), prediction).copy( name='error_rate') # Load the dataset print("Loading data...") if 'test' in experimentconfig.keys() and experimentconfig['test'] is True: train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=True) else : train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=False) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network['prob'], trainable=True)) algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params}, step_rule=step_rule) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name='grad_norm' print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss','train_grad_norm'],['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([loss, error_rate, grad_norm, reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), plot, # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(ComputationGraph(network)) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
import sqlite3 from sqlite3 import Error import stream as twitter_stream from time import strftime stream = twitter_stream.create_stream() tweet_buff, date_time, future = twitter_stream.get_stream(stream) connection = sqlite3.connect("tweets.db") tweet_db = connection.cursor() tweet_db.execute('drop table if exists POPULAR_HASHTAG;') tweet_db.execute('create table POPULAR_HASHTAG (tag_ID integer primary key autoincrement,hashtag text);') connection.commit() fp = open('report.txt', 'w') for tweet in tweet_buff: if 'entities' in tweet: hashtags = tweet['entities']['hashtags'] for hashtag in hashtags: tweet_db.execute('INSERT INTO POPULAR_HASHTAG (hashtag) VALUES (?);', [hashtag['text']]) connection.commit() result = tweet_db.execute('select hashtag, count(*) as count from POPULAR_HASHTAG group by hashtag order by count desc limit 10;') popHashtags = result.fetchall() print('{}\t{}'.format(date_time,future)) fp.write('{}\t{}\n'.format(date_time,future)) for popHashtag in popHashtags: print(popHashtag[0].encode('utf-8') + '\t{}'.format(popHashtag[1])) #encoding to priduce hashtags in that are not in english fp.write(popHashtag[0].encode('utf-8') + '\t{}\n'.format(popHashtag[1]))