def predict(self, test_input, input_type, test_case_count=25): normalize = Normalize() if input_type == 'RANDOM_INPUT': input_count = 0 for question in test_input: input_count += 1 question_ = normalize.normalize(question) logging.debug('Test Case No.{}: {}'.format( input_count, str(question))) logging.debug('-' * (len(question) + 16)) logging.debug('Predicted Tags: {}'.format( self.tag_predictor(question_))) logging.debug('') else: test_idx = np.random.randint(len(test_input), size=test_case_count) logging.debug("Predicted Vs Ground Truth for {} sample".format( test_case_count)) logging.debug('-' * 50) logging.debug('') input_count = 0 for idx in test_idx: input_count += 1 test_case = idx question = str(X_test[test_case]) logging.debug('Test Case No.{}: {}'.format( input_count, question)) logging.debug('-' * 100) logging.debug("Question ID: {}".format(test_case)) logging.debug('Predicted: ' + str( self.tag_predictor(normalize.normalize_( X_test[test_case])))) logging.debug('Ground Truth: ' + str( self._tag_encoder.inverse_transform( np.array([y_test[test_case]])))) logging.debug('\n')
def input(self): fin = open('F:\\data\\ml\\2\\page_blocks_test_feature.txt', 'r') lines = fin.readlines() row = 0 for line in lines: list = line.strip('\n').split(' ') self.matx[row][0:10] = list row += 1 Normalize.normalize(self.matx) fin = open('F:\\data\\ml\\2\\page_blocks_test_label.txt', 'r') lines = fin.readlines() row = 0 for line in lines: list = line.strip('\n') self.label[row] = list[0] row += 1
def __init__(self, in_size, n_out=None, non_lin='HT', method='cos', aft_nonlin=None, affinity_dict=None, type_layer='regular'): super(Graph_Layer_Wrapper, self).__init__() n_out = in_size if n_out is None else n_out if type_layer == 'regular': self.graph_layer = Graph_Layer(in_size, n_out=n_out, method=method, affinity_dict=affinity_dict) elif type_layer == 'cooc': self.graph_layer = Graph_Layer_Cooc(in_size, n_out=n_out) self.aft = None if aft_nonlin is not None: self.aft = [] to_pend = aft_nonlin.split('_') for tp in to_pend: if tp.lower() == 'ht': self.aft.append(nn.Hardtanh()) elif tp.lower() == 'rl': self.aft.append(nn.ReLU()) elif tp.lower() == 'l2': self.aft.append(Normalize()) elif tp.lower() == 'ln': self.aft.append(nn.LayerNorm(n_out)) elif tp.lower() == 'bn': self.aft.append( nn.BatchNorm1d(n_out, affine=False, track_running_stats=False)) elif tp.lower() == 'sig': self.aft.append(nn.Sigmoid()) else: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message) self.aft = nn.Sequential(*self.aft) # self.do = nn.Dropout(0.5) if non_lin is None: self.non_linearity = None elif non_lin == 'HT': self.non_linearity = nn.Hardtanh() elif non_lin.lower() == 'rl': self.non_linearity = nn.ReLU() else: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message)
def construct_mn(self, n_layers, n_neurons, alpha=0.1): mn_inp = Input(shape=[self.noise_size]) mn = Normalize()(mn_inp) mn = Dense(n_neurons, kernel_initializer='he_normal')(mn) for _ in range(1, n_layers): mn = Dense(n_neurons, kernel_initializer='he_normal')(mn) mn = LeakyReLU(alpha)(mn) mn = Model(inputs=mn_inp, outputs=mn) return mn
def __init__(self, n_classes, deno, pretrained, in_out=None, graph_size=None, method='cos'): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.deno = deno self.graph_size = graph_size if in_out is None: in_out = [2048, 64, 2048, 64] num_layers = len(in_out) - 3 non_lin = 'HT' print 'NUM LAYERS', num_layers, in_out self.linear_layer = nn.Linear(in_out[0], in_out[1], bias=False) # for param in self.linear_layer.parameters(): # param.requires_grad = False non_lin = 'HT' if pretrained == 'ucf': model_file = '../experiments/just_mill_flexible_deno_8_n_classes_20_layer_sizes_2048_64_ucf/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0.001_0.001/model_99.pt' elif pretrained == 'activitynet': model_file = '../experiments/just_mill_flexible_deno_8_n_classes_100_layer_sizes_2048_64_activitynet/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_50_step_50_0.1_0.001_0.001/model_49.pt' elif pretrained == 'random': model_file = '../experiments/just_mill_flexible_deno_8_n_classes_20_layer_sizes_2048_64_ucf/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0_0.001/model_99.pt' else: error_message = 'Similarity method %s not valid' % method raise ValueError(error_message) model_temp = torch.load(model_file) self.linear_layer.weight.data = model_temp.linear.weight.data self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append( Graph_Layer_Wrapper(in_out[num_layer + 2], n_out=in_out[num_layer + 3], non_lin=non_lin, method=method)) last_layer = [] last_layer.append(nn.Hardtanh()) last_layer.append(Normalize()) last_layer.append(nn.Dropout(0.5)) last_layer.append(nn.Linear(in_out[-1], n_classes)) last_layer = nn.Sequential(*last_layer) self.last_layer = last_layer
def process(self, input_paths, output_paths): # Init steps hs = HashtagSplit() nr = Normalize() ct = Contract() # execute pipeline for input_path, output_path in zip(input_paths, output_paths): # data paths path_0 = input_path path_1 = output_path[:-4] + '_1' + output_path[-4:] path_2 = output_path[:-4] + '_2' + output_path[-4:] path_3 = output_path # set paths hs.set_paths(path_0, path_1) nr.set_paths(path_1, path_2) ct.set_paths(path_2, path_3) # run print("starting with " + os.path.basename(input_path)) hs.run() print(os.path.basename(input_path) + ": hashtag done.") nr.run() print(os.path.basename(input_path) + ": normalize done.") ct.run() print(os.path.basename(input_path) + ": contract done.")
def __init__(self, n_classes, deno, in_out = None): super(Graph_Sim_Mill, self).__init__() self.num_classes = n_classes self.deno = deno # num_layers = 2 # in_out = [2048,512,1024] # print 'NUM LAYERS', num_layers, in_out if in_out is None: in_out = [2048,2048] # in_out = [2048,512,2048] num_layers = len(in_out)-1 print 'NUM LAYERS', num_layers, in_out self.linear_layer = nn.Linear(2048, 2048, bias = False) # for param in self.linear_layer.parameters(): # param.requires_grad = False # model_file = '../experiments/just_mill_ht_unit_norm_no_bias_ucf/all_classes_False_just_primary_False_deno_8_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0.001/model_99.pt' model_file = '../experiments/just_mill_ht_unit_norm_no_bias_fix_ucf/all_classes_False_just_primary_False_deno_8_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0.001_0.001_0.001__retry/model_99.pt' non_lin = 'HT' # model_file = '../experiments/just_mill_relu_unit_norm_no_bias_ucf/all_classes_False_just_primary_False_deno_8_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0.0001_128/model_99.pt' # non_lin = 'rl' model_temp = torch.load(model_file) # print model_temp.linear.weight.data.size() # print self.linear_layer.weight.data.size() # raw_input() self.linear_layer.weight.data = model_temp.linear.weight.data self.linear_layer.weight.require_grad = False self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append(Graph_Layer_Wrapper(in_out[num_layer],in_out[num_layer+1], non_lin)) # self.non_lin = nn.Hardtanh() last_layer = [] last_layer.append(nn.Hardtanh()) last_layer.append(Normalize()) last_layer.append(nn.Dropout(0.5)) last_layer.append(nn.Linear(in_out[-1],n_classes)) last_layer = nn.Sequential(*last_layer) self.last_layer = last_layer
def __init__(self, n_classes, deno): super(Just_Mill, self).__init__() self.num_classes = n_classes self.deno = deno self.linear = nn.Linear(2048, 64, bias=False) self.features = [] self.features.append(nn.Hardtanh()) self.features.append(Normalize()) self.features.append(nn.Dropout(0.5)) self.features.append(nn.Linear(64, n_classes)) self.features = nn.Sequential(*self.features)
def test_normalize(self): test = np.arange(1000) # normalize scaler = Normalize(test) normalized = scaler.normalize_data(test) min_val = min(normalized) max_val = max(normalized) # ensure values scaled to range (0, 1) self.assertGreaterEqual(min_val, 0.0) self.assertLessEqual(max_val, 1.0) # denormalize denormalized = scaler.denormalize_data(normalized) # ensure denormalized values are the same as the original for x, y in zip(test, denormalized): try: self.assertEqual(x, y) except AssertionError: self.assertAlmostEqual(x, y, 12)
def test_normalize_equilateral(self): # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../../datasets/iris.csv") norm = Normalize() result = norm.load_csv(irisFile) classes = norm.build_class_map(result, 4) norm.norm_col_equilateral(result, 4, classes, 0, 1) self.assertEqual(len(result[0]), 6) self.assertAlmostEqual(result[0][4], 0.06698, 3)
def __init__(self, n_classes, deno, in_out=None, aft_nonlin='RL', feat_ret=False): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.feat_ret = feat_ret self.deno = deno if in_out is None: in_out = [2048, 512] self.linear_layer = [nn.Linear(in_out[0], in_out[1], bias=True)] if aft_nonlin is not None: to_pend = aft_nonlin.split('_') for tp in to_pend: if tp.lower() == 'ht': self.linear_layer.append(nn.Hardtanh()) elif tp.lower() == 'rl': self.linear_layer.append(nn.ReLU()) elif tp.lower() == 'l2': self.linear_layer.append(Normalize()) elif tp.lower() == 'ln': self.linear_layer.append(nn.LayerNorm(n_out)) elif tp.lower() == 'bn': self.linear_layer.append( nn.BatchNorm1d(n_out, affine=False, track_running_stats=False)) elif tp.lower() == 'sig': self.linear_layer.append(nn.Sigmoid()) else: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message) self.linear_layer = nn.Sequential(*self.linear_layer) last_graph = [] last_graph.append(nn.Dropout(0.5)) last_graph.append(nn.Linear(in_out[-1], n_classes, bias=True)) self.last_graph = nn.Sequential(*last_graph)
def __init__(self, n_classes, deno, layer_sizes): super(Just_Mill, self).__init__() self.num_classes = n_classes self.deno = deno self.linear = [] self.linear.append( nn.Linear(layer_sizes[0], layer_sizes[1], bias=False)) self.linear.append(nn.Hardtanh()) self.linear.append(Normalize()) self.linear = nn.Sequential(*self.linear) # self.features.append(nn.ReLU()) self.features = [] self.features.append(nn.Dropout(0.5)) self.features.append(nn.Linear(layer_sizes[1], n_classes)) self.features = nn.Sequential(*self.features)
def __init__(self, n_classes, deno, in_out=None): super(Graph_Sim_Mill, self).__init__() torch.backends.cudnn.deterministic = True torch.manual_seed(999) self.num_classes = n_classes self.deno = deno # num_layers = 2 # in_out = [2048,512,1024] # print 'NUM LAYERS', num_layers, in_out if in_out is None: in_out = [2048, 2048] # in_out = [2048,512,2048] num_layers = len(in_out) - 1 print 'NUM LAYERS', num_layers, in_out self.linear_layer = nn.Linear(2048, in_out[-1], bias=False) non_lin = 'HT' # self.linear_layer.weight.data = model_temp.linear.weight.data self.linear_layer.weight.require_grad = False self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append( Graph_Layer_Wrapper(in_out[num_layer], in_out[num_layer + 1], non_lin)) # self.non_lin = nn.Hardtanh() last_layer = [] last_layer.append(nn.Hardtanh()) last_layer.append(Normalize()) last_layer.append(nn.Dropout(0.5)) last_layer.append(nn.Linear(in_out[-1], n_classes)) last_layer = nn.Sequential(*last_layer) self.last_layer = last_layer
def mk_input_layers_for_G(self, step): n_sty_inp = self.get_n_inp_sty(step) self.mixing_matrices = ini_mixing_matrix(n_sty_inp, step + 1) mn_inps = [Input([self.latent_size]) for _ in range(n_sty_inp)] lct_fake_inp = Input([1]) dens = [ Dense(self.latent_size, **kernel_cond) for _ in range(self.n_layers_of_mn) ] nors = [Normalize()(mn_inps[i]) for i in range(n_sty_inp)] d = [nors[i] for i in range(n_sty_inp)] for i in range(n_sty_inp): for j in range(self.n_layers_of_mn): d[i] = dens[j](d[i]) lct = LearnedConstTensor(self.img_shape[0][:2] + (self.latent_size, ))(lct_fake_inp) sty_out = [MixStyle(i, n_sty_inp, step + 1) for i in range(step + 1)] for i in range(step + 1): sty_out[i] = sty_out[i](d) return Model(inputs=[lct_fake_inp] + mn_inps, outputs=[lct] + sty_out, name='input_layers_{}_for_G'.format(str(step)))
def rnn_predict(stock, start, end): # get stock data try: df = get_stock_data(stock, start, end, json=False) except: # error info e = sys.exc_info() print(e) print("rnn predict fail") return e # normalize scaler = Normalize(df, max=True) normalized = scaler.normalize_data(df) # get training and testing inputs and outputs train_inputs, train_targets, test_inputs, test_targets = train_test_split( normalized) train_inputs = np.array(train_inputs) train_targets = np.array(train_targets) test_inputs = np.array(test_inputs) test_targets = np.array(test_targets) # returns 3d array in format [inputs, timesteps, features] train_inputs = to_3d(train_inputs) test_inputs = to_3d(test_inputs) NN = RNN_V2() train_outputs = NN.train(train_inputs, train_targets, epochs=100) test_outputs = NN.test(test_inputs) # de-normalize train_outputs = scaler.denormalize_data(train_outputs) train_targets = scaler.denormalize_data(train_targets) test_outputs = scaler.denormalize_data(test_outputs) test_targets = scaler.denormalize_data(test_targets).T # accuracy accuracy = 100 - mape(test_targets, test_outputs) return df[4:], pd.DataFrame(train_outputs), pd.DataFrame( test_outputs), str(round(accuracy, 2))
def calc_linear_regression(coeff, x): result = 0 for i in range(1, len(coeff)): result += x[i - 1] * coeff[i] result += coeff[0] return result # find the Iris data set abaloneFile = os.path.dirname(os.path.realpath(__file__)) abaloneFile = os.path.abspath(abaloneFile + "../../datasets/abalone.csv") # Normalize abalone file. norm = Normalize() abalone_work = norm.load_csv(abaloneFile) # Make all columns beyond col #1 numeric. for i in range(1, 9): norm.make_col_numeric(abalone_work, i) # Discover all of the classes for column #1, the gender. classes = norm.build_class_map(abalone_work, 0) # Normalize gender one-of-n encoding. norm.norm_col_one_of_n(abalone_work, 0, classes, 0, 1) # Separate into input and ideal. training = np.array(abalone_work)
def predict(self, test_input, custom_input, test_case_count): normalize = Normalize() if custom_input: input_count = 0 #prediction_df = pd.DataFrame(columns = ["Que No","Questions", "Predicted_Tags"]) prediction_list = [] for question in test_input: input_count += 1 question_ = normalize.normalize(question) logging.debug('-' * (len(question) + 16)) logging.debug('Test Case No.{}: {}'.format( input_count, str(question))) predicted_tag = self.tag_predictor(question_) logging.debug('Predicted Tags: {}'.format(predicted_tag)) prediction_list.append({ 'que_no': input_count, 'questions': str(question), 'predicted_tags': predicted_tag }) #logging.debug('') logging.debug('') return prediction_list else: test_idx = np.random.randint(len(test_input), size=test_case_count) logging.debug("Predicted Vs Ground Truth for {} sample(s)".format( test_case_count)) logging.debug('-' * 50) logging.debug('') input_count = 0 input_predicted_list = [] prediction_score = 0 predicted_tag_list = [] prediction_list = [] #pd.DataFrame(columns = ["Que No", "Questions", "Ground_Truth","Predicted_Tags"]) for idx in test_idx: input_count += 1 test_case = idx question = str(test_input[test_case]) logging.debug('') logging.debug('-' * 100) logging.debug('Test Case No.{}:'.format(input_count)) logging.debug("Question ID: {}".format(test_case)) logging.debug('Question: {}'.format(question)) predicted_tag = self.tag_predictor( normalize.normalize_(question)) predicted_tag_list.append(predicted_tag) ground_truth = self._tag_encoder.inverse_transform( np.array([self._y_test[test_case]])) score = 0 ground_truth_ = [*ground_truth[0]] #predicted_tag_ = [*predicted_tag] for tag in predicted_tag: tags = [*tag] for tag in tags: if tag in ground_truth_: if (len(tag) > 0): score = 1 prediction_score += 1 break else: for gt_tag in ground_truth_: if (gt_tag.startswith(tag) or tag.startswith(gt_tag) ) and len(gt_tag) > 0: score = 1 prediction_score += 1 break prediction_current = { 'que_no': input_count, 'questions': question, 'ground_truth': str(ground_truth), 'predicted_tags': str(predicted_tag) } prediction_list.append(prediction_current) # append row to the dataframe input_predicted_list.append( [input_count, ground_truth, predicted_tag, score]) # log the ground truth & prediction logging.debug('Predicted: ' + str(predicted_tag)) logging.debug('Ground Truth: ' + str(ground_truth)) logging.debug('\n') accuracy = prediction_score / input_count self._accuracy = accuracy return prediction_list
# Find the AIFH core files aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() result = norm.load_csv(irisFile) # Setup the first four fields to "range normalize" between -1 and 1. for i in range(0, 4): norm.make_col_numeric(result, i) norm.norm_col_range(result, i, -1, 1) # Discover all of the classes for column #4, the iris species. classes = norm.build_class_map(result, 4) # Normalize iris species with equilateral encoding norm.norm_col_equilateral(result, 4, classes, -1, 1) # Display the resulting data norm.display_data(result)
aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize k = 3 # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() iris_data = norm.load_csv(irisFile) # Prepare the iris data set. classes = norm.col_extract(iris_data, 4) norm.col_delete(iris_data, 4) for i in range(0, 4): norm.make_col_numeric(iris_data, i) # Cluster the Iris data set. res, idx = kmeans2(np.array(iris_data), k) for cluster_num in range(0, k): print("Cluster #" + str(cluster_num + 1)) for i in range(0, len(idx)): if idx[i] == cluster_num:
def build_G(self, step, input_layers=None, output_layers=None, merged_old_output_layers=None): n_sty_inp = self.get_n_inp_sty(step) self.mixing_matrices = ini_mixing_matrix(n_sty_inp, step + 1) G = input_layers if G == None: G = self.mk_input_layers_for_G(step) elif len(G.output) < step + 2: G.name = 'input_layers_{}_for_G'.format(step - 1) print('rebuild input layers... from {} to {}.'.format( step - 1, step)) self.load_weights_by_name(G) n_sty_inp = self.get_n_inp_sty(step) lct_inp = Input([1]) lct = None sty_inps = [Input([self.latent_size]) for _ in range(n_sty_inp)] nors = [Normalize()(inp) for inp in sty_inps] dens = [] d = nors for layer in G.layers: if isinstance(layer, Dense): dens.append(layer) if isinstance(layer, LearnedConstTensor): lct = layer(lct_inp) for i in range(n_sty_inp): for j in range(self.n_layers_of_mn): d[i] = dens[j](d[i]) sty_mix = [ MixStyle(i, n_sty_inp, step + 1)(d) for i in range(step + 1) ] G = Model(inputs=[lct_inp] + sty_inps, outputs=[lct] + sty_mix, name='input_layers_{}_for_G'.format(str(step))) inps = G.input G = G(inps) styles = G[1:] G = G[0] if self.generators[0] == None: self.generators[0] = self.mk_G_block(0, default_depth_G[0]) for i in range(step): if self.generators[i] == None: self.generators[i] = self.mk_G_block(i, default_depth_G[i], self.self_attns[i]) G = self.generators[i]([G, styles[i]]) old_G = G if self.generators[step] == None: self.generators[step] = self.mk_G_block(step, default_depth_G[step], self.self_attns[step]) G = self.generators[step]([old_G, styles[step]]) if output_layers == None: output_layers = self.mk_output_layers_for_G(step) G = output_layers(G) if merged_old_output_layers != None: G = self.mk_merge_layers_for_G( step, merged_old_output_layers)([old_G, G]) self.G = Model(inputs=inps, outputs=G) self.mix_reg()
def __init__(self, n_classes, deno, in_out=None, feat_dim=None, graph_size=None, method='cos', sparsify=[0.8], non_lin='HT', aft_nonlin=None, sigmoid=False, layer_bef=None, graph_sum=False, background=False, just_graph=False): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.background = background if self.background: assert sigmoid n_classes += 1 self.deno = deno self.graph_size = graph_size self.sparsify = sparsify self.graph_sum = graph_sum self.just_graph = just_graph if in_out is None: in_out = [2048, 64] if feat_dim is None: feat_dim = [2048, 64] num_layers = len(sparsify) print 'NUM LAYERS', num_layers, in_out self.bn = None # nn.BatchNorm1d(2048, affine = False) self.linear_layer = nn.Linear(feat_dim[0], feat_dim[1], bias=True) if layer_bef is None: self.layer_bef = None else: self.layer_bef = [] self.layer_bef.append( nn.Linear(layer_bef[0], layer_bef[1], bias=True)) self.layer_bef.append(nn.ReLU()) # self.layer_bef.append(Normalize()) self.layer_bef = nn.Sequential(*self.layer_bef) self.graph_layers = nn.ModuleList() self.last_graphs = nn.ModuleList() for num_layer in range(num_layers): if self.sparsify[num_layer] == 'lin': lin_curr = [] if non_lin == 'HT': lin_curr.append(nn.Hardtanh()) elif non_lin.lower() == 'rl': lin_curr.append(nn.ReLU()) elif non_lin is not None: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message) lin_curr.append(nn.Linear(in_out[0], in_out[1])) to_pend = aft_nonlin.split('_') for tp in to_pend: if tp.lower() == 'ht': lin_curr.append(nn.Hardtanh()) elif tp.lower() == 'rl': lin_curr.append(nn.ReLU()) elif tp.lower() == 'l2': lin_curr.append(Normalize()) elif tp.lower() == 'ln': lin_curr.append(nn.LayerNorm(n_out)) elif tp.lower() == 'bn': lin_curr.append( nn.BatchNorm1d(n_out, affine=False, track_running_stats=False)) else: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message) lin_curr = nn.Sequential(*lin_curr) self.graph_layers.append(lin_curr) else: self.graph_layers.append( Graph_Layer_Wrapper(in_out[0], n_out=in_out[1], non_lin=non_lin, method=method, aft_nonlin=aft_nonlin)) if self.just_graph: if sigmoid: aft_nonlin_curr = 'sig' else: aft_nonlin_curr = None last_graph = Graph_Layer_Wrapper(in_out[-1], n_classes, non_lin=non_lin, method=method, aft_nonlin=aft_nonlin_curr) else: last_graph = [] last_graph.append(nn.Dropout(0.5)) last_graph.append(nn.Linear(in_out[-1], n_classes)) if sigmoid: last_graph.append(nn.Sigmoid()) last_graph = nn.Sequential(*last_graph) self.last_graphs.append(last_graph) self.num_branches = num_layers print 'self.num_branches', self.num_branches
if args[1] == 0: return 1 return args[0] / args[1] add_wrapper = FunctionWrapper(add, 2, "+") sub_wrapper = FunctionWrapper(sub, 2, "-") mul_wrapper = FunctionWrapper(mul, 2, "*") div_wrapper = FunctionWrapper(div, 2, "/") # find the Iris data set polyFile = os.path.dirname(os.path.realpath(__file__)) polyFile = os.path.abspath(polyFile + "../../datasets/simple-poly.csv") # Read the Iris data set. print('Reading CSV file: ' + polyFile) norm = Normalize() poly_work = norm.load_csv(polyFile) norm.make_col_numeric(poly_work,0) norm.make_col_numeric(poly_work,1) # Prepare training data. Separate into input and ideal. training = np.array(poly_work) training_input = training[:, 0:1] training_ideal = training[:, 1:2] # Calculate the error with MSE. def score_function(genome): # Loop over the training set and calculate the output for each. actual_output = [] for input_data in training_input: genome.set_variable_value(["x"], input_data)
def normalize(self, types): print "Data Normalize with ", types normalization = Normalize(self.data) normalization.normalizing(types, self.__type)
def __init__(self, n_classes, deno, in_out=None, feat_dim=None, graph_size=None, method='cos', num_switch=1, focus=0, sparsify=False, non_lin='HT', normalize=[True, True]): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.deno = deno self.graph_size = graph_size self.sparsify = sparsify if in_out is None: in_out = [2048, 64] if feat_dim is None: feat_dim = [2048, 64] num_layers = len(in_out) - 1 # non_lin = 'HT' print 'NUM LAYERS', num_layers, in_out # if pretrained=='ucf': # model_file = '../experiments/just_mill_flexible_deno_8_n_classes_20_layer_sizes_2048_64_ucf/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0.001_0.001/model_99.pt' # elif pretrained=='activitynet': # model_file = '../experiments/just_mill_flexible_deno_8_n_classes_100_layer_sizes_2048_64_activitynet/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_50_step_50_0.1_0.001_0.001/model_49.pt' # elif pretrained=='random': # model_file = '../experiments/just_mill_flexible_deno_8_n_classes_20_layer_sizes_2048_64_ucf/all_classes_False_just_primary_False_limit_500_cw_True_MultiCrossEntropy_100_step_100_0.1_0_0.001/model_99.pt' # elif pretrained=='default': # model_file = None # else: # error_message = 'Similarity method %s not valid' % method # raise ValueError(error_message) # if model_file is not None: # model_temp = torch.load(model_file) # # self.linear_layer.weight.data = model_temp.linear.weight.data # else: # print 'NO MODEL FILE AAAAAAAA' self.linear_layers = nn.ModuleList() for idx_layer_num, layer_num in enumerate(range(num_layers)): if non_lin == 'HT': non_lin_curr = nn.Hardtanh() elif non_lin == 'RL': non_lin_curr = nn.ReLU() else: error_message = str('Non lin %s not valid', non_lin) raise ValueError(error_message) last_linear = [] idx_curr = idx_layer_num * 2 last_linear.append( nn.Linear(feat_dim[idx_curr], feat_dim[idx_curr + 1], bias=False)) last_linear.append(non_lin_curr) if normalize[0]: last_linear.append(Normalize()) last_linear.append(nn.Dropout(0.5)) last_linear.append(nn.Linear(feat_dim[idx_curr + 1], n_classes)) last_linear = nn.Sequential(*last_linear) self.linear_layers.append(last_linear) self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append( Graph_Layer_Wrapper(in_out[num_layer], n_out=in_out[num_layer + 1], non_lin=non_lin, method=method)) # last_linear = [] # if non_lin =='HT': # last_linear.append(nn.Hardtanh()) # elif non_lin =='RL': # last_linear.append(nn.ReLU()) # else: # error_message = str('Non lin %s not valid', non_lin) # raise ValueError(error_message) # last_linear.append(nn.Dropout(0.5)) # last_linear.append(nn.Linear(in_out[1],n_classes)) # last_linear = nn.Sequential(*last_linear) # self.last_linear = last_linear last_graph = [] if non_lin == 'HT': last_graph.append(nn.Hardtanh()) elif non_lin == 'RL': last_graph.append(nn.ReLU()) else: error_message = str('Non lin %s not valid', non_lin) raise ValueError(error_message) if normalize[1]: last_graph.append(Normalize()) last_graph.append(nn.Dropout(0.5)) last_graph.append(nn.Linear(in_out[-1], n_classes)) last_graph = nn.Sequential(*last_graph) self.last_graph = last_graph self.num_branches = num_layers + 1 if type(num_switch) == type(1): num_switch = [num_switch] * self.num_branches self.num_switch = num_switch self.epoch_counters = [0] * self.num_branches self.focus = focus self.epoch_last = 0 print 'self.num_branches', self.num_branches print 'self.num_switch', self.num_switch print 'self.epoch_counters', self.epoch_counters print 'self.focus', self.focus print 'self.epoch_last', self.epoch_last
# Find the AIFH core files aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize # find the Wisconsin breast cancer data set dataFile = os.path.dirname(os.path.realpath(__file__)) dataFile = os.path.abspath(dataFile + "../../datasets/breast-cancer-wisconsin.csv") # Normalize the Wisconsin file. norm = Normalize() data_file_work = norm.load_csv(dataFile) norm.delete_unknowns(data_file_work) norm.col_delete(data_file_work, 0) norm.col_replace(data_file_work, 9, 4, 1, 0) for i in range(0, 9): norm.make_col_numeric(data_file_work, i) df = pd.DataFrame(data_file_work) df.columns = [ "clump_thickness", "size_uniformity", "shape_uniformity", "marginal_adhesion", "epithelial_size", "bare_nucleoli", "bland_chromatin", "normal_nucleoli", "mitoses", "class" ]
return 1 return args[0] / args[1] add_wrapper = FunctionWrapper(add, 2, "+") sub_wrapper = FunctionWrapper(sub, 2, "-") mul_wrapper = FunctionWrapper(mul, 2, "*") div_wrapper = FunctionWrapper(div, 2, "/") # find the Iris data set polyFile = os.path.dirname(os.path.realpath(__file__)) polyFile = os.path.abspath(polyFile + "../../datasets/simple-poly.csv") # Read the Iris data set. print('Reading CSV file: ' + polyFile) norm = Normalize() poly_work = norm.load_csv(polyFile) norm.make_col_numeric(poly_work, 0) norm.make_col_numeric(poly_work, 1) # Prepare training data. Separate into input and ideal. training = np.array(poly_work) training_input = training[:, 0:1] training_ideal = training[:, 1:2] # Calculate the error with MSE. def score_function(genome): # Loop over the training set and calculate the output for each. actual_output = [] for input_data in training_input:
aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize from rbf_network import RbfNetwork from error import ErrorCalculation from equilateral import Equilateral # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() iris_work = norm.load_csv(irisFile) # Extract the original iris species so we can display during the final validation. ideal_species = [row[4] for row in iris_work] # Setup the first four fields to "range normalize" between -1 and 1. for i in range(0, 4): norm.make_col_numeric(iris_work, i) norm.norm_col_range(iris_work, i, 0, 1) # Discover all of the classes for column #4, the iris species. classes = norm.build_class_map(iris_work, 4) inv_classes = {v: k for k, v in classes.items()} # Normalize iris species using equilateral
# Find the AIFH core files aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize # find the Wisconsin breast cancer data set dataFile = os.path.dirname(os.path.realpath(__file__)) dataFile = os.path.abspath(dataFile + "../../datasets/breast-cancer-wisconsin.csv") # Normalize the Wisconsin file. norm = Normalize() data_file_work = norm.load_csv(dataFile) norm.delete_unknowns(data_file_work) norm.col_delete(data_file_work, 0) norm.col_replace(data_file_work, 9, 4, 1, 0) for i in xrange(0, 9): norm.make_col_numeric(data_file_work, i) df = pd.DataFrame(data_file_work) df.columns = ["clump_thickness", "size_uniformity", "shape_uniformity", "marginal_adhesion", "epithelial_size", "bare_nucleoli", "bland_chromatin", "normal_nucleoli", "mitoses", "class"] train_cols = df.columns[0:9] # Perform the logistic regression.
def __init__(self, n_classes, deno, in_out=None, feat_dim=None, graph_size=None, method='cos', sparsify=False, non_lin='HT', normalize=[True, True]): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.deno = deno self.graph_size = graph_size self.sparsify = sparsify if in_out is None: in_out = [2048, 64] if feat_dim is None: feat_dim = [2048, 64] num_layers = len(in_out) - 1 print 'NUM LAYERS', num_layers, in_out self.linear_layers = nn.ModuleList() self.linear_layers_after = nn.ModuleList() for idx_layer_num, layer_num in enumerate(range(num_layers)): if non_lin == 'HT': non_lin_curr = nn.Hardtanh() elif non_lin == 'RL': non_lin_curr = nn.ReLU() else: error_message = str('Non lin %s not valid', non_lin) raise ValueError(error_message) idx_curr = idx_layer_num * 2 self.linear_layers.append( nn.Linear(feat_dim[idx_curr], feat_dim[idx_curr + 1], bias=False)) last_linear = [] last_linear.append(non_lin_curr) if normalize[0]: last_linear.append(Normalize()) last_linear.append(nn.Dropout(0.5)) last_linear.append(nn.Linear(feat_dim[idx_curr + 1], n_classes)) last_linear = nn.Sequential(*last_linear) self.linear_layers_after.append(last_linear) self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append( Graph_Layer_Wrapper(in_out[num_layer], n_out=in_out[num_layer + 1], non_lin=non_lin, method=method)) self.num_branches = num_layers + 1 print 'self.num_branches', self.num_branches
def __init__(self, n_classes, deno, in_out = None, feat_dim = None, in_out_feat = None, graph_size = None, method = 'cos', sparsify = 0.5, non_lin = 'RL', aft_nonlin = 'RL', aft_nonlin_feat = 'RL', sigmoid = False, layer_bef = None, graph_sum = False, background = False, just_graph = False ): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.background = background if self.background: assert sigmoid n_classes+=1 self.deno = deno self.graph_size = graph_size self.sparsify = sparsify self.graph_sum = graph_sum self.just_graph = just_graph if in_out_feat is None: in_out_feat = [2048,1024] if in_out is None: in_out = [1024,512] if feat_dim is None: feat_dim = [1024,256] assert feat_dim[0]==in_out_feat[1]==in_out[0] # num_layers = 1 # print 'NUM LAYERS', num_layers, in_out self.num_branches = 2 print 'self.num_branches', self.num_branches self.bn =None # nn.BatchNorm1d(2048, affine = False) self.feature = [] self.feature.append(nn.Linear(in_out_feat[0], in_out_feat[1], bias = True)) to_pend = aft_nonlin_feat.split('_') for tp in to_pend: if tp.lower()=='ht': self.feature.append(nn.Hardtanh()) elif tp.lower()=='rl': self.feature.append(nn.ReLU()) elif tp.lower()=='l2': self.feature.append(Normalize()) elif tp.lower()=='ln': self.feature.append(nn.LayerNorm(n_out)) elif tp.lower()=='bn': self.feature.append(nn.BatchNorm1d(n_out, affine = False, track_running_stats = False)) elif tp.lower()=='sig': self.feature.append(nn.Sigmoid()) else: error_message = str('non_lin %s not recognized', non_lin) raise ValueError(error_message) self.feature = nn.Sequential(*self.feature) # self.feature_classifier = nn.Linear(in_out[-1],n_classes) self.linear_layer = nn.Linear(feat_dim[0], feat_dim[1], bias = True) self.graph_layer = Graph_Layer_Wrapper(in_out[0],n_out = in_out[1], non_lin = non_lin, method = method, aft_nonlin = aft_nonlin) last_graph = [] last_graph.append(nn.Dropout(0.5)) last_graph.append(nn.Linear(in_out[-1],n_classes)) if sigmoid: last_graph.append(nn.Sigmoid()) self.last_graph = nn.Sequential(*last_graph) last_feat = [] last_feat.append(nn.Dropout(0.5)) last_feat.append(nn.Linear(in_out_feat[-1],n_classes)) if sigmoid: last_feat.append(nn.Sigmoid()) # last_feat.append(nn.Softmax(dim=0)) self.last_feat = nn.Sequential(*last_feat)
# Find the AIFH core files aifh_dir = os.path.dirname(os.path.abspath(__file__)) aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize k = 3 # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() iris_data = norm.load_csv(irisFile) # Prepare the iris data set. classes = norm.col_extract(iris_data, 4) norm.col_delete(iris_data, 4) for i in range(0, 4): norm.make_col_numeric(iris_data, i) # Cluster the Iris data set. res, idx = kmeans2(np.array(iris_data), k) for cluster_num in range(0, k): print( "Cluster #" + str(cluster_num + 1)) for i in range(0, len(idx)): if idx[i] == cluster_num:
class Spider(object): header = Headers() headers = header.headers() #初始化获取随机的请求头 normalize = Normalize() #格式化url items_fans = {} #用于存储粉丝列表的字典 items_self = {} #用于存储个人信息的字典 redis = Redis() mongo = Mongo() s_time = 0 #起始时间 e_time = 0 #程序运行结束时间 flag = 0 #请求头切换标识 default_time = 20 def start_url(self): #初始链接 start_urls = [ 'https://weibo.com/p/1004061537790411?is_hot=1', ] for start_url in start_urls: yield start_url def downloader(self, url_item, referer, retries_num=4): """ 返回源码 """ print("开始下载") self.e_time = time.time() #获取当前时间 time_dif = self.e_time - self.s_time if self.flag == 1: time_dif = 400 flag = 0 if time_dif > 300: self.headers = self.header.headers() #获取随机的请求头 self.s_time = self.e_time time.sleep(random.random() * 5 + random.random() * 5) #+ random.randint(1,5)) if referer: #判断是否需要防盗链 self.headers['Referer'] = referer #添加referer url = url_item[0] print("待抓取:", url) try: response = requests.get(url, headers=self.headers, timeout=30) #print(self.headers) print("状态码:", response.status_code) #print(response.text) if response.status_code == 200: if len(response.text) > 50000: return response.text else: return None else: self.flag = 0 #切换请求头 if retries_num > 0: print("第", 4 - retries_num, '次下载') self.downloader(url_item, referer, retries_num - 1) else: self.redis.push(url_item) #下载失败则重新下载 return None except requests.exceptions.ConnectionError as e: print("downloaderrl错误", url) print("错误信息:", str(e)) else: response = requests.get(url, headers=self.headers) return response.text def parse_follow_page(self, html, referer): """ 从个人主页提取pageid, 用于构建 关注的人 的链接, 提取关注的人数,粉丝数 """ print("解析函数1") p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>' #用来匹配这是谁的微博 p3 = r"\$CONFIG\['page_id'\]='(\d.*?)';" #用于匹配pageid p4 = r"(\d{6})" #用于从 pageid 中匹配pid p5 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">关注<\\/span>' #关注的人数 p6 = r'<strong\sclass=\\"W_f12\\">(\d*?)<\\/strong><span\sclass=\\"S_txt2\\">粉丝<\\/span>' #粉丝数 self.items_self = {} self.items_self['collection'] = re.search(p1, html).group( 1) #谁的主页,用于建立collection self.items_self['page_id'] = re.search(p3, html).group(1) #获得pageid self.items_self['pid'] = re.search(p4, self.items_self['page_id']).group( 1) #获得pid try: self.items_self['idol'] = int(re.search(p5, html).group(1)) except: self.items_self['idol'] = '__' #关注人数不可见,则idol列表不能添加 print("关注的人数人不可访问") try: self.items_self['fans'] = int(re.search(p6, html).group(1)) except: self.items_self['fans'] = 0 print("粉丝数人不可访问") if self.items_self['fans'] > 50000: #这是阻尼系数 self.items_self['damp'] = 1 else: self.items_self['damp'] = 0.5 print(self.items_self) #self.mongo.save(self.items_self) #存储 yield self.items_self #返回结果用于存储 if isinstance(self.items_self['idol'], int): for url in self.normalize.nor_follow( self.items_self['page_id']): #关注着页面 url_item = [url, self.parse_detail, referer] yield url_item #只需返回关注页面的链接即可,其他的直接存储 else: yield None def parse_detail(self, html, referer): """ 提取每个人的关注页面和首页链接 """ print("解析函数2") self.items_fans = {} p1 = r'<title>(.*?[\u4e00-\u9fa5]{0,})的微博_微博</title>' p2 = r'<a\starget=\\"_blank\\"\stitle=\\"(.*?[\u4e00-\u9fa5]{0,})\\"\shref=\\"(.*?)\\"\s>' #用于匹配粉丝列表 try: results = re.findall(p2, html) for result in results: if result: collection = re.search(p1, html).group(1) #控制表 idol_name = result[0] #关注者的名字 link = self.normalize.nor_home(result[1].replace( '\\', '')) #关注者的首页链接 if re.search(r'\?', link): #如果能找到 ‘?’ 则存入数据库 self.items_fans = { 'collection': collection, 'idol_name': idol_name, 'link': link, } print(self.items_fans) #self.mongo.save(self.items_fans) #存储到数据库 yield self.items_fans #返回结果,用于存储 url_item = [ self.items_fans['link'], self.parse_follow_page, referer ] yield url_item #将结果返回 else: print("链接不符合规定:", link) yield None except: print("粉丝列表不可访问") def scheduler(self): #初始化 #self.redis.delete() #控制是否在爬虫关闭后继续抓取 if self.redis.llen() == 0: for url in self.start_url(): callback = self.parse_follow_page referer = "https://weibo.com" url_item = [url, callback, referer] self.redis.push(url_item) while True: print("开始执行") if not self.redis.llen(): url_item = self.redis.pop() url = url_item[0] callback = url_item[1] referer = url_item[2] html = self.downloader(url_item, referer=referer) if html is not None: print("html的长度:", len(html)) for items in callback(html, url): if isinstance(items, list): print("返回结果是列表") self.redis.push(items) if isinstance(items, dict): print("返回结果是字典") self.mongo.save(items) if items is None: pass #剔除掉粉丝列表不可看的 else: print("html的值:", html) else: break def run(self): self.scheduler()
def __init__( self, n_classes, deno, in_out=None, feat_dim=None, graph_size=None, method='cos', sparsify=False, non_lin='HT', normalize=[True, True], attention=False, gk=8, aft_nonlin=None, ): super(Graph_Multi_Video, self).__init__() self.num_classes = n_classes self.deno = deno self.graph_size = graph_size self.sparsify = sparsify self.gk = gk if in_out is None: in_out = [2048, 64] # if feat_dim is None: # feat_dim = [2048,64] num_layers = len(in_out) - 1 print 'NUM LAYERS', num_layers, in_out self.bn = None # nn.BatchNorm1d(2048, affine = False) # self.linear_layers = nn.ModuleList() # self.linear_layers_after = nn.ModuleList() # for idx_layer_num,layer_num in enumerate(range(num_layers)): # if non_lin =='HT': # non_lin_curr = nn.Hardtanh() # elif non_lin =='RL': # non_lin_curr = nn.ReLU() # else: # error_message = str('Non lin %s not valid', non_lin) # raise ValueError(error_message) # idx_curr = idx_layer_num*2 # self.linear_layers.append(nn.Linear(feat_dim[idx_curr], feat_dim[idx_curr+1], bias = True)) # last_linear = [] # last_linear.append(non_lin_curr) # if normalize[0]: # last_linear.append(Normalize()) # last_linear.append(nn.Dropout(0.5)) # last_linear.append(nn.Linear(feat_dim[idx_curr+1],n_classes)) # last_linear = nn.Sequential(*last_linear) # self.linear_layers_after.append(last_linear) self.graph_layers = nn.ModuleList() for num_layer in range(num_layers): self.graph_layers.append( Graph_Layer_Wrapper(in_out[num_layer], n_out=in_out[num_layer + 1], non_lin=non_lin, method=method, aft_nonlin=aft_nonlin)) last_graph = [] if aft_nonlin is None: if non_lin == 'HT': last_graph.append(nn.Hardtanh()) elif non_lin == 'RL': last_graph.append(nn.ReLU()) else: error_message = str('Non lin %s not valid', non_lin) raise ValueError(error_message) if normalize[1]: last_graph.append(Normalize()) last_graph.append(nn.Dropout(0.5)) last_graph.append(nn.Linear(in_out[-1], n_classes)) last_graph = nn.Sequential(*last_graph) self.last_graph = last_graph self.num_branches = 1 # num_layers+1 self.attention = attention print 'self.num_branches', self.num_branches
aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize from rbf_network import RbfNetwork from error import ErrorCalculation from train import TrainAnneal import numpy as np # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() iris_work = norm.load_csv(irisFile) # Extract the original iris species so we can display during the final validation. ideal_species = [row[4] for row in iris_work] # Setup the first four fields to "range normalize" between -1 and 1. for i in range(0, 4): norm.make_col_numeric(iris_work, i) norm.norm_col_range(iris_work, i, 0, 1) # Discover all of the classes for column #4, the iris species. classes = norm.build_class_map(iris_work, 4) inv_classes = {v: k for k, v in classes.items()} # Normalize iris species using one-of-n.
def test_normalize_one_of_n(self): # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../../datasets/iris.csv") norm = Normalize() result = norm.load_csv(irisFile) self.assertEqual(len(norm.column_map), 5) self.assertEqual(len(norm.header), 5) self.assertEqual(norm.header[0], "sepal_length") self.assertEqual(norm.header[1], "sepal_width") self.assertEqual(norm.header[2], "petal_length") self.assertEqual(norm.header[3], "petal_width") self.assertEqual(norm.header[4], "class") self.assertTrue("sepal_length" in norm.column_map) self.assertTrue("sepal_width" in norm.column_map) self.assertTrue("petal_length" in norm.column_map) self.assertTrue("petal_width" in norm.column_map) self.assertTrue("class" in norm.column_map) self.assertEqual(norm.resolve_column("sepal_length"), 0) self.assertEqual(norm.resolve_column("sepal_width"), 1) self.assertEqual(norm.resolve_column("petal_length"), 2) self.assertEqual(norm.resolve_column("petal_width"), 3) self.assertEqual(norm.resolve_column("class"), 4) self.assertRaises(AIFHError, norm.resolve_column, 6) self.assertRaises(AIFHError, norm.resolve_column, "unknown") for i in range(0, 4): norm.make_col_numeric(result, i) norm.norm_col_range(result, i, -1, 1) self.assertAlmostEqual(result[0][0], -0.555, 2) self.assertAlmostEqual(result[0][1], 0.249, 2) self.assertAlmostEqual(result[0][2], -0.864, 2) self.assertAlmostEqual(result[0][3], -0.916, 2) classes = norm.build_class_map(result, 4) norm.norm_col_one_of_n(result, 4, classes, -1, 1) self.assertEqual(len(classes), 3)
aifh_dir = os.path.abspath(aifh_dir + os.sep + ".." + os.sep + "lib" + os.sep + "aifh") sys.path.append(aifh_dir) from normalize import Normalize from rbf_network import RbfNetwork from error import ErrorCalculation from train import TrainAnneal # find the Iris data set irisFile = os.path.dirname(os.path.realpath(__file__)) irisFile = os.path.abspath(irisFile + "../../datasets/iris.csv") # Read the Iris data set. print('Reading CSV file: ' + irisFile) norm = Normalize() iris_work = norm.load_csv(irisFile) # Extract the original iris species so we can display during the final validation. ideal_species = [row[4] for row in iris_work] # Setup the first four fields to "range normalize" between -1 and 1. for i in range(0, 4): norm.make_col_numeric(iris_work, i) norm.norm_col_range(iris_work, i, 0, 1) # Discover all of the classes for column #4, the iris species. classes = norm.build_class_map(iris_work, 4) inv_classes = {v: k for k, v in classes.items()} # Normalize iris species using one-of-n.
from pyspark import SparkContext from pyspark.mllib.feature import HashingTF from pyspark.mllib.feature import IDF from pyspark.mllib.linalg import Vectors from normalize import Normalize from stopWords import StopWords import math import codecs #utils = Utils(15000) normalize = Normalize() sc = SparkContext() docs = sc.textFile("hdfs://localhost:8020/user/manh/crawler_1") dicStopWords = {} stopWords = StopWords('../input/stopwords.txt') print(stopWords) num = docs.count() print("num = %s" % (num)) #ghi ra file theo ding dang "idf hash" dung de tao khong gian vector def writeIdfHash(lst): # idf_hash = codecs.open("../output/idf_hash.txt", "wb", "utf8") idf_hash = open('../output/idf_hash.txt','w') i = 0 for x in lst: