def euclidean_distance(vector_1, vector_2): ''' calculates the euclidean distance between two vectors ''' vector_1, vector_2 = prepare_data(vector_1, vector_2) return math.sqrt( sum(math.pow(v1 - v2, 2) for v1, v2 in zip(vector_1, vector_2)))
def exec_ppjoin(): def parse_query(query): pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)') m = pattern.match(query) if m: sim = float(m.group(1)) dist = int(m.group(2)) text = m.group(3) return dist, sim, text else: return None, None, query query = request.args.get('q') # query from search string comes here df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) if query: theta, epsilon, text = parse_query(query) if not theta: theta = 0.1 epsilon = 100 if text: res = stTextSearch(df, text, theta) else: res = ppj_c(df, theta, epsilon) else: theta = 0.5 epsilon = 100 res = ppj_c(df, theta, epsilon) return res
def exec_ppjoin(): def parse_query(query): pattern = re.compile('-sim (0\.\d+) -dist (\d+)(.*)') m = pattern.match(query) if m: sim = float(m.group(1)) dist = int(m.group(2)) text = m.group(3) return dist, sim, text else: return None,None,query query = request.args.get('q') # query from search string comes here df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) if query: theta, epsilon, text = parse_query(query) if not theta: theta = 0.1 epsilon = 100 if text: res = stTextSearch(df, text, theta) else: res = ppj_c(df, theta, epsilon) else: theta = 0.5 epsilon = 100 res = ppj_c(df, theta, epsilon) return res
def get_test_data(path, normalize=True, means=None, stds=None, train_data=None): """ Convenience function for extracting and optionally normalizing test data. Args: path: str, path to the data file normalize: boolean, whether to normalize numeric columns means: dict, containing means of columns to be normalized stds: dict, containing stds of columns to be normalized train_data: DataFrame, if not None, referrence DataFrame for adding missing columns to test_data Returns: test_data: DataFrame """ if normalize: assert means is not None assert stds is not None test_data = prepare_data(path) if train_data is not None: test_data = add_missing_cols(train_data, test_data) # test_data = test_data[train_data.columns] if normalize: test_data = normalize_test_data(test_data, means, stds) return test_data
def minkowski_distance(vector_1, vector_2, n_root): ''' calculates minkowski distance ''' vector_1, vector_2 = prepare_data(vector_1, vector_2) if isinstance(n_root, int) and n_root >= 1: return sum( math.pow(abs(v1 - v2), n_root) for v1, v2 in zip(vector_1, vector_2))**(1 / n_root) elif isinstance(n_root, str): try: n_root = int(n_root) return minkowski_distance(vector_1, vector_2, n_root) except ValueError: raise ValueError("nth root should be integer and greater than 0") else: raise ArithmeticError("nth root can not be Zero")
def cosine_similarity(vector_1, vector_2, distance=False): ''' calculates the cosine similarity ''' vector_1, vector_2 = prepare_data(vector_1, vector_2) numerator = sum(v1 * v2 for v1, v2 in zip(vector_1, vector_2)) denominator = reduce( lambda x, y: math.sqrt(x) * math.sqrt(y), map( lambda x: sum([ i if x == 0 else j for i, j in [(v1**2, v2**2) for v1, v2 in zip(vector_1, vector_2)] ]), (0, 1))) if distance: return 1 - (numerator / denominator) else: return numerator / denominator
def run(num_classes,learning_rate,width,depth,mini_batch_size): precision = accuracy = recall = f_score = np.array([]) X_train,X_test,y_train,y_test,unknown_data = dp.load_data() X_train,X_test,y_train,y_test,unknown_data,dtype = dp.prepare_data(X_train,X_test,y_train,y_test,unknown_data) for _ in range(1): model = NN.Net1(num_classes,depth=depth,width=width).type(dtype) opt = optim.SGD(params=model.parameters(),lr=learning_rate,momentum=rp.m,nesterov=True) train_losses,test_losses = model.train_validate(X_train,y_train,X_test,y_test,opt,mini_batch_size,dtype) model = torch.load("Models/Best_Model.pkl") y_pred,_ = model.test(X_test) # Calculate metrics y_true = y_test.data.cpu().numpy() y_pred = y_pred.data.cpu().numpy() a,p,r,f = m.compute_metrics(y_true,y_pred) accuracy = np.append(accuracy,a) precision = np.append(precision,p) recall = np.append(recall,r) f_score = np.append(f_score,f) accuracy = np.mean(accuracy) precision = np.mean(precision) recall = np.mean(recall) f_score = np.mean(f_score) m.show_results(accuracy,precision,recall,f_score,num_classes,train_losses,test_losses) #g.generate_graph(model,X_train) fw.create_data_csv(learning_rate,depth,width,mini_batch_size,rp.m,len(test_losses)-10,accuracy) # Store unknown_data prediction y_pred,_ = model.test(unknown_data) fw.store_prediction(y_pred.data.cpu().numpy())
def main(): hist = crawl() # split data train, test = train_test_split(hist, test_size=0.2) pd.plotting.register_matplotlib_converters() target_col = 'close' line_plot(train[target_col], test[target_col], 'training', 'test', title='') # initial data in neurons in LSTM layer np.random.seed(42) window_len = 5 test_size = 0.2 zero_base = True lstm_neurons = 100 epochs = 20 batch_size = 32 loss = 'mse' dropout = 0.2 optimizer = 'adam' # train model train, test, X_train, X_test, y_train, y_test = prepare_data( hist, target_col, window_len=window_len, zero_base=zero_base, test_size=test_size) model = build_lstm_model( X_train, output_size=1, neurons=lstm_neurons, dropout=dropout, loss=loss, optimizer=optimizer) history = model.fit( X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True) # Mean Absolute Error targets = test[target_col][window_len:] preds = model.predict(X_test).squeeze() mean_absolute_error(preds, y_test) # plot and predict prices preds = test[target_col].values[:-window_len] * (preds + 1) preds = pd.Series(index=targets.index, data=preds) line_plot(targets, preds, 'actual', 'prediction', lw=3)
def get_train_data(path, normalize=True, num_cols=NUMERIC_COLUMNS): """ Convenience function for extracting and optionally normalizing data. Args: path: str, path to the data file normalize: boolean, whether to normalize numeric columns num_cols: list-like, if normalize is True, list of columns to normalize Returns: train_data: DataFrame means: dict, containing means of normalized columns (None if normalize is False) stds: dict, containing stds of normalized columns (None if normalize if False) """ train_data = prepare_data(path) means = stds = None if normalize: train_data, means, stds = normalize_multiple_columns(train_data, num_cols) return train_data, means, stds
obj = df.loc[id_] json_ = { "id": id_, "long": str(obj.lat), "lat": str(obj.lng), "text": obj.raw_text } cell.append(json_) result.append(cell) return json.dumps(result) # ----------------------------------------------------------------------------------------------------------------------- if __name__ == "__main__": df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) theta = 0.8 start_time = time.time() pairs = ppjoin(df, inverted_file, theta) print "Time elapsed:", time.time() - start_time print pairs[0] print 'Total: ', len(pairs) for pair in pairs: id1 = pair[0]["id"] id2 = pair[1]["id"] print jaccard_similarity(df.loc[id1].text, df.loc[id2].text) group_dict = group_objects(df, theta) start_time = time.time()
for id_ in pair: obj = df.loc[id_] json_ = { "id": id_, "long": str(obj.lat), "lat": str(obj.lng), "text": obj.raw_text } cell.append(json_) result.append(cell) return json.dumps(result) # ----------------------------------------------------------------------------------------------------------------------- if __name__ == "__main__": df = prepare_data('data/miami1000.pkl') inverted_file = get_inverted_file(df) theta = 0.8 start_time = time.time() pairs = ppjoin(df, inverted_file, theta) print "Time elapsed:", time.time() - start_time print pairs[0] print 'Total: ', len(pairs) for pair in pairs: id1 = pair[0]["id"] id2 = pair[1]["id"] print jaccard_similarity(df.loc[id1].text, df.loc[id2].text) group_dict = group_objects(df, theta) start_time = time.time()
def manhattan_distance(vector_1, vector_2): ''' calculates manhattan similarity ''' vector_1, vector_2 = prepare_data(vector_1, vector_2) return sum(abs(v1 - v2) for v1, v2 in zip(vector_1, vector_2))