def merge_synonym(): synsets = load_csv('./data/synsets/ANEW_synsets.csv') syn_clusters = [] for i, synset_i in enumerate(synsets): for synset_j in synsets[i + 1:]: if synset_i[0] in synset_j or synset_j[0] in synset_i: syn_cluster = (synset_i[0], synset_j[0]) syn_clusters.append(syn_cluster) # zz = set() # for ll in syn_clusters: # zz=zz.union(set(ll)) # print(len(zz)) # exit() outs = [] for a, b in syn_clusters: # 如果a, b 都没有出现过 if all(len(set([a, b]) & set(l)) == 0 for l in outs): # 创建新的 out = [a, b] outs.append(out) # 否则 else: # 合并进去 for i, k in enumerate(outs): if set([a, b]) & set(k) != set(): outs[i] = list(set(outs[i] + [a, b])) break # leng = 0 # for i, j in enumerate(outs): # leng += len(j) # print('| cluster_%s | %s |' % (str(i), str(j))) # print(leng) return outs
def replacer(word=None): syn_map = dict() synsets = load_csv('./data/synsets/ANEW_synsets.csv') for synset in synsets: if len(synset)>1: for w in synset[1:]: syn_map[w]=synset[0] # if word in syn_map.keys(): # return syn_map[word] return syn_map
def replacer(word=None): syn_map = dict() synsets = load_csv('./data/synsets/ANEW_synsets.csv') for synset in synsets: if len(synset) > 1: for w in synset[1:]: syn_map[w] = synset[0] # if word in syn_map.keys(): # return syn_map[word] return syn_map
def dialog_load_network(): dir_name = 'network_data' if path.isdir(dir_name) == False: dir_name = getcwd() f_name = QFileDialog.getOpenFileName(None, 'Load Electic Network', directory=dir_name, filter="Network files *.csv") try: assert (os.path.exists(f_name)) except AssertionError: sys.exit(' *** No file selected *** ') return load_csv(str(f_name))
def sonar_run(): seed(2) filePath = '../data/sonar.csv' dataset = load_data.load_csv(filePath, True) # convert string attributes to integers for i in range(0, len(dataset[0]) - 1): load_data.str_column_to_float(dataset, i) # convert class column to integers load_data.str_column_to_int(dataset, len(dataset[0]) - 1) # evaluate algorithm n_folds = 5 max_depth = 10 min_size = 1 sample_size = 1.0 n_features = int(sqrt(len(dataset[0]) - 1)) for n_trees in [1, 5, 10]: scores = evaluate_split.evaluate_algorithm(dataset, randomforest.random_forest, n_folds, max_depth, min_size, sample_size, n_trees, n_features) print('Trees: %d' % n_trees) print('Scores: %s' % scores) print('Mean Accuracy: %.3f%%' % (sum(scores) / float(len(scores))))
Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.svm import SVR from sklearn.metrics import mean_squared_error from math import sqrt ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv' X, y = load_csv(uri, ',', 4, 12, 12, 13, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ''' Feature scaling ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) sc_y = StandardScaler()
Created on Sun Apr 15 22:01:23 2018 knn with Concrete Slump dataset from uci @author: shifuddin """ from sklearn.model_selection import train_test_split from sklearn import neighbors from sklearn.metrics import mean_squared_error from math import sqrt from load_data import load_csv ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/slump/slump_test.data' X, y = load_csv(uri, ',', 1, 8, 8, 11, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) knn_regressor = neighbors.KNeighborsRegressor(algorithm='auto', n_neighbors=30, weights='uniform') knn_regressor.fit(X_train, y_train) y_pred = knn_regressor.predict(X_test)
Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from math import sqrt ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv' X, y = load_csv(uri, ',', 1, 27, 27, 28, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ''' Fit DecisionTreeRegressor with Bike Day data ''' regressor = RandomForestRegressor(n_estimators=10, random_state=0) regressor.fit(X_train, y_train) ''' Predicting result '''
from tensorflow.keras.callbacks import (TensorBoard, ModelCheckpoint, EarlyStopping) from load_data import load_csv, train_valid_test_datasets, show_batch from features import (PackNumericFeatures, categorical2onehot, categorical2embedding, normalization) from utils import get_unique from train_model import get_dense_two_layer_net # load data and create Dataset obj train_fileName = '../inputs/train.csv' test_fileName = '../inputs/test.csv' batch_size = 128 # 32 train_data, test_data = load_csv(train_fileName, test_fileName) train_data.pop("id") test_data_id = test_data.pop("id") train_dataset, valid_dataset, test_dataset = train_valid_test_datasets( train_data, test_data, valid_size=0.2, batch_size=batch_size, test_shuffle=False) train_size = int(train_data.shape[0] * 0.8) valid_size = int(train_data.shape[0] * 0.2) print(train_data.shape, test_data.shape) print(train_dataset.element_spec) numeric_features = ['month', 'day'] train_dataset = train_dataset.map(PackNumericFeatures(numeric_features))
num_classes = 30 seq_len = 4500 if is_dna_data else 1500 model_name = 'blstm_openset' data_dir = '/mnt/data/computervision/train80_val10_test10' if is_dna_data: model_name = 'blstm_dna_conv3_4500' data_dir = '/mnt/data/computervision/dna_train80_val10_test10' model_file = '../models/' + model_name + '.h5' model = load_model(model_file) av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output) print av_model.summary() train_data = load_csv(data_dir + '/train.csv') batch_size = 10000 avs = [] actual = [] lower = 0 while lower < len(train_data): print lower upper = min(lower + batch_size, len(train_data)) x, y = get_onehot(train_data[lower:upper], None, is_dna_data=is_dna_data, seq_len=seq_len) pred = av_model.predict(x, batch_size=500) avs.append(pred) actual.append(y)
# -*- coding: utf-8 -*- """ Created on Sat Apr 21 23:19:47 2018 @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data' X, y = load_csv(uri, ',', 1, 5, 9, 10) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ''' Perform logistic regression '''
""" Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import confusion_matrix ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test' X, y = load_csv(uri,',', 1, 45, 0, 1, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.2, random_state=1) ''' Feature scaling ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) '''
import numpy as np import normalizer as norm from load_data import load_csv base_path = '/home/daniel/Documentos/Projetos/TCC/Normalizador/tests' # Trocar por caminho relativo do projeto. raw_data = load_csv(f'{base_path}/raw_data.csv') normalized = norm.normalizer(raw_data) np.savetxt(f'{base_path}/normalized.csv', normalized, fmt='%.8f') print(normalized)
save_stats = True num_classes = 100 mask = True mask_len = 113 model_template = dna_mask_blstm num_letters = 4 if is_dna_data else 26 model = model_template(num_classes, num_letters, sequence_length, embed_size=256, mask_length=mask_len if mask else None) model.load_weights(model_file) model.summary() test_data = load_csv(data_dir + '/test.csv', divide=2 if is_dna_data else 1) print len(test_data) crop_count = 0.0 for seq, y in test_data: if len(seq) > sequence_length: crop_count += 1 print "percent cropped: ", crop_count / len(test_data) test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=is_dna_data, seq_len=sequence_length, num_classes=num_classes, rand_start=random_crop, mask_len=mask_len if mask else None) if print_acc: print "test accuracy: ", model.evaluate([test_x, test_m] if mask else test_x, test_y, batch_size=100) if save_stats: pred = model.predict([test_x, test_m] if mask else test_x, batch_size=100).argmax(axis=-1) log = Logger(model_name, num_classes, sequence_length)
import numpy as np import paraconsistent from load_data import load_csv base_path = 'C:/Users/guermandi/Desktop/TCC/AnaliseParaconsistente/tests' pathological = load_csv(f'{base_path}/patologicos-normalizados.csv') healthy = load_csv(f'{base_path}/saudaveis-normalizados.csv') pathological = np.delete(pathological, (0, 1, 2, 4, 5, 6), 1) healthy = np.delete(healthy, (0, 1, 2, 4, 5, 6), 1) classes = np.array([np.array(pathological), np.array(healthy)]) alpha = paraconsistent.alpha(classes) beta = paraconsistent.beta(classes) assurance = paraconsistent.assurance(alpha, beta) contradiction = paraconsistent.contradiction(alpha, beta) truth = paraconsistent.truth(assurance, contradiction) # Classes de dados classes = np.array([np.array(pathological), np.array(healthy)]) # Alfa e Beta alpha = paraconsistent.alpha(classes) beta = paraconsistent.beta(classes) # Ponto G1 assurance = paraconsistent.assurance(alpha, beta)
return ''.join(l) model = load_model(model_file) model.summary() results = [] for percent in range(2,22,2): #mode 0: substitute, mode 1: 3-aligned cut, mode 2: unaligned cut row = [percent] for mode in range(3): test_data = load_csv(data_dir + '/test.csv', divide=2) print len(test_data) for i in range(len(test_data)): (x, y) = test_data[i] if mode == 0: test_data[i] = (substitute(x, percent), y) else: test_data[i] = (delete_segment(x, percent, mode == 1), y) #if i % 100000 == 99999: # print i+1 test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=True, seq_len=sequence_length, num_classes=num_classes, mask_len=mask_len) acc = model.evaluate([test_x, test_m], test_y, batch_size=100, verbose=1)[1] print percent, mode, acc
from ml_logging import Logger num_classes = 30 num_amino_acids = 26 model = Sequential() model.add(Masking(mask_value=0, input_shape=(1500, num_amino_acids))) model.add(LSTM(50, activation='tanh')) model.add(Dense(num_classes, activation='softmax')) model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy']) model.summary() data_dir = '/mnt/data/computervision/train80_val10_test10' train_data = load_csv(data_dir + '/train.csv') print len(train_data) val_data = load_csv(data_dir + '/validation.csv') val_x, val_y = get_onehot(val_data, None) print len(val_data) logger = Logger('lstm50') save_path = '../models/lstm50.h5' num_episodes = 20000 for i in range(num_episodes): x, y = get_onehot(train_data, 1000) print i print model.train_on_batch(x, y) if (i % 1000 == 0) or i == num_episodes - 1:
[5.332441248,2.088626775,1], [6.922596716,1.77106367,1], [8.675418651,-0.242068655,1], [7.673756466,3.508563011,1]] n_inputs = len(dataset[0]) - 1 n_outputs = len(set([row[-1] for row in dataset])) network = initialize_network(n_inputs, 2, n_outputs) train_network(network, dataset, 0.5, 20, n_outputs) for layer in network: print(layer) for row in dataset: prediction = predict(network, row) print('Expected=%d, Got=%d' % (row[-1], prediction)) filename = 'seeds_dataset.csv' dataset = load_csv(filename) for i in range(len(dataset[0])-1): print(dataset[i]) str_column_to_float(dataset, i) str_column_to_int(dataset, len(dataset[0])-1) minmax = dataset_minmax(dataset) normalize_dataset(dataset, minmax) n_folds = 5 l_rate = 0.3 n_epoch = 50 n_hidden = 5
decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.svm import SVR from sklearn.metrics import mean_squared_error from math import sqrt import pandas as pd ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00244/fertility_Diagnosis.txt' X, y = load_csv(uri, ',', 0, 9, 9, 10, True) y = pd.get_dummies(y.ravel(), drop_first=True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ''' Feature scaling ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)
data = data.asfreq(step, method='bfill') data = data.reset_index() return data def train_test_split(data, train_ratio, method=0): train = [] test = [] if method == 0: n_rows = int(train_ratio * len(data)) train.append(data.iloc[:n_rows, :]) test.append(data.iloc[n_rows:, :]) return train, test if __name__ == "__main__": names_dict = {"Date": "Local time"} data = load_csv(csv_name="EURCAD_Ticks_05.12.2017-05.12.2017.csv", names_dict=names_dict) print(data["Date"].head()) sys.exit(0) print(data.shape) data = select_data(dataframe=data, start="2017/05/13", stop="2017/05/20") print(data.shape) print(data["Date"].head())
# -*- coding: utf-8 -*- """ Created on Thu Apr 12 13:23:12 2018 @author: shifuddin """ from sklearn.neural_network import MLPClassifier from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data' X, y = load_csv(uri, ',', 1, 5, 9, 10, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ''' Feature scaling ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) '''
input_file = '/mnt/data/computervision/dna_100class_train80_val10_test10/test.csv' display_classes = 10 n = 100 is_dna_data = True seq_len = 4500 mask_len = 113 model_file = '../models/' + model_name + '.h5' model = load_model(model_file) embed_model = Model(inputs=model.input, outputs=model.get_layer("lstm_2").output) embed_model.summary() counts = np.zeros(display_classes, dtype=np.int8) data = load_csv(input_file, divide=1) chosen_data = [] for (x, y) in data: if y < display_classes and counts[y] < n: counts[y] = counts[y] + 1 chosen_data.append((x, y)) x, y, m = get_onehot(chosen_data, None, is_dna_data=is_dna_data, seq_len=seq_len, mask_len=mask_len) embed = embed_model.predict([x, m], batch_size=100, verbose=1) tsne = TSNE(n_components=2, random_state=0)
""" Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import confusion_matrix ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt' X, y = load_csv(uri,',', 0,4, 4,5, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.2, random_state=1) ''' Feature scaling ''' from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) '''
Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from math import sqrt ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' X, y = load_csv(uri, ';', 0, 11, 11, 12, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) ''' Fit DecisionTreeRegressor with Bike Day data ''' regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X_train, y_train) ''' Predicting result '''
def replacer(): outs = merge_synonym() syn_map = load_csv('./data/synsets/ANEW_synsets.csv') replace_map = build_syn_map(syn_map, outs) print(replace_map['gusto']) return replace_map
seq_len = 4500 #data_file = '/mnt/data/computervision/dna_train80_val10_test10/test.csv' data_file = '../results/dna_unknown_100class_pairs.csv' #keep this 1000class so every model uses the same data mask = False mask_len = 113 model_file = '../models/' + model_name + '.h5' model = load_model(model_file) embed_model = Model(inputs=model.input, outputs=model.get_layer("lstm_2").output) print embed_model.summary() single_dict = dict() pair_dict = dict() data = load_csv(data_file) for (x, y) in data: if y in pair_dict: continue if y in single_dict: assert x != single_dict[y] pair_dict[y] = [single_dict[y], x] else: single_dict[y] = x if len(pair_dict) == num_classes: break chosen_data = [] for i in range(2): for y in pair_dict: x = pair_dict[y][i]
num_classes = 30 model_name = 'blstm_dna_conv3_4500' data_file = '/mnt/data/computervision/dna_train80_val10_test10/test.csv' #data_file = '/mnt/data/computervision/dna_train80_val10_test10/unknowns.csv' data_divide = 4 dist_min = 0 dist_max = 20 model_file = '../models/' + model_name + '.h5' model = load_model(model_file) av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output) print av_model.summary() data = load_csv(data_file, divide=data_divide) print len(data) x, y = get_onehot(data, None, is_dna_data=is_dna_data, seq_len=4500 if is_dna_data else 1500) avs = av_model.predict(x, batch_size=500) print 'done getting avs' del data, x, y means = [] with open('../results/' + model_name + '_mean_activations.csv', 'r') as infile: r = csv.reader(infile) for row in r: means.append(np.array(row, dtype=np.float32))
Created on Thu May 3 11:51:18 2018 decission tree with banknote @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error from math import sqrt ''' Load feature values as X and target as Y here we read day dataset ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv' X, y = load_csv(uri,',', 1,10,0,1, True) ''' Split into training and test set ''' X_train, X_test, y_train, y_test =train_test_split(X, y,test_size=0.2, random_state=1) ''' Fit DecisionTreeRegressor with Bike Day data ''' regressor = DecisionTreeRegressor(random_state = 0) regressor.fit(X_train, y_train) ''' Predicting result
# %% from load_data import load_csv from analyze_data import show_beta, show_gamma from sir_model import long_time_later from graph_plot import compare_graph # %% load data directory = "C:\\Users\\HasunSong\\PycharmProjects\\virus\\covid19_korea.csv" data = load_csv(directory=directory) header = data[0] #['날짜', '치료중', '누적확진', '누적격리해제', '누적사망', '확진', '격리해제', '사망'] # %% guess beta and gamma show_beta(data) show_gamma(data) # 03.26. 기준 # %% 모델 돌려보기 BETA = 3e-10 GAMMA = 0.05 TOT_POP = 50000000 DAYS = 3000 rec = long_time_later([TOT_POP, 10000, 0], DAYS, beta=BETA, gamma=GAMMA) # %% 실제 상황과 비교하기 compare_graph(data, rec, BETA, GAMMA) # %% 여러 값에 대해 beta_list = [3e-10, 5e-10, 1e-9, 2.5e-9] gamma_list = [0.01, 0.02, 0.03, 0.05] for bt in beta_list: for gm in gamma_list: rec = long_time_later([TOT_POP, 10000, 0], DAYS, beta=bt, gamma=gm)
# -*- coding: utf-8 -*- """ Created on Sun Apr 22 21:26:57 2018 Kmean with bc wisconsin @author: shifuddin """ from load_data import load_csv from sklearn.cluster import KMeans from sklearn.metrics import homogeneity_score import pandas as pd ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECT.test' X, y = load_csv(uri, ',', 1, 24, 0, 1, True) ''' Fitting K-Means to the dataset ''' kmeans = KMeans(n_clusters=10, init='k-means++', random_state=42, max_iter=1000) y_kmeans = kmeans.fit_predict(X) cluster_centers = kmeans.cluster_centers_ labels = kmeans.labels_ homo_score = homogeneity_score(y.ravel(), y_kmeans)
# -*- coding: utf-8 -*- """ Created on Sat Apr 21 23:06:32 2018 uci bancknote authetication dataset @author: shifuddin """ from load_data import load_csv from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix ''' Load X, y from uri ''' uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt' X, y = load_csv(uri, ',', 0, 4, 4, 5) ''' Split into training and test set ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ''' Perform logistic regression '''