def run(): if (tools.config()["preprocessing"]["use-cache"] is False): subprocess.call(f"rm -rf {pathForSets}* {pathForTestsets}* ", shell=True) dataLoaders = [ ["emotions", lambda: tools.loadData(source_data + "emotions")], [ "germeval", lambda: tools.loadGermeval2017( source_data + "germeval2017/set_v1.4.tsv") ], [ "sb10k", lambda: tools.loadData( source_data + "SB10k/not-preprocessed/corpus_label_text.tsv", "\t") ], [ "PotTS", lambda: tools.loadData( source_data + "PotTS/not-preprocessed/corpus_label_text.tsv", "\t") ], [ "filmstarts", lambda: tools.loadFilmstarts( source_data + "filmstarts/filmstarts.tsv") ], [ "scare", lambda: tools.loadScareSet(source_data + "scare_v1.0.0_data/reviews/") ], [ "holidaycheck", lambda: tools.loadHolidaycheck( source_data + "holidaycheck/holidaycheck.clean.filtered.tsv") ], [ "leipzig-mixed-typical-2011", lambda: tools.loadData(source_data + "leipzig/deu-mixed-labeled") ], [ "leipzig-newscrawl-2017", lambda: tools.loadData( source_data + "leipzig/deu-newscrawl-2017-labeled") ], [ "leipzig-deu-wikipedia-2016", lambda: tools.loadData( source_data + "leipzig/deu-wikipedia-2016-labeled") ] ] dataSets = [] table = [] dataSetsToLoad = tools.config()["datasets"] for dataSet in dataSetsToLoad: if dataSet["train"] is True or dataSet["test"] is True: # if this fails the loader you are defined in the config, is not defined in the code loader = next( filter(lambda x: x[0] == dataSet["name"], dataLoaders)) # split every set in its 3 classes meta_info = cleanAndSplit(*loader) if dataSet["train"] is True: dataSets.append(loader) table.append(list(dataSet.values()) + meta_info) headers = [ "set name", "training", "test", "from cache", "positiv", "neutral", "negative", "total" ] print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f")) trainSets = [ dataset["name"] for dataset in dataSetsToLoad if dataset["train"] is True ] # combine single datasets into one set per class neutralSamples = createSetForClass("neutral", trainSets) positiveSamples = createSetForClass("positive", trainSets) negativeSamples = createSetForClass("negative", trainSets) print("\nclass distribution in data set:") print("neutral \t{}\npostitive\t{}\nnegative\t{}".format( neutralSamples, positiveSamples, negativeSamples)) # balance classes if (tools.config()['preprocessing']['balance'] == 'down'): print("\nbalance classes with downsampling") samplesPerClass = min(neutralSamples, positiveSamples, negativeSamples) print("random sampels per class: {}".format(samplesPerClass)) print("total sampels: {}".format(samplesPerClass * 3)) # train / test split per class split(samplesPerClass, "neutral") split(samplesPerClass, "positive") split(samplesPerClass, "negative") else: split(neutralSamples, "neutral") split(positiveSamples, "positive") split(negativeSamples, "negative") print(f"random sampels per class neutral: {neutralSamples}") print(f"random sampels per class positiv: {positiveSamples}") print(f"random sampels per class negative:{negativeSamples}") print( f"total sampels: {neutralSamples +positiveSamples + negativeSamples}" ) # combine classes to set trainFile = path + "model.train" validFile = path + "model.valid" testFile = path + "model.test" executeToFile(f"cat {pathForSets}all.train.* | cut -f2,3", trainFile, shellMode=True) executeToFile(f"cat {pathForSets}all.valid.* | cut -f2,3", validFile, shellMode=True) executeToFile(f"cat {pathForSets}all.test.*", testFile, shellMode=True) totalTrain = tools.lineCount(trainFile) totalValid = tools.lineCount(validFile) totalTest = tools.lineCount(testFile) totalLines = float(totalTrain + totalValid + totalTest) print("\nsamples in:\ntrain\t{}\nvalid\t{}\ntest\t{}\nsum\t{}".format( totalTrain, totalValid, totalTest, totalLines)) print("\npercentage in:\ntrain\t{}\nvalid\t{}\ntest\t{}".format( totalTrain / totalLines, totalValid / totalLines, totalTest / totalLines)) test_sets = [ dataset["name"] for dataset in dataSetsToLoad if dataset["train"] is False and dataset["test"] is True ] print(f"datasets just for testing {test_sets}") if os.path.exists(testFile + ".extra"): os.remove(testFile + ".extra") for test_set in test_sets: executeToFile(f"cat {pathForSets}{test_set}.* ", testFile + ".extra", mode="a", shellMode=True) executeToFile(f"cat {testFile} {testFile}.extra", path + "model.test.full", shellMode=True) executeToFile( f"cat {pathForSets}all.negative {pathForSets}all.neutral {pathForSets}all.positive | cut -f3 ", path + "wordvecc.train", shellMode=True)
# coding:utf-8 """ kaggle泰坦尼克号竞赛第四次提交 用支持向量机SVM """ import numpy as np import pandas as pd import tools import matplotlib.pyplot as plt from sklearn import svm from sklearn.model_selection import train_test_split if __name__ == "__main__": # 读取数据 train_data, test_data = tools.loadData() print(train_data.head()) print(test_data.head()) # 输出数据 print(train_data.info()) print(test_data.info()) # 探索数据 # tools.exploreData(train_data) # 数据清洗,特征提取 train_data, test_data = tools.cleanData(train_data, test_data) # 增加一个Family字段,根据SibSp和 # Parch之和分类 train_data["Family"] = train_data["SibSp"] + train_data["Parch"] train_data.loc[(train_data.Family == 0), "Family"] = 0 train_data.loc[((train_data.Family > 0) & (train_data.Family < 4)),
def loadKs(self, npz_filename): x = loadData(npz_filename) self.tuneX, self.tuneY, self.k0, self.k1, self.b1 = [x[fld] for fld in ('tuneX', 'tuneY', 'k0', 'k1', 'b1')]
def loadCubes(self, npz_cubefile): self.cubes = dict(loadData(npz_cubefile))
# coding:utf-8 # kaggle泰坦尼克号竞赛第一次提交 # 按https://www.kaggle.com/alexisbcook/titanic-tutorial 教程来 import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier import tools if __name__ == "__main__": # 读取数据 traindata, testdata = tools.loadData() print(train_data.head()) print(test_data.head()) # 计算男女乘客的生存率 women = train_data.loc[train_data.Sex == 'female']["Survived"] rate_women = sum(women) / len(women) print("% of women who survied:", rate_women) men = train_data.loc[train_data.Sex == 'male']["Survived"] rate_men = sum(men) / len(men) print("% of men who survied:", rate_men) # 使用随机森林算法预测 y = train_data["Survived"] features = ["Pclass", "Sex", "SibSp", "Parch"] X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) model = RandomForestClassifier(n_estimators=100, max_depth=5,
Z = np.array(sim_data_3D["Z"]) Z_electrode = np.array(sim_data_3D["Z_electrode"]) simulation_params = sim_data_3D["simulation_params"] cell_params_ex = sim_data_3D["cell_params_ex"] cell_params_in = sim_data_3D["cell_params_in"] # Load simulation results of the 3D network [summed_dipole, summed_EEG_top, tvec] = tools.loadResults( exp_id_3D, filename, simulation_params["population_sizes"], Z, simulation_params["tstop"], simulation_params["dt"], simulation_params["individual_EEG"]) # Load simulation results of the LIF network if compute_local_proxy: AMPA_LIF = tools.loadData( exp_id_LIF, filename, '.AMPA' + "_sub_" + str(subnetwork)) GABA_LIF = tools.loadData( exp_id_LIF, filename, '.GABA' + "_sub_" + str(subnetwork)) LFP_LIF = tools.loadData( exp_id_LIF, filename, '.LFP' + "_sub_" + str(subnetwork)) else: AMPA_LIF = tools.loadData(exp_id_LIF, filename, '.AMPA') GABA_LIF = tools.loadData(exp_id_LIF, filename, '.GABA') LFP_LIF = tools.loadData(exp_id_LIF, filename, '.LFP') times_LIF = tools.loadData(exp_id_LIF, filename, '.times') dt_LIF = tools.loadData(exp_id_LIF, filename, '.dt') # Startup time to analyze results at 100 ms