예제 #1
0
def run():
    if (tools.config()["preprocessing"]["use-cache"] is False):
        subprocess.call(f"rm -rf {pathForSets}* {pathForTestsets}* ",
                        shell=True)

    dataLoaders = [
        ["emotions", lambda: tools.loadData(source_data + "emotions")],
        [
            "germeval", lambda: tools.loadGermeval2017(
                source_data + "germeval2017/set_v1.4.tsv")
        ],
        [
            "sb10k", lambda: tools.loadData(
                source_data + "SB10k/not-preprocessed/corpus_label_text.tsv",
                "\t")
        ],
        [
            "PotTS", lambda: tools.loadData(
                source_data + "PotTS/not-preprocessed/corpus_label_text.tsv",
                "\t")
        ],
        [
            "filmstarts", lambda: tools.loadFilmstarts(
                source_data + "filmstarts/filmstarts.tsv")
        ],
        [
            "scare", lambda: tools.loadScareSet(source_data +
                                                "scare_v1.0.0_data/reviews/")
        ],
        [
            "holidaycheck", lambda: tools.loadHolidaycheck(
                source_data + "holidaycheck/holidaycheck.clean.filtered.tsv")
        ],
        [
            "leipzig-mixed-typical-2011",
            lambda: tools.loadData(source_data + "leipzig/deu-mixed-labeled")
        ],
        [
            "leipzig-newscrawl-2017", lambda: tools.loadData(
                source_data + "leipzig/deu-newscrawl-2017-labeled")
        ],
        [
            "leipzig-deu-wikipedia-2016", lambda: tools.loadData(
                source_data + "leipzig/deu-wikipedia-2016-labeled")
        ]
    ]
    dataSets = []
    table = []
    dataSetsToLoad = tools.config()["datasets"]

    for dataSet in dataSetsToLoad:
        if dataSet["train"] is True or dataSet["test"] is True:
            # if this fails the loader you are defined in the config, is not defined in the code
            loader = next(
                filter(lambda x: x[0] == dataSet["name"], dataLoaders))

            # split every set in its 3 classes
            meta_info = cleanAndSplit(*loader)

            if dataSet["train"] is True:
                dataSets.append(loader)

            table.append(list(dataSet.values()) + meta_info)

    headers = [
        "set name", "training", "test", "from cache", "positiv", "neutral",
        "negative", "total"
    ]
    print(tabulate(table, headers, tablefmt="pipe", floatfmt=".4f"))

    trainSets = [
        dataset["name"] for dataset in dataSetsToLoad
        if dataset["train"] is True
    ]

    # combine single datasets into one set per class
    neutralSamples = createSetForClass("neutral", trainSets)
    positiveSamples = createSetForClass("positive", trainSets)
    negativeSamples = createSetForClass("negative", trainSets)

    print("\nclass distribution in data set:")
    print("neutral \t{}\npostitive\t{}\nnegative\t{}".format(
        neutralSamples, positiveSamples, negativeSamples))

    # balance classes
    if (tools.config()['preprocessing']['balance'] == 'down'):
        print("\nbalance classes with downsampling")
        samplesPerClass = min(neutralSamples, positiveSamples, negativeSamples)
        print("random sampels per class: {}".format(samplesPerClass))
        print("total sampels: {}".format(samplesPerClass * 3))
        # train / test split per class
        split(samplesPerClass, "neutral")
        split(samplesPerClass, "positive")
        split(samplesPerClass, "negative")
    else:
        split(neutralSamples, "neutral")
        split(positiveSamples, "positive")
        split(negativeSamples, "negative")
        print(f"random sampels per class neutral: {neutralSamples}")
        print(f"random sampels per class positiv: {positiveSamples}")
        print(f"random sampels per class negative:{negativeSamples}")
        print(
            f"total sampels: {neutralSamples +positiveSamples + negativeSamples}"
        )

    # combine classes to set
    trainFile = path + "model.train"
    validFile = path + "model.valid"
    testFile = path + "model.test"

    executeToFile(f"cat {pathForSets}all.train.* | cut -f2,3",
                  trainFile,
                  shellMode=True)
    executeToFile(f"cat {pathForSets}all.valid.* | cut -f2,3",
                  validFile,
                  shellMode=True)
    executeToFile(f"cat {pathForSets}all.test.*", testFile, shellMode=True)

    totalTrain = tools.lineCount(trainFile)
    totalValid = tools.lineCount(validFile)
    totalTest = tools.lineCount(testFile)
    totalLines = float(totalTrain + totalValid + totalTest)

    print("\nsamples in:\ntrain\t{}\nvalid\t{}\ntest\t{}\nsum\t{}".format(
        totalTrain, totalValid, totalTest, totalLines))

    print("\npercentage in:\ntrain\t{}\nvalid\t{}\ntest\t{}".format(
        totalTrain / totalLines, totalValid / totalLines,
        totalTest / totalLines))

    test_sets = [
        dataset["name"] for dataset in dataSetsToLoad
        if dataset["train"] is False and dataset["test"] is True
    ]
    print(f"datasets just for testing {test_sets}")
    if os.path.exists(testFile + ".extra"): os.remove(testFile + ".extra")

    for test_set in test_sets:
        executeToFile(f"cat {pathForSets}{test_set}.* ",
                      testFile + ".extra",
                      mode="a",
                      shellMode=True)

    executeToFile(f"cat {testFile} {testFile}.extra",
                  path + "model.test.full",
                  shellMode=True)

    executeToFile(
        f"cat {pathForSets}all.negative {pathForSets}all.neutral {pathForSets}all.positive | cut -f3 ",
        path + "wordvecc.train",
        shellMode=True)
예제 #2
0
# coding:utf-8
"""
kaggle泰坦尼克号竞赛第四次提交
用支持向量机SVM
"""

import numpy as np
import pandas as pd
import tools
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    # 读取数据
    train_data, test_data = tools.loadData()
    print(train_data.head())
    print(test_data.head())

    # 输出数据
    print(train_data.info())
    print(test_data.info())
    # 探索数据
    # tools.exploreData(train_data)
    # 数据清洗,特征提取
    train_data, test_data = tools.cleanData(train_data, test_data)
    # 增加一个Family字段,根据SibSp和
    # Parch之和分类
    train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
    train_data.loc[(train_data.Family == 0), "Family"] = 0
    train_data.loc[((train_data.Family > 0) & (train_data.Family < 4)),
예제 #3
0
 def loadKs(self, npz_filename):
     x = loadData(npz_filename)
     self.tuneX, self.tuneY, self.k0, self.k1, self.b1 = [x[fld] for fld in ('tuneX', 'tuneY', 'k0', 'k1', 'b1')]
예제 #4
0
 def loadCubes(self, npz_cubefile):
     self.cubes = dict(loadData(npz_cubefile))
예제 #5
0
# coding:utf-8
# kaggle泰坦尼克号竞赛第一次提交
# 按https://www.kaggle.com/alexisbcook/titanic-tutorial 教程来

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import tools

if __name__ == "__main__":
    # 读取数据
    traindata, testdata = tools.loadData()
    print(train_data.head())
    print(test_data.head())

    # 计算男女乘客的生存率
    women = train_data.loc[train_data.Sex == 'female']["Survived"]
    rate_women = sum(women) / len(women)
    print("% of women who survied:", rate_women)
    men = train_data.loc[train_data.Sex == 'male']["Survived"]
    rate_men = sum(men) / len(men)
    print("% of men who survied:", rate_men)

    # 使用随机森林算法预测
    y = train_data["Survived"]
    features = ["Pclass", "Sex", "SibSp", "Parch"]
    X = pd.get_dummies(train_data[features])
    X_test = pd.get_dummies(test_data[features])

    model = RandomForestClassifier(n_estimators=100,
                                   max_depth=5,
예제 #6
0
                Z = np.array(sim_data_3D["Z"])
                Z_electrode = np.array(sim_data_3D["Z_electrode"])
                simulation_params = sim_data_3D["simulation_params"]
                cell_params_ex = sim_data_3D["cell_params_ex"]
                cell_params_in = sim_data_3D["cell_params_in"]

                # Load simulation results of the 3D network
                [summed_dipole, summed_EEG_top, tvec] = tools.loadResults(
                    exp_id_3D, filename, simulation_params["population_sizes"],
                    Z, simulation_params["tstop"], simulation_params["dt"],
                    simulation_params["individual_EEG"])

                # Load simulation results of the LIF network
                if compute_local_proxy:
                    AMPA_LIF = tools.loadData(
                        exp_id_LIF, filename,
                        '.AMPA' + "_sub_" + str(subnetwork))
                    GABA_LIF = tools.loadData(
                        exp_id_LIF, filename,
                        '.GABA' + "_sub_" + str(subnetwork))
                    LFP_LIF = tools.loadData(
                        exp_id_LIF, filename,
                        '.LFP' + "_sub_" + str(subnetwork))
                else:
                    AMPA_LIF = tools.loadData(exp_id_LIF, filename, '.AMPA')
                    GABA_LIF = tools.loadData(exp_id_LIF, filename, '.GABA')
                    LFP_LIF = tools.loadData(exp_id_LIF, filename, '.LFP')
                times_LIF = tools.loadData(exp_id_LIF, filename, '.times')
                dt_LIF = tools.loadData(exp_id_LIF, filename, '.dt')

                # Startup time to analyze results at 100 ms