Пример #1
0
def train(power=6.46):
    dat_path = os.path.join("_Data", "dataset.dat")
    gen_dataset(dat_path)
    with open(dat_path, "rb") as _file:
        x, y = pickle.load(_file)
    _indices = np.random.permutation(len(x))
    x = [x[i] for i in _indices]
    y = [y[i] for i in _indices]
    data_len = len(x)
    batch_size = math.ceil(data_len * 0.1)
    _test_sets, _prob_lists = [], []
    _total = sum([len(sentence) for sentence in x])
    for i in range(10):
        rs = [[] for _ in range(9)]
        _next = (i + 1) * batch_size if i != 9 else data_len
        x_train = x[:i * batch_size] + x[(i + 1) * batch_size:]
        y_train = y[:i * batch_size] + y[(i + 1) * batch_size:]
        x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next]
        for xx, yy in zip(x_train, y_train):
            rs[yy] += xx
        _counters = [Counter(group) for group in rs]
        _test_sets.append((x_test, y_test))
        _prob_lst = []
        for counter in _counters:
            _sum = sum(counter.values())
            _prob_lst.append(
                {key: value / _sum
                 for key, value in counter.items()})
            _prob_lst[-1]["null"] = _sum * 2**power
            _prob_lst[-1]["prior"] = _sum / _total
        _prob_lists.append(_prob_lst)
    return _test_sets, _prob_lists
Пример #2
0
def main(clf):
    dat_path = os.path.join("_Data", "dataset.dat")
    gen_dataset(dat_path)
    with open(dat_path, "rb") as _file:
        x, y = pickle.load(_file)
    x = [" ".join(sentence) for sentence in x]
    _indices = np.random.permutation(len(x))
    x = list(np.array(x)[_indices])
    y = list(np.array(y)[_indices])
    data_len = len(x)
    batch_size = math.ceil(data_len * 0.1)
    _acc_lst, y_results = [], []
    bar = ProgressBar(max_value=10, name=str(clf))
    for i in range(10):
        _next = (i + 1) * batch_size if i != 9 else data_len
        x_train = x[:i * batch_size] + x[(i + 1) * batch_size:]
        y_train = y[:i * batch_size] + y[(i + 1) * batch_size:]
        x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next]
        count_vec = CountVectorizer()
        counts_train = count_vec.fit_transform(x_train)
        x_test = count_vec.transform(x_test)
        tfidf_transformer = TfidfTransformer()
        x_train = tfidf_transformer.fit_transform(counts_train)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        _acc_lst.append(clf.acc(y_test, y_pred))
        y_results.append([y_test, y_pred])
        del x_train, y_train, x_test, y_test, y_pred
        bar.update()
    return _acc_lst, y_results
Пример #3
0
def train(power=6.46):
    dat_path = os.path.join("_Data", "dataset.dat")
    gen_dataset(dat_path)
    with open(dat_path, "rb") as _file:
        x, y = pickle.load(_file)
    _indices = np.random.permutation(len(x))
    x = [x[i] for i in _indices]
    y = [y[i] for i in _indices]
    data_len = len(x)
    batch_size = math.ceil(data_len*0.1)
    _test_sets, _prob_lists = [], []
    _total = sum([len(sentence) for sentence in x])
    for i in range(10):
        rs = [[] for _ in range(9)]
        _next = (i+1)*batch_size if i != 9 else data_len
        x_train = x[:i * batch_size] + x[(i + 1) * batch_size:]
        y_train = y[:i * batch_size] + y[(i + 1) * batch_size:]
        x_test, y_test = x[i*batch_size:_next], y[i*batch_size:_next]
        for xx, yy in zip(x_train, y_train):
            rs[yy] += xx
        _counters = [Counter(group) for group in rs]
        _test_sets.append((x_test, y_test))
        _prob_lst = []
        for counter in _counters:
            _sum = sum(counter.values())
            _prob_lst.append({
                key: value / _sum for key, value in counter.items()
            })
            _prob_lst[-1]["null"] = _sum * 2 ** power
            _prob_lst[-1]["prior"] = _sum / _total
        _prob_lists.append(_prob_lst)
    return _test_sets, _prob_lists
Пример #4
0
def main(clf):
    dat_path = os.path.join("_Data", "dataset.dat")
    gen_dataset(dat_path)
    with open(dat_path, "rb") as _file:
        x, y = pickle.load(_file)
    x = [" ".join(sentence) for sentence in x]
    _indices = np.random.permutation(len(x))
    x = list(np.array(x)[_indices])
    y = list(np.array(y)[_indices])
    data_len = len(x)
    batch_size = math.ceil(data_len * 0.1)
    acc_lst, y_results = [], []
    bar = ProgressBar(max_value=10, name=str(clf))
    for i in range(10):
        _next = (i + 1) * batch_size if i != 9 else data_len
        x_train = x[:i * batch_size] + x[(i + 1) * batch_size:]
        y_train = y[:i * batch_size] + y[(i + 1) * batch_size:]
        x_test, y_test = x[i * batch_size:_next], y[i * batch_size:_next]
        count_vec = CountVectorizer()
        counts_train = count_vec.fit_transform(x_train)
        x_test = count_vec.transform(x_test)
        tfidf_transformer = TfidfTransformer()
        x_train = tfidf_transformer.fit_transform(counts_train)
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        acc_lst.append(clf.acc(y_test, y_pred))
        y_results.append([y_test, y_pred])
        del x_train, y_train, x_test, y_test, y_pred
        bar.update()
    return acc_lst, y_results