示例#1
0
def data_summary():
    data_dir = os.path.join("data/tsv/cpu")
    data_processor = DataProcessor(data_dir)

    with open("output/summary.txt", "w") as fw:
        train_examples = data_processor.get_examples("train")
        dev_examples = data_processor.get_examples("dev")
        test_examples = data_processor.get_examples("test")
        texts = train_examples["text"] + dev_examples["text"] + test_examples[
            "text"]

        length = [len(l.split()) for l in texts]
        max_len = np.max(length)
        min_len = np.min(length)
        median_len = np.median(length)
        num_words = sum(length)
        num_train = len(train_examples["text"])
        num_dev = len(dev_examples["text"])
        num_test = len(test_examples["text"])
        num_total = num_train + num_dev + num_test

        output = "total: %s\ntrain set: %s\ndev set:%s\ntest set:%s\n" \
                 "number of tokens:%s\nmax len:%s\nmin len:%s\nmedian len:%s\n" % (
            num_total, num_train, num_dev, num_test, num_words, max_len, min_len, median_len)
        print(output)
        fw.write(output)

        length = np.array(length)
        np.save("output/length.npy", length)
示例#2
0
    def __init__(self, output_dir, data_dir, item):
        data_dir = os.path.join(data_dir, item)
        output_dir = os.path.join(output_dir, item)

        try:
            predictions = np.load(os.path.join(output_dir, "predictions.npy"))
            y_pred = np.array(predictions).astype(np.float32)
            # y_pred = np.argsort(-y_pred, 1).astype(np.int64)
        except:
            y_pred = []

        try:
            data_processor = DataProcessor(data_dir)
            test_examples = data_processor.get_examples("test")
            labels = test_examples["label"]
            # labels = [[l] for l in labels]
            y_true = np.array(labels).astype(np.int64)
        except:
            y_true = []
            test_examples = {"text": [], "label": []}

        self.y_true = y_true
        self.y_preds = y_pred
        self.test_eamples = test_examples
        self.num_classes = len(set(y_true))