示例#1
0
def parse_vis():
    input_file = "vis/train.csv"
    input_dir = Constant.RAW_DIR + input_file

    print("Preprocessing avazu dataset ...")

    # load dataset
    print("\tLoading dataset ...")
    df = pd.read_csv(input_dir, header=None)

    # fix missing value -- no missing value

    # drop columns
    df = df.drop(Config("vis").IGN_COL, axis=1)

    # normalizing features -- no numerical cols

    # split train, valid, and test
    print("\tSplitting Train, Valid, and Test Dataset ...")
    df_train, df_val, df_test = _split_train_validation_test(df)

    # split ind, val, and test
    print("\tSplitting Index, Value, and Labels ...")
    full_splits = _split_ind_val_label(dataset="vis",
                                       df_train=df_train,
                                       df_test=df_test,
                                       df_val=df_val)

    # save 3X3 dataframes to `parsed` folder
    print("\tSaving all splited matrices ...")
    _save_splits(full_splits, dataset="vis")
示例#2
0
    def __init__(self
                 , dataset
                 , batch_size):
        """
        :param: dataset: name of dataset
        :param: use_graph: whether need to build graph
        :param: batch_size:
        """

        # ==== params =====
        self.dataset = dataset
        self.cfg = Config(dataset=dataset)

        # ===== sizes =====
        self.batch_size = batch_size
        self.train_size, self.test_size, self.valid_size = 0, 0, 0

        # ===== inner variables =====
        self.batch_index = 0
        self.has_next = False

        # ===== datasets =====
        self.train_ind, self.train_label = self.load_data("train")
        self.test_ind, self.test_label = self.load_data("test")
        self.val_ind, self.val_label = self.load_data("val")

        self.train_size = self.train_label.shape[0]
        self.test_size = self.test_label.shape[0]
        self.val_size = self.val_label.shape[0]
        self.feature_size, self.field_size = self.load_statistics()

        # ===== iter count =====
        self.train_iter_count = self.train_size // self.batch_size
示例#3
0
def _split_ind_val_label(dataset, df_train, df_test, df_val):
    feat_dict = FeatureDictionary(df_train=df_train,
                                  df_test=df_test,
                                  df_val=df_val,
                                  cfg=Config(dataset=dataset))

    # parse datasets
    df_train_split = feat_dict.parse(df=df_train)
    df_test_split = feat_dict.parse(df=df_test)
    df_val_split = feat_dict.parse(df=df_val)

    return df_train_split, df_val_split, df_test_split, feat_dict
    def __init__(self, config: const.Config, device: torch.device) -> None:

        self.config = config

        self.device = device

        self.seq2seq = Seq2seq(config, device=device, load_emb=True)

        self.loss = nn.NLLLoss()
        self.optimizer = torch.optim.Adam(self.seq2seq.parameters())
        config.name = 'train'
        data = prepare.load_data('train')
        data = prepare.process(data)
        self.data = data_prepare.Data(data, config.batch_size, config)
        self.max_sentence_length = config.max_sentence_length
        self.epoch_number = config.epoch_number + 1
示例#5
0
from prepare_feature import Feature
import model
import numpy as np
import torch
from const import Config

config = Config()
feature = Feature(config)


class Parser(object):
    def __init__(self, sentence):
        self.sentence = sentence
        self.stack = ["<ROOT>"]
        self.buffer = list(sentence)
        self.dep = []

    def parse_step(self, transition):
        if transition == "S" and len(self.buffer) > 0:
            #将b1从buffer中移除,添加到stack中
            word = self.buffer.pop(0)
            self.stack.append(word)
        if transition == "L":
            # arc对应的依赖关系(label)为l,添加一个dependency arc为s1->s2,,然后将s2从stack中移除
            head = self.stack[-1]
            dependent = self.stack.pop(-2)
            self.dep.append((head, dependent))
        if transition == 'R':
            # arc对应的依赖关系为l,添加一个dependency arc为s2->s1,然后将s1从stack中移除。
            head = self.stack[-2]
            dependent = self.stack.pop()