def parse_vis(): input_file = "vis/train.csv" input_dir = Constant.RAW_DIR + input_file print("Preprocessing avazu dataset ...") # load dataset print("\tLoading dataset ...") df = pd.read_csv(input_dir, header=None) # fix missing value -- no missing value # drop columns df = df.drop(Config("vis").IGN_COL, axis=1) # normalizing features -- no numerical cols # split train, valid, and test print("\tSplitting Train, Valid, and Test Dataset ...") df_train, df_val, df_test = _split_train_validation_test(df) # split ind, val, and test print("\tSplitting Index, Value, and Labels ...") full_splits = _split_ind_val_label(dataset="vis", df_train=df_train, df_test=df_test, df_val=df_val) # save 3X3 dataframes to `parsed` folder print("\tSaving all splited matrices ...") _save_splits(full_splits, dataset="vis")
def __init__(self , dataset , batch_size): """ :param: dataset: name of dataset :param: use_graph: whether need to build graph :param: batch_size: """ # ==== params ===== self.dataset = dataset self.cfg = Config(dataset=dataset) # ===== sizes ===== self.batch_size = batch_size self.train_size, self.test_size, self.valid_size = 0, 0, 0 # ===== inner variables ===== self.batch_index = 0 self.has_next = False # ===== datasets ===== self.train_ind, self.train_label = self.load_data("train") self.test_ind, self.test_label = self.load_data("test") self.val_ind, self.val_label = self.load_data("val") self.train_size = self.train_label.shape[0] self.test_size = self.test_label.shape[0] self.val_size = self.val_label.shape[0] self.feature_size, self.field_size = self.load_statistics() # ===== iter count ===== self.train_iter_count = self.train_size // self.batch_size
def _split_ind_val_label(dataset, df_train, df_test, df_val): feat_dict = FeatureDictionary(df_train=df_train, df_test=df_test, df_val=df_val, cfg=Config(dataset=dataset)) # parse datasets df_train_split = feat_dict.parse(df=df_train) df_test_split = feat_dict.parse(df=df_test) df_val_split = feat_dict.parse(df=df_val) return df_train_split, df_val_split, df_test_split, feat_dict
def __init__(self, config: const.Config, device: torch.device) -> None: self.config = config self.device = device self.seq2seq = Seq2seq(config, device=device, load_emb=True) self.loss = nn.NLLLoss() self.optimizer = torch.optim.Adam(self.seq2seq.parameters()) config.name = 'train' data = prepare.load_data('train') data = prepare.process(data) self.data = data_prepare.Data(data, config.batch_size, config) self.max_sentence_length = config.max_sentence_length self.epoch_number = config.epoch_number + 1
from prepare_feature import Feature import model import numpy as np import torch from const import Config config = Config() feature = Feature(config) class Parser(object): def __init__(self, sentence): self.sentence = sentence self.stack = ["<ROOT>"] self.buffer = list(sentence) self.dep = [] def parse_step(self, transition): if transition == "S" and len(self.buffer) > 0: #将b1从buffer中移除,添加到stack中 word = self.buffer.pop(0) self.stack.append(word) if transition == "L": # arc对应的依赖关系(label)为l,添加一个dependency arc为s1->s2,,然后将s2从stack中移除 head = self.stack[-1] dependent = self.stack.pop(-2) self.dep.append((head, dependent)) if transition == 'R': # arc对应的依赖关系为l,添加一个dependency arc为s2->s1,然后将s1从stack中移除。 head = self.stack[-2] dependent = self.stack.pop()