def __init__(self, seq_length, shared_setting, grouped_data): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.grouped_data = grouped_data self.sampler = KeySampler(self.grouped_data)
def __init__(self, topic, max_sequence, vocab_filename): self.train_data = None self.dev_data = None self.test_data = None self.topic = topic voca_path = os.path.join(data_path, vocab_filename) assert os.path.exists(voca_path) self.encoder = SubwordTextEncoder(voca_path) self.max_sequence = max_sequence
def __init__(self, seq_length, shared_setting, grouped_data): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.mask_rate = 0.15 self.grouped_data = grouped_data self.train_group = None self.test_group = None self.test_sampler = None self.train_sampler = None
def __init__(self, topic, max_sequence, vocab_filename): self.train_data = None self.dev_data = None self.test_data = None self.topic = topic voca_path = os.path.join(data_path, vocab_filename) assert os.path.exists(voca_path) if "ST" in vocab_filename: self.encoder = TokenTextEncoder(voca_path, replace_oov=SPEC_4) else: self.encoder = SubwordTextEncoder(voca_path) self.max_sequence = max_sequence
class AuxPairLoader: def __init__(self, seq_length, shared_setting, grouped_data): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.grouped_data = grouped_data self.sampler = KeySampler(self.grouped_data) def encode(self, sent): tokens = self.encoder.encode(sent) pad_len = self.seq_length - len(tokens) return tokens + pad_len * [PAD_ID] def case_encoder(self, pair): # sent1 : list[int] # label : int sent1, sent2 = pair sent1_enc = slice_n_pad(self.encode(sent1), self.seq_length, PAD_ID) sent2_enc = slice_n_pad(self.encode(sent2), self.seq_length, PAD_ID) return [(sent1_enc, sent2_enc)] def get_insts(self, data_size): sent_pairs = pos_sampling(self.grouped_data, self.sampler, data_size) generator = [self.case_encoder(p) for p in sent_pairs] return list(generator)
def __init__(self, max_sequence, vocab_filename, voca_size, is_span): self.train_data = None self.dev_data = None self.test_data = None voca_path = os.path.join(data_path, vocab_filename) self.encoder = SubwordTextEncoder(voca_path) self.lower_case = True self.sep_char = "#" self.encoder = FullTokenizerWarpper(voca_path) self.voca_size = voca_size self.dev_explain = None self.encoder_unit = EncoderUnit(max_sequence, voca_path) self.max_seq = max_sequence self.question = [ "What is title of the controversy?", "What is the controversy about?" ] if not is_span: self.q_id = 0 else: self.q_id = 1 self.is_span = is_span self.text_offset = len(self.encoder.encode( self.question[self.q_id])) + 2 data = load_annotation() self.all_data = self.generate_data(data) self.train_data, self.dev_data = self.held_out(self.all_data)
def __init__(self, max_sequence, vocab_filename, voca_size, using_alt_tokenizer=False): self.train_data = None self.dev_data = None self.test_data = None self.dev_file = os.path.join(corpus_dir, "dev.txt") self.test_file = os.path.join(corpus_dir, "test.txt") self.max_seq = max_sequence self.voca_size = voca_size voca_path = os.path.join(data_path, vocab_filename) assert os.path.exists(voca_path) print(voca_path) if not using_alt_tokenizer: self.encoder = SubwordTextEncoder(voca_path) self.sep_char = "_" self.lower_case = False else: self.lower_case = True self.sep_char = "#" self.encoder = FullTokenizerWarpper(voca_path)
def avg_token_length(): s = "atheism" cont_list = tweet_reader.load_as_text_chunk(s) voca_path = os.path.join(data_path, Tweets2Stance.vocab_filename) encoder = SubwordTextEncoder(voca_path) n = 0 histogram = Counter() for sent in cont_list: tokens = encoder.encode(sent) histogram[len(tokens)] += 1 n += 1 if n > 1000: break accum = 0 for i in range(100): accum += histogram[i] print("{} : {}".format(i, accum))
class DataLoader(): def __init__(self, seq_length, shared_setting): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.mask_rate = 0.15 def token_generator(self, reader): buf = [] for line in reader: tokens = self.encoder.encode(line) buf.extend(tokens) if len(buf) > self.seq_length: yield buf[:self.seq_length] buf = buf[self.seq_length:] def case_generator(self, reader): sents = self.token_generator(reader) random.seed(0) n_delete = int(self.seq_length * self.mask_rate) for sent in sents: delete_indice = random.sample(range(self.seq_length), n_delete) x = list(sent) for idx in delete_indice: action = random.randrange(0, 10) if action < 8: x[idx] = C_MASK_ID elif action == 8: rand_char = random.randrange(0, self.voca_size) x[idx] = rand_char else: pass y = list(sent) yield x, y # Child classs will feed own text to case_generator # and return generator of x,y tuples def get_train_generator(self): raise NotImplementedError() def get_test_generator(self): raise NotImplementedError()
class DataLoader: def __init__(self, topic, max_sequence, vocab_filename): self.train_data = None self.dev_data = None self.test_data = None self.topic = topic voca_path = os.path.join(data_path, vocab_filename) assert os.path.exists(voca_path) if "ST" in vocab_filename: self.encoder = TokenTextEncoder(voca_path, replace_oov=SPEC_4) else: self.encoder = SubwordTextEncoder(voca_path) self.max_sequence = max_sequence def example_generator(self, corpus_path, topic): select_target = dict_topic2full_desc[topic] label_list = stance_label f = open(corpus_path, "r", encoding="utf-8", errors="ignore") reader = csv.reader(f, delimiter=',') for idx, row in enumerate(reader): if idx == 0: continue # skip header # Works for both splits even though dev has some extra human labels. sent = row[0] target = row[1] label = label_list.index(row[2]) if select_target is None: f_include = True else: if target in select_target: f_include = True else: f_include = False if f_include: yield {"inputs": sent, "label": label} def load_train_data(self): path = os.path.join(corpus_dir, "train.csv") plain_data = list(self.example_generator(path, self.topic)) random.shuffle(plain_data) train_size = int(0.9 * len(plain_data)) dev_size = len(plain_data) - train_size self.train_data_raw = plain_data[:train_size] self.dev_data_raw = plain_data[train_size:] self.train_data = self.encode(self.train_data_raw) self.dev_data = self.encode(self.dev_data_raw) def load_test_data(self): path = os.path.join(corpus_dir, "test.csv") self.test_data_raw = list(self.example_generator(path, self.topic)) self.test_data = self.encode(self.test_data_raw) @classmethod def dict2tuple(cls, data): X = [] Y = [] for entry in data: X.append(entry["inputs"]) Y.append(entry["label"]) return X, Y def get_train_data(self): if self.train_data is None: self.load_train_data() return self.dict2tuple(self.train_data) def get_dev_data(self): if self.dev_data is None: self.load_train_data() return self.dict2tuple(self.dev_data) def get_test_data(self): if self.test_data is None: self.load_test_data() return self.dict2tuple(self.test_data) def encode(self, plain_data): for entry in plain_data: key = "inputs" coded_text = [CLS_ID] + self.encoder.encode( entry[key])[:self.max_sequence - 1] pad = (self.max_sequence - len(coded_text)) * [text_encoder.PAD_ID] entry[key] = coded_text + pad yield entry
def __init__(self, seq_length, shared_setting): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.mask_rate = 0.15
class PairDataLoader(): def __init__(self, seq_length, shared_setting, grouped_data): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.mask_rate = 0.15 self.grouped_data = grouped_data self.train_group = None self.test_group = None self.test_sampler = None self.train_sampler = None @classmethod def load_from_pickle(cls, id): pickle_name = "PairDataLoader_{}".format(id) path = os.path.join(cache_path, pickle_name) return pickle.load(open(path, "rb")) def save_to_pickle(self, id): pickle_name = "PairDataLoader_{}".format(id) path = os.path.join(cache_path, pickle_name) pickle.dump(self, open(path, "wb")) def encode(self, sent): tokens = self.encoder.encode(sent) pad_len = self.seq_length - len(tokens) return tokens + pad_len * [PAD_ID] def delete(self, sent): n_delete = int(self.seq_length * self.mask_rate) delete_indice = random.sample(range(self.seq_length), n_delete) x = list(sent) y = list(sent) for idx in delete_indice: action = random.randrange(0, 10) if action < 8: x[idx] = C_MASK_ID elif action == 8: rand_char = random.randrange(0, self.voca_size) x[idx] = rand_char else: pass return x, y def case_encoder(self, plain_insts): # sent1 : list[int] # label : int for sent1, sent2, label in plain_insts: sent1_enc = slice_n_pad(self.encode(sent1), self.seq_length, PAD_ID) sent2_enc = slice_n_pad(self.encode(sent2), self.seq_length, PAD_ID) sent1_del, y_1 = self.delete(sent1_enc) sent2_del, y_2 = self.delete(sent2_enc) x = sent1_del + [SEP_ID] + sent2_del y_seq = y_1 + [0] + y_2 y_cls = label yield x, y_seq, y_cls @staticmethod def split_dict(d, held_out_size): keys = list(d.keys()) indice = random.sample(range(0, len(keys)), held_out_size) held_out_keys = [keys[i] for i in indice] train_d = {} test_d = {} for key, items in d.items(): if key in held_out_keys: test_d[key] = items else: train_d[key] = items return train_d, test_d def index_data(self): if self.test_group is None: self.split_train_test() def split_train_test(self): print("split_train_test 1") held_out_group = 4000 self.train_group, self.test_group = self.split_dict( self.grouped_data, held_out_group) print("split_train_test 2") self.test_sampler = KeySampler(self.test_group) print("split_train_test 3") self.train_sampler = KeySampler(self.train_group) print("split_train_test 4") # Child classs will feed own text to case_generator # and return generator of x,y tuples def get_train_batch(self, data_size): if self.train_group is None: self.split_train_test() train_generator = self.case_encoder( pos_neg_pair_sampling(self.train_group, self.train_sampler, data_size)) return train_generator def get_test_generator(self, data_size): if self.test_group is None: self.split_train_test() test_generator = self.case_encoder( pos_neg_pair_sampling(self.test_group, self.test_sampler, data_size)) return test_generator
class AuthorAsDoc: def __init__(self, seq_length, shared_setting, grouped_data): voca_path = os.path.join(data_path, shared_setting.vocab_filename) self.voca_size = shared_setting.vocab_size self.encoder = SubwordTextEncoder(voca_path) self.seq_length = seq_length self.grouped_data = grouped_data self.train_group = None self.test_group = None self.test_sampler = None self.train_sampler = None self.mask_rate = 0.15 @staticmethod def split_dict(d, held_out_size): keys = list(d.keys()) indice = random.sample(range(0, len(keys)), held_out_size) held_out_keys = [keys[i] for i in indice] train_d = {} test_d = {} for key, items in d.items(): if key in held_out_keys: test_d[key] = items else: train_d[key] = items return train_d, test_d def index_data(self): if self.test_group is None: self.split_train_test() def split_train_test(self): print("split_train_test 1") held_out_group = 4000 self.train_group, self.test_group = self.split_dict( self.grouped_data, held_out_group) print("split_train_test 2") self.test_sampler = KeySampler(self.test_group) print("split_train_test 3") self.train_sampler = KeySampler(self.train_group) print("split_train_test 4") @classmethod def load_from_pickle(cls, id): pickle_name = "AuthorAsDoc_{}".format(id) path = os.path.join(cache_path, pickle_name) return pickle.load(open(path, "rb")) def save_to_pickle(self, id): pickle_name = "AuthorAsDoc_{}".format(id) path = os.path.join(cache_path, pickle_name) pickle.dump(self, open(path, "wb")) def encode(self, sent): tokens = self.encoder.encode(sent) return tokens + [SEP_ID] def delete_alter(self, sent): n_delete = int(self.seq_length * self.mask_rate) delete_indice = random.sample(range(self.seq_length), n_delete) x = list(sent) for idx in delete_indice: action = random.randrange(0, 10) if action < 8: x[idx] = C_MASK_ID elif action == 8: rand_char = random.randrange(0, self.voca_size) x[idx] = rand_char else: pass y = list(sent) return x, y def get_instances(self, grouped_dict, key_sampler, data_size): data = [] for i in range(data_size): key = key_sampler.sample2() items = grouped_dict[key] seq = [] j_init = random.randint(0, len(items) - 1) j = 0 while len(seq) < self.seq_length: sent = self.encode(items[j]) if len(seq) + len(sent) > self.seq_length: break seq += sent j = increment_circular(j, len(items)) if j == j_init: break seq = slice_n_pad(seq, self.seq_length, PAD_ID) data.append(self.delete_alter(seq)) return data def get_train_instances(self, data_size): return self.get_instances(self.train_group, self.train_sampler, data_size) def get_test_instances(self, data_size): return self.get_instances(self.test_group, self.test_sampler, data_size)