def __init__(self, task_config, lm): self.config = task_config self.tokenizer = get_tokenizer(lm=lm) self.len_cache = {} # build the tfidf index self.build_index()
def __init__(self, source, vocab, taskname, max_len=512, lm='distilbert', size=None, augment_op=None, balance=False): self.tokenizer = get_tokenizer(lm=lm) # tokens and tags sents, tags_li = [], [] # list of lists self.max_len = max_len if type(source) is str: sents, tags_li = self.read_classification_file(source) if size is not None: sents = sents[:size] tags_li = tags_li[:size] else: for sent in source: sents.append(sent) tags_li.append(vocab[0]) # assign class variables self.sents, self.tags_li = sents, tags_li self.vocab = vocab # index for tags/labels self.tag2idx = {tag: idx for idx, tag in enumerate(self.vocab)} self.idx2tag = {idx: tag for idx, tag in enumerate(self.vocab)} self.taskname = taskname # augmentation op self.augment_op = augment_op if augment_op == 't5': self.load_t5_examples(source) elif augment_op != None: self.augmenter = Augmenter() else: self.augmenter = None self.balance = balance if balance: # sort by labels self.pos_sents = [] self.neg_sents = [] self.neg_cnt = [] for sid, (sent, lbl) in enumerate(zip(sents, tags_li)): if int(lbl) == 0: self.neg_sents.append(sid) self.neg_cnt.append(0) else: self.pos_sents.append(sid)
def __init__(self, train_fn, idf_fn, w2v, task, bert_path, lm='bert'): if 'tagging' in task or 'qa' in task: self.tokens, self.labels = read_tagging_file(train_fn) else: self.sents = read_asc_file(train_fn) self.tokens = list(map(lambda x: x['token'], self.sents)) idf_dict = json.load(open(idf_fn)) self.w2v = w2v self.task = task self.index = {'token': dict(), 'span': dict()} self.all_spans = list() self.span_freqs = list() self.avg_senti = dict() if self.task == 'classification': # sentiment sensitive self.calc_senti_score() self.tokenizer = get_tokenizer(lm=lm) self.init_token_index(idf_dict) self.init_span_index(bert_path=bert_path) self.index_token_replacement()
def __init__(self, source, vocab, taskname, max_len=512, lm='distilbert', size=None, augment_op=None): self.tokenizer = get_tokenizer(lm=lm) # tokens and tags sents, tags_li = [], [] # list of lists self.max_len = max_len if type(source) is str: sents, tags_li = self.read_classification_file(source) if size is not None: sents = sents[:size] tags_li = tags_li[:size] else: for sent in source: sents.append(sent) tags_li.append(vocab[0]) # assign class variables self.sents, self.tags_li = sents, tags_li self.vocab = vocab # index for tags/labels self.tag2idx = {tag: idx for idx, tag in enumerate(self.vocab)} self.idx2tag = {idx: tag for idx, tag in enumerate(self.vocab)} self.taskname = taskname # augmentation op if augment_op != None: self.augmenter = Augmenter() self.augment_op = augment_op else: self.augmenter = None