Пример #1
0
    def __init__(self, task_config, lm):
        self.config = task_config
        self.tokenizer = get_tokenizer(lm=lm)
        self.len_cache = {}

        # build the tfidf index
        self.build_index()
Пример #2
0
    def __init__(self,
                 source,
                 vocab,
                 taskname,
                 max_len=512,
                 lm='distilbert',
                 size=None,
                 augment_op=None,
                 balance=False):
        self.tokenizer = get_tokenizer(lm=lm)

        # tokens and tags
        sents, tags_li = [], [] # list of lists
        self.max_len = max_len

        if type(source) is str:
            sents, tags_li = self.read_classification_file(source)
            if size is not None:
                sents = sents[:size]
                tags_li = tags_li[:size]
        else:
            for sent in source:
                sents.append(sent)
                tags_li.append(vocab[0])

        # assign class variables
        self.sents, self.tags_li = sents, tags_li
        self.vocab = vocab

        # index for tags/labels
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.vocab)}
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.vocab)}
        self.taskname = taskname

        # augmentation op
        self.augment_op = augment_op
        if augment_op == 't5':
            self.load_t5_examples(source)
        elif augment_op != None:
            self.augmenter = Augmenter()
        else:
            self.augmenter = None

        self.balance = balance
        if balance:
            # sort by labels
            self.pos_sents = []
            self.neg_sents = []
            self.neg_cnt = []

            for sid, (sent, lbl) in enumerate(zip(sents, tags_li)):
                if int(lbl) == 0:
                    self.neg_sents.append(sid)
                    self.neg_cnt.append(0)
                else:
                    self.pos_sents.append(sid)
    def __init__(self, train_fn, idf_fn, w2v, task, bert_path, lm='bert'):
        if 'tagging' in task or 'qa' in task:
            self.tokens, self.labels = read_tagging_file(train_fn)
        else:
            self.sents = read_asc_file(train_fn)
            self.tokens = list(map(lambda x: x['token'], self.sents))

        idf_dict = json.load(open(idf_fn))
        self.w2v = w2v
        self.task = task
        self.index = {'token': dict(), 'span': dict()}
        self.all_spans = list()
        self.span_freqs = list()
        self.avg_senti = dict()
        if self.task == 'classification':
            # sentiment sensitive
            self.calc_senti_score()
        self.tokenizer = get_tokenizer(lm=lm)
        self.init_token_index(idf_dict)
        self.init_span_index(bert_path=bert_path)
        self.index_token_replacement()
Пример #4
0
    def __init__(self,
                 source,
                 vocab,
                 taskname,
                 max_len=512,
                 lm='distilbert',
                 size=None,
                 augment_op=None):
        self.tokenizer = get_tokenizer(lm=lm)

        # tokens and tags
        sents, tags_li = [], [] # list of lists
        self.max_len = max_len

        if type(source) is str:
            sents, tags_li = self.read_classification_file(source)
            if size is not None:
                sents = sents[:size]
                tags_li = tags_li[:size]
        else:
            for sent in source:
                sents.append(sent)
                tags_li.append(vocab[0])

        # assign class variables
        self.sents, self.tags_li = sents, tags_li
        self.vocab = vocab

        # index for tags/labels
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.vocab)}
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.vocab)}
        self.taskname = taskname

        # augmentation op
        if augment_op != None:
            self.augmenter = Augmenter()
            self.augment_op = augment_op
        else:
            self.augmenter = None