def __init__(self, data_path, tokenizer, transforms, vocab, args): df = pd.read_csv(data_path, sep='\t')[[ 'gold_label', 'sentence1', 'sentence2', 'image' ]] print(f'{data_path}, number of rows: {len(df)}') if df['sentence2'].isnull().sum() > 0: print( f" drop number of lines because of missing sentence2: {df['sentence2'].isnull().sum()}" ) df = df.loc[df['sentence2'].isnull() != True] df = df.rename({'gold_label': 'label', 'image': 'img'}, axis=1) df['img'] = args.img_path + '/' + df['img'] self.data = df.to_dict('records') self.data_dir = str(os.path.dirname(data_path)) self.tokenizer = tokenizer self.args = args self.vocab = vocab self.n_classes = len(args.labels) self.text_start_token = ["[CLS]" ] if args.model != "mmbt" else ["[SEP]"] with numpy_seed(0): for row in self.data: if np.random.random() < args.drop_img_percent: row["img"] = None self.max_seq_len = args.max_seq_len if args.model == "mmbt": self.max_seq_len -= args.num_image_embeds self.transforms = transforms
def __init__(self, data_path, tokenizer, transforms, vocab, args): self.data = [json.loads(l) for l in open(data_path)] self.data_dir = os.path.dirname(data_path) self.tokenizer = tokenizer self.args = args self.vocab = vocab self.n_classes = len(args.labels) self.text_start_token = ["[CLS]" ] if args.model != "mmbt" else ["[SEP]"] with numpy_seed(0): for row in self.data: if np.random.random() < args.drop_img_percent: row["img"] = None self.max_seq_len = args.max_seq_len if args.model == "mmbt": self.max_seq_len -= args.num_image_embeds self.transforms = transforms