def getDataSet(opt): import dataloader dataset = dataloader.getDataset(opt) # files=[os.path.join(data_dir,data_name) for data_name in ['train.txt','test.txt','dev.txt']] # return dataset.getFormatedData(opt) return dataset.process()
def getDataset(opt): # use dataloader part import dataloader dataset = dataloader.getDataset(opt) # return the processed file name: text and label # by dataset.process() funtion return dataset.getFormatedData()
opt.alphabet=alphabet # alphabet.dump(opt.dataset+".alphabet") for data in datas: if "bert" not in opt.model.lower(): data["text"]= data["text"].apply(lambda text: [alphabet.get(word,alphabet.unknow_token) for word in text[:opt.max_seq_len]] + [alphabet.padding_token] *int(opt.max_seq_len-len(text)) ) else : data["text"]= data["text"].apply(process_with_bert,tokenizer=tokenizer,max_seq_len = opt.max_seq_len) data["label"]=data["label"].apply(lambda text: label_alphabet.get(text)) return map(lambda x:BucketIterator(x,opt),datas)#map(BucketIterator,datas) # def loadDataWithoutEmbedding(opt): datas=[] for filename in getDataSet(opt): df = pd.read_csv(filename,header = None,sep="\t",names=["text","label"]).fillna('0') df["text"]= df["text"].str.lower() datas.append((df["text"],df["label"])) return datas if __name__ =="__main__": import opts opt = opts.parse_opt() opt.max_seq_len=-1 import dataloader dataset= dataloader.getDataset(opt) datas=loadData(opt)
def getDataSet(opt): import dataloader dataset = dataloader.getDataset(opt) return dataset.getFormatedData()