def Test25(): MaxLength = 30 BatchSize = 2 EmbeddingSize = 4 HeadNum = 2 SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) BatchDatas = DL.TrainDataLoaderCreator(TrainDataset, BatchSize) for Batch in BatchDatas: SrcSent = Batch["SrcSent"] print(SrcSent) SrcLength = Batch["SrcLength"] print(SrcLength) TgtSent = Batch["TgtSent"] print(TgtSent) TgtLength = Batch["TgtLength"] print(TgtLength)
def Test21(): MaxLength = 30 def CollateFunction(Batch): #print(len(Batch)) OutputBatch = { "SrcSent": [], "SrcLength": [], "TgtSent": [], "TgtLength": [] } for Elem in Batch: #print(Elem[0][0]) OutputBatch["SrcSent"].append(Elem[0][0]) OutputBatch["SrcLength"].append(Elem[0][1]) OutputBatch["TgtSent"].append(Elem[1][0]) OutputBatch["TgtLength"].append(Elem[1][1]) #print(OutputBatch["SrcSent"]) OutputBatch["SrcSent"] = t.LongTensor(OutputBatch["SrcSent"]) OutputBatch["TgtSent"] = t.LongTensor(OutputBatch["TgtSent"]) return OutputBatch SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) z = DL.TrainDataLoaderCreator(TrainDataset, 4) Count = 0 while True: if Count == 100: break Count = Count + 1 for x in z: print("Batch") print(x["SrcSent"].size())
def Test26(): MaxLength = 30 BatchSize = 2 EmbeddingSize = 4 HeadNum = 2 EnLayer = 2 DeLayer = 2 SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) BatchDatas = DL.TrainDataLoaderCreator(TrainDataset, BatchSize) SrcVocabularySize = SrcDict.VocabularySize() TgtVocabularySize = TgtDict.VocabularySize() Trans = T.TransformerNMTModel(HeadNum, EmbeddingSize, SrcVocabularySize, TgtVocabularySize, MaxLength, EnLayer, DeLayer) for BatchInd, Batch in enumerate(BatchDatas): print("BegingBatch") SrcSent = Batch["SrcSent"] print(SrcSent.size()) SrcLength = Batch["SrcLength"] #print(SrcLength.size()) TgtSent = Batch["TgtSent"] print(TgtSent.size()) TgtLength = Batch["TgtLength"] #print(TgtLength.size()) SrcMask = T.BatchLengthToBoolTensorMask(SrcLength, MaxLength) TgtMask = T.BatchLengthToBoolTensorMask(TgtLength, MaxLength) Output = Trans(SrcSent, TgtSent, SrcMask, TgtMask) print("Step") print(BatchInd + 1) print(Output.size()) print(Output[0][2])
def TestBuildTransformer(): with open("Model/Config.json") as Fd: ConfigDict = json.load(Fd) MaxLength = ConfigDict["MaxLength"] BatchSize = ConfigDict["BatchSize"] EmbeddingSize = ConfigDict["EmbeddingSize"] HeadNum = ConfigDict["HeadNum"] EnLayer = ConfigDict["EnLayer"] DeLayer = ConfigDict["DeLayer"] SrcIndSentences, SrcLength, SrcDict = DLoad.LoadData( "Data/test.sents", "Data/src.vocab", MaxLength) TgtDict = DLoad.LoadVocabulary("Data/tgt.vocab") TestDataset = DLoad.TestCorpusDataset(SrcIndSentences, SrcLength) BatchDatas = DLoad.TestDataLoaderCreator(TestDataset, BatchSize) SrcVocabularySize = SrcDict.VocabularySize() TgtVocabularySize = TgtDict.VocabularySize() print("Building Model") Trans = TransformerNMTModel( HeadNum, EmbeddingSize, SrcVocabularySize, TgtVocabularySize, MaxLength, EnLayer, DeLayer) print("Model building finished") return Trans, BatchDatas, SrcDict, TgtDict, MaxLength
def Test17(): IndSentences, Length, Dict = DL.LoadData("src.sents", "src.vocab", 30) for Sent in IndSentences: print(Sent) for L in Length: print(L)