def getTrainRawData(srcFolder, fileName, keyset): docIndexString, docStringIndices = readALTA2015Data(srcFolder + fileName) # Attention: the following would cost website query limits docIndexBabelSynsetID, docFilteredIndexString = \ queryBabelfySynsetID(docIndexString, docStringIndices) docIndexLangTrans = queryBabelNetSimpleLemma(docIndexBabelSynsetID, keyset) return docFilteredIndexString, docIndexLangTrans
rawBnSets, indices, document) docIndexBabelSynsetID[d] = filteredBnSets docFilteredIndexString[d] = filteredStrings return docIndexBabelSynsetID, docFilteredIndexString ## if __name__ == "__main__": from DocUtils.concaTextData import readALTA2015Data filepath = "/Users/spacegoing/百度云同步盘/macANU/" \ "2cdSemester 2015/Document Analysis/sharedTask" \ "/Code/pycharmVersion/Data/Test/Test.txt" docIndexString, docStringIndices = readALTA2015Data(filepath) rawDocBnSets = getDocBnSets(docStringIndices) # rawBnSets = rawDocBnSets[24] # indices = docStringIndices[24][1] # indexStrings = docStringIndices[24][0] # filteredBnSets, filteredStrings = filterRawBnSets(rawBnSets, indices, indexStrings) # print(filteredBnSets[:5]) # print(filteredBnSets[-5:]) docIndexBabelSynsetID, docFilteredIndexString = \ getFilteredDocBnSets(docStringIndices, rawDocBnSets) import pickle babelfyData = [
docIndexBabelSynsetID[d] = filteredBnSets docFilteredIndexString[d] = filteredStrings return docIndexBabelSynsetID, docFilteredIndexString ## if __name__ == "__main__": from DocUtils.concaTextData import readALTA2015Data filepath = ( "/Users/spacegoing/百度云同步盘/macANU/" "2cdSemester 2015/Document Analysis/sharedTask" "/Code/pycharmVersion/Data/Test/Test.txt" ) docIndexString, docStringIndices = readALTA2015Data(filepath) rawDocBnSets = getDocBnSets(docStringIndices) # rawBnSets = rawDocBnSets[24] # indices = docStringIndices[24][1] # indexStrings = docStringIndices[24][0] # filteredBnSets, filteredStrings = filterRawBnSets(rawBnSets, indices, indexStrings) # print(filteredBnSets[:5]) # print(filteredBnSets[-5:]) docIndexBabelSynsetID, docFilteredIndexString = getFilteredDocBnSets(docStringIndices, rawDocBnSets) import pickle babelfyData = [docIndexString, docStringIndices, docIndexBabelSynsetID, docFilteredIndexString] outpath = (