def getTrainRawData(srcFolder, fileName, keyset):
    docIndexString, docStringIndices = readALTA2015Data(srcFolder + fileName)

    # Attention: the following would cost website query limits
    docIndexBabelSynsetID, docFilteredIndexString = \
        queryBabelfySynsetID(docIndexString, docStringIndices)

    docIndexLangTrans = queryBabelNetSimpleLemma(docIndexBabelSynsetID, keyset)

    return docFilteredIndexString, docIndexLangTrans
示例#2
0
            rawBnSets, indices, document)

        docIndexBabelSynsetID[d] = filteredBnSets
        docFilteredIndexString[d] = filteredStrings

    return docIndexBabelSynsetID, docFilteredIndexString


##
if __name__ == "__main__":
    from DocUtils.concaTextData import readALTA2015Data

    filepath = "/Users/spacegoing/百度云同步盘/macANU/" \
               "2cdSemester 2015/Document Analysis/sharedTask" \
               "/Code/pycharmVersion/Data/Test/Test.txt"
    docIndexString, docStringIndices = readALTA2015Data(filepath)
    rawDocBnSets = getDocBnSets(docStringIndices)

    # rawBnSets = rawDocBnSets[24]
    # indices = docStringIndices[24][1]
    # indexStrings = docStringIndices[24][0]
    # filteredBnSets, filteredStrings = filterRawBnSets(rawBnSets, indices, indexStrings)
    # print(filteredBnSets[:5])
    # print(filteredBnSets[-5:])

    docIndexBabelSynsetID, docFilteredIndexString = \
        getFilteredDocBnSets(docStringIndices, rawDocBnSets)

    import pickle

    babelfyData = [
示例#3
0
        docIndexBabelSynsetID[d] = filteredBnSets
        docFilteredIndexString[d] = filteredStrings

    return docIndexBabelSynsetID, docFilteredIndexString


##
if __name__ == "__main__":
    from DocUtils.concaTextData import readALTA2015Data

    filepath = (
        "/Users/spacegoing/百度云同步盘/macANU/"
        "2cdSemester 2015/Document Analysis/sharedTask"
        "/Code/pycharmVersion/Data/Test/Test.txt"
    )
    docIndexString, docStringIndices = readALTA2015Data(filepath)
    rawDocBnSets = getDocBnSets(docStringIndices)

    # rawBnSets = rawDocBnSets[24]
    # indices = docStringIndices[24][1]
    # indexStrings = docStringIndices[24][0]
    # filteredBnSets, filteredStrings = filterRawBnSets(rawBnSets, indices, indexStrings)
    # print(filteredBnSets[:5])
    # print(filteredBnSets[-5:])

    docIndexBabelSynsetID, docFilteredIndexString = getFilteredDocBnSets(docStringIndices, rawDocBnSets)

    import pickle

    babelfyData = [docIndexString, docStringIndices, docIndexBabelSynsetID, docFilteredIndexString]
    outpath = (