예제 #1
0
def testing_whole_family(outputPath=None, wsChildrenDic=dict(), word2ballDic=dict(), outputBallFile=None):
    """
    :param outputPath:
    :param wsChildrenDic:
    :param word2ballDic:
    :param outputBallFile:
    :return:
    """
    print("checking whether the tree structure is perfectly encoded in nball embeddings...\n")
    failed_P, failed_DC = [], []
    maxsize, mindim, word2ballDic = load_balls(ipath = outputPath, word2ballDic=word2ballDic)

    for froot in get_children('*root*', wsChildrenDic=wsChildrenDic):
        failed_P += check_P_for_child_parent_in_one_family(froot,
                                                           wsChildrenDic=wsChildrenDic,
                                                           word2ballDic=word2ballDic,
                                                           ballPath=outputPath)

    failed_DC += check_DC_for_sibilings_in_one_family(root='*root*', wsChildrenDic=wsChildrenDic,
                                                      word2ballDic=word2ballDic)
    print("failed families with P", failed_P)
    print("failed families with DC", failed_DC)
    if failed_P == [] and failed_DC == []:
        print("the tree structure is perfectly encoded in nball embeddings.\n")
        print("generating nball embedding file...\n")
        merge_balls_into_file(ipath= outputPath, outfile=outputBallFile)
    else:
        print("the tree structure is NOT perfectly encoded in nball embeddings.\n")
        print("try again, or contact the author")
예제 #2
0
def train_word2ball(root="",  outputPath = '', logFile='', wsChildrenDic=dict(),
                    word2ballDic=dict(), word2vecDic=dict(), outputPathBack = None,
                    wscatCodeDic=dict(), outputBallFile=None):
    """
    :param root:
    :param outputPath:
    :param logFile:
    :param wsChildrenDic:
    :param word2ballDic:
    :param word2vecDic:
    :param wscatCodeDic:
    :param outputBallFile:
    :param outputBallForestFile:
    :return:
    """
    training_all_families(root=root, wsChildrenDic=wsChildrenDic, word2vecDic=word2vecDic,
                                          wscatCodeDic=wscatCodeDic, word2ballDic=word2ballDic,
                                          outputPath=outputPath, logFile=logFile)
    if outputPathBack:
        copy_tree(outputPath, outputPathBack)
    maxsize, mindim , word2ballDic = load_balls(ipath=outputPath, word2ballDic=word2ballDic)
    fix_dim(maxsize, mindim, bPath=outputPath, outputPath=outputPath)
    make_DC_for_first_level_children(root=root, firstChild = 'entity.n.01', wsChildrenDic=wsChildrenDic,
                                                    word2ballDic=word2ballDic, outputPath=outputPath,
                                                    maxsize=maxsize, mindim=mindim, logFile=logFile)

    testing_whole_family(outputPath=outputPath, wsChildrenDic=wsChildrenDic, outputBallFile=outputBallFile)
예제 #3
0
def training_all_families(root="*root*", wsChildrenDic=dict(), word2vecDic=dict(), wscatCodeDic=dict(),
                          word2ballDic=dict(),
                          outputPath=None, logFile=None, checking = False):
    """
    :param root:
    :param wsChildrenDic:
    :param word2vecDic:
    :param wscatCodeDic:
    :param word2ballDic:
    :param outputPath:
    :param logFile:
    :param checking:
    :return:
    """
    global L0, DIM
    children = get_children(root, wsChildrenDic=wsChildrenDic)
    child0= 'entity.n.01'
    children = sorted(children, key=lambda ele: np.dot(get_word2vector(child0, word2vecDic=word2vecDic),
                                                       get_word2vector(ele, word2vecDic=word2vecDic)))
    print(children)
    N = int(np.ceil(np.log(len(children))))
    open(logFile, 'w+')
    while children:
        child = children.pop()
        k = 512
        addDim0 = list(bin(N))[2:][:DIM]
        if len(addDim0) < DIM:
            addDim0 += [0] * (DIM - len(addDim0))
        addDim = [int(ele) * 2 - 1 for ele in addDim0]
        addDim = [ele * k for ele in addDim]
        print("***", child)
        with open(logFile, 'a+') as wlog:
            wlog.write(" ".join([str(ele) for ele in [child]
                                    +addDim
                                    +[time.strftime("%Y-%m-%d %H:%M:%S",time.gmtime())]]))
            wlog.write("\n")
        word2ballDic = training_one_family(root=child, addDim=addDim, wsChildrenDic=wsChildrenDic,
                                           word2vecDic=word2vecDic, wscatCodeDic=wscatCodeDic,
                                           word2ballDic=word2ballDic,
                                           outputPath=outputPath, logFile=logFile)
        children = sorted(children, key=lambda ele: np.dot(get_word2vector(child, word2vecDic=word2vecDic),
                                                           get_word2vector(ele, word2vecDic=word2vecDic)))
    print("finished training of all families\n")

    if checking:
        print("checking each family\n")
        maxsize, mindim, word2ballDic = load_balls(ipath=outputPath, word2ballDic=word2ballDic)

        failed_P, failed_DC = [], []

        for child in get_children(root):
            failed_P += check_P_for_child_parent_in_one_family(child, word2ballDic =word2ballDic,
                                                               wsChildrenDic=wsChildrenDic, ballPath=outputPath)
            failed_DC += check_DC_for_sibilings_in_one_family(root=child, word2ballDic =word2ballDic,
                                                              wsChildrenDic=wsChildrenDic)
        print("failed families with P", failed_P)
        print("failed families with DC", failed_DC)
    return word2ballDic