예제 #1
0
def compare(gdb,columnX,columnY,GeValueX,Vlist,Plist,VPmaplist):
    '''
    :param gdb:
    :param column:
    :param GeValueX:
    :return:
    '''
    rsl=[]
    df=gdb
    global V,P,VPmap
    V=Vlist
    P=Plist
    VPmap=VPmaplist
    root=geColumn[columnX]
    Xnode=GeValueX.node
    xspar = VPmap[GeValueX]
    delist = [GeValueX.value]
    delist = getDescendantNodes(root, Xnode, delist)   # value list
    # print(delist)
    for row in range(0,len(df)):
        xdata=df.ix[row][columnX]
        # print(xdata+'++++++')
        if xdata in delist:
            ydata=df.ix[row][columnY]
            rooty=geColumn[columnY]
            heighty = findNodes(rooty, ydata).getHeight()
            # print(ydata)
            # print(heighty)
            for GeValueY in V[columnY]:   # find the equal one in right side
                # print(GeValueY.value)
                # if GeValueY.value==ydata:
                # print(findNodes(rooty,GeValueY.value).getHeight())
                if findNodes(rooty,GeValueY.value).getHeight()<=heighty:
                    yspar=VPmap[GeValueY]
                    # print(yspar)
                    p=Partition([])
                    # print(VPmap[GeValueX],VPmap[GeValueY])
                    if xspar in P:
                        GVS = getGeValueAcordPar(VPmap, xspar)
                        for gs in GVS:
                            VPmap[gs]=p
                        P.remove(xspar)
                    if yspar in P:
                        GVS = getGeValueAcordPar(VPmap, yspar)
                        # print(GVS)
                        for gs in GVS:
                            VPmap[gs] = p
                        P.remove(yspar)

                    xspar=VPmap[GeValueX]
                    # print(VPmap[GeValueX], VPmap[GeValueY])
                    P.append(p)
예제 #2
0
 def __init__(self, value,row,col,root):
     self.value = value
     self.row=row
     self.col=col
     self.root=root
     self.node=findNodes(root,value)
     self.height=self.node.getHeight()
예제 #3
0
 def __init__(self, value, row, col, root):
     self.value = value
     self.row = row
     self.col = col
     self.root = root
     self.node = findNodes(root, value)
     self.height = node.getHeight()
     list = [self]
     self.partition = Partition(list)
예제 #4
0
def sp_g(value, root):
    '''
    Specificity Penalty for Ontology attribute
    :param value: the value of one tuple's one attribute
    :param root: attribute's tree's root
    :return: result
    '''

    node = findNodes(root, value)
    if (node.isLeaf() == 'true'):
        result = 0
    else:
        list = []
        list = getDescendantNodes(root, node, list)
        size = len(list)
        result = math.log(size)
        return result
예제 #5
0
def cp_g(value, root):
    '''
    Certainty Penalty for Ontology attribute
    :param value: the value of one tuple's one attribute
    :param root: attribute's tree's root
    :return: result
    '''

    node = findNodes(root, value)
    if (node.isLeaf() == 'true'):
        result = 0
    else:
        list = []
        list = getDescendantNodes(root, node, list)
        size = len(list)
        domain = getDomainsize(root)
        result = size / domain

        return result
예제 #6
0
def doPartition(gex, gey, V, df, dic):

    xlist = {}
    ylist = {}
    for x in gex:
        x = x.split('*')
        xlist[int(x[0])] = dic[x[1]]
    #['1*age']

    for y in gey:
        y = y.split('*')
        ylist[int(y[0])] = dic[y[1]]
    #['3*location']

    for xx in xlist:  # 1:age

        max, rank = maxV(V[xx])
        while rank <= 5:
            max = V[xx][rank]
            for mx in max:  #  <__main__.GeValue object at 0x11231a6a0>
                row = mx.row
                for yy in ylist:  # 3:location
                    val_y = df.ix[row][yy]
                    root = ylist[yy]
                    node = findNodes(root, val_y)
                    upper = []
                    upper = getUpper(root, node, upper)
                    for vy in V[yy]:  # vy=1,2,3,4,5
                        if len(V[yy]
                               [vy]) > 0:  # current level is not an empty list
                            for v in V[yy][vy]:  # GeValue v
                                if v.value in upper:
                                    p = merge(mx.partition, v.partition)
                                    mx.partition = p
                                    v.partition = p
                                    for gv in p.list:
                                        gv.partition = p
            rank = rank + 1
    return V
예제 #7
0
def datadic_fd(MasterData, X, Y):

    # X, Y = X.split(','), Y.split(',')
    datadic = {}
    for i in range(0, len(MasterData)):

        tuple = MasterData[i]

        # save into the data dic of the fd

        strX = ''  # left part of fd
        listY = [
        ]  # right part of fd, because of the possibility of Ontology, so use list to combine

        for x in X:
            strX = strX + str(tuple[int(x)])

        for y in Y:

            y = y.split('*')

            if len(y) > 1:
                # pattern of y*tree

                root = reJson(
                    '../data/lTest/city.json')  #path according to the y[1]

                listY.append(getFamily(y[1], findNodes(root,
                                                       tuple[int(y[0])])))

            else:

                listY.append(tuple[int(y[0])])

            datadic[strX] = listY

    return datadic
예제 #8
0
def ErrorDetect_perFD(data, X, Y):
    '''
    对每一组FD(X->Y)找violations,
    计算的思想是:
    1:将tuple的X部分拼成str
    :param data: list
    :param X: str
    :param Y: str
    :return: list of violations
    '''
    from src.OntologyNode import getFamily
    from src.OntologyNode import findNodes
    from src.ReadJson import reJson

    print('*'*20+' FD ['+X+'--->'+Y+' ]'+'*'*20)
    X, Y = X.split(','), Y.split(',')
    dic = {}
    violations = []

    for i in range(0, len(data)):
        a = data[i]
        # print('---::current tuple::---')
        # print(a)
        strX = ''
        listY = []
        b = ''
        for x in X:
            strX = strX + str(a[int(x)])
        if strX not in dic:
            i = 0
            y_dic = {}
            for y in Y:
                y = y.split('*')
                if len(y) > 1:
                    y[1] = reJson('../data/lTest/city.json')
                    # print(y[1])
                    listY = getFamily(y[1], findNodes(y[1], a[int(y[0])]))
                    y_dic[i] = listY
                else:
                    listY.append(a[int(y[0])])
                    y_dic[i] = listY
                i = i + 1
            dic[strX] = y_dic
        else:
            i = 0
            for y in Y:
                y = y.split('*')
                if a[int(y[0])] not in dic[strX][i]:
                    ori = []
                    ori.append(strX)
                    ori.append(dic[strX])
                    if ori not in violations:
                        violations.append(ori)

                    violations.append(a)
                i = i + 1
        # print('::current dic::')
        # print(dic)
        # print('::current violations::')
        # print(violations)

    return violations
예제 #9
0
                    P.append(p)
P=[]      # list of partitions

VPmap={}   # GeValue -----> Partition

columns=5
rows=len(df)
geColumn={1:age,3:location,4:department}    # column ---> root
V={1:[],3:[],4:[]}  #save every column's GeValue

# Initializing
for y in geColumn:
    root=geColumn[y]
    for x in range(0,rows):
        value=df.ix[x][y]
        node=findNodes(root,value)
        if node.isLeaf()=='false':
            v=GeValue(value,x,y,root)
            V[y].append(v)
            p=Partition([v])
            P.append(p)
            VPmap[v]=p
# Traverse
print(P)
print('--')


for f in fd:    # 0,1*age;3*location
    x=f
    y=fd[f]
    x=x.split(',')