def compare(gdb,columnX,columnY,GeValueX,Vlist,Plist,VPmaplist): ''' :param gdb: :param column: :param GeValueX: :return: ''' rsl=[] df=gdb global V,P,VPmap V=Vlist P=Plist VPmap=VPmaplist root=geColumn[columnX] Xnode=GeValueX.node xspar = VPmap[GeValueX] delist = [GeValueX.value] delist = getDescendantNodes(root, Xnode, delist) # value list # print(delist) for row in range(0,len(df)): xdata=df.ix[row][columnX] # print(xdata+'++++++') if xdata in delist: ydata=df.ix[row][columnY] rooty=geColumn[columnY] heighty = findNodes(rooty, ydata).getHeight() # print(ydata) # print(heighty) for GeValueY in V[columnY]: # find the equal one in right side # print(GeValueY.value) # if GeValueY.value==ydata: # print(findNodes(rooty,GeValueY.value).getHeight()) if findNodes(rooty,GeValueY.value).getHeight()<=heighty: yspar=VPmap[GeValueY] # print(yspar) p=Partition([]) # print(VPmap[GeValueX],VPmap[GeValueY]) if xspar in P: GVS = getGeValueAcordPar(VPmap, xspar) for gs in GVS: VPmap[gs]=p P.remove(xspar) if yspar in P: GVS = getGeValueAcordPar(VPmap, yspar) # print(GVS) for gs in GVS: VPmap[gs] = p P.remove(yspar) xspar=VPmap[GeValueX] # print(VPmap[GeValueX], VPmap[GeValueY]) P.append(p)
def __init__(self, value,row,col,root): self.value = value self.row=row self.col=col self.root=root self.node=findNodes(root,value) self.height=self.node.getHeight()
def __init__(self, value, row, col, root): self.value = value self.row = row self.col = col self.root = root self.node = findNodes(root, value) self.height = node.getHeight() list = [self] self.partition = Partition(list)
def sp_g(value, root): ''' Specificity Penalty for Ontology attribute :param value: the value of one tuple's one attribute :param root: attribute's tree's root :return: result ''' node = findNodes(root, value) if (node.isLeaf() == 'true'): result = 0 else: list = [] list = getDescendantNodes(root, node, list) size = len(list) result = math.log(size) return result
def cp_g(value, root): ''' Certainty Penalty for Ontology attribute :param value: the value of one tuple's one attribute :param root: attribute's tree's root :return: result ''' node = findNodes(root, value) if (node.isLeaf() == 'true'): result = 0 else: list = [] list = getDescendantNodes(root, node, list) size = len(list) domain = getDomainsize(root) result = size / domain return result
def doPartition(gex, gey, V, df, dic): xlist = {} ylist = {} for x in gex: x = x.split('*') xlist[int(x[0])] = dic[x[1]] #['1*age'] for y in gey: y = y.split('*') ylist[int(y[0])] = dic[y[1]] #['3*location'] for xx in xlist: # 1:age max, rank = maxV(V[xx]) while rank <= 5: max = V[xx][rank] for mx in max: # <__main__.GeValue object at 0x11231a6a0> row = mx.row for yy in ylist: # 3:location val_y = df.ix[row][yy] root = ylist[yy] node = findNodes(root, val_y) upper = [] upper = getUpper(root, node, upper) for vy in V[yy]: # vy=1,2,3,4,5 if len(V[yy] [vy]) > 0: # current level is not an empty list for v in V[yy][vy]: # GeValue v if v.value in upper: p = merge(mx.partition, v.partition) mx.partition = p v.partition = p for gv in p.list: gv.partition = p rank = rank + 1 return V
def datadic_fd(MasterData, X, Y): # X, Y = X.split(','), Y.split(',') datadic = {} for i in range(0, len(MasterData)): tuple = MasterData[i] # save into the data dic of the fd strX = '' # left part of fd listY = [ ] # right part of fd, because of the possibility of Ontology, so use list to combine for x in X: strX = strX + str(tuple[int(x)]) for y in Y: y = y.split('*') if len(y) > 1: # pattern of y*tree root = reJson( '../data/lTest/city.json') #path according to the y[1] listY.append(getFamily(y[1], findNodes(root, tuple[int(y[0])]))) else: listY.append(tuple[int(y[0])]) datadic[strX] = listY return datadic
def ErrorDetect_perFD(data, X, Y): ''' 对每一组FD(X->Y)找violations, 计算的思想是: 1:将tuple的X部分拼成str :param data: list :param X: str :param Y: str :return: list of violations ''' from src.OntologyNode import getFamily from src.OntologyNode import findNodes from src.ReadJson import reJson print('*'*20+' FD ['+X+'--->'+Y+' ]'+'*'*20) X, Y = X.split(','), Y.split(',') dic = {} violations = [] for i in range(0, len(data)): a = data[i] # print('---::current tuple::---') # print(a) strX = '' listY = [] b = '' for x in X: strX = strX + str(a[int(x)]) if strX not in dic: i = 0 y_dic = {} for y in Y: y = y.split('*') if len(y) > 1: y[1] = reJson('../data/lTest/city.json') # print(y[1]) listY = getFamily(y[1], findNodes(y[1], a[int(y[0])])) y_dic[i] = listY else: listY.append(a[int(y[0])]) y_dic[i] = listY i = i + 1 dic[strX] = y_dic else: i = 0 for y in Y: y = y.split('*') if a[int(y[0])] not in dic[strX][i]: ori = [] ori.append(strX) ori.append(dic[strX]) if ori not in violations: violations.append(ori) violations.append(a) i = i + 1 # print('::current dic::') # print(dic) # print('::current violations::') # print(violations) return violations
P.append(p) P=[] # list of partitions VPmap={} # GeValue -----> Partition columns=5 rows=len(df) geColumn={1:age,3:location,4:department} # column ---> root V={1:[],3:[],4:[]} #save every column's GeValue # Initializing for y in geColumn: root=geColumn[y] for x in range(0,rows): value=df.ix[x][y] node=findNodes(root,value) if node.isLeaf()=='false': v=GeValue(value,x,y,root) V[y].append(v) p=Partition([v]) P.append(p) VPmap[v]=p # Traverse print(P) print('--') for f in fd: # 0,1*age;3*location x=f y=fd[f] x=x.split(',')