Пример #1
0
    def lmsScore(self):
        relationships=self.relationships 
        totalSize=self.totalSize
        ccClasses=set(Neo4jDrive.findAllCCNodes())
        hypothesis=self.hypothesis
        for column in enumerate(columnNames):
            rlist=sparqlQuerypy.findPropertyClassesThird(column)
            relationships[column]['lms']={}
            ccClassesOfColumn=set(Neo4jDrive.findCCNodes(column))
            for r in rlist:
                rangeList=sparqlQuerypy.findRange(r['s']['value'])
                if len(rangeList)==0: #does not have range
                    objTypeList=set([sparqlQuerypy.findTypeOfObject(r['t']['value']))
                    if len(objTypeList & ccClassesofColumn)==0:
                        continue #discard property if range(types of objects) don't exist in ccClasses.
                
                if (set(rangeList) & ccClassesofColumn)==0:
                    continue #discard property if range(got through Sparql) doesn't exist in ccClasses.
                domainList=sparqlQuerypy.findDomain(r['t']['value'])
                if len(domainList)==0: #does not have a Domain
                    domainList=[k['t']['value'] for k in sparqlQuerypy.findTypeOfSubject(r['s']['value']))]

                for domain in domainList:
                    if r['s']['value'] not in relationships[column]['lms'].keys():
                        relationships[column]['lms'][r['s']['value']]={}
                    
                    if domain in hypothesis:
                        if domain not in relationships[column]['lms'][r['s']['value']].keys():
                            relationships[column]['lms'][r['s']['value']]['d']= {'name':domain}      
                    else:
                        if domain in ccClasses:
                            hypothesis.add(domain)
                            relationships[column]['lms'][r['s']['value']]['d']= {'name':domain}
Пример #2
0
 def run(self):
     count=0
     objtypes=[]
     rlist=sparqlQuerypy.findPropertyClassesFirst(self.a)
     
     for r in rlist:
         if u'r' not in r.keys():
             ccClasses=Neo4jDrive.findCCNodes(self.a)
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
             for item in (set([k['d']['value'] for k in propertyUsage]) & hypothesisSet):
                 #rel=Neo4jDrive.insertNodeAndRelationship(self.a ,'cp', r['p']['value'])
                 #self.hyplock.acquire()
                 #hypothesisSet.add(r['p']['value'])
                 #self.hyplock.release()
                 #temp=Neo4jDrive.findNodeByName(r['p']['value'])
                 #temp.properties['hyp']='yes'
                 #temp.push()
                 self.addProperty(r['p']['value'])
                 rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item)
             for item in (set([k['d']['value'] for k in propertyUsage]) & set(self.allCC)):
                 #rel=Neo4jDrive.insertNodeAndRelationship(self.a, 'cp', r['p']['value'])
                 #self.hyplock.acquire()
                 #hypothesisSet.add(r['p']['value'])
                 #self.hyplock.release()
                 #temp=Neo4jDrive.findNodeByName(r['p']['value'])
                 #temp.properties['hyp']='yes'
                 #temp.push()
                 self.addProperty(r['p']['value'])
                 rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item)
Пример #3
0
 def run(self):
     rlist=sparqlQuerypy.findProperty2(self.label1,self.label2)
     print '------------------'
     log.write('----------------\n')
     log.write(str(datetime.datetime.now())+'\n')
     log.write(self.label1+self.label2)
     print self.label1,self.label2#,rlist
     
     cache=[]
     propertyUsage=[1]
     for r in rlist:
         if u'd' in r.keys():
             self.addProperty(r['p']['value'])
             rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0]
             rel_data['name']='domain'
             rel_data.push()
         else:
             ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column])
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             if r['p']['value'] not in cache:
                 propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                 cache+=[r['p']['value']]
             
                 print len(propertyUsage),r['p']['value']
                 if len(propertyUsage)<15000:
                     for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):
                          self.addProperty(r['p']['value'])
                          rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0]
                          rel_data['name']="domain"
                          rel_data.push()
                          node=Neo4jDrive.findNodeByName(item)
                          node.properties['hyp']='yes'
                          node.properties['type']='cc'
                          node.push()
                          self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
Пример #4
0
    def dmsScore(self):
        data=self.data
        columnNames=self.columnNames
        totalSize=self.totalSize
        relationships=self.relationships
        size=len(data)
        cache=[]
        bitmap={}
        
        for i,column1 in enumerate(columnNames):
            if column1 not in relationships.keys():
                relationships[column1]={}
            if column1 not in bitmap.keys():
                bitmap[column1]={}
            for j,column2 in enumerate(columnNames):
                if column2 not in relationships.keys():
                    relationships[column2]={}
                if i==j: continue
                for element in data:
                    print '--------------------'
                    print element[i],'-->',element[j]
                    item=(element[i],element[j])
                    rlist=sparqlQuerypy.findProperty2(element[i].strip(),element[j].strip())
                    cache=[]
                    for r in rlist:
                        
                        if column2 not in relationships[column1].keys():
                            relationships[column1][column2]={}
                        if column2 not in bitmap[column1].keys():
                            bitmap[column1][column2]={}
                        if item not in bitmap[column1][column2]:
                            bitmap[column1][column2][item]={}
                        bitmap[column1][column2][item][r['p']['value']]=0
                        if r['p']['value'] not in relationships[column1][column2].keys():
                            relationships[column1][column2][r['p']['value']]={}
                        if u'd' in r.keys():
                            print 'u d is in r.keys()'
                            relationships[column1][column2][r['p']['value']]['name']='property'
                            if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                relationships[column1][column2][r['p']['value']]['count']=1.0
                            if bitmap[column1][column2][item][r['p']['value']]==0:
                                relationships[column1][column2][r['p']['value']]['count']+=1
                                bitmap[column1][column2][item][r['p']['value']]=1
                            print relationships[column1][column2][r['p']['value']]['count']
                            relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize
                            if r['p']['value'] not in relationships[column2].keys():
                                relationships[column2][r['p']['value']]={}
                            relationships[column2][r['p']['value']]['name']='cp'
                            if r['p']['value'] not in relationships.keys():
                                relationships[r['p']['value']]={}
                            if r['d']['value'] not in relationships[r['p']['value']].keys():
                                relationships[r['p']['value']][r['d']['value']]={'name':'domain'}   
                            #-----------------TODO: add to hypothesis-------------#      

                        else:
                            ccClasses=Neo4jDrive.findCCNodes(column2)
                            
                            buildString="("
                            for ii in ccClasses:
                                buildString+='<'+ii+'>,'
                            buildString=buildString[:-1]
                            buildString+=")"

                            if r['p']['value'] not in cache:
                                propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                                cache+=[r['p']['value']]
                                #bitmap[column1][column2][item][r['p']['value']]=0
                                for domain in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):

                                   relationships[column1][column2][r['p']['value']]['name']='property'
                                   if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                       relationships[column1][column2][r['p']['value']]['count']=1.0
                                   print "item and r['p']['value'], is", item,r['p']['value']
                                   if bitmap[column1][column2][item][r['p']['value']]==0:
                                       relationships[column1][column2][r['p']['value']]['count']+=1
                                       bitmap[column1][column2][item][r['p']['value']]=1
                                   print relationships[column1][column2][r['p']['value']]['count']
                                   relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize*1.0
                                   if r['p']['value'] not in relationships[column2].keys():
                                       relationships[column2][r['p']['value']]={}
                                   relationships[column2][r['p']['value']]['name']='cp'
                                   if r['p']['value'] not in relationships.keys():
                                       relationships[r['p']['value']]={}
                                   if item not in relationships[r['p']['value']].keys():
                                       relationships[r['p']['value']][domain]={'name':'domain'}
                bitmap[column1][column2]=None
                         #-------------------------add to Hypothesis----------------------#

                     #-----------------Uploading to Neo4j----------------------------#
        for i,column1 in enumerate(columnNames):
            for j,column2 in enumerate(columnNames):
                if column1==column2: continue
                if column2 not in relationships[column1].keys(): continue 
                for rel in relationships[column1][column2].keys():
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column1,rel,column2)[0]
                    
                    rel_data.properties['type']='property'
                    rel_data.properties['name']=rel
                    if 'dms' in relationships[column1][column2][rel].keys():
                        rel_data.properties['dms']=relationships[column1][column2][rel]['dms']
                    else:
                        rel_data.properties['dms']=0
                    rel_data.push()
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column2,'cp',rel)[0]
                    rel_data.properties['type']='cp'
                    rel_data.push()

                    for domain in relationships[rel].keys():
                       rel_data=Neo4jDrive.insertNodeAndRelationship(rel,'domain',domain)[0]
                       rel_data.properties['type']='domain'
                       rel_data.push()