Exemplo n.º 1
0
def main():
    csvitems=[]
    data=[]
    tables=["StatesandCapitals.csv","RiversandSourceState.csv"]
    size=[]

    for nameOfFile in tables:
        Neo4jDrive.insertNode(nameOfFile)
        node=Neo4jDrive.findNodeByName(nameOfFile)
        node.properties['type']='table'
        node.push()
        csvitems+=[CSVRead.readCSV(nameOfFile,firstRow=False, choice=[0,1])[1:]]
        size+=[len(csvitems[-1])]
        random.shuffle(csvitems[-1])
    i=k=0          
    while len(csvitems)>0:
        
        for l,item in enumerate(csvitems):
            
            end=k+sample
            s=sample
            if k+sample>len(item):
                s=sample-(end-len(item))
                end=len(item)
            data[i:i+s]=[[it,l] for it in item[k:end]]
            i+=s
            if k+sample>len(item):
               csvitems.remove(item)
        k+=sample
    run(data,tables,size)
    def run(self):
        support = self.support
        totalNumberOfValues = self.totalNumberOfValues

        column = self.column
        columnNames = self.columnNames
        item = self.item
        rlist = sparqlQuerypy.findBottomUp(item)
        for r in rlist:

            rel_data = Neo4jDrive.insertNodeAndRelationship(
                columnNames[column], "cc", r[2])
            node = Neo4jDrive.findNodeByName(r[2])
            if node.properties['incoming'] == None:
                node.properties['incoming'] = 1
            else:
                node.properties['incoming'] += 1
            node.properties['type'] = 'type'
            node.push()

            rel_data = rel_data[0]
            rel_data.properties['rel_class'] = 'cc'
            rel_data.properties['support'] = support[item] / (
                totalNumberOfValues * 1.0)
            rel_data.push()
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames = CSVRead.readCSV(nameOfFile,
                                  firstRow=True,
                                  choice=[0, 1, 2, 3, 4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name)

    #support=CSVRead.getSupport(nameOfFile,0)
    #totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(
            sum([
                1 for _ in Neo4jDrive.findRelationshipsOfNode(
                    nameOfFile, "Column")
            ])):
        support = CSVRead.getSupport(nameOfFile, column)
        totalNumberOfValues = CSVRead.numberOfItems(support)

        #print i.end_node
        #cNode=Neo4jDrive.findNodeByName(columnNames[column])

        for item in support.keys():
            k = itemThread(item, columnNames, column, support,
                           totalNumberOfValues)
            k.start()
            k.join()
Exemplo n.º 4
0
    def lmsScore(self):
        relationships=self.relationships 
        totalSize=self.totalSize
        ccClasses=set(Neo4jDrive.findAllCCNodes())
        hypothesis=self.hypothesis
        for column in enumerate(columnNames):
            rlist=sparqlQuerypy.findPropertyClassesThird(column)
            relationships[column]['lms']={}
            ccClassesOfColumn=set(Neo4jDrive.findCCNodes(column))
            for r in rlist:
                rangeList=sparqlQuerypy.findRange(r['s']['value'])
                if len(rangeList)==0: #does not have range
                    objTypeList=set([sparqlQuerypy.findTypeOfObject(r['t']['value']))
                    if len(objTypeList & ccClassesofColumn)==0:
                        continue #discard property if range(types of objects) don't exist in ccClasses.
                
                if (set(rangeList) & ccClassesofColumn)==0:
                    continue #discard property if range(got through Sparql) doesn't exist in ccClasses.
                domainList=sparqlQuerypy.findDomain(r['t']['value'])
                if len(domainList)==0: #does not have a Domain
                    domainList=[k['t']['value'] for k in sparqlQuerypy.findTypeOfSubject(r['s']['value']))]

                for domain in domainList:
                    if r['s']['value'] not in relationships[column]['lms'].keys():
                        relationships[column]['lms'][r['s']['value']]={}
                    
                    if domain in hypothesis:
                        if domain not in relationships[column]['lms'][r['s']['value']].keys():
                            relationships[column]['lms'][r['s']['value']]['d']= {'name':domain}      
                    else:
                        if domain in ccClasses:
                            hypothesis.add(domain)
                            relationships[column]['lms'][r['s']['value']]['d']= {'name':domain}
Exemplo n.º 5
0
 def run(self):
     count=0
     objtypes=[]
     rlist=sparqlQuerypy.findPropertyClassesFirst(self.a)
     
     for r in rlist:
         if u'r' not in r.keys():
             ccClasses=Neo4jDrive.findCCNodes(self.a)
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
             for item in (set([k['d']['value'] for k in propertyUsage]) & hypothesisSet):
                 #rel=Neo4jDrive.insertNodeAndRelationship(self.a ,'cp', r['p']['value'])
                 #self.hyplock.acquire()
                 #hypothesisSet.add(r['p']['value'])
                 #self.hyplock.release()
                 #temp=Neo4jDrive.findNodeByName(r['p']['value'])
                 #temp.properties['hyp']='yes'
                 #temp.push()
                 self.addProperty(r['p']['value'])
                 rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item)
             for item in (set([k['d']['value'] for k in propertyUsage]) & set(self.allCC)):
                 #rel=Neo4jDrive.insertNodeAndRelationship(self.a, 'cp', r['p']['value'])
                 #self.hyplock.acquire()
                 #hypothesisSet.add(r['p']['value'])
                 #self.hyplock.release()
                 #temp=Neo4jDrive.findNodeByName(r['p']['value'])
                 #temp.properties['hyp']='yes'
                 #temp.push()
                 self.addProperty(r['p']['value'])
                 rel=Neo4jDrive.insertNodeAndRelationship(r['p']['value'], 'd', item)
Exemplo n.º 6
0
def run(data,tables,size):
    support=[[]]
    columnNames=[]
    for i,nameOfFile in enumerate(tables):
        columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])]
        columnNames[i]=[c.strip() for c in columnNames[i]]
        for j,name in enumerate(columnNames[i]):
            z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0]
            node=Neo4jDrive.findNodeByName(name)
            node.properties['type']='Column'
            node.push()
            z.properties['type']="Column"
            z.push()
            support[i]+=[CSVRead.getSupport(nameOfFile,j)]
        support+=[[]]
    support=support[:-1]
   
    totalNumberOfValues=CSVRead.getSize(nameOfFile,0)
   
    
    hyplock=Lock()
    stypelock=Lock()
    
    for itemPiece in data:
        indexOfFile=itemPiece[1]
        item=itemPiece[0]
        for column in range(len(columnNames[indexOfFile])):
        #support=CSVRead.getSupport(nameOfFile,column)
        #totalNumberOfValues=CSVRead.numberOfItems(support)
        
            k=ccThread(item[column],columnNames[indexOfFile],column,support[indexOfFile],size[indexOfFile])
            k.start()
            k.join()
    for itemPiece in data:
        indexOfFile=itemPiece[1]
        item=itemPiece[0]
        for column in range(len(columnNames[indexOfFile])):
           #support=CSVRead.getSupport(nameOfFile,column)
           #totalNumberOfValues=CSVRead.numberOfItems(support)

            for perm_column in range(len(columnNames[indexOfFile])):
                if perm_column!=column:
                    k=dmsThread(item[column],item[perm_column],size[indexOfFile],columnNames[indexOfFile],column,perm_column)
                    k.start()
                    k.join()
        
        
    allCC=set(Neo4jDrive.findAllCCNodes())
    for s,c in enumerate(columnNames):
        for column in c:
            k=topDownThread(column,hyplock,stypelock,allCC,size[s])
            k.start()
            k.join()
Exemplo n.º 7
0
    def ccScores(self):
        data=self.data
        columnNames=self.columnNames
        totalSize=self.totalSize
        relationships=self.relationships
        size=len(data)
        bitmap={}
        for i,column in enumerate(columnNames):
            relationships[column]={}
            bitmap[column]={} #this is a dictionary which is a set of flags per data value remembering if the increment already happened.
            for element in data:
                item=element[i]
                rlist=sparqlQuerypy.findBottomUp(item.strip())
                print 'number of nodes for', item.strip(), " is ", len(rlist)
                bitmap[column][item]={}
                for r in rlist:
                    if r[0] not in bitmap[column][item].keys():
                        bitmap[column][item][r[0]]=0
                    if r[0] not in relationships[column].keys():
                        relationships[column][r[0]]={}
                    relationships[column][r[0]]['name']='cc'
                    if 'incoming' not in relationships[column][r[0]].keys():
                        relationships[column][r[0]]['incoming']=1
                        relationships[column][r[0]]['cc']=1.0/totalSize
                    else:
                        relationships[column][r[0]]['incoming']+=1 
                        relationships[column][r[0]]['cc']=relationships[column][r[0]]['incoming']*1.0/totalSize
                        bitmap[column][item][r[0]]=1
        classSet=set() # A set to save all the possible cc classes for ease of retrieval later and to streamline it.


        for column in columnNames: #Loop to push the relations and nodes to Neo4j
            for classes in relationships[column].keys():
                classSet.add(classes)
                rel_data=Neo4jDrive.insertNodeAndRelationship(column,'cc',classes)[0]
                rel_data.properties['rel_class']='cc'
                rel_data.properties['fk']=relationships[column][classes]['cc'] 
                rel_data.push()
         
        for classes in classSet: #Loop to update the CCS score for each class after the previous loop is over. CCS=sum(fk)/no(fk) for the node.
            print classes
            cummulative=0 # The accumulator
            linkNumbers=0 # The denominator
            for link in Neo4jDrive.findIncomingCCLinks(classes): #loop to find incoming cc edges.
                cummulative+=link[0].properties['fk']
                linkNumbers+=1
            node=Neo4jDrive.findNodeByName(classes)
            node.properties['ccs']=cummulative*1.0/linkNumbers
            node.properties['type']='cc'
            node.push()
Exemplo n.º 8
0
    def run(self):
        support=self.support
        totalNumberOfValues=self.totalNumberOfValues*1.0

        column=self.column
        columnNames=self.columnNames
        item=self.item
        rlist=sparqlQuerypy.findBottomUp(item.strip())

        print 'number of nodes for', item.strip(), " is ", len(rlist)
        log.write('number of nodes for'+str( item.strip())+ " is "+ str(len(rlist))+'\n')
        flag=0
        for r in rlist:
            rel_data=Neo4jDrive.insertNodeAndRelationship(columnNames[column],"cc",r[2])       
            rel_data=rel_data[0]
            node=Neo4jDrive.findNodeByName(r[2])
            if r[2]=='http://dbpedia.org/ontology/PopulatedPlace':
                print columnNames[column], 'Happening'
            
                print 'potato',rel_data
            if rel_data.properties['incoming']==None: #find out why this is not happenings
                rel_data.properties['incoming']=1
                rel_data.properties['ccs']=1/totalNumberOfValues
                rel_data.push()
                #print 'tomato',rel_data
            else:
                if flag==0:
                    rel_data.properties['incoming']+=1
                    rel_data.push()
                    rel_data.properties['ccs']=node.properties['incoming']/totalNumberOfValues
                    flag=1
            node.properties['type']='cc'
            node.properties['ccs']=0
            numberOfLinks=0
            for link in Neo4jDrive.findIncomingCCLinks(r[2]):
                node.properties['ccs']+=link[0].properties['ccs']
                numberOfLinks+=1
            if numberOfLinks>0: node.properties['ccs']/=numberOfLinks
            node.push()
            
            
            
            rel_data.properties['rel_class'] = 'cc'
            #rel_data.properties['ccs']=node.proper/(totalNumberOfValues*1.0)
            rel_data.push()
Exemplo n.º 9
0
    def run(self):
        support=self.support
        totalNumberOfValues=self.totalNumberOfValues

        column=self.column
        columnNames=self.columnNames
        item=self.item
        node=Neo4jDrive.findNodeByName(item)
        if  node== None:
            Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item)
            node=Neo4jDrive.findNodeByName(item)
            node.properties['fvalue']=support[item]
            node.push()
            rlist=sparqlQuerypy.findBottomUp(item)
            for r in rlist:
                try:
                    rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0])
                    rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2])          
                    node=node=Neo4jDrive.findNodeByName(r[2])
                    if node.properties['incoming']==None:
                        node.properties['incoming']=1
                    else:
                        node.properties['incoming']+=1
                    node.properties['type']='type'
                    node.push()
                except :
                    print columnNames[column],'cc',r[0]

                rel_data=rel_data[0]
                rel_data.properties['rel_class'] = 'cc'
                rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0)
                rel_data.push()
Exemplo n.º 10
0
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)
    
    #support=CSVRead.getSupport(nameOfFile,0)
    #totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])):
        support=CSVRead.getSupport(nameOfFile,column)
        totalNumberOfValues=CSVRead.numberOfItems(support)
        
        #print i.end_node
        #cNode=Neo4jDrive.findNodeByName(columnNames[column])
         
        for item in support.keys():
            k=itemThread(item,columnNames,column,support,totalNumberOfValues)
            k.start()
            k.join()
Exemplo n.º 11
0
 def run(self):
     rlist=sparqlQuerypy.findProperty2(self.label1,self.label2)
     print '------------------'
     log.write('----------------\n')
     log.write(str(datetime.datetime.now())+'\n')
     log.write(self.label1+self.label2)
     print self.label1,self.label2#,rlist
     
     cache=[]
     propertyUsage=[1]
     for r in rlist:
         if u'd' in r.keys():
             self.addProperty(r['p']['value'])
             rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0]
             rel_data['name']='domain'
             rel_data.push()
         else:
             ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column])
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             if r['p']['value'] not in cache:
                 propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                 cache+=[r['p']['value']]
             
                 print len(propertyUsage),r['p']['value']
                 if len(propertyUsage)<15000:
                     for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):
                          self.addProperty(r['p']['value'])
                          rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0]
                          rel_data['name']="domain"
                          rel_data.push()
                          node=Neo4jDrive.findNodeByName(item)
                          node.properties['hyp']='yes'
                          node.properties['type']='cc'
                          node.push()
                          self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
Exemplo n.º 12
0
 def addProperty(self,p):
     rel_data=Neo4jDrive.insertNodeAndRelationship(self.columnNames[self.column],"property",p)
     hypothesisSet.add(p)
     node=Neo4jDrive.findNodeByName(p)
     if node.properties['dcsincoming']==None:
         node.properties['dcsincoming']=1
         node.properties['dcs']=1/(self.size*1.0)
     else:
         node.properties['dcsincoming']+=1
         node.properties['dcs']=node.properties['dcsincoming']/(self.size*1.0)
     node.properties['type']='property'
     node.push()
     rel=Neo4jDrive.insertRelationship(self.columnNames[self.column], p, self.columnNames[self.perm_column])[0]
     if rel.properties['propCount']==None:    
         rel.properties['type']='property_rel'
         rel.properties['name']=p
         rel.properties['count']=1
         rel.properties['dms']=rel.properties['count']/(self.size*1.0)
     else:
         rel.properties['count']+=1
         rel.properties['dms']=rel.properties['count']/(self.size*1.0)
     rel.push()
Exemplo n.º 13
0
def main():
    columnNames=[]
    colNam={}
    csvitems={}
    size={}
    tables=["StatesandCapitals.csv","RiversandSourceState.csv"] 
    for i, nameOfFile in enumerate(tables):  #pushes each table as a node into the graph along with the columns
        Neo4jDrive.insertNode(nameOfFile)
        node=Neo4jDrive.findNodeByName(nameOfFile)
        node.properties['type']='table'
        node.push() #end of push
        columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])]
        columnNames[i]=[c.strip() for c in columnNames[i]]
        colNam[nameOfFile]=[c.strip() for c in columnNames[i]]
        for j,name in enumerate(columnNames[i]):
            z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0]
            node=Neo4jDrive.findNodeByName(name)
            node.properties['type']='Column'
            node.push()
            z.properties['type']="Column"
            z.push() #end of the Column Pushing


        csvitems[nameOfFile]=CSVRead.readCSV(nameOfFile,firstRow=False,choice=[0,1])[1:] #stores each data set in a dictionary of lists
        size[nameOfFile]=[len(csvitems[nameOfFile])] #stores the sizes of the lists in a dictionary called size
        random.shuffle(csvitems[nameOfFile]) #shuffles for randomness
    relationships={}
    iterations=1
    convergence=False #the test flag for whether convergence has been reached
    while(not convergence):
        for table in tables:
            start=sample*(iterations-1)
            end=sample*iterations
            rt=runThread(table, csvitems[table][start:end], colNam[table],end,relationships)
            rt.start()
            rt.join()
        iterations+=1
        if end>5:convergence=True
Exemplo n.º 14
0
    def run(self):
        support = self.support
        totalNumberOfValues = self.totalNumberOfValues

        column = self.column
        columnNames = self.columnNames
        item = self.item
        rlist = sparqlQuerypy.findBottomUp(item)
        for r in rlist:

            rel_data = Neo4jDrive.insertNodeAndRelationship(columnNames[column], "cc", r[2])
            node = Neo4jDrive.findNodeByName(r[2])
            if node.properties["incoming"] == None:
                node.properties["incoming"] = 1
            else:
                node.properties["incoming"] += 1
            node.properties["type"] = "type"
            node.push()

            rel_data = rel_data[0]
            rel_data.properties["rel_class"] = "cc"
            rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0)
            rel_data.push()
Exemplo n.º 15
0
 def addProperty(self,p):
     print self.a, p
     rel_data=Neo4jDrive.insertNodeAndRelationship(self.a,"cp",p)[0]
     rel_data.properties['type']='cp'
     self.hyplock.acquire()
     hypothesisSet.add(p)
     self.hyplock.release()
     node=Neo4jDrive.findNodeByName(p)
     if rel_data.properties['incoming']==None:
         rel_data.properties['incoming']=1
         rel_data.properties['dms']=1/(self.size*1.0)
         pr=p
         for j in range(len(pr)-1,0,-1):
             if pr[j]=='/':
                 pr=pr[j+1:]
                 break
         rel_data.properties['lms']=self.levenshtein(self.a,pr)
     else:
         rel_data.properties['incoming']+=1
         rel_data.properties['dms']=node.properties['incoming']/(self.size*1.0)
     rel_data.push()
     node.properties['type']='property'
     node.properties['hyp']='yes'
     node.push()
Exemplo n.º 16
0
import Neo4jDrive
import CSVWrite
from py2neo import Graph
import csv
import math
graph = Graph("http://*****:*****@localhost:7474/db/data/")

i = 5
number = 6
with open('../csv/eggs%s.csv' % number, 'wb') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='{')
    writer.writerow(
        ['Domain Class', 'CCS Score', 'DCS Score', 'Table', 'Overall Score'])
    theTable = []
    domains = {}
    numberOfColumns = Neo4jDrive.findTotalNumberOfColumns()[0][0]
    for record in graph.cypher.execute(
            "MATCH (n) where n.hyp='yes' return n.name, n.ccs, n.DCS"):
        domain = record[0]
        ccs = record[1]
        dcs = (Neo4jDrive.findNumberOfColumns(domain)[0][0] *
               1.0) / numberOfColumns
        r = []
        table = Neo4jDrive.tableMembership(domain)
        if ccs != None and dcs != None and ccs != 0 and dcs != 0:
            csvs = math.sqrt((ccs * ccs) + (dcs * dcs))
            entropy = -(ccs) / (ccs + dcs) * math.log(
                ccs /
                (ccs + dcs)) - (dcs) / (ccs + dcs) * math.log(dcs /
                                                              (ccs + dcs))
            overall = csvs * entropy * table
Exemplo n.º 17
0
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)
    
    #support=CSVRead.getSupport(nameOfFile,0)
    #totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])):
        support=CSVRead.getSupport(nameOfFile,column)
        totalNumberOfValues=CSVRead.numberOfItems(support)
        
        #print i.end_node
        #cNode=Neo4jDrive.findNodeByName(columnNames[column])
         
        for item in support.keys():
            node=Neo4jDrive.findNodeByName(item)
            if  node== None:
                Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item)
                node=Neo4jDrive.findNodeByName(item)
                node.properties['fvalue']=support[item]
                node.push()
                rlist=sparqlQuerypy.findBottomUp(item)
                for r in rlist:
                    try:
                        rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0])
                        rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2])          
                        node=node=Neo4jDrive.findNodeByName(r[2])
                        if node.properties['incoming']==None:
                            node.properties['incoming']=1
                        else:
                            node.properties['incoming']+=1
                        node.properties['type']='type'
                        node.push()
                    except :
                        
                        print columnNames[column],'cc',r[0]

                    rel_data=rel_data[0]
                    rel_data.properties['rel_class'] = 'cc'
                    rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0)
                    rel_data.push()
Exemplo n.º 18
0
    def dmsScore(self):
        data=self.data
        columnNames=self.columnNames
        totalSize=self.totalSize
        relationships=self.relationships
        size=len(data)
        cache=[]
        bitmap={}
        
        for i,column1 in enumerate(columnNames):
            if column1 not in relationships.keys():
                relationships[column1]={}
            if column1 not in bitmap.keys():
                bitmap[column1]={}
            for j,column2 in enumerate(columnNames):
                if column2 not in relationships.keys():
                    relationships[column2]={}
                if i==j: continue
                for element in data:
                    print '--------------------'
                    print element[i],'-->',element[j]
                    item=(element[i],element[j])
                    rlist=sparqlQuerypy.findProperty2(element[i].strip(),element[j].strip())
                    cache=[]
                    for r in rlist:
                        
                        if column2 not in relationships[column1].keys():
                            relationships[column1][column2]={}
                        if column2 not in bitmap[column1].keys():
                            bitmap[column1][column2]={}
                        if item not in bitmap[column1][column2]:
                            bitmap[column1][column2][item]={}
                        bitmap[column1][column2][item][r['p']['value']]=0
                        if r['p']['value'] not in relationships[column1][column2].keys():
                            relationships[column1][column2][r['p']['value']]={}
                        if u'd' in r.keys():
                            print 'u d is in r.keys()'
                            relationships[column1][column2][r['p']['value']]['name']='property'
                            if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                relationships[column1][column2][r['p']['value']]['count']=1.0
                            if bitmap[column1][column2][item][r['p']['value']]==0:
                                relationships[column1][column2][r['p']['value']]['count']+=1
                                bitmap[column1][column2][item][r['p']['value']]=1
                            print relationships[column1][column2][r['p']['value']]['count']
                            relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize
                            if r['p']['value'] not in relationships[column2].keys():
                                relationships[column2][r['p']['value']]={}
                            relationships[column2][r['p']['value']]['name']='cp'
                            if r['p']['value'] not in relationships.keys():
                                relationships[r['p']['value']]={}
                            if r['d']['value'] not in relationships[r['p']['value']].keys():
                                relationships[r['p']['value']][r['d']['value']]={'name':'domain'}   
                            #-----------------TODO: add to hypothesis-------------#      

                        else:
                            ccClasses=Neo4jDrive.findCCNodes(column2)
                            
                            buildString="("
                            for ii in ccClasses:
                                buildString+='<'+ii+'>,'
                            buildString=buildString[:-1]
                            buildString+=")"

                            if r['p']['value'] not in cache:
                                propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                                cache+=[r['p']['value']]
                                #bitmap[column1][column2][item][r['p']['value']]=0
                                for domain in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):

                                   relationships[column1][column2][r['p']['value']]['name']='property'
                                   if 'count' not in relationships[column1][column2][r['p']['value']].keys():
                                       relationships[column1][column2][r['p']['value']]['count']=1.0
                                   print "item and r['p']['value'], is", item,r['p']['value']
                                   if bitmap[column1][column2][item][r['p']['value']]==0:
                                       relationships[column1][column2][r['p']['value']]['count']+=1
                                       bitmap[column1][column2][item][r['p']['value']]=1
                                   print relationships[column1][column2][r['p']['value']]['count']
                                   relationships[column1][column2][r['p']['value']]['dms']=relationships[column1][column2][r['p']['value']]['count']/totalSize*1.0
                                   if r['p']['value'] not in relationships[column2].keys():
                                       relationships[column2][r['p']['value']]={}
                                   relationships[column2][r['p']['value']]['name']='cp'
                                   if r['p']['value'] not in relationships.keys():
                                       relationships[r['p']['value']]={}
                                   if item not in relationships[r['p']['value']].keys():
                                       relationships[r['p']['value']][domain]={'name':'domain'}
                bitmap[column1][column2]=None
                         #-------------------------add to Hypothesis----------------------#

                     #-----------------Uploading to Neo4j----------------------------#
        for i,column1 in enumerate(columnNames):
            for j,column2 in enumerate(columnNames):
                if column1==column2: continue
                if column2 not in relationships[column1].keys(): continue 
                for rel in relationships[column1][column2].keys():
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column1,rel,column2)[0]
                    
                    rel_data.properties['type']='property'
                    rel_data.properties['name']=rel
                    if 'dms' in relationships[column1][column2][rel].keys():
                        rel_data.properties['dms']=relationships[column1][column2][rel]['dms']
                    else:
                        rel_data.properties['dms']=0
                    rel_data.push()
                    rel_data=Neo4jDrive.insertNodeAndRelationship(column2,'cp',rel)[0]
                    rel_data.properties['type']='cp'
                    rel_data.push()

                    for domain in relationships[rel].keys():
                       rel_data=Neo4jDrive.insertNodeAndRelationship(rel,'domain',domain)[0]
                       rel_data.properties['type']='domain'
                       rel_data.push()
Exemplo n.º 19
0
import Neo4jDrive
import CSVWrite
from py2neo import Graph
import csv
import math
graph = Graph("http://*****:*****@localhost:7474/db/data/")

i=5
number=6
with open('../csv/eggs%s.csv'%number,'wb') as csvfile:
    writer=csv.writer(csvfile, delimiter=',',quotechar='{')
    writer.writerow(['Domain Class','CCS Score','DCS Score', 'Table','Overall Score'])
    theTable=[]
    domains={}
    numberOfColumns=Neo4jDrive.findTotalNumberOfColumns()[0][0]
    for record in graph.cypher.execute("MATCH (n) where n.hyp='yes' return n.name, n.ccs, n.DCS"):
        domain=record[0]        
        ccs=record[1]
        dcs=(Neo4jDrive.findNumberOfColumns(domain)[0][0]*1.0)/numberOfColumns
        r=[]
        table=Neo4jDrive.tableMembership(domain)
        if ccs!=None and dcs!=None and ccs!=0 and dcs!=0:  
            csvs=math.sqrt((ccs*ccs)+(dcs*dcs))
            entropy=-(ccs)/(ccs+dcs)*math.log(ccs/(ccs+dcs))-(dcs)/(ccs+dcs)*math.log(dcs/(ccs+dcs))
            overall=csvs*entropy*table
        else:
            overall='-'
        domains[domain]=overall
        r.append(domain)    
        r.append(ccs)
        r.append(dcs)
Exemplo n.º 20
0
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames = CSVRead.readCSV(nameOfFile, firstRow=True, choice=[0, 1, 2, 3, 4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name)

    # support=CSVRead.getSupport(nameOfFile,0)
    # totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile, "Column")])):
        support = CSVRead.getSupport(nameOfFile, column)
        totalNumberOfValues = CSVRead.numberOfItems(support)

        # print i.end_node
        # cNode=Neo4jDrive.findNodeByName(columnNames[column])

        for item in support.keys():
            node = Neo4jDrive.findNodeByName(item)
            if node == None:
                Neo4jDrive.insertNodeAndRelationship(columnNames[column], "dataItems", item)
                node = Neo4jDrive.findNodeByName(item)
                node.properties["fvalue"] = support[item]
                node.push()
                rlist = sparqlQuerypy.findBottomUp(item)
                for r in rlist:
                    try:
                        rel_data = Neo4jDrive.insertNodeAndRelationship(item, "cc", r[0])
                        rel_data1 = Neo4jDrive.insertNodeAndRelationship(r[0], "dd", r[2])
                        node = node = Neo4jDrive.findNodeByName(r[2])
                        if node.properties["incoming"] == None:
                            node.properties["incoming"] = 1
                        else:
                            node.properties["incoming"] += 1
                        node.properties["type"] = "type"
                        node.push()
                    except:

                        print columnNames[column], "cc", r[0]

                    rel_data = rel_data[0]
                    rel_data.properties["rel_class"] = "cc"
                    rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0)
                    rel_data.push()