Exemplo n.º 1
0
    def run(self):
        support=self.support
        totalNumberOfValues=self.totalNumberOfValues

        column=self.column
        columnNames=self.columnNames
        item=self.item
        node=Neo4jDrive.findNodeByName(item)
        if  node== None:
            Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item)
            node=Neo4jDrive.findNodeByName(item)
            node.properties['fvalue']=support[item]
            node.push()
            rlist=sparqlQuerypy.findBottomUp(item)
            for r in rlist:
                try:
                    rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0])
                    rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2])          
                    node=node=Neo4jDrive.findNodeByName(r[2])
                    if node.properties['incoming']==None:
                        node.properties['incoming']=1
                    else:
                        node.properties['incoming']+=1
                    node.properties['type']='type'
                    node.push()
                except :
                    print columnNames[column],'cc',r[0]

                rel_data=rel_data[0]
                rel_data.properties['rel_class'] = 'cc'
                rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0)
                rel_data.push()
    def run(self):
        support = self.support
        totalNumberOfValues = self.totalNumberOfValues

        column = self.column
        columnNames = self.columnNames
        item = self.item
        rlist = sparqlQuerypy.findBottomUp(item)
        for r in rlist:

            rel_data = Neo4jDrive.insertNodeAndRelationship(
                columnNames[column], "cc", r[2])
            node = Neo4jDrive.findNodeByName(r[2])
            if node.properties['incoming'] == None:
                node.properties['incoming'] = 1
            else:
                node.properties['incoming'] += 1
            node.properties['type'] = 'type'
            node.push()

            rel_data = rel_data[0]
            rel_data.properties['rel_class'] = 'cc'
            rel_data.properties['support'] = support[item] / (
                totalNumberOfValues * 1.0)
            rel_data.push()
Exemplo n.º 3
0
def main():
    csvitems=[]
    data=[]
    tables=["StatesandCapitals.csv","RiversandSourceState.csv"]
    size=[]

    for nameOfFile in tables:
        Neo4jDrive.insertNode(nameOfFile)
        node=Neo4jDrive.findNodeByName(nameOfFile)
        node.properties['type']='table'
        node.push()
        csvitems+=[CSVRead.readCSV(nameOfFile,firstRow=False, choice=[0,1])[1:]]
        size+=[len(csvitems[-1])]
        random.shuffle(csvitems[-1])
    i=k=0          
    while len(csvitems)>0:
        
        for l,item in enumerate(csvitems):
            
            end=k+sample
            s=sample
            if k+sample>len(item):
                s=sample-(end-len(item))
                end=len(item)
            data[i:i+s]=[[it,l] for it in item[k:end]]
            i+=s
            if k+sample>len(item):
               csvitems.remove(item)
        k+=sample
    run(data,tables,size)
Exemplo n.º 4
0
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames=CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1,2,3,4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)
    
    #support=CSVRead.getSupport(nameOfFile,0)
    #totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile,"Column")])):
        support=CSVRead.getSupport(nameOfFile,column)
        totalNumberOfValues=CSVRead.numberOfItems(support)
        
        #print i.end_node
        #cNode=Neo4jDrive.findNodeByName(columnNames[column])
         
        for item in support.keys():
            node=Neo4jDrive.findNodeByName(item)
            if  node== None:
                Neo4jDrive.insertNodeAndRelationship(columnNames[column],'dataItems',item)
                node=Neo4jDrive.findNodeByName(item)
                node.properties['fvalue']=support[item]
                node.push()
                rlist=sparqlQuerypy.findBottomUp(item)
                for r in rlist:
                    try:
                        rel_data=Neo4jDrive.insertNodeAndRelationship(item,"cc",r[0])
                        rel_data1=Neo4jDrive.insertNodeAndRelationship(r[0],"dd",r[2])          
                        node=node=Neo4jDrive.findNodeByName(r[2])
                        if node.properties['incoming']==None:
                            node.properties['incoming']=1
                        else:
                            node.properties['incoming']+=1
                        node.properties['type']='type'
                        node.push()
                    except :
                        
                        print columnNames[column],'cc',r[0]

                    rel_data=rel_data[0]
                    rel_data.properties['rel_class'] = 'cc'
                    rel_data.properties['support']=support[item]/(totalNumberOfValues*1.0)
                    rel_data.push()
Exemplo n.º 5
0
def main():
    Neo4jDrive.insertNode(nameOfFile)
    columnNames = CSVRead.readCSV(nameOfFile, firstRow=True, choice=[0, 1, 2, 3, 4])
    for name in columnNames:
        Neo4jDrive.insertNodeAndRelationship(nameOfFile, "Column", name)

    # support=CSVRead.getSupport(nameOfFile,0)
    # totalNumberOfValues=CSVRead.numberOfItems(support)
    for column in range(sum([1 for _ in Neo4jDrive.findRelationshipsOfNode(nameOfFile, "Column")])):
        support = CSVRead.getSupport(nameOfFile, column)
        totalNumberOfValues = CSVRead.numberOfItems(support)

        # print i.end_node
        # cNode=Neo4jDrive.findNodeByName(columnNames[column])

        for item in support.keys():
            node = Neo4jDrive.findNodeByName(item)
            if node == None:
                Neo4jDrive.insertNodeAndRelationship(columnNames[column], "dataItems", item)
                node = Neo4jDrive.findNodeByName(item)
                node.properties["fvalue"] = support[item]
                node.push()
                rlist = sparqlQuerypy.findBottomUp(item)
                for r in rlist:
                    try:
                        rel_data = Neo4jDrive.insertNodeAndRelationship(item, "cc", r[0])
                        rel_data1 = Neo4jDrive.insertNodeAndRelationship(r[0], "dd", r[2])
                        node = node = Neo4jDrive.findNodeByName(r[2])
                        if node.properties["incoming"] == None:
                            node.properties["incoming"] = 1
                        else:
                            node.properties["incoming"] += 1
                        node.properties["type"] = "type"
                        node.push()
                    except:

                        print columnNames[column], "cc", r[0]

                    rel_data = rel_data[0]
                    rel_data.properties["rel_class"] = "cc"
                    rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0)
                    rel_data.push()
Exemplo n.º 6
0
def run(data,tables,size):
    support=[[]]
    columnNames=[]
    for i,nameOfFile in enumerate(tables):
        columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])]
        columnNames[i]=[c.strip() for c in columnNames[i]]
        for j,name in enumerate(columnNames[i]):
            z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0]
            node=Neo4jDrive.findNodeByName(name)
            node.properties['type']='Column'
            node.push()
            z.properties['type']="Column"
            z.push()
            support[i]+=[CSVRead.getSupport(nameOfFile,j)]
        support+=[[]]
    support=support[:-1]
   
    totalNumberOfValues=CSVRead.getSize(nameOfFile,0)
   
    
    hyplock=Lock()
    stypelock=Lock()
    
    for itemPiece in data:
        indexOfFile=itemPiece[1]
        item=itemPiece[0]
        for column in range(len(columnNames[indexOfFile])):
        #support=CSVRead.getSupport(nameOfFile,column)
        #totalNumberOfValues=CSVRead.numberOfItems(support)
        
            k=ccThread(item[column],columnNames[indexOfFile],column,support[indexOfFile],size[indexOfFile])
            k.start()
            k.join()
    for itemPiece in data:
        indexOfFile=itemPiece[1]
        item=itemPiece[0]
        for column in range(len(columnNames[indexOfFile])):
           #support=CSVRead.getSupport(nameOfFile,column)
           #totalNumberOfValues=CSVRead.numberOfItems(support)

            for perm_column in range(len(columnNames[indexOfFile])):
                if perm_column!=column:
                    k=dmsThread(item[column],item[perm_column],size[indexOfFile],columnNames[indexOfFile],column,perm_column)
                    k.start()
                    k.join()
        
        
    allCC=set(Neo4jDrive.findAllCCNodes())
    for s,c in enumerate(columnNames):
        for column in c:
            k=topDownThread(column,hyplock,stypelock,allCC,size[s])
            k.start()
            k.join()
Exemplo n.º 7
0
def main():
    columnNames=[]
    colNam={}
    csvitems={}
    size={}
    tables=["StatesandCapitals.csv","RiversandSourceState.csv"] 
    for i, nameOfFile in enumerate(tables):  #pushes each table as a node into the graph along with the columns
        Neo4jDrive.insertNode(nameOfFile)
        node=Neo4jDrive.findNodeByName(nameOfFile)
        node.properties['type']='table'
        node.push() #end of push
        columnNames+=[CSVRead.readCSV(nameOfFile,firstRow=True, choice=[0,1])]
        columnNames[i]=[c.strip() for c in columnNames[i]]
        colNam[nameOfFile]=[c.strip() for c in columnNames[i]]
        for j,name in enumerate(columnNames[i]):
            z=Neo4jDrive.insertNodeAndRelationship(nameOfFile,"Column",name)[0]
            node=Neo4jDrive.findNodeByName(name)
            node.properties['type']='Column'
            node.push()
            z.properties['type']="Column"
            z.push() #end of the Column Pushing


        csvitems[nameOfFile]=CSVRead.readCSV(nameOfFile,firstRow=False,choice=[0,1])[1:] #stores each data set in a dictionary of lists
        size[nameOfFile]=[len(csvitems[nameOfFile])] #stores the sizes of the lists in a dictionary called size
        random.shuffle(csvitems[nameOfFile]) #shuffles for randomness
    relationships={}
    iterations=1
    convergence=False #the test flag for whether convergence has been reached
    while(not convergence):
        for table in tables:
            start=sample*(iterations-1)
            end=sample*iterations
            rt=runThread(table, csvitems[table][start:end], colNam[table],end,relationships)
            rt.start()
            rt.join()
        iterations+=1
        if end>5:convergence=True
Exemplo n.º 8
0
    def ccScores(self):
        data=self.data
        columnNames=self.columnNames
        totalSize=self.totalSize
        relationships=self.relationships
        size=len(data)
        bitmap={}
        for i,column in enumerate(columnNames):
            relationships[column]={}
            bitmap[column]={} #this is a dictionary which is a set of flags per data value remembering if the increment already happened.
            for element in data:
                item=element[i]
                rlist=sparqlQuerypy.findBottomUp(item.strip())
                print 'number of nodes for', item.strip(), " is ", len(rlist)
                bitmap[column][item]={}
                for r in rlist:
                    if r[0] not in bitmap[column][item].keys():
                        bitmap[column][item][r[0]]=0
                    if r[0] not in relationships[column].keys():
                        relationships[column][r[0]]={}
                    relationships[column][r[0]]['name']='cc'
                    if 'incoming' not in relationships[column][r[0]].keys():
                        relationships[column][r[0]]['incoming']=1
                        relationships[column][r[0]]['cc']=1.0/totalSize
                    else:
                        relationships[column][r[0]]['incoming']+=1 
                        relationships[column][r[0]]['cc']=relationships[column][r[0]]['incoming']*1.0/totalSize
                        bitmap[column][item][r[0]]=1
        classSet=set() # A set to save all the possible cc classes for ease of retrieval later and to streamline it.


        for column in columnNames: #Loop to push the relations and nodes to Neo4j
            for classes in relationships[column].keys():
                classSet.add(classes)
                rel_data=Neo4jDrive.insertNodeAndRelationship(column,'cc',classes)[0]
                rel_data.properties['rel_class']='cc'
                rel_data.properties['fk']=relationships[column][classes]['cc'] 
                rel_data.push()
         
        for classes in classSet: #Loop to update the CCS score for each class after the previous loop is over. CCS=sum(fk)/no(fk) for the node.
            print classes
            cummulative=0 # The accumulator
            linkNumbers=0 # The denominator
            for link in Neo4jDrive.findIncomingCCLinks(classes): #loop to find incoming cc edges.
                cummulative+=link[0].properties['fk']
                linkNumbers+=1
            node=Neo4jDrive.findNodeByName(classes)
            node.properties['ccs']=cummulative*1.0/linkNumbers
            node.properties['type']='cc'
            node.push()
Exemplo n.º 9
0
    def run(self):
        support=self.support
        totalNumberOfValues=self.totalNumberOfValues*1.0

        column=self.column
        columnNames=self.columnNames
        item=self.item
        rlist=sparqlQuerypy.findBottomUp(item.strip())

        print 'number of nodes for', item.strip(), " is ", len(rlist)
        log.write('number of nodes for'+str( item.strip())+ " is "+ str(len(rlist))+'\n')
        flag=0
        for r in rlist:
            rel_data=Neo4jDrive.insertNodeAndRelationship(columnNames[column],"cc",r[2])       
            rel_data=rel_data[0]
            node=Neo4jDrive.findNodeByName(r[2])
            if r[2]=='http://dbpedia.org/ontology/PopulatedPlace':
                print columnNames[column], 'Happening'
            
                print 'potato',rel_data
            if rel_data.properties['incoming']==None: #find out why this is not happenings
                rel_data.properties['incoming']=1
                rel_data.properties['ccs']=1/totalNumberOfValues
                rel_data.push()
                #print 'tomato',rel_data
            else:
                if flag==0:
                    rel_data.properties['incoming']+=1
                    rel_data.push()
                    rel_data.properties['ccs']=node.properties['incoming']/totalNumberOfValues
                    flag=1
            node.properties['type']='cc'
            node.properties['ccs']=0
            numberOfLinks=0
            for link in Neo4jDrive.findIncomingCCLinks(r[2]):
                node.properties['ccs']+=link[0].properties['ccs']
                numberOfLinks+=1
            if numberOfLinks>0: node.properties['ccs']/=numberOfLinks
            node.push()
            
            
            
            rel_data.properties['rel_class'] = 'cc'
            #rel_data.properties['ccs']=node.proper/(totalNumberOfValues*1.0)
            rel_data.push()
Exemplo n.º 10
0
 def addProperty(self,p):
     rel_data=Neo4jDrive.insertNodeAndRelationship(self.columnNames[self.column],"property",p)
     hypothesisSet.add(p)
     node=Neo4jDrive.findNodeByName(p)
     if node.properties['dcsincoming']==None:
         node.properties['dcsincoming']=1
         node.properties['dcs']=1/(self.size*1.0)
     else:
         node.properties['dcsincoming']+=1
         node.properties['dcs']=node.properties['dcsincoming']/(self.size*1.0)
     node.properties['type']='property'
     node.push()
     rel=Neo4jDrive.insertRelationship(self.columnNames[self.column], p, self.columnNames[self.perm_column])[0]
     if rel.properties['propCount']==None:    
         rel.properties['type']='property_rel'
         rel.properties['name']=p
         rel.properties['count']=1
         rel.properties['dms']=rel.properties['count']/(self.size*1.0)
     else:
         rel.properties['count']+=1
         rel.properties['dms']=rel.properties['count']/(self.size*1.0)
     rel.push()
Exemplo n.º 11
0
 def run(self):
     rlist=sparqlQuerypy.findProperty2(self.label1,self.label2)
     print '------------------'
     log.write('----------------\n')
     log.write(str(datetime.datetime.now())+'\n')
     log.write(self.label1+self.label2)
     print self.label1,self.label2#,rlist
     
     cache=[]
     propertyUsage=[1]
     for r in rlist:
         if u'd' in r.keys():
             self.addProperty(r['p']['value'])
             rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",r['d']['value'])[0]
             rel_data['name']='domain'
             rel_data.push()
         else:
             ccClasses=Neo4jDrive.findCCNodes(self.columnNames[self.perm_column])
             buildString="("
             for i in ccClasses:
                 buildString+='<'+i+'>,'
             buildString=buildString[:-1]
             buildString+=")"
             if r['p']['value'] not in cache:
                 propertyUsage=sparqlQuerypy.findPropertyClassesSecond(r['p']['value'],buildString)
                 cache+=[r['p']['value']]
             
                 print len(propertyUsage),r['p']['value']
                 if len(propertyUsage)<15000:
                     for item in (set([k['r']['value'] for k in propertyUsage]) & set(ccClasses)):
                          self.addProperty(r['p']['value'])
                          rel_data=Neo4jDrive.insertNodeAndRelationship(r['p']['value'],"domain",item)[0]
                          rel_data['name']="domain"
                          rel_data.push()
                          node=Neo4jDrive.findNodeByName(item)
                          node.properties['hyp']='yes'
                          node.properties['type']='cc'
                          node.push()
                          self.incrementDms(rel_data) #for each table we have to put a score on the link between the what and what? The property and its domain? But then how is the score calculated? Is it number of columns in the table by total in that table or is it completely unique?
Exemplo n.º 12
0
    def run(self):
        support = self.support
        totalNumberOfValues = self.totalNumberOfValues

        column = self.column
        columnNames = self.columnNames
        item = self.item
        rlist = sparqlQuerypy.findBottomUp(item)
        for r in rlist:

            rel_data = Neo4jDrive.insertNodeAndRelationship(columnNames[column], "cc", r[2])
            node = Neo4jDrive.findNodeByName(r[2])
            if node.properties["incoming"] == None:
                node.properties["incoming"] = 1
            else:
                node.properties["incoming"] += 1
            node.properties["type"] = "type"
            node.push()

            rel_data = rel_data[0]
            rel_data.properties["rel_class"] = "cc"
            rel_data.properties["support"] = support[item] / (totalNumberOfValues * 1.0)
            rel_data.push()
Exemplo n.º 13
0
 def addProperty(self,p):
     print self.a, p
     rel_data=Neo4jDrive.insertNodeAndRelationship(self.a,"cp",p)[0]
     rel_data.properties['type']='cp'
     self.hyplock.acquire()
     hypothesisSet.add(p)
     self.hyplock.release()
     node=Neo4jDrive.findNodeByName(p)
     if rel_data.properties['incoming']==None:
         rel_data.properties['incoming']=1
         rel_data.properties['dms']=1/(self.size*1.0)
         pr=p
         for j in range(len(pr)-1,0,-1):
             if pr[j]=='/':
                 pr=pr[j+1:]
                 break
         rel_data.properties['lms']=self.levenshtein(self.a,pr)
     else:
         rel_data.properties['incoming']+=1
         rel_data.properties['dms']=node.properties['incoming']/(self.size*1.0)
     rel_data.push()
     node.properties['type']='property'
     node.properties['hyp']='yes'
     node.push()