class SubjectColumnIdentificationBenchTestCase(unittest.TestCase):
    def setUp(self):
        sampler = T2DSampler()
        self.testTable = sampler.getTestTable()
        self.dlIdentifier = DistantSupervisionIdentifier()
        self.simpleIdentifier = SimpleIdentifier()
        #self.testTables20 = sampler.get20Tables()
        #self.testTables = sampler.getTablesSubjectIdentification()

    def testDistantLearningIdentifierNotCorrect(self):
        idList = ["68779923_2_1000046510804975562.csv"]
                  #"9353071_0_969221250383056227.csv",
                  #"94145647_0_4411495338698364870.csv",
                  #"9348099_0_390574653830621671.csv",
                  #"30103516_1_7626625507688323656.csv",
                  #"28788428_0_7847978656182431680.csv",
                  #"39650055_5_7135804139753401681.csv"]

        for _id in idList:
            table = T2DTable(_id)
            support = 1
            connectivity = 4
            threshold = 0
            subjectColumn = self.dlIdentifier.identifySubjectColumn(table, support=support, connectivity=connectivity, threshold=threshold)
            print "subjectColumn %s" % (table.subjectColumn,)
            print "identified %s" % (subjectColumn,)
            print table.table[0:5]
 def setUp(self):
     sampler = T2DSampler()
     self.testTable = sampler.getTestTable()
     self.dlIdentifier = DistantSupervisionIdentifier()
     self.simpleIdentifier = SimpleIdentifier()
     #self.testTables20 = sampler.get20Tables()
     self.testTables = sampler.getTablesSubjectIdentification()
示例#3
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.dlIdentifier = DistantSupervisionIdentifier()
示例#4
0
class SimpleCachePropertyMapper(object):
    """
        Performs quite poorly:
        The current precision is 47,6% and what can be achieved is 71% (maximum)
    """

    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.dlIdentifier = DistantSupervisionIdentifier()

    def mapProperties(self, table):
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        cacheFile = os.path.join(cacheFolder, tableId + ".relations.cache")
        subjectColumn = self.dlIdentifier.identifySubjectColumn(table)

        self.logger.debug("Identifying properties for a table %s"%(tableId))

        if(os.path.exists(cacheFile)):
            relations = pickle.load(open(cacheFile, 'rb'))
        else:
            raise RelationsDataStructureNotFound("Could not found Rels structure for %s"%(str(tableId),))

        self.executionTimeFull = 0
        self.startTime = time.time()
        #init properties
        nonSubjectColumns = range(0,len(relations[0]))
        nonSubjectColumns.remove(subjectColumn)
        properties = collections.defaultdict(dict)
        for nonSubjectColumn in nonSubjectColumns:
            properties[nonSubjectColumn] = []

        #Aggregate all properties
        for row in relations:
            for nonSubjectColumn in nonSubjectColumns:
                #This is properties for atomic table with h_i, i = nonSubjectColumn
                try:
                    properties[nonSubjectColumn].append(row[subjectColumn][nonSubjectColumn])
                except:
                    pass

        #Flatten the properties
        topProperties = []
        for nonSubjectColumn in nonSubjectColumns:
            properties[nonSubjectColumn] = [item for sublist in properties[nonSubjectColumn] for item in sublist]
            #and get the maximum
            try:
                topProperty = Counter(properties[nonSubjectColumn]).most_common(1)[0][0]
                topProperties.append((topProperty,nonSubjectColumn))
            except IndexError:
                self.logger.debug("No property identified for column %s"%(nonSubjectColumn))

        self.endTime = time.time()
        self.executionTimeFull = self.endTime - self.startTime

        #check if seed properties contain properties we are trying to find
        self.seedListContains = 0
        for _property in table.properties:
            if _property['uri'] in properties[_property['columnIndex']]:
                self.seedListContains += 1

        return topProperties
 def setUp(self):
     sampler = T2DSampler()
     self.testTable = sampler.getTestTable()
     self.dlIdentifier = DistantSupervisionIdentifier()
     self.simpleIdentifier = SimpleIdentifier()
class SubjectColumnIdentificationBenchTestCase(unittest.TestCase):
    def setUp(self):
        sampler = T2DSampler()
        self.testTable = sampler.getTestTable()
        self.dlIdentifier = DistantSupervisionIdentifier()
        self.simpleIdentifier = SimpleIdentifier()
        #self.testTables20 = sampler.get20Tables()
        self.testTables = sampler.getTablesSubjectIdentification()

    def determineResultsFilename(self, filename):
        import os
        while os.path.exists(os.path.join("results",filename)):
            filename = filename.split(".")
            index = str(int(filename.pop()) + 1)
            filename.append(index)
            filename = ".".join(filename)

        filename = os.path.join("results",filename)
        return filename

    def resultsIterativePrinter(self, row, filename):
        import csv
        with open(filename, 'a') as csvfile:
            spamwriter = csv.writer(csvfile, delimiter=',',
                                    quotechar='"', quoting=csv.QUOTE_MINIMAL)
            spamwriter.writerow(row)


    def distantLearningIdentifier(self, rowsToAnalyze, rowsFromCache, tables, support, connectivity):
        resultsFilename = "rows_%s.support_%s.connectivity_%s.hot.csv.1" % (str(rowsFromCache), str(support), str(connectivity),)
        resultsFilename = self.determineResultsFilename(resultsFilename)
        header = ["tableId","rowsToAnalyze","tableSize","subjectColumnIdx","identifiedCorrectly","executionTimeFull","executionTimePure","queryTime","disambiguationTime"]
        self.resultsIterativePrinter(header,resultsFilename)

        if rowsFromCache != None:
            rowsToAnalyze = rowsFromCache

        for table in tables:
            colNumber = self.dlIdentifier.identifySubjectColumn(table,rowsFromCache=rowsFromCache,support=support,connectivity=connectivity)
            identifiedCorrectly = table.isSubjectColumn(colNumber)
            tableSize = len(table.getData())
            result = [table.id, rowsToAnalyze, tableSize, colNumber, identifiedCorrectly, self.dlIdentifier.executionTimeFull, self.dlIdentifier.executionTimePure, self.dlIdentifier.queryTime, self.dlIdentifier.agdistisTime]
            self.resultsIterativePrinter(result,resultsFilename)

    # def testDistantLearningIdentifierOne(self):
    #     self.distantLearningIdentifier(20, [self.testTable], "testOneTable.20rows.csv.1")

    # def testDistantLearningIdentifierTwenty(self):
    #     self.distantLearningIdentifier(20, self.testTables20, "test20tables.20rows.csv.1")
    #
    def testDistantLearningIdentifierAll(self):
        """
            With 1 row only!

            Tables analyzed: 900
            Subject Column Identified Correctly: 762
            Precision: 0.846666666667

            Tables analyzed: 1687
            Subject Column Identified Correctly: 1461
            Precision: 0.866034380557
        """
        #for rowsFromCache in range(1, 20):
        for connectivity in range(0, 100, 10):
            for support in range(0, 100, 10):
                self.distantLearningIdentifier(20, 20, self.testTables, support, connectivity)