class SubjectColumnIdentificationBenchTestCase(unittest.TestCase): def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.dlIdentifier = DistantSupervisionIdentifier() self.simpleIdentifier = SimpleIdentifier() #self.testTables20 = sampler.get20Tables() #self.testTables = sampler.getTablesSubjectIdentification() def testDistantLearningIdentifierNotCorrect(self): idList = ["68779923_2_1000046510804975562.csv"] #"9353071_0_969221250383056227.csv", #"94145647_0_4411495338698364870.csv", #"9348099_0_390574653830621671.csv", #"30103516_1_7626625507688323656.csv", #"28788428_0_7847978656182431680.csv", #"39650055_5_7135804139753401681.csv"] for _id in idList: table = T2DTable(_id) support = 1 connectivity = 4 threshold = 0 subjectColumn = self.dlIdentifier.identifySubjectColumn(table, support=support, connectivity=connectivity, threshold=threshold) print "subjectColumn %s" % (table.subjectColumn,) print "identified %s" % (subjectColumn,) print table.table[0:5]
def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.dlIdentifier = DistantSupervisionIdentifier() self.simpleIdentifier = SimpleIdentifier() #self.testTables20 = sampler.get20Tables() self.testTables = sampler.getTablesSubjectIdentification()
def __init__(self): self.logger = Logger().getLogger(__name__) self.dlIdentifier = DistantSupervisionIdentifier()
class SimpleCachePropertyMapper(object): """ Performs quite poorly: The current precision is 47,6% and what can be achieved is 71% (maximum) """ def __init__(self): self.logger = Logger().getLogger(__name__) self.dlIdentifier = DistantSupervisionIdentifier() def mapProperties(self, table): tableData = table.getData() tableHeader = table.getHeader() tableId = table.id cacheFile = os.path.join(cacheFolder, tableId + ".relations.cache") subjectColumn = self.dlIdentifier.identifySubjectColumn(table) self.logger.debug("Identifying properties for a table %s"%(tableId)) if(os.path.exists(cacheFile)): relations = pickle.load(open(cacheFile, 'rb')) else: raise RelationsDataStructureNotFound("Could not found Rels structure for %s"%(str(tableId),)) self.executionTimeFull = 0 self.startTime = time.time() #init properties nonSubjectColumns = range(0,len(relations[0])) nonSubjectColumns.remove(subjectColumn) properties = collections.defaultdict(dict) for nonSubjectColumn in nonSubjectColumns: properties[nonSubjectColumn] = [] #Aggregate all properties for row in relations: for nonSubjectColumn in nonSubjectColumns: #This is properties for atomic table with h_i, i = nonSubjectColumn try: properties[nonSubjectColumn].append(row[subjectColumn][nonSubjectColumn]) except: pass #Flatten the properties topProperties = [] for nonSubjectColumn in nonSubjectColumns: properties[nonSubjectColumn] = [item for sublist in properties[nonSubjectColumn] for item in sublist] #and get the maximum try: topProperty = Counter(properties[nonSubjectColumn]).most_common(1)[0][0] topProperties.append((topProperty,nonSubjectColumn)) except IndexError: self.logger.debug("No property identified for column %s"%(nonSubjectColumn)) self.endTime = time.time() self.executionTimeFull = self.endTime - self.startTime #check if seed properties contain properties we are trying to find self.seedListContains = 0 for _property in table.properties: if _property['uri'] in properties[_property['columnIndex']]: self.seedListContains += 1 return topProperties
def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.dlIdentifier = DistantSupervisionIdentifier() self.simpleIdentifier = SimpleIdentifier()
class SubjectColumnIdentificationBenchTestCase(unittest.TestCase): def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.dlIdentifier = DistantSupervisionIdentifier() self.simpleIdentifier = SimpleIdentifier() #self.testTables20 = sampler.get20Tables() self.testTables = sampler.getTablesSubjectIdentification() def determineResultsFilename(self, filename): import os while os.path.exists(os.path.join("results",filename)): filename = filename.split(".") index = str(int(filename.pop()) + 1) filename.append(index) filename = ".".join(filename) filename = os.path.join("results",filename) return filename def resultsIterativePrinter(self, row, filename): import csv with open(filename, 'a') as csvfile: spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) spamwriter.writerow(row) def distantLearningIdentifier(self, rowsToAnalyze, rowsFromCache, tables, support, connectivity): resultsFilename = "rows_%s.support_%s.connectivity_%s.hot.csv.1" % (str(rowsFromCache), str(support), str(connectivity),) resultsFilename = self.determineResultsFilename(resultsFilename) header = ["tableId","rowsToAnalyze","tableSize","subjectColumnIdx","identifiedCorrectly","executionTimeFull","executionTimePure","queryTime","disambiguationTime"] self.resultsIterativePrinter(header,resultsFilename) if rowsFromCache != None: rowsToAnalyze = rowsFromCache for table in tables: colNumber = self.dlIdentifier.identifySubjectColumn(table,rowsFromCache=rowsFromCache,support=support,connectivity=connectivity) identifiedCorrectly = table.isSubjectColumn(colNumber) tableSize = len(table.getData()) result = [table.id, rowsToAnalyze, tableSize, colNumber, identifiedCorrectly, self.dlIdentifier.executionTimeFull, self.dlIdentifier.executionTimePure, self.dlIdentifier.queryTime, self.dlIdentifier.agdistisTime] self.resultsIterativePrinter(result,resultsFilename) # def testDistantLearningIdentifierOne(self): # self.distantLearningIdentifier(20, [self.testTable], "testOneTable.20rows.csv.1") # def testDistantLearningIdentifierTwenty(self): # self.distantLearningIdentifier(20, self.testTables20, "test20tables.20rows.csv.1") # def testDistantLearningIdentifierAll(self): """ With 1 row only! Tables analyzed: 900 Subject Column Identified Correctly: 762 Precision: 0.846666666667 Tables analyzed: 1687 Subject Column Identified Correctly: 1461 Precision: 0.866034380557 """ #for rowsFromCache in range(1, 20): for connectivity in range(0, 100, 10): for support in range(0, 100, 10): self.distantLearningIdentifier(20, 20, self.testTables, support, connectivity)