class SupportIdentifierTestCase(unittest.TestCase): def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.scIdentifier = SupportIdentifier() self.testTables = sampler.getTablesSubjectIdentificationGoldStandard() def testSupportIdentifier(self): """ support = 0 100 | 0.327868852459 support = 0.8 97 | 0.327868852459 support = 0.8 30 | 0.459016393443 support = 0.8 40 | 0.409836065574 support = 10 70 | 0.475409836066 """ correctly = 0 for tableNum, table in enumerate(self.testTables): supportFloor = 10 supportCeil = 70 subjectColumn = self.scIdentifier.identifySubjectColumn(table, supportCeil, supportFloor) if table.isSubjectColumn(subjectColumn): correctly += 1 macroAccuracy = float(correctly) / len(self.testTables) print correctly print len(self.testTables) print macroAccuracy
class Support(FeatureInterface): def __init__(self): self.supportIdentifier = SupportIdentifier() def calculate(self, column, columnIndex, table): support = self.supportIdentifier.getSupport(table) return support[columnIndex]
class SupportConnectivityIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.connectivityIdentifier = ConnectivityIdentifier() self.supportIdentifier = SupportIdentifier() def identifySubjectColumn(self, table, supportFloor=10, supportCeil=70, connectivityThreshold=0.01, alpha=0.5): connectivities = self.connectivityIdentifier.getConnectivity(table, applyWeights=False) supports = self.supportIdentifier.getSupport(table) supports = [support if support < supportCeil and support > supportFloor else 0 for support in supports] connectivities = [ connectivity if connectivity > connectivityThreshold else 0 for connectivity in connectivities ] # Make supports and connectivities on the same scale connectivities = [connectivity * 100 for connectivity in connectivities] # supports = [support / 10 for support in supports] consups = [0] * len(connectivities) for columnIndex, item in enumerate(consups): consups[columnIndex] = alpha * supports[columnIndex] + (1 - alpha) * connectivities[columnIndex] return consups.index(max(consups))
def setUp(self): sampler = T2DSampler() self.testTable = sampler.getTestTable() self.scIdentifier = SupportIdentifier() self.testTables = sampler.getTablesSubjectIdentificationGoldStandard()
def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.supportIdentifier = SupportIdentifier()
class DistantSupervisionIdentifier(object): def __init__(self): self.logger = Logger().getLogger(__name__) self.agdistis = AgdistisTableIdentifier() self.supportIdentifier = SupportIdentifier() def identifySubjectColumn( self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0 ): """ rowsToAnalyze -- how many rows should be evaluated rowsFromCache -- can be used to reduce number of rows to be read from cache connectivity -- a number of relations subject column should have at least (absolute number) threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows """ tableData = table.getData() tableHeader = table.getHeader() tableId = table.id numberOfRows = len(tableData) numberOfColumns = len(tableData[0]) self.logger.debug(tableId) self.executionStartTimePoint = 0 self.executionEndTimePoint = 0 self.executionTimeFull = 0 self.executionTimePure = 0 # without querying and disambiguation self.queryTime = 0 self.agdistisTime = 0 self.executionStartTimePoint = time.time() # identify entities # TODO: get the score from agdistis agdistisStartTimePoint = time.time() entities = self.agdistis.disambiguateTable(table) agdistisEndTimePoint = time.time() self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint # TODO: rename columnScores to supports columnScores = self.supportIdentifier.calculateSupport(entities) # Support based approach ends here: refactor into class relations = self.propertyTableSearch.findRelationsForTable(table, entities) # Make just a connectivity approach!!! # Calculate the connectivity for all the rows and then take average! # What we have a boolean classifier # Linear combination is better # Ten cross fold validation (or inverse) # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support # For the combination --> import ipdb ipdb.set_trace() subjectColumnScores = [0] * numberOfColumns for subjectColumn in subjectColumns: if subjectColumn != None: subjectColumnScores[subjectColumn] += 1 # Normalize for columnIndex, subjectColumnScore in enumerate(subjectColumnScores): subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100 import ipdb ipdb.set_trace() # WRONG!!!! # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold] self.executionEndTimePoint = time.time() self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime if len(subjectColumn) <= 0: return None else: return subjectColumn[0]
def __init__(self): self.supportIdentifier = SupportIdentifier()
def __init__(self): self.logger = Logger().getLogger(__name__) self.connectivityIdentifier = ConnectivityIdentifier() self.supportIdentifier = SupportIdentifier()