Exemplo n.º 1
0
class SupportIdentifierTestCase(unittest.TestCase):
    def setUp(self):
        sampler = T2DSampler()
        self.testTable = sampler.getTestTable()
        self.scIdentifier = SupportIdentifier()
        self.testTables = sampler.getTablesSubjectIdentificationGoldStandard()

    def testSupportIdentifier(self):
        """
           support = 0 100 | 0.327868852459
           support = 0.8 97 | 0.327868852459
           support = 0.8 30 | 0.459016393443
           support = 0.8 40 | 0.409836065574
           support = 10 70 | 0.475409836066
        """
        correctly = 0
        for tableNum, table in enumerate(self.testTables):
            supportFloor = 10
            supportCeil = 70
            subjectColumn = self.scIdentifier.identifySubjectColumn(table, supportCeil, supportFloor)
            if table.isSubjectColumn(subjectColumn):
                correctly += 1

        macroAccuracy = float(correctly) / len(self.testTables)
        print correctly
        print len(self.testTables)
        print macroAccuracy
Exemplo n.º 2
0
class Support(FeatureInterface):
    def __init__(self):
        self.supportIdentifier = SupportIdentifier()

    def calculate(self, column, columnIndex, table):
        support = self.supportIdentifier.getSupport(table)
        return support[columnIndex]
Exemplo n.º 3
0
class SupportConnectivityIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.connectivityIdentifier = ConnectivityIdentifier()
        self.supportIdentifier = SupportIdentifier()

    def identifySubjectColumn(self, table, supportFloor=10, supportCeil=70, connectivityThreshold=0.01, alpha=0.5):
        connectivities = self.connectivityIdentifier.getConnectivity(table, applyWeights=False)
        supports = self.supportIdentifier.getSupport(table)

        supports = [support if support < supportCeil and support > supportFloor else 0 for support in supports]
        connectivities = [
            connectivity if connectivity > connectivityThreshold else 0 for connectivity in connectivities
        ]

        # Make supports and connectivities on the same scale
        connectivities = [connectivity * 100 for connectivity in connectivities]
        # supports = [support / 10 for support in supports]

        consups = [0] * len(connectivities)
        for columnIndex, item in enumerate(consups):
            consups[columnIndex] = alpha * supports[columnIndex] + (1 - alpha) * connectivities[columnIndex]

        return consups.index(max(consups))
Exemplo n.º 4
0
 def setUp(self):
     sampler = T2DSampler()
     self.testTable = sampler.getTestTable()
     self.scIdentifier = SupportIdentifier()
     self.testTables = sampler.getTablesSubjectIdentificationGoldStandard()
Exemplo n.º 5
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.agdistis = AgdistisTableIdentifier()
     self.supportIdentifier = SupportIdentifier()
Exemplo n.º 6
0
class DistantSupervisionIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()
        self.supportIdentifier = SupportIdentifier()

    def identifySubjectColumn(
        self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0
    ):
        """
            rowsToAnalyze -- how many rows should be evaluated
            rowsFromCache -- can be used to reduce number of rows to be read from cache
            connectivity -- a number of relations subject column should have at least (absolute number)
            threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows
        """
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])

        self.logger.debug(tableId)

        self.executionStartTimePoint = 0
        self.executionEndTimePoint = 0
        self.executionTimeFull = 0
        self.executionTimePure = 0  # without querying and disambiguation
        self.queryTime = 0
        self.agdistisTime = 0

        self.executionStartTimePoint = time.time()
        # identify entities
        # TODO: get the score from agdistis
        agdistisStartTimePoint = time.time()
        entities = self.agdistis.disambiguateTable(table)
        agdistisEndTimePoint = time.time()
        self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint

        # TODO: rename columnScores to supports
        columnScores = self.supportIdentifier.calculateSupport(entities)
        # Support based approach ends here: refactor into class
        relations = self.propertyTableSearch.findRelationsForTable(table, entities)

        # Make just a connectivity approach!!!

        # Calculate the connectivity for all the rows and then take average!
        # What we have a boolean classifier
        # Linear combination is better
        # Ten cross fold validation (or inverse)
        # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support
        # For the combination -->

        import ipdb

        ipdb.set_trace()

        subjectColumnScores = [0] * numberOfColumns
        for subjectColumn in subjectColumns:
            if subjectColumn != None:
                subjectColumnScores[subjectColumn] += 1

        # Normalize
        for columnIndex, subjectColumnScore in enumerate(subjectColumnScores):
            subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100

        import ipdb

        ipdb.set_trace()
        # WRONG!!!!
        # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold]

        self.executionEndTimePoint = time.time()
        self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint
        self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime

        if len(subjectColumn) <= 0:
            return None
        else:
            return subjectColumn[0]
Exemplo n.º 7
0
 def __init__(self):
     self.supportIdentifier = SupportIdentifier()
Exemplo n.º 8
0
 def __init__(self):
     self.logger = Logger().getLogger(__name__)
     self.connectivityIdentifier = ConnectivityIdentifier()
     self.supportIdentifier = SupportIdentifier()