예제 #1
0
class DistantSupervisionIdentifier(object):
    def __init__(self):
        self.logger = Logger().getLogger(__name__)
        self.agdistis = AgdistisTableIdentifier()
        self.supportIdentifier = SupportIdentifier()

    def identifySubjectColumn(
        self, table, rowsToAnalyze=20, rowsFromCache=None, support=0, connectivity=0, threshold=0
    ):
        """
            rowsToAnalyze -- how many rows should be evaluated
            rowsFromCache -- can be used to reduce number of rows to be read from cache
            connectivity -- a number of relations subject column should have at least (absolute number)
            threshold -- percentage of subject columns identified inside the analyzed part of the table (divided by the total number of rows), i.e. 80% means that the same subject column identified for 80% of rows
        """
        tableData = table.getData()
        tableHeader = table.getHeader()
        tableId = table.id
        numberOfRows = len(tableData)
        numberOfColumns = len(tableData[0])

        self.logger.debug(tableId)

        self.executionStartTimePoint = 0
        self.executionEndTimePoint = 0
        self.executionTimeFull = 0
        self.executionTimePure = 0  # without querying and disambiguation
        self.queryTime = 0
        self.agdistisTime = 0

        self.executionStartTimePoint = time.time()
        # identify entities
        # TODO: get the score from agdistis
        agdistisStartTimePoint = time.time()
        entities = self.agdistis.disambiguateTable(table)
        agdistisEndTimePoint = time.time()
        self.agdistisTime = agdistisEndTimePoint - agdistisStartTimePoint

        # TODO: rename columnScores to supports
        columnScores = self.supportIdentifier.calculateSupport(entities)
        # Support based approach ends here: refactor into class
        relations = self.propertyTableSearch.findRelationsForTable(table, entities)

        # Make just a connectivity approach!!!

        # Calculate the connectivity for all the rows and then take average!
        # What we have a boolean classifier
        # Linear combination is better
        # Ten cross fold validation (or inverse)
        # just try different different weights a*connectivity + (1-a)*support --> equivalent for a*connectivity + b+support
        # For the combination -->

        import ipdb

        ipdb.set_trace()

        subjectColumnScores = [0] * numberOfColumns
        for subjectColumn in subjectColumns:
            if subjectColumn != None:
                subjectColumnScores[subjectColumn] += 1

        # Normalize
        for columnIndex, subjectColumnScore in enumerate(subjectColumnScores):
            subjectColumnScores[columnIndex] = float(subjectColumnScore) / numberOfRows * 100

        import ipdb

        ipdb.set_trace()
        # WRONG!!!!
        # subjectColumn = [columnIndex for columnIndex, columnScore in enumerate(subjectColumnScores) if columnScore >= threshold]

        self.executionEndTimePoint = time.time()
        self.executionTimeFull = self.executionEndTimePoint - self.executionStartTimePoint
        self.executionTimePure = self.executionTimeFull - self.queryTime - self.agdistisTime

        if len(subjectColumn) <= 0:
            return None
        else:
            return subjectColumn[0]