示例#1
0
    def getAllNumericalRangeDiffScore(self, onePrimaryKey,
                                      allNumericalfieldRangeMap):

        #matching using primary key to calculate range difference score, not considering the same table matching
        if onePrimaryKey in allNumericalfieldRangeMap:  #primary key is in the numerical algorithm,  run numerical algorithm
            for tbFd in allNumericalfieldRangeMap:
                if (onePrimaryKey, tbFd
                    ) in self.allNumericalPairsRangeDifferenceScoreMap or (
                        tbFd, onePrimaryKey
                    ) in self.allNumericalPairsRangeDifferenceScoreMap:
                    continue
                tableNm1 = onePrimaryKey.split('.')[0]
                tableNm2 = tbFd.split('.')[0]
                if onePrimaryKey != tbFd and tableNm1 != tableNm2:  #judge the other field not in the same table matching etc.
                    # print ('len allNumericalFieldsList', tbfdPri + ',' + tbFd)
                    percentileListA = allNumericalfieldRangeMap[onePrimaryKey][
                        1:]
                    percentileListB = allNumericalfieldRangeMap[tbFd][1:]
                    rangeDiffScore = self.getRangeMetricNumericalUsed(
                        int(percentileListA[0]), int(percentileListB[0]),
                        int(percentileListA[1]), int(percentileListB[1]),
                        int(percentileListA[2]), int(percentileListB[2]),
                        int(percentileListA[3]), int(percentileListB[3]))
                    rdScoreLst = blist()
                    rdScoreLst.append(onePrimaryKey)
                    rdScoreLst.append(tbFd)
                    rdScoreLst.append(rangeDiffScore)
                    self.allNumericalPairsRangeDifferenceScoreMap[(
                        onePrimaryKey, tbFd)] = rangeDiffScore
                    fdprsObj = fieldPairSim(onePrimaryKey, tbFd,
                                            rangeDiffScore)
                    self.allNumericalRangeDifferenceScoreTripleLst.append(
                        fdprsObj)
示例#2
0
 def readLastNLineFileTsvThreeColumnToLst(self,  lastNLine, inputFile):
     indexLastN = -1 * lastNLine;
     lstTriples = blist()
     f = open (inputFile)
     lineList = f.readlines()
     f.close()
     #print (lineList)
     print ("The last line is:", len(lineList))
     #print (lineList[-1])
     for row in lineList[indexLastN:-1]:
         #print ('row ', row, type(row))
         prA = row.split('\t')[0].strip().lower()           # strip and lowercase
         prB = row.split('\t')[1].strip().lower()           # strip and lowercase
         matchingRatio = row.split('\t')[2].strip().lower()           # strip and lowercase
         fdprsObj = fieldPairSim(prA, prB, matchingRatio)
         lstTriples.append(fdprsObj)
     return lstTriples
示例#3
0
    def getAllNumericalBucketdotProductsScore(self, rangeDiffThd,
                                              inputBucketSizeNum,
                                              oneRangeDiffResPair,
                                              allNumericalValuesMap):
        # print ('oneRangeDiffResPair : ', type(oneRangeDiffResPair), oneRangeDiffResPair)
        preproc = preprocess()

        pair = oneRangeDiffResPair
        setX = set()
        setY = set()
        fieldX = str(pair[0])
        fieldXVal = allNumericalValuesMap[fieldX]
        for val in set(fieldXVal):  #unique value
            if preproc.is_number(val) and int(float(val)) >= 0 and int(
                    float(val)) < 200000000000:
                setX.add(int(float(val)))

        fieldY = str(pair[1])
        fieldYVal = allNumericalValuesMap[fieldY]
        for val in set(fieldYVal):
            if preproc.is_number(val) and int(float(val)) >= 0 and int(
                    float(val)) < 200000000000:
                #if selectNum <= 0.5*len(fieldBValue):
                setY.add(int(float(val)))

        rangeXY = int(
            max(min(setX), max(setX), min(setY), max(setY)) -
            min(min(setX), max(setX), min(setY), max(setY)) +
            1)  #union of range to decide the
        # bucketNum
        bdpRes = self.bucketDotProduct(
            setX, setY, int(inputBucketSizeNum), rangeXY,
            True)  #normalized buckete dot product score
        bdpScoreLst = blist()
        bdpScoreLst.append(fieldX)
        bdpScoreLst.append(fieldY)
        bdpScoreLst.append(bdpRes)
        fdprsObj = fieldPairSim(fieldX, fieldY, bdpRes)
        self.allNumericalBucketDPScoreTripleLst.append(fdprsObj)
示例#4
0
    def getNonumericalCosSimiRecordWiseScalingMethod(
            self, pair, tbFieldAllNonNumericalValuesMap, prefixLength,
            partFetchNum, ratioPruning, recordPrSimiThreshold,
            finalNonNumericalOutputDir):
        comRdFileObj = commonReadFile()
        if not os.path.exists(finalNonNumericalOutputDir + '/' +
                              'pruneResults'):
            os.makedirs(finalNonNumericalOutputDir + '/' + 'pruneResults')

        i = 0
        #get field values
        prA = pair.strip().split('-')[0].lower()  #tb.field A
        prB = pair.strip().split('-')[1].lower()
        #get index
        if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap:
            #get all tb field values
            #  print ('pairsDDDDDDDDD: ', prA, prB)

            lstValA = tbFieldAllNonNumericalValuesMap[prA]  #get values
            lstValB = tbFieldAllNonNumericalValuesMap[prB]
            # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0])
            writeWholeLst = blist([])  #write rows lists
            cosResLst = blist([])
            pairsNameLstA = blist([])
            pairsNameLstB = blist([])
            pairsNameLstA.append(str(prA))
            pairsNameLstB.append(str(prB))
            cosResLst.append('Cosine Similarity')
            countTrueComparePairs = 0
            #partition A and B columns into several partitions respectively
            lenA = len(lstValA)
            lenB = len(lstValB)

            lenAPartition = min(partFetchNum, lenA)
            numAPartition = ceil(lenA / lenAPartition)
            lenBPartition = min(partFetchNum, lenB)
            numBPartition = ceil(lenB / lenBPartition)

            #scalable method, in partition times partion pairs, if in the first half of partition, the ratio of matching above threshod is less than ratioPruning, exit this column pairs.
            Bexit = False
            laAllLenWhole = 0
            lbAlllenWhole = 0
            for i in range(0, numAPartition):
                if Bexit:
                    break
                if (i + 1) * partFetchNum <= lenA:
                    lsValACur = lstValA[i * partFetchNum:(i + 1) *
                                        partFetchNum]
                else:
                    lsValACur = lstValA[i * partFetchNum:lenA]

                for j in range(0, numBPartition):
                    if (j + 1) * partFetchNum <= lenB:
                        lsValBCur = lstValB[j * partFetchNum:(j + 1) *
                                            partFetchNum]
                    else:
                        lsValBCur = lstValB[j * partFetchNum:lenB]
                    #print ('countTruePairsAAAAAAAAAA ', lenA, lenB, prA, prB)

                    [
                        countTrueComparePairsEvery, countAboveThreholdEvery,
                        laAllLen, lbAlllen
                    ] = self.filterColumnsWithSample(False, lsValACur,
                                                     lsValBCur, prefixLength,
                                                     partFetchNum,
                                                     recordPrSimiThreshold,
                                                     pairsNameLstA,
                                                     pairsNameLstB, cosResLst)
                    #print ('pairsccccccccc: ', prA, prB, countAboveThrehold, partFetchNum, numAPartition)
                    laAllLenWhole += laAllLen
                    lbAllLenWhole += lbAlllen
                    countTrueComparePairs += countTrueComparePairsEvery
                    if (
                            countAboveThreholdEvery /
                        ((len(lsValACur) + len(lsValBCur)) / 2) < ratioPruning
                    ):  #  (numAPartition >=2) and (i <= 2*numAPartition/3) and  judge if the ratio of matching above threshold is low, exit and go to next column,until to the final partition?
                        Bexit = True
                        filetmp = finalNonNumericalOutputDir + '/' + 'pruneResults' + '/' + 'prunePairs.tsv'
                        rowStr = prA + '\t' + prB + '\t' + '\n'
                        comRdFileObj.writeStrRowToFileAppend(filetmp, rowStr)
                        # print ('pairsDDDDDDDDD: ', prA, prB)
                        break
                #consider this column pair
            # if any record pair similarity above threshold, run again sample record similarity
            if (not Bexit) and (len(pairsNameLstA) >
                                1):  #no any pair qualifies
                writeWholeLst.append(pairsNameLstA)
                writeWholeLst.append(pairsNameLstB)
                writeWholeLst.append(cosResLst)
                #print ('countTruePairsBBBBBBBB ', countTrueComparePairs,  prA, prB)
                ##fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTrueComparePairs)

                laMatchLen = len(set(pairsNameLstA[0])) - 1
                lbMatchLen = len(set(pairsNameLstB[1])) - 1
                matchingRatio = 0.5 * (laMatchLen / laAllLen +
                                       laAllLenWhole / lbAlllenWhole)
                fdprsObj = fieldPairSim(prA, prB, matchingRatio)

                self.lstTopPairsTobeAllMatched.append(fdprsObj)

                if (len(writeWholeLst) >= 3):
                    #select records, numOfRecords,
                    tbA = prA.split('.')[0]
                    fdA = prA.split('.')[1]
                    tbB = prB.split('.')[0]
                    fdB = prB.split('.')[1]
                    outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str(
                        tbB).upper() + '__' + str(fdB)

                    fd = open(
                        finalNonNumericalOutputDir + '/' + outFile2 + '.tsv',
                        'w')
                    comRdFileObj.writeListsColumnsToFileAppendWriterTsv(
                        fd, writeWholeLst)
                    fd.close()
                    writeWholeLst = blist([])
                # totalEndOne = time.time()
                # print ('total time One', totalEndOne - totalEndOneFilter)
        #because the speed and time problem, write out part of result to look
        self.recordPartialResultTimeEnd = time.time()  #start time
        if ((len(self.lstTopPairsTobeAllMatched) != 0
             and len(self.lstTopPairsTobeAllMatched) % 30 == 0)
                or ((self.recordPartialResultTimeEnd -
                     self.recordPartialResultTimeStart) >=
                    86400)):  #86400seconds =1 days
            self.recordPartialResultTimeStart = time.time()
            if not os.path.exists(finalNonNumericalOutputDir + '/' +
                                  'partResultOutput'):
                os.makedirs(finalNonNumericalOutputDir + '/' +
                            'partResultOutput')
            comRdFileObj = commonReadFile(
            )  # clear only matching ratio output file
            comRdFileObj.sortAndWritetoFile(
                self.lstTopPairsTobeAllMatched, finalNonNumericalOutputDir +
                '/' + 'partResultOutput' + '/' + 'partRatioScoreAllResult00' +
                str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')
示例#5
0
    def getSamplesNonumericalCosSimiRecordWise(self, pair, samplesFlag,
                                               tbFieldAllNonNumericalValuesMap,
                                               prefixLength, sampleRecordsNum,
                                               recordPrSimiThreshold,
                                               finalNonNumericalOutputDir):
        comRdFileObj = commonReadFile()

        i = 0
        #get field values
        if (samplesFlag):
            prA = pair.strip().split('-')[0].lower()  #tb.field A
            prB = pair.strip().split('-')[1].lower()
        else:
            prA = pair.fieldA.strip()
            prB = pair.fieldB.strip()
        #get index
        if prA in tbFieldAllNonNumericalValuesMap and prB in tbFieldAllNonNumericalValuesMap:
            #get all tb field values
            # print ('pairsDDDDDDDDD: ', prA, prB)

            lsValA = tbFieldAllNonNumericalValuesMap[prA]  #get values
            lsValB = tbFieldAllNonNumericalValuesMap[prB]
            # print ('newLsValA AAAA: ' , prA, prB, len(lsValA),lsValA[0], len(lsValB), lsValB[0])
            writeWholeLst = blist([])  #write rows lists
            cosResLst = blist([])
            pairsNameLstA = blist([])
            pairsNameLstB = blist([])
            pairsNameLstA.append(str(prA))
            pairsNameLstB.append(str(prB))
            cosResLst.append('Cosine Similarity')

            [countTruePairs, countAboveThrehold,
             laAllLen, lbAlllen] = self.filterColumnsWithSample(
                 samplesFlag, lsValA, lsValB, prefixLength, sampleRecordsNum,
                 recordPrSimiThreshold, pairsNameLstA, pairsNameLstB,
                 cosResLst)
            # if any record pair similarity above threshold, run again sample record similarity
            if (len(pairsNameLstA) > 1):  #no any pair qualifies
                writeWholeLst.append(pairsNameLstA)
                writeWholeLst.append(pairsNameLstB)
                writeWholeLst.append(cosResLst)

                #fdprsObj = fieldPairSim(prA, prB, len(pairsNameLstA)/countTruePairs)
                #matching ratio score calcuation    1/2*(la/l_alla + lb/l_allb)
                laMatchLen = len(set(pairsNameLstA[0])) - 1
                lbMatchLen = len(set(pairsNameLstB[1])) - 1
                matchingRatio = 0.5 * (laMatchLen / laAllLen +
                                       lbMatchLen / lbAlllen)
                fdprsObj = fieldPairSim(prA, prB, matchingRatio)

                self.lstTopPairsTobeAllMatched.append(fdprsObj)

                if (len(writeWholeLst) >= 3):
                    #select records, numOfRecords,
                    tbA = prA.split('.')[0]
                    fdA = prA.split('.')[1]
                    tbB = prB.split('.')[0]
                    fdB = prB.split('.')[1]
                    outFile2 = str(tbA).upper() + '__' + str(fdA) + '-' + str(
                        tbB).upper() + '__' + str(fdB)

                    if (samplesFlag):  #sample result output dir
                        finalNonNumericalOutputDir = Intermediate_DirFiles[0]
                        fd = open(
                            finalNonNumericalOutputDir + '/' + outFile2 +
                            '.tsv', 'w')
                    comRdFileObj.writeListsColumnsToFileAppendWriterTsv(
                        fd, writeWholeLst)
                    i = i + 1
                    fd.close()
                    writeWholeLst = blist([])
                # totalEndOne = time.time()
                # print ('total time One', totalEndOne - totalEndOneFilter)
        #because the speed and time problem, write out part of result

        if (len(self.lstTopPairsTobeAllMatched) != 0
                and len(self.lstTopPairsTobeAllMatched) % 200 == 0):
            if not os.path.exists(
                    'intermediateOutput/nonNumericalInterOutput/second'):
                os.makedirs(
                    'intermediateOutput/nonNumericalInterOutput/second')
            if not os.path.exists(
                    'intermediateOutput/nonNumericalInterOutput/second/partResultOutput'
            ):
                os.makedirs(
                    'intermediateOutput/nonNumericalInterOutput/second/partResultOutput'
                )
            comRdFileObj = commonReadFile(
            )  # clear only matching ratio output file
            comRdFileObj.sortAndWritetoFile(
                self.lstTopPairsTobeAllMatched,
                Intermediate_DirFiles[2] + '/' + 'partRatioScoreAllResult00' +
                str(len(self.lstTopPairsTobeAllMatched)) + '.tsv')