Пример #1
0
 def clusterSameFloor(self, childList):
     avgNumList = []
     for child in childList:
         numList = [link['wordNum'] for link in getLink(child)]
         avgNumList.append(sum(numList)/len(numList))
     sumList = [0]
     for i in range(1, len(avgNumList)+1):
         sumList.append(sumList[i-1]+avgNumList[i-1])
     gapList = [0]
     for i in range(0, len(avgNumList)-1):
         for j in range(i+1, len(avgNumList)):
             gapList.append(abs(avgNumList[j]-(sumList[j]-sumList[i]+0.0)/(j-i)))
     gapList.sort(reverse = True)
     for gap in gapList:
         nodeClusterList = []
         start = 0
         for i in range(1, len(avgNumList)):
             if abs(avgNumList[i]-(sumList[i]-sumList[start]+0.0)/(i-start)) >= gap:
                 nodeClusterList.append(childList[start:i])
                 start = i
         nodeClusterList.append(childList[start:])
         if all([self.getChaotic(cluster) < self.ct for cluster in nodeClusterList]):
             clusterList = []
             for cluster in nodeClusterList:
                 oneCluster = []
                 for node in cluster:
                     oneCluster += [link['href'] for link in getLink(node)]
                 clusterList.append(oneCluster)
             return clusterList
     return None
Пример #2
0
def clusterLink(domTree, clusterList, md1, md2):
    childList = []
    for child in domTree.children:
        try:
            if getLink(child):
                childList.append(child)
        except:
            continue
    if not childList:
        return True
    if not getLink(domTree):
        return True
    isOneClusterList = [
        clusterLink(child, clusterList, md1, md2) for child in childList
    ]
    if all(isOneClusterList):
        if md1.isOneCluster(domTree) and md2.isOneCluster([domTree]):
            return True
        else:
            clusterList += md1.clusterSameFloor(childList, md2)
            return False
    else:
        tmpChildList = []
        for i in range(0, len(childList)):
            if not isOneClusterList[i]:
                clusterList += md1.clusterSameFloor(tmpChildList, md2)
                tmpChildList = []
            else:
                tmpChildList.append(childList[i])
        clusterList += md1.clusterSameFloor(tmpChildList, md2)
        return False
Пример #3
0
 def getMinGap(self, dom1, dom2):
     minGap = None
     linkList1 = getLink(dom1)
     linkList2 = getLink(dom2)
     if linkList1 and linkList2:
         minGap = self.getDis(linkList1[0], linkList2[0])
     for link1 in linkList1:
         for link2 in linkList2:
             minGap = min(self.getDis(link1, link2), minGap)
     return minGap
def findSubTree(dom, linkSet):
    subTree = dom
    for desc in dom.descendants:
        tmpLinkSet = set([link['href'] for link in getLink(desc)])
        if linkSet.issubset(tmpLinkSet):
            subTree = desc
    return subTree
Пример #5
0
def run(dataDir):
    htmlList = listdir(dataDir)
    htmlList = [html for html in htmlList if html.split('.')[-1] == 'html']
    naviNum = 0
    nNaviNum = 0
    naviDic = {}
    naviDic['navi'] = set()
    naviDic['nNavi'] = set()
    for html in htmlList:
        try:
            dom = bs(open(path.join(dataDir, html)))
        except Exception, e:
            print e
            break
        for link in getLink(dom):
            try:
                if link['cluster'] == 'nav' or link['cluster'] == 'list':
                    naviNum += 1
                    naviDic['navi'].add(link['clusterindex'])
                else:
                    nNaviNum += 1
                    naviDic['nNavi'].add(link['clusterindex'])
            except:
                naviDic['nNavi'].add(-1)
                nNaviNum += 1
Пример #6
0
def clusterLinkAgg(dom, para=1):
    md = ldi.linkDistance(dom, para)
    linkList = getLink(dom)
    linkClusterList = []
    for link in linkList:
        linkClusterList.append([link])
    while True:
        if len(linkClusterList) <= 1:
            break
        first = 0
        second = 1
        try:
            minGap = singleLink(linkClusterList[first],
                                linkClusterList[second], md)
            #minGap = singleLink2D(linkClusterList[first], linkClusterList[second], md)
        except:
            print(len(linkClusterList))
            return
        for i in range(0, len(linkClusterList)):
            for j in range(i + 1, len(linkClusterList)):
                gap = singleLink(linkClusterList[i], linkClusterList[j], md)
                #gap = singleLink2D(linkClusterList[i], linkClusterList[j], md)
                if gap < minGap:
                    minGap = gap
                    first = i
                    second = j
        if minGap > md.dt:
            #if minGap > md.dt*1.414:
            break
        linkClusterList[first] += linkClusterList[second]
        linkClusterList.pop(second)
    return linkClusterList
Пример #7
0
def runClusterLink(dom, md1, md2):
    clusterList = []
    if clusterLink(dom, clusterList, md1, md2):
        tmpCluster = getLink(dom)
        if tmpCluster:
            clusterList.append(tmpCluster)
    clusterList = [cluster for cluster in clusterList if cluster]
    return clusterList
Пример #8
0
def main():
    dom = bs(open('../../data/clean_eval/'+sys.argv[1]+'.html'))
    linkList = getLink(dom)
    linkLenList = [len(link.text.split()) for link in linkList]
    print(linkLenList)
    lc = linkChaotic(dom)
    linkLenList = [link['wordNum'] for link in linkList]
    print(linkLenList)
Пример #9
0
    def clusterSameFloor(self, childList, md2):
        clusterList = []
        if not childList:
            return clusterList

        oneCluster = getLink(childList[0])
        for i in range(1, len(childList)):
            tmp = list(childList[i - 1].next_siblings)
            tmp1 = [childList[i - 1]] + tmp[0:tmp.index(childList[i]) + 1]
            if self.getMinGap(
                    childList[i],
                    childList[i - 1]) > self.dt or not md2.isOneCluster(tmp1):
                clusterList.append(oneCluster)
                oneCluster = getLink(childList[i])
            else:
                oneCluster += getLink(childList[i])
        clusterList.append(oneCluster)
        return clusterList
Пример #10
0
 def getChaotic(self, domTreeList):
     linkList = []
     for domTree in domTreeList:
         try:
             linkList += getLink(domTree)
         except:
             continue
     if linkList:
         wordNumList = [float(link['wordNum']) for link in linkList]
         return np.var(np.array(wordNumList))
     else:
         return None
Пример #11
0
 def isOneCluster(self, domTree):
     childList = []
     for child in domTree.children:
         try:
             if getLink(child):
                 childList.append(child)
         except:
             continue
     maxGap = 0
     for i in range(1, len(childList)):
         maxGap = max(self.getMinGap(childList[i - 1], childList[i]),
                      maxGap)
     return maxGap <= self.dt
Пример #12
0
def clusterLinkKM(dom, k=2):
    md = ldi.linkDistance(dom, 1)
    linkList = getLink(dom)
    a = [[link['index']] for link in linkList]
    #a = [[link['index'], link.parent['index']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList]
    if not a:
        return []
    cluster = KMeans(k)
    labelList = cluster.fit_predict(array(a))
    linkClusterList = [[] for i in range(0, k)]
    for i in range(0, len(labelList)):
        linkClusterList[labelList[i]].append(linkList[i])
    return linkClusterList
 def denThres(self, domTree):
     linkList = getLink(domTree)
     linkStrList = []
     for link in linkList:
         linkStrList.append(link.text)
         imgList = link.find_all('img')
         if not imgList:
             linkStrList.append('string')
         for img in link.find_all('img'):
             if 'alt' in img.attrs:
                 linkStrList.append(img['alt'])
             else:
                 linkStrList.append('string')
     linkStrLen = sum([len(linkStr.split()) for linkStr in linkStrList])
     dt = self.alpha * (linkStrLen + self.beta) / (
         float(len(domTree.text.split())) + linkStrLen + self.beta)
     return dt
 def getLinkDen(self, subDomList):
     linkList = []
     for subDom in subDomList:
         linkList += getLink(subDom)
     linkStrList = []
     for link in linkList:
         linkStrList.append(link.text)
         imgList = link.find_all('img')
         if not imgList:
             linkStrList.append('string')
         for img in link.find_all('img'):
             if 'alt' in img.attrs:
                 linkStrList.append(img['alt'])
             else:
                 linkStrList.append('string')
     linkStrLen = sum([len(linkStr.split()) for linkStr in linkStrList])
     ld = (linkStrLen + self.beta) / (float(len(subDom.text.split())) +
                                      linkStrLen + self.beta)
     return ld
Пример #15
0
 def disThres(self, domTree, alpha=1):
     disList = []
     disList.append(0)
     linkList = getLink(domTree)
     for i in range(1, len(linkList)):
         dis = self.getDis(linkList[i], linkList[i - 1])
         if dis:
             disList.append(dis)
     disList.sort(reverse=True)
     judgeList = [
         disList[i] * len(disList) * alpha + i * disList[0]
         for i in range(0, len(disList))
     ]
     minIndex = 0
     for i in range(0, len(judgeList)):
         if judgeList[i] < judgeList[minIndex]:
             minIndex = i
     dt = disList[minIndex]
     return dt
Пример #16
0
def clusterLink1D(dom, para=1):
    md = ldi.linkDistance(dom, para)
    linkList = getLink(dom)
    if not linkList:
        return []
    linkClusterList = []
    tmp = [linkList[0]]
    for i in range(1, len(linkList)):
        if md.getDis(linkList[i], linkList[i - 1]) > md.dt:
            linkClusterList.append(tmp)
            tmp = []
        else:
            tmp.append(linkList[i])
    if tmp:
        linkClusterList.append(tmp)
    linkClusterList = [
        linkCluster for linkCluster in linkClusterList if linkCluster
    ]
    return linkClusterList
Пример #17
0
def clusterLinkSC(dom,
                  k=2,
                  affinity='nearest_neighbors',
                  n_neighbors=1,
                  gamma=0.021):
    md = ldi.linkDistance(dom, 1)
    linkList = getLink(dom)
    a = [[link['index']] for link in linkList]
    #a = [[link['index'], link.parent['index']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList]
    if not a:
        return []
    cluster = SpectralClustering(k, gamma=gamma)
    labelList = cluster.fit_predict(array(a))
    linkClusterList = [[] for i in range(0, k)]
    for i in range(0, len(labelList)):
        linkClusterList[labelList[i]].append(linkList[i])
    return linkClusterList
def runK(dataDir, clmd):
    kMin = 1
    kMax = 100
    htmlList = listdir(dataDir)
    htmlList = [html for html in htmlList if html.split('.')[-1] == 'html']
    ariList = []
    amiList = []
    for html in htmlList:
        dom = bs(open(path.join(dataDir, html)))
        linkList = getLink(dom)
        tmpSet = set()
        for link in linkList:
            if 'clusterindex' in link.attrs:
                tmpSet.add(link['clusterindex'])
        try:
            linkClusterList = clmd(dom, len(tmpSet))
        except:
            print(tb.format_exc())
            continue
        ariList.append(cm.ari(linkClusterList))
        amiList.append(cm.ami(linkClusterList))
    return [ariList, amiList]
Пример #19
0
def clusterLinkDB(dom, para=0.5):
    md = ldi.linkDistance(dom, para)
    linkList = getLink(dom)
    a = [[link['index']] for link in linkList]
    #a = [[link['index'], link.parent['index']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size']] for link in linkList]
    #a = [[link['index'], link['link_color'], link['link_size'], link.parent['index']] for link in linkList]
    if not a:
        return []
    if not md.dt:
        return linkList
    #cluster = DBSCAN(md.dt*1.414, min_samples=1)
    cluster = DBSCAN(md.dt, min_samples=1)
    labelList = cluster.fit_predict(array(a))
    tmpSet = set()
    for label in labelList:
        tmpSet.add(label)
    linkClusterList = [[] for i in range(0, len(tmpSet))]
    for i in range(0, len(labelList)):
        linkClusterList[labelList[i]].append(linkList[i])
    linkClusterList = [
        linkCluster for linkCluster in linkClusterList if linkCluster
    ]
    return linkClusterList
def main():
    htmlFileDir = '../../data/cleanEval'
    htmlFileDir = '../../data/SSD/Big5/techweb.com'
    #htmlFileDir = '../../data/SSD/myriad40'
    '''
    for num in range(0, 10):
        htmlFilePath = path.join(htmlFileDir, str(num+1)+'.html')
        try:
            (oLinkMatrix, oGroundList, oClusterIndex) = genMatrix([bs(open(htmlFilePath))])
            scaler = MinMaxScaler()
            linkMatrix = scaler.fit_transform(oLinkMatrix)
            est = KMeans(n_clusters=2)
            y = est.fit_predict(linkMatrix)
            print metric(y, oGroundList)
        except:
            print(tb.format_exc())
            continue
    '''
    domList = []
    total = 0
    testRatio = 0.5
    search = False
    for num in range(0, 100):
        htmlFilePath = path.join(htmlFileDir, str(num + 1) + '.html')
        try:
            domList.append(bs(open(htmlFilePath)))
            total += len(getLink(domList[-1]))
        except:
            continue
    print(total)
    (oLinkMatrix, oGroundList, oClusterIndex) = genMatrix(domList)

    dataList = [[oLinkMatrix[i], oGroundList[i]]
                for i in range(0, len(oGroundList))]
    job_n = 16
    trainRatioList = [
        float(i) / 100 for i in range(1, 11) + range(10, 101, 10)
    ]
    for trainRatio in trainRatioList:
        turnNum = 100
        turn = 0
        precision = recall = f1_score = accuracy = 0
        while turn < turnNum:
            try:
                random.shuffle(dataList)
                linkMatrix = [dataList[i][0] for i in range(0, len(dataList))]
                groundList = [dataList[i][1] for i in range(0, len(dataList))]
                testBound = int(testRatio * len(groundList))
                upperBound = int(trainRatio * (1 - testRatio) *
                                 len(groundList))
                scaler = StandardScaler()
                linkMatrix = scaler.fit_transform(linkMatrix)
                grid = None
                if search:
                    C_range = np.logspace(-2, 10, 13)
                    gamma_range = np.logspace(-9, 3, 13)
                    param_grid = dict(gamma=gamma_range, C=C_range)
                    cv = StratifiedShuffleSplit(groundList[0:upperBound],
                                                n_iter=5,
                                                test_size=0.2,
                                                random_state=42)
                    grid = GridSearchCV(SVC(),
                                        param_grid=param_grid,
                                        cv=cv,
                                        n_jobs=job_n)
                    grid.fit(linkMatrix[0:upperBound],
                             groundList[0:upperBound])
                    clf = SVC(C=grid.best_params_['C'],
                              gamma=grid.best_params_['gamma'])
                else:
                    C = float(upperBound / sum(groundList[0:upperBound]))
                    clf = SVC(C=1, gamma=0.10, kernel='rbf')
                clf.fit(linkMatrix[0:upperBound], groundList[0:upperBound])
                predict = clf.predict(linkMatrix[testBound + 1:])
                tmp = metric(predict, groundList[testBound + 1:])
                precision += tmp[0]
                recall += tmp[1]
                f1_score += tmp[2]
                accuracy += tmp[3]
                turn += 1
            except:
                continue
        print "%s, %s, %s, %s" % (precision / turnNum, recall / turnNum,
                                  f1_score / turnNum, accuracy / turnNum)
def calLinkRatio(dom, smooth=0.0):
    return (len(getLink(dom)) + smooth) / (len(dom.text.split()) + smooth)