def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, preprocessType="None"): output = [] for pattern in patterns: (lp, rp) = pattern pages = getAllPagesInsideWebsite(websiteLocation) patternScore = 0 for page in pages: exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) pageContent = readPlainHtmlPageContent(exactPageLocation) if preprocessType == "NUM": pageContent = replaceNumWordsInStr(pageContent) results = makeSingleObjectExtractions(pageContent, lp, rp) if goldContent in results: patternScore += 1 if patternScore > 0: output.append((lp, rp)) return output
def singleObjectPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialSeedSet, threshold=1000, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, rp) = pattern exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileLocation) singleObj = "" if len(contentList) == 1: singleObj = contentList[0] goldContent = " ".join(singleObj.split()) expected = [goldContent] pageContent = readPlainHtmlPageContent(exactPageLocation) computed = makeSingleObjectExtractions(pageContent, lp, rp, threshold) resultsPerPattern.append( (computed, expected, artificialSeedSet[page])) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
def clusterPatternFiltering(patterns, websiteLocation, supervisedFileLocation, artificialClusters, preprocessType="None"): stats = {} output = [] patternIndex = 0 for pattern in patterns: pages = getAllPagesInsideWebsite(websiteLocation) resultsPerPattern = [] patternIndex += 1 for page in pages: (lp, mp, rp) = pattern exactPageLocation = page + "/page.html" # contentList = getClusterInsideLeftRightPattern(pageContent) contentList = readFileContentInList(page + "/" + supervisedFileLocation) contentList = [" ".join(item.split()) for item in contentList] expected = contentList expectedArtificial = artificialClusters[page] expectedArtificial = [ " ".join(item.split()) for sublist in expectedArtificial for item in sublist ] pageContent = readPlainHtmlPageContent(exactPageLocation) clusters = getClusterInsideLeftRightPattern( pageContent, lp, mp, rp) computed = [] for cluster in clusters: computed.extend(getElementsOfCluster(cluster, mp)) # print("Expected is ") # print(expected) # print("expected Artificial is ") # print(expectedArtificial) # print("Computed is ") # print(computed) resultsPerPattern.append((computed, expected, expectedArtificial)) stats[patternIndex] = (pattern, resultsPerPattern) patterns = getPatternsFromStats(stats) return patterns
from utils import writeTripletPatternsAsCsv from ClusterContextExtraction import getClusterContexts from RegExpPatternFilteringModule import clusterPatternFiltering from utils import appendPreprocessType, processNumInContext from SeedExpansionModule import addArtificialClusterSeeds websiteLocations = getWebsiteLocations(supervisedDataLocation) print(websiteLocations) for websiteLocation in websiteLocations: pages = getAllPagesInsideWebsite(websiteLocation) contexts = [] artificialClusters = {} for page in pages: print(page) print(websiteLocation) exactPageLocation = page + "/page.html" clusterElements = readFileContentInList(page + "/" + supervisedFileName) pageContent = readPlainHtmlPageContent(exactPageLocation) contextsPerPage = getClusterContexts(pageContent, clusterElements) contexts.append(contextsPerPage) artificialClusterPerPage = addArtificialClusterSeeds( pageContent, clusterElements) artificialClusters[page] = artificialClusterPerPage realCluster = "\n".join(clusterElements) # print("Real Cluster is ") # print(realCluster) # artificialCluster = ["\n".join(cluster) for cluster in artificialClusterPerPage] # artificialCluster = "\n\n\n".join(artificialCluster) # print("Artificial Cluster is ") # print(artificialCluster) # print("Real cluster") # realCluster = "\n".join(clusterElements)
from FileUtil import getAllPagesInsideWebsite, readPlainHtmlPageContent from FileUtil import readFileContentInList from utils import getSingleObjectContexts, writePairPatternsAsCsv from SingleObjectPatternsLearningUtil import learnPatterns from utils import appendPreprocessType from utils import processNumInContext websiteLocations = getWebsiteLocations(supervisedDataLocation) print(websiteLocations) for websiteLocation in websiteLocations: pages = getAllPagesInsideWebsite(websiteLocation) singleObjectContexts = [] singleObjList = [] for page in pages: exactPageLocation = page + "/page.html" contentList = readFileContentInList(page + "/" + supervisedFileName) singleObj = "" if len(contentList)==1: singleObj = contentList[0] #so that multiple spaces in product title are being removed singleObj = " ".join(singleObj.split()) pageContent = readPlainHtmlPageContent(exactPageLocation) contextsPerPage = getSingleObjectContexts(pageContent, singleObj) if len(contextsPerPage)>0: singleObjectContexts.append(contextsPerPage) singleObjList.append(singleObj) print("You know what...") print("Single object contexts are:- ") print(singleObjectContexts) patterns = learnPatterns(singleObjectContexts)