예제 #1
def test(sites, keywords):
    sitesPaths = [os.path.join('nowe_filmy', 'test', site) for site in sites]

    siteTexts = [readSource(site) for site in sitesPaths] 

    print '*' * 80
    print 'TEST'
    filmDescriptions = {}
    newKeywords = []

    for (i, text) in enumerate(siteTexts):
        nohtmlText = re.sub('<[^<>]*>', ' ', text)
        plainText = re.sub('\s+', ' ', nohtmlText)

        print 'search site:', sites[i]
        filmName = sites[i][:-4]  # cut off .htm
        filmDescriptions[filmName] = {}

        foundKeywords = fillDescription(filmDescriptions[filmName], plainText, keywords + newKeywords)
        while foundKeywords != []:
            newKeywords += foundKeywords
            foundKeywords = fillDescription(filmDescriptions[filmName], plainText, keywords + newKeywords)

    return filmDescriptions
예제 #2
def retrieveFrom(name, fullDescr):
    content = readSource(name)
    soup = bs(content)
    keywords = getKeywords(fullDescr)
    usedKeywordPaths, keywordsBestPath = keywordsPath(soup, keywords)

    shift = getBestPathShift(fullDescr, keywordsBestPath, usedKeywordPaths)

    bestPath = keywordsBestPath
    for i in range(shift):

    return bestPath
예제 #3
def findObject(name, descr):
    '''Try to find object on source identified by name (url of file name) using
    description descr. If object can be found, returns tuple containing paths
    to object elements and html subtree with this object, otherwise returns None.'''
    content = readSource(name)
    soup = bs(content)

        elementsFullPaths, bestPath = findPath(soup, descr)
        cutSize = len(bestPath) - 1
        subPaths = [path[cutSize:] for path in elementsFullPaths]
    except IndexError:
        return None

    return subPaths, soupFromPath(soup, bestPath)
예제 #4
def findPredecessor(sources, names):
    siteTexts = [readSource(source) for source in sources] 

    predecessors = {
        'unigram': {},
        'bigram': {},
        'trigram': {}

    for (i, text) in enumerate(siteTexts):
        nohtmlText = re.sub('<[^<>]*>', ' ', text)
        plainText = re.sub('\s+', ' ', nohtmlText)

        print 'search site:', sources[i]
        for name in names:
            print 'search name:', name
            nameIndexes = findIndexes(plainText, name)

            for index in nameIndexes:
                partialText = plainText[:index]
                words = findThreeLastWords(predecessors, partialText)

                if words[0][-1] == ',':
                    textWithOmitted = omitPrevious(partialText, 4)
                    words = findThreeLastWords(predecessors, textWithOmitted)
                    for word in words:
                        for key in predecessors.iterkeys():
                            if word in predecessors[key]:
                                predecessors[key][word] += 1
                    fillPredecessors(predecessors, words)

    bestName = chooseBestName(predecessors)
    return cleanName(bestName)
예제 #5
def elementInSoup(subSoup, element, usedPaths):
    print element
    subPath = getPath(subSoup)
    if element['keyVisible'] == True:
        findSoup = subSoup.find(text=re.compile(element['key']))
        if findSoup is None:
            return None, False, usedPaths
        findPath = getPath(findSoup)[:len(subPath)]
        if matchPath(element['path'], findPath):
            return {
                'key': element['key'],
                'value': findSoup.text
            }, True, usedPaths + findPath
            print 'Sciezki sie nie zgadzaja'
            return None, False, usedPaths

    return ({'key': u'Reżyseria', 'value': 'Al Bundy'}, True, usedPaths)

def visitRestTree(fullDescr, subSoup, usedPaths):
    return ({'key': u'Reżyseria', 'value': 'Al Bundy'}, subSoup, usedPaths)

def retrieveFrom(name, fullDescr):
    content = readSource(name)
    soup = bs(content)
    keywords = getKeywords(fullDescr)
    usedKeywordPaths, keywordsBestPath = keywordsPath(soup, keywords)

    shift = getBestPathShift(fullDescr, keywordsBestPath, usedKeywordPaths)

    bestPath = keywordsBestPath
    for i in range(shift):

    return bestPath

def getKeywords(fullDescr):
    keywords = []
    for el in fullDescr:
        if el['keyVisible']:

    return keywords

def keywordsPath(soup, keywords):
    descrEl = []
    for (i, keyword) in enumerate(keywords):
            'key': '%d_____' % i,
            'value': keyword

    elementPaths, bestPath = findPath(soup, descrEl)

    usedKeywordsPaths = {}
    for (i, el) in enumerate(descrEl):
        if elementPaths[i] != []:
            usedKeywordsPaths[keywords[i]] = elementPaths[i]
    return (usedKeywordsPaths, bestPath)

def getBestPathShift(fullDescr, keywordsBestPath, keywordsPaths):
    keywords = keywordsPaths.keys()
    descrPaths = [descr['path'] for descr in fullDescr if descr['key'] in keywordsPaths]
    matchingPath = reduce(matchPath, descrPaths)
    print 'mp:', matchingPath
    cutPath = cutAllMatch(matchingPath)
    shift = len(cutPath) - 1
    print 'Matching path shift =', shift
    return shift

def matchPath(path1, path2, strict=False):
    minLength = min(len(path1), len(path2))
    matchingPath = []
    for i in range(minLength):
        p1 = path1[i]
        p2 = path2[i]
        nameCheck = check(p1[0], p2[0], strict)
        indCheck = check(p1[1], p2[1], strict)
        if not nameCheck[0] or not indCheck[0]:
        matchingPath.append((nameCheck[1], indCheck[1]))

    return matchingPath

def cutAllMatch(path):
    pathCopy = path[:]
    while len(pathCopy) > 0 and pathCopy[-1][1] == '_':

    return pathCopy

def check(x1, x2, strict=False):
    if strict:
        return (x1 == x2, x1)
        if x1 == '_' or x2 == '_':
            return (True, '_')
            return (x1 == x2, x1)

if __name__ == '__main__':
    wstydPaths, wstydSubSoup = findObject('filmy\\wstyd.htm', wstydDes)
    wstydFullDescr = prepareDescription(wstydSubSoup, wstydDes, wstydPaths)

    #cubaPaths, cubaSubSoup = findObject('filmy\\cuba_isla_of_music.htm', cubaDes)
    #cubaFullDescr = prepareDescription(cubaSubSoup, cubaDes, cubaPaths)

    ostatniaPaths, ostatniaSubSoup = findObject('filmy\\ostatnia_milosc_na_ziemi.htm', ostatniaDes)
    ostatniaFullDescr = prepareDescription(ostatniaSubSoup, ostatniaDes, ostatniaPaths)

    descriptions = [wstydFullDescr, ostatniaFullDescr]
    mergedDescription = merge(descriptions)

    for d in mergedDescription:
        print '------'
        for k, v in d.iteritems():
            print k, ':', v
        print '------'

    print '--------------'
    retrievedPath = retrieveFrom('filmy\\zapiski.htm', mergedDescription)
    print retrievedPath
    cnt = readSource('filmy\\zapiski.htm')
    soup = bs(cnt)
    subSoup = soupFromPath(soup, retrievedPath)

    print 'get data'
    print getData(subSoup, mergedDescription)
예제 #6
def getSoup(name):
    content = readSource(name)
    soup = bs(content)
    return soup