Exemplos de clearHTML em Python, exemplos de strip_html.clearHTML em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: real_title2.py Projeto: ctwiz/sourcereader

def stripsplitTitle(full_title):
    ## Regex for title to split on ##
    ## beautiful soup replaces html entities, so this needs to put them back in
    full_title = full_title.replace('~1.314392653~', '&')
    #full_title = clearCurlies(full_title)
    full_title = clearHTML(full_title)
    print("RTS:Full_title I'm being given : " + full_title)
    sp = re.compile('''\s+(&raquo;|\>\>|&#187;|&laquo;|\<\<|&#171;|\||&#124;|\-\-|::|-|&#8211;|/)\s+''',re.I)
    print("RT2:Beginning to split title.  Unsplit title:   \"" + full_title + "\"",2)
    blocks = re.split(sp, full_title)
    ## Find longest block of title ##
    #print(blocks)
    #print("length : " + str(len(blocks)))
    if len(blocks)>1:
        longest=1
        longestb=''
        for b in blocks:
            if len(b)>1 and len(b.split(' '))>longest:
                longest=len(b.split(' '))
                longestb=b
        print("RT2:longestblock:  FOUND!    \"" + longestb.strip() + "\"",2)
        #found title in story, confidence + 1
        infoModule.info.page['confidence'] += 1
        return longestb.strip()
    else:
        print("RT2:title unsplittable, running through contig title and returning:  \"" + full_title.strip() + "\"",2)
        tit = check_rt(full_title.strip(), infoModule.info.page['rawHTML'])
        print("RT2:title coming back from check_rt : " + tit)
        ## logic that splits titles and takes up the H1
        print("RT2: length of tit.split " + str(len(tit.split(' '))))
        print(tit.split(' '))
        print("RT2: length of full_title.strip.split " + str(len(full_title.strip().split(' '))))
        print(full_title.strip())
        print(full_title.strip().split(' '))
        quotient = (len(tit.split(' '))) / len(full_title.strip().split(' '))
        quotient_limit = 0.50
        print(" QUOTIENT : " + str(quotient))
        if( quotient >= quotient_limit):
            #found title in story, confidence + 1
            infoModule.info.page['confidence'] = infoModule.info.page['confidence'] + 1
            return tit
        else:
            #reverting to h1 or title tag, confidence - 1
            infoModule.info.page['confidence'] = infoModule.info.page['confidence'] - 1
            soup=BeautifulSoup(''.join(infoModule.info.page['rawHTML']))
            testStr = soup.h1
            pprint.pprint(testStr)
            if len(testStr.contents) > 0:
                clearedH1 = clearHTML(str(testStr.contents[0])).strip()
                if testStr != None and clearedH1 != '':
                    print "clearedH1 ~" + clearedH1 + "~"
                    return clearedH1
                else:
                    return full_title
            else:
                #h1 was empty, though it existed
                return full_title

Exemplo n.º 2

0

Exibir arquivo

Arquivo: largestBlock.py Projeto: ctwiz/sourcereader

def bodyExtract(url):
    global be_results
    
    infoModule.info.site['body_extractor_no_date'] = True
    
    infoModule.info.page['rawHTML'] = fetchPage(url)
    htmlTitle()
    infoModule.info.page['title'] = real_title2.realTitle()
    print infoModule.info.page['title']
    #sys.exit()
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8')
        print "meta_description: " + infoModule.info.page['meta_description']
    infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False)
    if be_results != None:
        print be_results
    return be_results

Exemplo n.º 3

0

Exibir arquivo

Arquivo: largestBlock.py Projeto: dpgailey/sourcereader

def bodyExtract(url):
    global be_results

    infoModule.info.site['body_extractor_no_date'] = True

    infoModule.info.page['rawHTML'] = fetchPage(url)
    htmlTitle()
    infoModule.info.page['title'] = real_title2.realTitle()
    print infoModule.info.page['title']
    #sys.exit()
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode(
            'utf-8')
        print "meta_description: " + infoModule.info.page['meta_description']
    infoModule.info.page['plainText'] = strip_html.clearHTML(
        infoModule.info.page['rawHTML'])
    be_results = body_extractor.extract(infoModule.info.page['plainText'],
                                        doAsciiConvert=False)
    if be_results != None:
        print be_results
    return be_results

Exemplo n.º 4

0

Exibir arquivo

Arquivo: find_new_entities_ut.py Projeto: ctwiz/bawdy

    row=cr.fetch_row(1,1)
    if row == ():
        break
    print(row)
    url = row[0]['url']
    infoModule.info.page['url'] = url
    infoModule.info.page['title'] = 'whatevs'
    log.plog('testing clearHTML', 1)
    print 'Attempting URL: ' + url + "\r\n"
    urlSocket = urllib.urlopen(url)
    html = urlSocket.read()	
	
    if (len(sys.argv) > 1 and sys.argv[1]) == 'features':
        html = strip_html.clearHTMLFeatures(html)
    else:
        html = strip_html.clearHTML(html)
    find_new_entities.find_new_entities(html)
    print '========================================================='
    
# blockedWords test  
while (1):
    url = 'http://news.ycombinator.com/item?id=2092273'
    infoModule.info.page['url'] = url
    infoModule.info.page['title'] = 'whatevs'
    print 'Attempting URL: ' + url + "\r\n"
    urlSocket = urllib.urlopen(url)
    html = urlSocket.read()	
    html = strip_html.clearHTML(html)
    print html

    find_new_entities.find_new_entities(html)

Exemplo n.º 5

0

Exibir arquivo

def stripsplitTitle(full_title):
    ## Regex for title to split on ##
    ## beautiful soup replaces html entities, so this needs to put them back in
    full_title = full_title.replace('~1.314392653~', '&')
    #full_title = clearCurlies(full_title)
    full_title = clearHTML(full_title)
    print("RTS:Full_title I'm being given : " + full_title)
    sp = re.compile(
        '''\s+(&raquo;|\>\>|&#187;|&laquo;|\<\<|&#171;|\||&#124;|\-\-|::|-|&#8211;|/)\s+''',
        re.I)
    print(
        "RT2:Beginning to split title.  Unsplit title:   \"" + full_title +
        "\"", 2)
    blocks = re.split(sp, full_title)
    ## Find longest block of title ##
    #print(blocks)
    #print("length : " + str(len(blocks)))
    if len(blocks) > 1:
        longest = 1
        longestb = ''
        for b in blocks:
            if len(b) > 1 and len(b.split(' ')) > longest:
                longest = len(b.split(' '))
                longestb = b
        print("RT2:longestblock:  FOUND!    \"" + longestb.strip() + "\"", 2)
        #found title in story, confidence + 1
        infoModule.info.page['confidence'] += 1
        return longestb.strip()
    else:
        print(
            "RT2:title unsplittable, running through contig title and returning:  \""
            + full_title.strip() + "\"", 2)
        tit = check_rt(full_title.strip(), infoModule.info.page['rawHTML'])
        print("RT2:title coming back from check_rt : " + tit)
        ## logic that splits titles and takes up the H1
        print("RT2: length of tit.split " + str(len(tit.split(' '))))
        print(tit.split(' '))
        print("RT2: length of full_title.strip.split " +
              str(len(full_title.strip().split(' '))))
        print(full_title.strip())
        print(full_title.strip().split(' '))
        quotient = (len(tit.split(' '))) / len(full_title.strip().split(' '))
        quotient_limit = 0.50
        print(" QUOTIENT : " + str(quotient))
        if (quotient >= quotient_limit):
            #found title in story, confidence + 1
            infoModule.info.page[
                'confidence'] = infoModule.info.page['confidence'] + 1
            return tit
        else:
            #reverting to h1 or title tag, confidence - 1
            infoModule.info.page[
                'confidence'] = infoModule.info.page['confidence'] - 1
            soup = BeautifulSoup(''.join(infoModule.info.page['rawHTML']))
            testStr = soup.h1
            pprint.pprint(testStr)
            if len(testStr.contents) > 0:
                clearedH1 = clearHTML(str(testStr.contents[0])).strip()
                if testStr != None and clearedH1 != '':
                    print "clearedH1 ~" + clearedH1 + "~"
                    return clearedH1
                else:
                    return full_title
            else:
                #h1 was empty, though it existed
                return full_title

Exemplo n.º 6

0

Exibir arquivo

def check_rt(title, html):
    print("<br><br>title I'm getting before clearHTML : " + title +
          "<br><br><br>")
    #lines = clearCurlies(clearHTML(html))
    try:
        lines = clearHTML(html).decode('utf-8')
    except:
        lines = clearHTML(html)
    title = clearHTML(title)
    lines = re.sub("\n\n+", '\n', lines)
    lines = re.split("\n+", lines)
    print("<br><br>title I'm being passed after clearHTML : " + title +
          "<br><br>")
    matches = []
    title_array = re.split("\s+", title)
    for k in range(len(lines)):
        print title
        print str(k) + " " + lines[k]
        # capture each phrase
        phrase = ''
        # start with the title pointer at zero for each line
        title_pointer = 0
        # title incrementor
        title_inc = 0
        # start with word_match_count at zero for each line
        word_match_count = 0
        # while title pointer hasn't reached the end of the title, continue
        while (title_pointer < len(title_array)):
            # this begins at each line
            # words to match begins at nothing for each line
            words_to_match = ''
            # while the title buffer is not at the end of the title
            while (title_inc < len(title_array)):
                # words to match should add each word with a space at the end
                words_to_match += title_array[title_inc] + " "
                # chopping off the last space for testing
                match_this = words_to_match[:-1]
                try:
                    match_this = match_this.decode('utf-8')
                except:
                    pass
                # try and match this cluster of words in the line
                #print("line : " + str(k) + ", matching : " + match_this + ", title_pointer : "+ str(title_pointer) + ", title_inc : "+ str(title_inc) + ", title_pntr_word: "+title_array[title_pointer] +", title_inc_word : " + title_array[title_inc])
                #handle some encoding fall-through
                try:
                    cluster_match = lines[k].find(match_this)
                except:
                    cluster_match = -1
                if cluster_match >= 0:
                    #print("Found")
                    # if this matched cluster is larger than the last, then we replace total count
                    # add one for a match
                    if (word_match_count < (title_inc - title_pointer + 1)):
                        word_match_count = (title_inc - title_pointer + 1)
                        phrase = match_this
                    # great, it matched, now let's try adding a word and matching that
                    title_inc += 1
                else:
                    break
            #increment pointer once title_inc has made it through, or failed.
            if (title_inc == len(title_array)):
                break
            # This starts the next search from the word after the pointer was just starting at
            title_pointer += 1
            title_inc = title_pointer
        # append results to our list [line number, match count]
        matches.append((k, word_match_count, phrase))
        matches = sorted(matches, key=itemgetter(1, 0), reverse=True)
    # returns the highest match title
    return matches[0][2]

Exemplo n.º 7

0

Exibir arquivo

Arquivo: storyFetcher.py Projeto: ctwiz/sourcereader

def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')     
    request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)')     
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)   
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')
        
    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] 
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')
    
    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog("meta_description: " + infoModule.info.page['meta_description'], 2)
    
    

    log.plog('======================================= TITLE ================================', 2)
    # get title
    #set HTMLTitle first
    
    
    HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else: 
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog('======================================= OUTLINE ================================', 2)        
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause 
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''
        
        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''
        
    log.plog('======================================= IMAGES ================================', 2)
    #find images        
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url)
    if imageArray == None:
        log.plog('could not find image', 3) 
        imageArray = ''   


    log.plog('======================================= VIDEOS ================================', 2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON
    
    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')
    
    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0;
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize):
        return(failOn('no story or video found, and largest image less than min size'))
    
    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']
    
    storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status" : status, "story" : storyObj}
    output = json.dumps(returnVal)
    return output

Exemplo n.º 8

0

Exibir arquivo

Arquivo: scanPage.py Projeto: ctwiz/sourcereader

def scanPage():
    siteDB = infoModule.info.site['database']

    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        os._exit(0)
        
    urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink'])
    while True:
        urlBlocker = urlBlockerQ.fetch_row(1,1)
        if urlBlocker == ():
            break
        blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url'])
        if blockTest != None:
            log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2)
            os._exit(0)
                
    log.plog("fetching " + infoModule.info.page['url'], 2)
    try:
        socket = urllib.urlopen(infoModule.info.page['url'])
    except IOError:
        log.plog('could not open ' + infoModule.info.page['url'], 4)
        return False
    responseCode = socket.getcode()
    log.plog('urllib response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return False
    headerInfo = socket.info()
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return False
    # put in to account for WSJ -dpg
    if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = wsjAuthHack(infoModule.info.page['url'])
    elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = nytAuthHack(infoModule.info.page['url'])
    else:
       	infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL
        #redirected urls need to be blocked too
        urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink'])
        while True:
            urlBlocker = urlBlockerQ.fetch_row(1,1)
            if urlBlocker == ():
                break
            blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url'])
            if blockTest != None:
                log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2)
                os._exit(0)

        ### and short url needs to be blocked
        #do not read links that have only one string in them
        linkParts = urlparse.urlparse(infoModule.info.page['url']) 
        shortPath = re.search('^/\w+/*$', linkParts[2])
        lp = linkParts[2]
        if shortPath != None:
            log.plog("link excluded because it only has a short path of characters: %s" % linkParts[2], 2)
            os._exit(0)

    ## anything in htmlBlacklist?
    htmlBlacklistQ = mysql_tools.mysqlQuery("select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink'])
    while True:
        htmlBlacklist = htmlBlacklistQ.fetch_row(1,1)
        if htmlBlacklist == ():
            break
        badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML'])
        if badSeedHTML != None:
            log.plog('html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3)
            os._exit(0) 
    
    ###################################
    #special case for feedburner sources
    #ernst does not like special cases
    ###################################
    infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url'])
    
    #check AGAIN to see if url is already in system
    escURL = infoModule.info.page['url'].replace("'", "\\'")
    urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink'])
    #don't exit, return false so that a new story can be tried
    if urlCheckQ.num_rows() > 0:
        log.plog("scanpage-url already in newsroom: %s" % infoModule.info.page['url'] , 2)
        log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1,1)))
        return False
    urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink'])
    if urlCheckQ.num_rows() > 0:
        
        log.plog("scanpage-url already in subs: %s" % infoModule.info.page['url'], 2)
        log.plog("sub_id: " + str(urlCheckQ.fetch_row(1,1)))
        return False


    ## if source is '0', try to find source
    if infoModule.info.source['source_id'] == '0':
        sourceRegexQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink'])
        while True:
            sourceRegex = sourceRegexQ.fetch_row(1,1)
            if sourceRegex == ():
                break
            urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url'])
            if urlTest != None:
                log.plog('found source via regex: ' + sourceRegex[0]['title'], 2)
                infoModule.info.source = sourceRegex[0]
                for i in infoModule.info.source.keys():
                    ## this is sort of hack-y, but stupid python returns None for null
                    if infoModule.info.source[i] == None:
                        infoModule.info.source[i] = ''

                break
    
    ## maybe check last modified header and don't get stories older than 7 days?
    '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url'])
    if possibleAgeInDays != None:
        log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2)
        if int(possibleAgeInDays) > 5:
            log.plog("story is " + str(possibleAgeInDays) + " days old.  Not reading", 2)
            return False
'''
    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        os._exit(0)

    #add meta description into the mix
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8')
        log.plog("meta_description: " + infoModule.info.page['meta_description'], 2)
        

    log.plog('======================================= TITLE ================================', 2)
    # get title
    #set HTMLTitle first
    HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else: 
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)


    log.plog('======================================= OUTLINE ================================', 2)        
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor
    if infoModule.info.site['skipBodyRegex'] == False:
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
            #set html block used for imaage, author and links to be what outline returns
            if outline != False:
                infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
                infoModule.info.page['rawHTML'] = outline
        except TimeoutFunctionException:
            outline = False
            log.plog("ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5)

    #outline = find_story.findStoryViaRegex()
    if outline != False:
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            ## parse links in page only in regex block if we have regex
            log.plog('======================================= LINK SCORING ================================', 2)
            links.linkScoring(outline, 'subs')
            links.linkScoring(outline, 'newsroom')
            log.plog('======================================= OUTBOUND LINKS ================================', 2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(outline)
        

        if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
    else:
        log.plog('searching for body using body extractor', 2)
        outline = body_extractor.extract(infoModule.info.page['plainText'])
        if outline != False:
            infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
            abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline)
            if abbreviatedHTML != None:
                infoModule.info.page['rawHTML'] = abbreviatedHTML
            infoModule.info.page['outline'] = outline
        else:
            log.plog('could not create an outline for this story!', 5)
            os._exit(0)
        ## parse links in page - no regex, so look in rawHTML for links
        ## if there are widgetBlockers, first clear them from the html
        linkHTML = infoModule.info.page['rawHTML']
        widgetBlacklistQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink'])
        while True:
            widgetBlacklist = widgetBlacklistQ.fetch_row(1,1)
            if widgetBlacklist == ():
                break
            if isinstance(linkHTML, str) == False:
                log.plog('linkHTML is not string', 5)
                os._exit(0)
            wblMatch = re.search(widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I)
            if wblMatch != None:
                log.plog("found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2)
                linkHTML = linkHTML.replace(wblMatch.group(0), '')
                mysql_tools.mysqlQuery("update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink'])
                
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            log.plog('======================================= LINK SCORING ================================', 2)                
            links.linkScoring(linkHTML, 'subs')
            links.linkScoring(linkHTML, 'newsroom')
            log.plog('======================================= OUTBOUND LINKS ================================', 2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(linkHTML)



    log.plog('======================================= IMAGES ================================', 2)
    #find images        
    if 'image_start_marker' in infoModule.info.source:
        image_start_marker = infoModule.info.source['image_start_marker']
    else:
        image_start_marker = ''

    if 'image_end_marker' in infoModule.info.source:
        image_end_marker = infoModule.info.source['image_end_marker']
    else:
        image_end_marker = ''
    imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker)
    if imageArray == None:
        log.plog('could not find image', 3)    
    else:
        x = imageArray[0]
        y = imageArray[1]
        imageURL = imageArray[2]

        if imageURL == '':
            log.plog('could not find image', 3)
        else:
            log.plog('image found: ' + imageURL, 2)
            infoModule.info.page['largestImage'] = imageURL
            infoModule.info.page['maxSize'] = x * y

    log.plog('======================================= IMAGE CREDIT ================================', 2)
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker'])
        if imageSource != False:
            infoModule.info.page['imageSource'] = imageSource

    log.plog('======================================= VIDEOS ================================', 2)
    ###look for videos
    videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML'])

    if videoLink == False:
        infoModule.info.page['vlink'] = ''
    else:
        log.plog('found video embed', 2)
        infoModule.info.page['vlink'] = videoLink
        vthumb = find_video.findVideoThumb(videoLink)
        if vthumb == False:
            infoModule.info.page['vthumb'] = ''
        else:
            log.plog('found video thumb', 2)
            infoModule.info.page['vthumb'] = vthumb

    log.plog('======================================= AUTHOR ================================', 2)    
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = strip_html.clearHTML(author)
            infoModule.info.page['author'] = author
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''
            
    log.plog('======================================= ENTITIES ================================', 2)
    #### find entities
    entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)
    nickname = False
    while nickname is False: 
        try:
            entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)
	    nickname = True
        except:
            pass
    ## test cityToTeam
    #cityToTeam.getRelevantEntity()

    entities.setPrimo()

    #### chop outline to 500 chars unless featured
    if 'featured_source' not in infoModule.info.source or infoModule.info.source['featured_source'] == '0':
        infoModule.info.page['outline'] = infoModule.info.page['outline'][0:500] + '...'
    
    if len(infoModule.info.entityList) < 1:
        log.plog("no entities found in story!", 5)
        os._exit(0)

    log.plog('======================================= UNKNOWN ENTITIES ================================', 2)
    ## any unknown entities?
    entityFixedString = infoModule.info.page['title'] + ' ' + infoModule.info.page['outline']
    entityFixedString = entityFixedString.replace("'s", "")
    entityFixedString = re.sub('\W+', ' ', entityFixedString)
    
    find_new_entities.find_new_entities(entityFixedString)
    ## page must have at least one non-hidden entity            
    invisibleTypesQuery = mysql_tools.mysqlQuery("select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink'])
    invisibleTypes = ''
    sep = ''
    while True:
        oneType = invisibleTypesQuery.fetch_row(1,1)
        if oneType == ():
            break
        invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id']
        sep = ','

    sep = ''
    cclist = ''
    for eKey in infoModule.info.entityList.keys():
        cclist = cclist + sep + str(eKey)
        sep = ','


    sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")"
    nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    if nonHiddenEntitiesQ.num_rows() == 0:
        log.plog('no non-hidden entities found in story!', 4)
        os._exit(0)
     
    newSubId = addStory.addStory()
    if newSubId == False:
        log.plog('addStory failed', 5)
        
    else:
        log.plog("Story added.  New sub_id: " + str(newSubId), 2)
        
    os._exit(0)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: bawdy.py Projeto: dpgailey/bawdy

                    'HTMLTitle']
                log('using html title')
            else:
                log('title from realTitle')

        if infoModule.info.page['title'] == '':
            log('could not find title for page. Setting to HTML Title', 4)
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
        #clear html from title
    except Exception, e:
        log("Couldn't pass real_title2.realTitle()")

    log("ging to strip_html")

    try:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['title'])
    except:
        log("couldn't pass strip_html for title")
        sys.exit(1)
    log('final title: ' + infoModule.info.page['title'])
    title = real_title2.realTitle()
    infoModule.info.page['title'] = title
    text_page = strip_html.clearHTML(full_page)
    body_text = extract(text_page)
    if body_text != None:
        body_text = body_text[0:450] + "..."
    h = HTMLParser.HTMLParser()
    try:
        be_text = h.unescape(body_text)
    except:
        be_text = body_text

Exemplo n.º 10

0

Exibir arquivo

Arquivo: regexSuggestions.py Projeto: ctwiz/sourcereader

def scanPage(step):
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()
        
    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print infoModule.info.page['url'] 
    
        
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    
    hit = False
    outline = False
    originalStep = step
    while hit == False:
        #pick out most popular regex
        sql = "select count(*) as common, story_start_marker, story_end_marker from " + siteDB + ".sources where story_start_marker != '' group by story_start_marker order by count(*) desc limit %d,1" % step
        regexQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
        if regexQ == False:
            break
        if regexQ.num_rows() == 0:
            break
        regex = regexQ.fetch_row(1,1)
        infoModule.info.source['story_start_marker'] = regex[0]['story_start_marker']
        infoModule.info.source['story_end_marker'] = regex[0]['story_end_marker']
        infoModule.info.source['story_end_marker'] = infoModule.info.source['story_end_marker'].replace('\/', '/')
        infoModule.info.source['story_start_marker'] = infoModule.info.source['story_start_marker'].replace('\/', '/')
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
        except:
            outline = False

	if outline != False:
            hit = True
        step+=1
    if outline != False:
        startMarker = infoModule.info.source['story_start_marker'].replace('<', '&lt;')
        endMarker = infoModule.info.source['story_end_marker'].replace('<', '&lt;')
        if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        infoModule.info.page['outline'] = infoModule.info.page['outline'].decode('utf-8')
        infoModule.info.page['outline'] = infoModule.info.page['outline'].encode('ascii', 'xmlcharrefreplace')
        print str(step)
        print startMarker
        print endMarker
        print infoModule.info.page['outline']
            
    else:
        print "no match"

Exemplo n.º 11

0

Exibir arquivo

Arquivo: real_title2.py Projeto: ctwiz/sourcereader

def check_rt(title, html):
    print("<br><br>title I'm getting before clearHTML : " + title + "<br><br><br>")
    #lines = clearCurlies(clearHTML(html))
    try:
        lines = clearHTML(html).decode('utf-8')
    except: 
        lines = clearHTML(html)
    title = clearHTML(title)
    lines = re.sub("\n\n+", '\n', lines)
    lines = re.split("\n+", lines)
    print("<br><br>title I'm being passed after clearHTML : " + title + "<br><br>")
    matches = []
    title_array = re.split("\s+", title)
    for k in range(len(lines)):
        print title
        print str(k) + " " + lines[k]
        # capture each phrase
        phrase = ''
        # start with the title pointer at zero for each line
        title_pointer = 0
        # title incrementor
        title_inc = 0
        # start with word_match_count at zero for each line
        word_match_count =  0
        # while title pointer hasn't reached the end of the title, continue
        while(title_pointer < len(title_array)):
            # this begins at each line
            # words to match begins at nothing for each line
            words_to_match = ''
            # while the title buffer is not at the end of the title
            while(title_inc < len(title_array)):
                # words to match should add each word with a space at the end
                words_to_match += title_array[title_inc] + " "
                # chopping off the last space for testing
                match_this = words_to_match[:-1]
                try: 
                    match_this = match_this.decode('utf-8')
                except:
                    pass
                # try and match this cluster of words in the line
                #print("line : " + str(k) + ", matching : " + match_this + ", title_pointer : "+ str(title_pointer) + ", title_inc : "+ str(title_inc) + ", title_pntr_word: "+title_array[title_pointer] +", title_inc_word : " + title_array[title_inc])
                #handle some encoding fall-through
                try:
                    cluster_match = lines[k].find(match_this)
                except:
                    cluster_match = -1
                if cluster_match >= 0:
                    #print("Found")
                    # if this matched cluster is larger than the last, then we replace total count
                    # add one for a match
                    if(word_match_count<(title_inc-title_pointer+1)):
                        word_match_count = (title_inc-title_pointer+1)
                        phrase = match_this
                    # great, it matched, now let's try adding a word and matching that
                    title_inc+=1
                else:
                    break
            #increment pointer once title_inc has made it through, or failed.
            if(title_inc==len(title_array)):   
                break;
            # This starts the next search from the word after the pointer was just starting at 
            title_pointer+=1
            title_inc=title_pointer
        # append results to our list [line number, match count]
        matches.append((k, word_match_count, phrase))
        matches = sorted(matches, key=itemgetter(1,0), reverse=True)
    # returns the highest match title
    return matches[0][2]

Exemplo n.º 12

0

Exibir arquivo

Arquivo: bawdy.py Projeto: ctwiz/bawdy

                infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
                log('using html title')
            else:
                log('title from realTitle')

        if infoModule.info.page['title'] == '':
            log('could not find title for page. Setting to HTML Title', 4)
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
        #clear html from title
    except Exception, e:
        log("Couldn't pass real_title2.realTitle()")

    log("ging to strip_html")

    try:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    except:
        log("couldn't pass strip_html for title")
        sys.exit(1)
    log('final title: ' + infoModule.info.page['title'])
    title = real_title2.realTitle()
    infoModule.info.page['title'] = title
    text_page = strip_html.clearHTML(full_page)
    body_text = extract(text_page)
    if body_text != None:
        body_text = body_text[0:450]+"..."
    h = HTMLParser.HTMLParser()
    try:
        be_text = h.unescape(body_text)
    except: 
        be_text = body_text

Exemplo n.º 13

0

Exibir arquivo

Arquivo: regexTester_feed.py Projeto: ctwiz/sourcereader

def scanPage():
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()
        
    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print "<b>URL</b> <a href=\"" + infoModule.info.page['url'] + "\">" + infoModule.info.page['url'] + "</a><br />"
    # get title
    #title = find_title.findTitle()
    #if title != False:
    #    infoModule.info.page['title'] = title
    #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
    #    infoModule.info.page['title'] = infoModule.info.page['potential_title']
    #else:
    #    log.plog('no title found!', 3)
    #    sys.exit()

    ##print infoModule.info.page['title']        
        
    #find images        
    #if 'image_start_marker' in infoModule.info.source:
    #    image_start_marker = infoModule.info.source['image_start_marker']
    #else:
    #    image_start_marker = ''

    #if 'image_end_marker' in infoModule.info.source:
    #    image_end_marker = infoModule.info.source['image_end_marker']
    #else:
    #    image_end_marker = ''
    #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker)
    #x = imageArray[0]
    #y = imageArray[1]
    #imageURL = imageArray[2]
    
    #if imageURL == '':
    #    log.plog('could not find image', 3)
    #else:
    #    log.plog('image found: ' + imageURL, 2)
    #    infoModule.info.page['largestImage'] = imageURL
    #    infoModule.info.page['maxSize'] = x * y
    
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker'])
        if imageSource != False and imageSource != None:
            infoModule.info.page['imageSource'] = imageSource
            print "<b>Image Credit:</b> " + imageSource + "<br />"
            

    ###look for videos
    #videoHunter = find_video.youtube()
    
    #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML'])
    #if videoLink == False:
    #    log.plog('no video found', 2)
    #    infoModule.info.page['vlink'] = ''
    #else:
    #    log.plog('found video embed', 2)
    #    infoModule.info.page['vlink'] = videoLink
        
    ## parse links in page
    #links.linkScoring(infoModule.info.page['rawHTML'], 'subs')
    #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom')
    #links.outboundLinks(infoModule.info.page['rawHTML'])
    
    
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = author.replace('<', '&lt;')
            infoModule.info.page['author'] = author
            print "<b>Author:</b> " + author + "<br />"
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''
        
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    
    outline = find_story.findStoryViaRegex()
    if outline != False:
        if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />"
            
    print "<hr>"

Exemplo n.º 14

0

Exibir arquivo

Arquivo: scanPage.py Projeto: dpgailey/sourcereader

def scanPage():
    siteDB = infoModule.info.site['database']

    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        os._exit(0)

    urlBlockerQ = mysql_tools.mysqlQuery(
        "select * from " + siteDB + ".urlBlocker",
        infoModule.info.site['dblink'])
    while True:
        urlBlocker = urlBlockerQ.fetch_row(1, 1)
        if urlBlocker == ():
            break
        blockTest = re.search(urlBlocker[0]['regex'],
                              infoModule.info.page['url'])
        if blockTest != None:
            log.plog(
                'url ' + infoModule.info.page['url'] + " matches urlBlocker " +
                urlBlocker[0]['regex'], 2)
            os._exit(0)

    log.plog("fetching " + infoModule.info.page['url'], 2)
    try:
        socket = urllib.urlopen(infoModule.info.page['url'])
    except IOError:
        log.plog('could not open ' + infoModule.info.page['url'], 4)
        return False
    responseCode = socket.getcode()
    log.plog('urllib response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return False
    headerInfo = socket.info()
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return False
    # put in to account for WSJ -dpg
    if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = wsjAuthHack(
            infoModule.info.page['url'])
    elif re.search("nytimes\.com", infoModule.info.page['url'],
                   re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = nytAuthHack(
            infoModule.info.page['url'])
    else:
        infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL
        #redirected urls need to be blocked too
        urlBlockerQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".urlBlocker",
            infoModule.info.site['dblink'])
        while True:
            urlBlocker = urlBlockerQ.fetch_row(1, 1)
            if urlBlocker == ():
                break
            blockTest = re.search(urlBlocker[0]['regex'],
                                  infoModule.info.page['url'])
            if blockTest != None:
                log.plog(
                    'url ' + infoModule.info.page['url'] +
                    " matches urlBlocker " + urlBlocker[0]['regex'], 2)
                os._exit(0)

        ### and short url needs to be blocked
        #do not read links that have only one string in them
        linkParts = urlparse.urlparse(infoModule.info.page['url'])
        shortPath = re.search('^/\w+/*$', linkParts[2])
        lp = linkParts[2]
        if shortPath != None:
            log.plog(
                "link excluded because it only has a short path of characters: %s"
                % linkParts[2], 2)
            os._exit(0)

    ## anything in htmlBlacklist?
    htmlBlacklistQ = mysql_tools.mysqlQuery(
        "select regex from " + siteDB + ".htmlBlacklist",
        infoModule.info.site['dblink'])
    while True:
        htmlBlacklist = htmlBlacklistQ.fetch_row(1, 1)
        if htmlBlacklist == ():
            break
        badSeedHTML = re.search(htmlBlacklist[0]['regex'],
                                infoModule.info.page['rawHTML'])
        if badSeedHTML != None:
            log.plog(
                'html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'],
                3)
            os._exit(0)

    ###################################
    #special case for feedburner sources
    #ernst does not like special cases
    ###################################
    infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '',
                                         infoModule.info.page['url'])

    #check AGAIN to see if url is already in system
    escURL = infoModule.info.page['url'].replace("'", "\\'")
    urlCheckQ = mysql_tools.mysqlQuery(
        "select sub_id from " + siteDB + ".newsroom where url='" + escURL +
        "'", infoModule.info.site['dblink'])
    #don't exit, return false so that a new story can be tried
    if urlCheckQ.num_rows() > 0:
        log.plog(
            "scanpage-url already in newsroom: %s" %
            infoModule.info.page['url'], 2)
        log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1, 1)))
        return False
    urlCheckQ = mysql_tools.mysqlQuery(
        "select sub_id from " + siteDB + ".subs where url='" + escURL + "'",
        infoModule.info.site['dblink'])
    if urlCheckQ.num_rows() > 0:

        log.plog(
            "scanpage-url already in subs: %s" % infoModule.info.page['url'],
            2)
        log.plog("sub_id: " + str(urlCheckQ.fetch_row(1, 1)))
        return False

    ## if source is '0', try to find source
    if infoModule.info.source['source_id'] == '0':
        sourceRegexQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".sources where url_regex != ''",
            infoModule.info.site['dblink'])
        while True:
            sourceRegex = sourceRegexQ.fetch_row(1, 1)
            if sourceRegex == ():
                break
            urlTest = re.search(sourceRegex[0]['url_regex'],
                                infoModule.info.page['url'])
            if urlTest != None:
                log.plog('found source via regex: ' + sourceRegex[0]['title'],
                         2)
                infoModule.info.source = sourceRegex[0]
                for i in infoModule.info.source.keys():
                    ## this is sort of hack-y, but stupid python returns None for null
                    if infoModule.info.source[i] == None:
                        infoModule.info.source[i] = ''

                break

    ## maybe check last modified header and don't get stories older than 7 days?
    '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url'])
    if possibleAgeInDays != None:
        log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2)
        if int(possibleAgeInDays) > 5:
            log.plog("story is " + str(possibleAgeInDays) + " days old.  Not reading", 2)
            return False
'''
    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        os._exit(0)

    #add meta description into the mix
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode(
            'utf-8')
        log.plog(
            "meta_description: " + infoModule.info.page['meta_description'], 2)

    log.plog(
        '======================================= TITLE ================================',
        2)
    # get title
    #set HTMLTitle first
    HTMLTitle = re.search('<title>(.*?)<\/title>',
                          infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(
            infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else:
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(
        infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    log.plog(
        '======================================= OUTLINE ================================',
        2)
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source[
            'featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(
            infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(
            infoModule.info.page['rawHTML'])

    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor
    if infoModule.info.site['skipBodyRegex'] == False:
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
            #set html block used for imaage, author and links to be what outline returns
            if outline != False:
                infoModule.info.page['imageHTML'] = infoModule.info.page[
                    'rawHTML']
                infoModule.info.page['rawHTML'] = outline
        except TimeoutFunctionException:
            outline = False
            log.plog(
                "ERROR regex timed out for %s" %
                infoModule.info.source['story_start_marker'], 5)

    #outline = find_story.findStoryViaRegex()
    if outline != False:
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source[
                'source_id'] != '0' and 'source_format' in infoModule.info.source and len(
                    infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            ## parse links in page only in regex block if we have regex
            log.plog(
                '======================================= LINK SCORING ================================',
                2)
            links.linkScoring(outline, 'subs')
            links.linkScoring(outline, 'newsroom')
            log.plog(
                '======================================= OUTBOUND LINKS ================================',
                2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(outline)

        if 'featured_source' in infoModule.info.source and infoModule.info.source[
                'featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(
                outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
    else:
        log.plog('searching for body using body extractor', 2)
        outline = body_extractor.extract(infoModule.info.page['plainText'])
        if outline != False:
            infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']
            abbreviatedHTML = html_body_extractor.html_body_extractor(
                infoModule.info.page['rawHTML'], outline)
            if abbreviatedHTML != None:
                infoModule.info.page['rawHTML'] = abbreviatedHTML
            infoModule.info.page['outline'] = outline
        else:
            log.plog('could not create an outline for this story!', 5)
            os._exit(0)
        ## parse links in page - no regex, so look in rawHTML for links
        ## if there are widgetBlockers, first clear them from the html
        linkHTML = infoModule.info.page['rawHTML']
        widgetBlacklistQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".widgetBlacklist",
            infoModule.info.site['dblink'])
        while True:
            widgetBlacklist = widgetBlacklistQ.fetch_row(1, 1)
            if widgetBlacklist == ():
                break
            if isinstance(linkHTML, str) == False:
                log.plog('linkHTML is not string', 5)
                os._exit(0)
            wblMatch = re.search(
                widgetBlacklist[0]['start_text'] + '.*?' +
                widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I)
            if wblMatch != None:
                log.plog(
                    "found widget blacklist for " +
                    widgetBlacklist[0]['start_text'] + '.*?' +
                    widgetBlacklist[0]['end_text'], 2)
                linkHTML = linkHTML.replace(wblMatch.group(0), '')
                mysql_tools.mysqlQuery(
                    "update " + siteDB +
                    ".widgetBlacklist set hits=hits+1 where widget_id=" +
                    widgetBlacklist[0]['widget_id'],
                    infoModule.info.site['dblink'])

        if infoModule.info.page['promoter'] == '0' and infoModule.info.source[
                'source_id'] != '0' and 'source_format' in infoModule.info.source and len(
                    infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            log.plog(
                '======================================= LINK SCORING ================================',
                2)
            links.linkScoring(linkHTML, 'subs')
            links.linkScoring(linkHTML, 'newsroom')
            log.plog(
                '======================================= OUTBOUND LINKS ================================',
                2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(linkHTML)

    log.plog(
        '======================================= IMAGES ================================',
        2)
    #find images
    if 'image_start_marker' in infoModule.info.source:
        image_start_marker = infoModule.info.source['image_start_marker']
    else:
        image_start_marker = ''

    if 'image_end_marker' in infoModule.info.source:
        image_end_marker = infoModule.info.source['image_end_marker']
    else:
        image_end_marker = ''
    imageArray = find_images.findImages(infoModule.info.page['imageHTML'],
                                        image_start_marker, image_end_marker)
    if imageArray == None:
        log.plog('could not find image', 3)
    else:
        x = imageArray[0]
        y = imageArray[1]
        imageURL = imageArray[2]

        if imageURL == '':
            log.plog('could not find image', 3)
        else:
            log.plog('image found: ' + imageURL, 2)
            infoModule.info.page['largestImage'] = imageURL
            infoModule.info.page['maxSize'] = x * y

    log.plog(
        '======================================= IMAGE CREDIT ================================',
        2)
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(
            infoModule.info.page['rawHTML'],
            infoModule.info.source['image_source_start_marker'],
            infoModule.info.source['image_source_end_marker'])
        if imageSource != False:
            infoModule.info.page['imageSource'] = imageSource

    log.plog(
        '======================================= VIDEOS ================================',
        2)
    ###look for videos
    videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML'])

    if videoLink == False:
        infoModule.info.page['vlink'] = ''
    else:
        log.plog('found video embed', 2)
        infoModule.info.page['vlink'] = videoLink
        vthumb = find_video.findVideoThumb(videoLink)
        if vthumb == False:
            infoModule.info.page['vthumb'] = ''
        else:
            log.plog('found video thumb', 2)
            infoModule.info.page['vthumb'] = vthumb

    log.plog(
        '======================================= AUTHOR ================================',
        2)
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = strip_html.clearHTML(author)
            infoModule.info.page['author'] = author
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''

    log.plog(
        '======================================= ENTITIES ================================',
        2)
    #### find entities
    entities.entityFinder(
        infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
        True)
    nickname = False
    while nickname is False:
        try:
            entities.nicknameFinder(
                infoModule.info.page['title'] + ' ' +
                infoModule.info.page['outline'], True)
            nickname = True
        except:
            pass
    ## test cityToTeam
    #cityToTeam.getRelevantEntity()

    entities.setPrimo()

    #### chop outline to 500 chars unless featured
    if 'featured_source' not in infoModule.info.source or infoModule.info.source[
            'featured_source'] == '0':
        infoModule.info.page[
            'outline'] = infoModule.info.page['outline'][0:500] + '...'

    if len(infoModule.info.entityList) < 1:
        log.plog("no entities found in story!", 5)
        os._exit(0)

    log.plog(
        '======================================= UNKNOWN ENTITIES ================================',
        2)
    ## any unknown entities?
    entityFixedString = infoModule.info.page[
        'title'] + ' ' + infoModule.info.page['outline']
    entityFixedString = entityFixedString.replace("'s", "")
    entityFixedString = re.sub('\W+', ' ', entityFixedString)

    find_new_entities.find_new_entities(entityFixedString)
    ## page must have at least one non-hidden entity
    invisibleTypesQuery = mysql_tools.mysqlQuery(
        "select mptype_id from db_topics.mptypes where visibility='invisible'",
        infoModule.info.site['dblink'])
    invisibleTypes = ''
    sep = ''
    while True:
        oneType = invisibleTypesQuery.fetch_row(1, 1)
        if oneType == ():
            break
        invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id']
        sep = ','

    sep = ''
    cclist = ''
    for eKey in infoModule.info.entityList.keys():
        cclist = cclist + sep + str(eKey)
        sep = ','

    sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")"
    nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql,
                                                infoModule.info.site['dblink'])
    if nonHiddenEntitiesQ.num_rows() == 0:
        log.plog('no non-hidden entities found in story!', 4)
        os._exit(0)

    newSubId = addStory.addStory()
    if newSubId == False:
        log.plog('addStory failed', 5)

    else:
        log.plog("Story added.  New sub_id: " + str(newSubId), 2)

    os._exit(0)

Exemplo n.º 15

0

Exibir arquivo

def scanPage(step):
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()

    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print infoModule.info.page['url']

    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source[
            'featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(
            infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(
            infoModule.info.page['rawHTML'])

    hit = False
    outline = False
    originalStep = step
    while hit == False:
        #pick out most popular regex
        sql = "select count(*) as common, story_start_marker, story_end_marker from " + siteDB + ".sources where story_start_marker != '' group by story_start_marker order by count(*) desc limit %d,1" % step
        regexQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
        if regexQ == False:
            break
        if regexQ.num_rows() == 0:
            break
        regex = regexQ.fetch_row(1, 1)
        infoModule.info.source['story_start_marker'] = regex[0][
            'story_start_marker']
        infoModule.info.source['story_end_marker'] = regex[0][
            'story_end_marker']
        infoModule.info.source['story_end_marker'] = infoModule.info.source[
            'story_end_marker'].replace('\/', '/')
        infoModule.info.source['story_start_marker'] = infoModule.info.source[
            'story_start_marker'].replace('\/', '/')
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
        except:
            outline = False

        if outline != False:
            hit = True
        step += 1
    if outline != False:
        startMarker = infoModule.info.source['story_start_marker'].replace(
            '<', '&lt;')
        endMarker = infoModule.info.source['story_end_marker'].replace(
            '<', '&lt;')
        if 'featured_source' in infoModule.info.source and infoModule.info.source[
                'featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(
                outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        infoModule.info.page['outline'] = infoModule.info.page[
            'outline'].decode('utf-8')
        infoModule.info.page['outline'] = infoModule.info.page[
            'outline'].encode('ascii', 'xmlcharrefreplace')
        print str(step)
        print startMarker
        print endMarker
        print infoModule.info.page['outline']

    else:
        print "no match"

Exemplo n.º 16

0

Exibir arquivo

    row = cr.fetch_row(1, 1)
    if row == ():
        break
    print(row)
    url = row[0]['url']
    infoModule.info.page['url'] = url
    infoModule.info.page['title'] = 'whatevs'
    log.plog('testing clearHTML', 1)
    print 'Attempting URL: ' + url + "\r\n"
    urlSocket = urllib.urlopen(url)
    html = urlSocket.read()

    if (len(sys.argv) > 1 and sys.argv[1]) == 'features':
        html = strip_html.clearHTMLFeatures(html)
    else:
        html = strip_html.clearHTML(html)
    find_new_entities.find_new_entities(html)
    print '========================================================='

# blockedWords test
while (1):
    url = 'http://news.ycombinator.com/item?id=2092273'
    infoModule.info.page['url'] = url
    infoModule.info.page['title'] = 'whatevs'
    print 'Attempting URL: ' + url + "\r\n"
    urlSocket = urllib.urlopen(url)
    html = urlSocket.read()
    html = strip_html.clearHTML(html)
    print html

    find_new_entities.find_new_entities(html)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: body_extractor_ut.py Projeto: dpgailey/sourcereader

        return False
        
    retval = websock.read()
    return retval
    

if __name__ == '__main__':
    if len(sys.argv) > 1:
        url = sys.argv[1]
        
        infoModule.info.site['body_extractor_no_date'] = True
        infoModule.info.page['rawHTML'] = fetchPage(url)
        htmlTitle()
        infoModule.info.page['title'] = real_title2.realTitle()
        print infoModule.info.page['title']
        #sys.exit()

        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
        infoModule.info.site['body_extractor_no_date'] = True
        infoModule.info.page['meta_description'] = ''
        meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
        if meta_search != None:
            infoModule.info.page['meta_description'] = meta_search.group(1)
            print "meta_description: " + infoModule.info.page['meta_description']
        outline = extract(infoModule.info.page['plainText'], doAsciiConvert=False)
        outline = unicodeMapper.clearCurlies(outline)
        
        print outline
    else:
        unittest.main()

Exemplo n.º 18

0

Exibir arquivo

def fetchStory(url):
    siteDB = 'peepbuzz'
    infoModule.info.page['url'] = url
    log.plog("fetching " + url, 2)
    request_obj = urllib2.Request(url)
    request_obj.add_header('Referer', 'http://www.google.com/')
    request_obj.add_header(
        'User-agent',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)'
    )
    try:
        websock = urllib2.urlopen(request_obj)
    except IOError:
        log.plog('could not open ' + url, 4)
        return failOn('could not open ' + url)
    responseCode = websock.getcode()
    headerInfo = websock.info()
    pprint.pprint(headerInfo)
    log.plog('urllib2 response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return failOn('got failure response code from server')
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return failOn('content type: ' + contentType + '. not fetching')

    readWithTimeout = timeout.TimeoutFunction(websock.read, 5)
    #infoModule.info.page['rawHTML'] = websock.read()
    try:
        infoModule.info.page['rawHTML'] = readWithTimeout()
    except timeout.TimeoutFunctionException:
        log.plog("timeout while trying to fetch " + url, 101)
        return failOn('read timeout ' + url)
    redirURL = websock.geturl()
    if redirURL != url:
        log.plog('redirected to ' + redirURL, 2)
        url = redirURL
        #redirected urls need to be blocked too

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        return failOn('article length exceeds 500k, probably not html')

    windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94']
    cd = chardet.detect(infoModule.info.page['rawHTML'])
    if cd['encoding'] != 'ascii':
        log.plog('Server encoding: ' + cd['encoding'], 2)
        oldHTML = infoModule.info.page['rawHTML']
        infoModule.info.page['rawHTML'] = infoModule.info.page[
            'rawHTML'].decode(cd['encoding'])
        windows_chars_in_html = [
            trouble for trouble in windows_trouble_list
            if infoModule.info.page['rawHTML'].find(trouble) >= 0
        ]
        if len(windows_chars_in_html) > 0:
            #windows = infoModule.info.page['rawHTML'].find(u'\x93')
            log.plog('this is actually windows-1252', 3)
            infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252')

    # some configuration options
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1)
        log.plog(
            "meta_description: " + infoModule.info.page['meta_description'], 2)

    log.plog(
        '======================================= TITLE ================================',
        2)
    # get title
    #set HTMLTitle first

    HTMLTitle = re.search('<title>(.*?)<\/title>',
                          infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(
            infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else:
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(
        infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    #cd = chardet.detect(infoModule.info.page['title'])
    #if cd['encoding'] != 'ascii':
    #    log.plog('title encoding: ' + cd['encoding'], 2)
    #    oldTitle = infoModule.info.page['title']
    #    infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding'])
    #    windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0]
    #    if len(windows_chars_in_html) > 0:
    #        #windows = infoModule.info.page['rawHTML'].find(u'\x93')
    #        log.plog('title is actually windows-1252', 3)
    #        infoModule.info.page['title'] = oldTitle.decode('windows-1252')

    log.plog(
        '======================================= OUTLINE ================================',
        2)
    ## fetch outline
    #remove special case elements from the html.  These are lines or blocks of code that cause
    #problems if left in
    infoModule.info.page['plainText'] = strip_html.removeSpecialCases(
        infoModule.info.page['rawHTML'])
    infoModule.info.page['plainText'] = strip_html.clearHTML(
        infoModule.info.page['plainText'])
    #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here
    infoModule.info.page['plainText'] = re.sub(
        '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0,
        re.I | re.S | re.M)
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor

    log.plog('searching for body using body extractor', 2)
    infoModule.info.site['body_extractor_no_date'] = True
    outline = body_extractor.extract(infoModule.info.page['plainText'],
                                     doAsciiConvert=False)
    infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']
    if outline != None:
        abbreviatedHTML = html_body_extractor.html_body_extractor(
            infoModule.info.page['rawHTML'], outline)
        if abbreviatedHTML != None:
            infoModule.info.page['rawHTML'] = abbreviatedHTML
        infoModule.info.page['outline'] = outline
        #use largestBlock to strip leading dom elements off that seem extraneous
        infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(
            infoModule.info.page['imageHTML'], infoModule.info.page['outline'])
    else:
        log.plog('could not create an outline for this story!', 5)
        infoModule.info.page['outline'] = ''

        #return failOn('could not create an outline for this story!')

    # outline must be at least minOutlineLen
    minOutlineLen = 255
    if len(infoModule.info.page['outline']) > 0 and len(
            infoModule.info.page['outline']) < minOutlineLen:
        log.plog('outline too short, assuming failure', 3)
        infoModule.info.page['outline'] = ''

    log.plog(
        '======================================= IMAGES ================================',
        2)
    #find images
    image_start_marker = ''
    image_end_marker = ''
    imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'],
                                            url)
    if imageArray == None:
        log.plog('could not find image', 3)
        imageArray = ''

    log.plog(
        '======================================= VIDEOS ================================',
        2)
    ###look for videos
    allVideosJSON = find_all_videos.find_all_videos(
        infoModule.info.page['imageHTML'])

    allVideos = json.loads(allVideosJSON)
    if len(allVideos) > 0:
        log.plog('found video embed', 2)
        print allVideosJSON

    #if no outline and no images over x by y and no videos, then no story
    if infoModule.info.page['outline'] == '' and (
            imageArray == '' or imageArray == []) and allVideos == '':
        failOn('nothing found')

    #largest image if no outline must be at least 450 x 450 to make it an image page
    largestImageDimensions = 0
    largestImage = []
    for image in imageArray:
        if image['width'] * image['height'] > largestImageDimensions:
            largestImage = image
            largestImageDimensions = image['width'] * image['height']

    print largestImage
    minImageSize = 400
    if infoModule.info.page['outline'] == '' and allVideos == [] and (
            largestImage == [] or largestImage['width'] < minImageSize
            or largestImage['height'] < minImageSize):
        return (failOn(
            'no story or video found, and largest image less than min size'))

    status = 'OK'
    storyObj = {}
    storyObj['title'] = infoModule.info.page['title']

    storyObj['outline'] = unicodeMapper.clearCurlies(
        infoModule.info.page['outline'])
    storyObj['url'] = url
    storyObj['images'] = imageArray
    storyObj['videos'] = allVideos
    returnVal = {"status": status, "story": storyObj}
    output = json.dumps(returnVal)
    return output

Exemplo n.º 19

0

Exibir arquivo

def scanPage():
    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        sys.exit()

    log.plog("fetching " + infoModule.info.page['url'], 2)
    socket = urllib.urlopen(infoModule.info.page['url'])
    infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL

    ## maybe check last modified header and don't get stories older than 7 days?

    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        sys.exit()

    print "<b>URL</b> <a href=\"" + infoModule.info.page[
        'url'] + "\">" + infoModule.info.page['url'] + "</a><br />"
    #print "<b>URL</b> " + infoModule.info.page['url'] + "<br />"
    # get title
    #title = find_title.findTitle()
    #if title != False:
    #    infoModule.info.page['title'] = title
    #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
    #    infoModule.info.page['title'] = infoModule.info.page['potential_title']
    #else:
    #    log.plog('no title found!', 3)
    #    sys.exit()

    ##print infoModule.info.page['title']

    #find images
    #if 'image_start_marker' in infoModule.info.source:
    #    image_start_marker = infoModule.info.source['image_start_marker']
    #else:
    #    image_start_marker = ''

    #if 'image_end_marker' in infoModule.info.source:
    #    image_end_marker = infoModule.info.source['image_end_marker']
    #else:
    #    image_end_marker = ''
    #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker)
    #x = imageArray[0]
    #y = imageArray[1]
    #imageURL = imageArray[2]

    #if imageURL == '':
    #    log.plog('could not find image', 3)
    #else:
    #    log.plog('image found: ' + imageURL, 2)
    #    infoModule.info.page['largestImage'] = imageURL
    #    infoModule.info.page['maxSize'] = x * y

    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(
            infoModule.info.page['rawHTML'],
            infoModule.info.source['image_source_start_marker'],
            infoModule.info.source['image_source_end_marker'])
        if imageSource != False and imageSource != None:
            infoModule.info.page['imageSource'] = imageSource
            #print "<b>Image Credit:</b> " + imageSource + "<br />"

    ###look for videos
    #videoHunter = find_video.youtube()

    #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML'])
    #if videoLink == False:
    #    log.plog('no video found', 2)
    #    infoModule.info.page['vlink'] = ''
    #else:
    #    log.plog('found video embed', 2)
    #    infoModule.info.page['vlink'] = videoLink

    ## parse links in page
    #links.linkScoring(infoModule.info.page['rawHTML'], 'subs')
    #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom')
    #links.outboundLinks(infoModule.info.page['rawHTML'])

    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = author.replace('<', '&lt;')
            infoModule.info.page['author'] = author
            print "<b>Author:</b> " + author + "<br />"
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''

    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source[
            'featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(
            infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(
            infoModule.info.page['rawHTML'])

    outline = find_story.findStoryViaRegex()
    if outline != False:
        if 'featured_source' in infoModule.info.source and infoModule.info.source[
                'featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(
                outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
        print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />"

    print "<hr>"