示例#1
0
def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)
    
    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath   = entityLib.entityLibrary(ids, 'linkPath')
                entityURL   = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)
            
                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '","nameUsed":"' + entityList[ids]['nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res
示例#2
0
    else:
        infoModule.info.page['outline'] = strip_html.clearHTML(outline)
else:
    log.plog('searching for body using body extractor', 2)
    outline = body_extractor.extract(infoModule.info.page['plainText'])
    if outline != False:
        infoModule.info.page['outline'] = outline
    else:
        log.plog('could not create an outline for this story!', 5)
        sys.exit()
'''
#add nickname for testing
infoModule.info.page['title'] = 'Elizabeth Smart: Cop Found Me But "Walked Away"'
infoModule.info.page['outline'] = '(AP)  Elizabeth Smart told jurors Tuesday how a Salt Lake City police detective tried to see behind her veil but backed down when the man accused of kidnapping her said her face was hidden for religious reasons. "I was mad at myself, that I didn\'t say anything," she said on her second day of testimony in the federal trial of Brian David Mitchell. "I felt terrible that the detective hadn\'t pushed harder and had just walked away." Smart, now 23, was 14 when she was taken at knifepoint in June 2002 while sleeping. Nine months later, motorists spotted her walking in a Salt Lake City suburb with Mitchell. Mitchell, 57, faces life in prison if he is convicted of kidnapping and unlawful transportation of a minor across state lines with the intent to engage in criminal sexual activity.'

entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)
print "entity IDs found:"
pprint.pprint(infoModule.info.entityList)

entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)

print "entity IDs found after nicknames:"
pprint.pprint(infoModule.info.entityList)

entities.setPrimo()

print "entity state after setPrimo"
pprint.pprint(infoModule.info.entityList)
print "Total Entities Found:"
print(len(infoModule.info.entityList))
示例#3
0
def scanPage():
    siteDB = infoModule.info.site['database']

    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        os._exit(0)
        
    urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink'])
    while True:
        urlBlocker = urlBlockerQ.fetch_row(1,1)
        if urlBlocker == ():
            break
        blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url'])
        if blockTest != None:
            log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2)
            os._exit(0)
                
    log.plog("fetching " + infoModule.info.page['url'], 2)
    try:
        socket = urllib.urlopen(infoModule.info.page['url'])
    except IOError:
        log.plog('could not open ' + infoModule.info.page['url'], 4)
        return False
    responseCode = socket.getcode()
    log.plog('urllib response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return False
    headerInfo = socket.info()
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return False
    # put in to account for WSJ -dpg
    if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = wsjAuthHack(infoModule.info.page['url'])
    elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = nytAuthHack(infoModule.info.page['url'])
    else:
       	infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL
        #redirected urls need to be blocked too
        urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink'])
        while True:
            urlBlocker = urlBlockerQ.fetch_row(1,1)
            if urlBlocker == ():
                break
            blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url'])
            if blockTest != None:
                log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2)
                os._exit(0)

        ### and short url needs to be blocked
        #do not read links that have only one string in them
        linkParts = urlparse.urlparse(infoModule.info.page['url']) 
        shortPath = re.search('^/\w+/*$', linkParts[2])
        lp = linkParts[2]
        if shortPath != None:
            log.plog("link excluded because it only has a short path of characters: %s" % linkParts[2], 2)
            os._exit(0)

    ## anything in htmlBlacklist?
    htmlBlacklistQ = mysql_tools.mysqlQuery("select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink'])
    while True:
        htmlBlacklist = htmlBlacklistQ.fetch_row(1,1)
        if htmlBlacklist == ():
            break
        badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML'])
        if badSeedHTML != None:
            log.plog('html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3)
            os._exit(0) 
    
    ###################################
    #special case for feedburner sources
    #ernst does not like special cases
    ###################################
    infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url'])
    
    #check AGAIN to see if url is already in system
    escURL = infoModule.info.page['url'].replace("'", "\\'")
    urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink'])
    #don't exit, return false so that a new story can be tried
    if urlCheckQ.num_rows() > 0:
        log.plog("scanpage-url already in newsroom: %s" % infoModule.info.page['url'] , 2)
        log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1,1)))
        return False
    urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink'])
    if urlCheckQ.num_rows() > 0:
        
        log.plog("scanpage-url already in subs: %s" % infoModule.info.page['url'], 2)
        log.plog("sub_id: " + str(urlCheckQ.fetch_row(1,1)))
        return False


    ## if source is '0', try to find source
    if infoModule.info.source['source_id'] == '0':
        sourceRegexQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink'])
        while True:
            sourceRegex = sourceRegexQ.fetch_row(1,1)
            if sourceRegex == ():
                break
            urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url'])
            if urlTest != None:
                log.plog('found source via regex: ' + sourceRegex[0]['title'], 2)
                infoModule.info.source = sourceRegex[0]
                for i in infoModule.info.source.keys():
                    ## this is sort of hack-y, but stupid python returns None for null
                    if infoModule.info.source[i] == None:
                        infoModule.info.source[i] = ''

                break
    
    ## maybe check last modified header and don't get stories older than 7 days?
    '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url'])
    if possibleAgeInDays != None:
        log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2)
        if int(possibleAgeInDays) > 5:
            log.plog("story is " + str(possibleAgeInDays) + " days old.  Not reading", 2)
            return False
'''
    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        os._exit(0)

    #add meta description into the mix
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8')
        log.plog("meta_description: " + infoModule.info.page['meta_description'], 2)
        

    log.plog('======================================= TITLE ================================', 2)
    # get title
    #set HTMLTitle first
    HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else: 
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)


    log.plog('======================================= OUTLINE ================================', 2)        
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML'])
    
    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor
    if infoModule.info.site['skipBodyRegex'] == False:
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
            #set html block used for imaage, author and links to be what outline returns
            if outline != False:
                infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
                infoModule.info.page['rawHTML'] = outline
        except TimeoutFunctionException:
            outline = False
            log.plog("ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5)

    #outline = find_story.findStoryViaRegex()
    if outline != False:
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            ## parse links in page only in regex block if we have regex
            log.plog('======================================= LINK SCORING ================================', 2)
            links.linkScoring(outline, 'subs')
            links.linkScoring(outline, 'newsroom')
            log.plog('======================================= OUTBOUND LINKS ================================', 2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(outline)
        

        if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
    else:
        log.plog('searching for body using body extractor', 2)
        outline = body_extractor.extract(infoModule.info.page['plainText'])
        if outline != False:
            infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'];
            abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline)
            if abbreviatedHTML != None:
                infoModule.info.page['rawHTML'] = abbreviatedHTML
            infoModule.info.page['outline'] = outline
        else:
            log.plog('could not create an outline for this story!', 5)
            os._exit(0)
        ## parse links in page - no regex, so look in rawHTML for links
        ## if there are widgetBlockers, first clear them from the html
        linkHTML = infoModule.info.page['rawHTML']
        widgetBlacklistQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink'])
        while True:
            widgetBlacklist = widgetBlacklistQ.fetch_row(1,1)
            if widgetBlacklist == ():
                break
            if isinstance(linkHTML, str) == False:
                log.plog('linkHTML is not string', 5)
                os._exit(0)
            wblMatch = re.search(widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I)
            if wblMatch != None:
                log.plog("found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2)
                linkHTML = linkHTML.replace(wblMatch.group(0), '')
                mysql_tools.mysqlQuery("update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink'])
                
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            log.plog('======================================= LINK SCORING ================================', 2)                
            links.linkScoring(linkHTML, 'subs')
            links.linkScoring(linkHTML, 'newsroom')
            log.plog('======================================= OUTBOUND LINKS ================================', 2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(linkHTML)



    log.plog('======================================= IMAGES ================================', 2)
    #find images        
    if 'image_start_marker' in infoModule.info.source:
        image_start_marker = infoModule.info.source['image_start_marker']
    else:
        image_start_marker = ''

    if 'image_end_marker' in infoModule.info.source:
        image_end_marker = infoModule.info.source['image_end_marker']
    else:
        image_end_marker = ''
    imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker)
    if imageArray == None:
        log.plog('could not find image', 3)    
    else:
        x = imageArray[0]
        y = imageArray[1]
        imageURL = imageArray[2]

        if imageURL == '':
            log.plog('could not find image', 3)
        else:
            log.plog('image found: ' + imageURL, 2)
            infoModule.info.page['largestImage'] = imageURL
            infoModule.info.page['maxSize'] = x * y

    log.plog('======================================= IMAGE CREDIT ================================', 2)
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker'])
        if imageSource != False:
            infoModule.info.page['imageSource'] = imageSource

    log.plog('======================================= VIDEOS ================================', 2)
    ###look for videos
    videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML'])

    if videoLink == False:
        infoModule.info.page['vlink'] = ''
    else:
        log.plog('found video embed', 2)
        infoModule.info.page['vlink'] = videoLink
        vthumb = find_video.findVideoThumb(videoLink)
        if vthumb == False:
            infoModule.info.page['vthumb'] = ''
        else:
            log.plog('found video thumb', 2)
            infoModule.info.page['vthumb'] = vthumb

    log.plog('======================================= AUTHOR ================================', 2)    
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = strip_html.clearHTML(author)
            infoModule.info.page['author'] = author
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''
            
    log.plog('======================================= ENTITIES ================================', 2)
    #### find entities
    entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)
    nickname = False
    while nickname is False: 
        try:
            entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True)
	    nickname = True
        except:
            pass
    ## test cityToTeam
    #cityToTeam.getRelevantEntity()

    entities.setPrimo()

    #### chop outline to 500 chars unless featured
    if 'featured_source' not in infoModule.info.source or infoModule.info.source['featured_source'] == '0':
        infoModule.info.page['outline'] = infoModule.info.page['outline'][0:500] + '...'
    
    if len(infoModule.info.entityList) < 1:
        log.plog("no entities found in story!", 5)
        os._exit(0)

    log.plog('======================================= UNKNOWN ENTITIES ================================', 2)
    ## any unknown entities?
    entityFixedString = infoModule.info.page['title'] + ' ' + infoModule.info.page['outline']
    entityFixedString = entityFixedString.replace("'s", "")
    entityFixedString = re.sub('\W+', ' ', entityFixedString)
    
    find_new_entities.find_new_entities(entityFixedString)
    ## page must have at least one non-hidden entity            
    invisibleTypesQuery = mysql_tools.mysqlQuery("select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink'])
    invisibleTypes = ''
    sep = ''
    while True:
        oneType = invisibleTypesQuery.fetch_row(1,1)
        if oneType == ():
            break
        invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id']
        sep = ','

    sep = ''
    cclist = ''
    for eKey in infoModule.info.entityList.keys():
        cclist = cclist + sep + str(eKey)
        sep = ','


    sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")"
    nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink'])
    if nonHiddenEntitiesQ.num_rows() == 0:
        log.plog('no non-hidden entities found in story!', 4)
        os._exit(0)
     
    newSubId = addStory.addStory()
    if newSubId == False:
        log.plog('addStory failed', 5)
        
    else:
        log.plog("Story added.  New sub_id: " + str(newSubId), 2)
        
    os._exit(0)
示例#4
0
def scanPage():
    siteDB = infoModule.info.site['database']

    if 'url' not in infoModule.info.page:
        log.plog('scan page called without url', 4)
        os._exit(0)

    urlBlockerQ = mysql_tools.mysqlQuery(
        "select * from " + siteDB + ".urlBlocker",
        infoModule.info.site['dblink'])
    while True:
        urlBlocker = urlBlockerQ.fetch_row(1, 1)
        if urlBlocker == ():
            break
        blockTest = re.search(urlBlocker[0]['regex'],
                              infoModule.info.page['url'])
        if blockTest != None:
            log.plog(
                'url ' + infoModule.info.page['url'] + " matches urlBlocker " +
                urlBlocker[0]['regex'], 2)
            os._exit(0)

    log.plog("fetching " + infoModule.info.page['url'], 2)
    try:
        socket = urllib.urlopen(infoModule.info.page['url'])
    except IOError:
        log.plog('could not open ' + infoModule.info.page['url'], 4)
        return False
    responseCode = socket.getcode()
    log.plog('urllib response code: ' + str(responseCode), 2)
    if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303:
        log.plog('got failure response code from server', 4)
        return False
    headerInfo = socket.info()
    contentType = headerInfo.gettype()
    if contentType != 'text/html' and contentType != 'text/html, text/html':
        log.plog('content type: ' + contentType + '. not fetching', 4)
        return False
    # put in to account for WSJ -dpg
    if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = wsjAuthHack(
            infoModule.info.page['url'])
    elif re.search("nytimes\.com", infoModule.info.page['url'],
                   re.S | re.M | re.I):
        infoModule.info.page['rawHTML'] = nytAuthHack(
            infoModule.info.page['url'])
    else:
        infoModule.info.page['rawHTML'] = socket.read()
    redirURL = socket.geturl()
    if redirURL != infoModule.info.page['url']:
        log.plog('redirected to ' + redirURL, 2)
        infoModule.info.page['url'] = redirURL
        #redirected urls need to be blocked too
        urlBlockerQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".urlBlocker",
            infoModule.info.site['dblink'])
        while True:
            urlBlocker = urlBlockerQ.fetch_row(1, 1)
            if urlBlocker == ():
                break
            blockTest = re.search(urlBlocker[0]['regex'],
                                  infoModule.info.page['url'])
            if blockTest != None:
                log.plog(
                    'url ' + infoModule.info.page['url'] +
                    " matches urlBlocker " + urlBlocker[0]['regex'], 2)
                os._exit(0)

        ### and short url needs to be blocked
        #do not read links that have only one string in them
        linkParts = urlparse.urlparse(infoModule.info.page['url'])
        shortPath = re.search('^/\w+/*$', linkParts[2])
        lp = linkParts[2]
        if shortPath != None:
            log.plog(
                "link excluded because it only has a short path of characters: %s"
                % linkParts[2], 2)
            os._exit(0)

    ## anything in htmlBlacklist?
    htmlBlacklistQ = mysql_tools.mysqlQuery(
        "select regex from " + siteDB + ".htmlBlacklist",
        infoModule.info.site['dblink'])
    while True:
        htmlBlacklist = htmlBlacklistQ.fetch_row(1, 1)
        if htmlBlacklist == ():
            break
        badSeedHTML = re.search(htmlBlacklist[0]['regex'],
                                infoModule.info.page['rawHTML'])
        if badSeedHTML != None:
            log.plog(
                'html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'],
                3)
            os._exit(0)

    ###################################
    #special case for feedburner sources
    #ernst does not like special cases
    ###################################
    infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '',
                                         infoModule.info.page['url'])

    #check AGAIN to see if url is already in system
    escURL = infoModule.info.page['url'].replace("'", "\\'")
    urlCheckQ = mysql_tools.mysqlQuery(
        "select sub_id from " + siteDB + ".newsroom where url='" + escURL +
        "'", infoModule.info.site['dblink'])
    #don't exit, return false so that a new story can be tried
    if urlCheckQ.num_rows() > 0:
        log.plog(
            "scanpage-url already in newsroom: %s" %
            infoModule.info.page['url'], 2)
        log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1, 1)))
        return False
    urlCheckQ = mysql_tools.mysqlQuery(
        "select sub_id from " + siteDB + ".subs where url='" + escURL + "'",
        infoModule.info.site['dblink'])
    if urlCheckQ.num_rows() > 0:

        log.plog(
            "scanpage-url already in subs: %s" % infoModule.info.page['url'],
            2)
        log.plog("sub_id: " + str(urlCheckQ.fetch_row(1, 1)))
        return False

    ## if source is '0', try to find source
    if infoModule.info.source['source_id'] == '0':
        sourceRegexQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".sources where url_regex != ''",
            infoModule.info.site['dblink'])
        while True:
            sourceRegex = sourceRegexQ.fetch_row(1, 1)
            if sourceRegex == ():
                break
            urlTest = re.search(sourceRegex[0]['url_regex'],
                                infoModule.info.page['url'])
            if urlTest != None:
                log.plog('found source via regex: ' + sourceRegex[0]['title'],
                         2)
                infoModule.info.source = sourceRegex[0]
                for i in infoModule.info.source.keys():
                    ## this is sort of hack-y, but stupid python returns None for null
                    if infoModule.info.source[i] == None:
                        infoModule.info.source[i] = ''

                break

    ## maybe check last modified header and don't get stories older than 7 days?
    '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url'])
    if possibleAgeInDays != None:
        log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2)
        if int(possibleAgeInDays) > 5:
            log.plog("story is " + str(possibleAgeInDays) + " days old.  Not reading", 2)
            return False
'''
    if len(infoModule.info.page['rawHTML']) > 500000:
        log.plog("article length exceeds 500k, probably not html", 2)
        os._exit(0)

    #add meta description into the mix
    infoModule.info.page['meta_description'] = ''
    meta_search = re.search(
        'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"',
        infoModule.info.page['rawHTML'], re.I | re.S)
    if meta_search != None:
        infoModule.info.page['meta_description'] = meta_search.group(1).decode(
            'utf-8')
        log.plog(
            "meta_description: " + infoModule.info.page['meta_description'], 2)

    log.plog(
        '======================================= TITLE ================================',
        2)
    # get title
    #set HTMLTitle first
    HTMLTitle = re.search('<title>(.*?)<\/title>',
                          infoModule.info.page['rawHTML'], re.S | re.I)
    if HTMLTitle != None:
        infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1)
        log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2)
    else:
        infoModule.info.page['HTMLTitle'] = ""
    title = find_title.findTitle()
    if title != False:
        infoModule.info.page['title'] = title
        log.plog('title from regex', 2)
    if 'potential_title' in infoModule.info.page and len(
            infoModule.info.page['potential_title']) > 0:
        infoModule.info.page['title'] = strip_html.clearHTML(
            infoModule.info.page['potential_title'])
        log.plog('title from potential_title', 2)
    else:
        infoModule.info.page['title'] = real_title2.realTitle()
        if infoModule.info.page['title'] == False:
            infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']
            log.plog('using html title', 2)
        else:
            log.plog('title from realTitle', 2)

    if infoModule.info.page['title'] == '':
        log.plog('could not find title for page. Setting to HTML Title', 4)
        infoModule.info.page['title'] = infoModule.info.page['HTMLTitle']

    #clear html from title
    infoModule.info.page['title'] = strip_html.clearHTML(
        infoModule.info.page['title'])
    #also titleCase the title
    #infoModule.info.page['title'] = infoModule.info.page['title'].title()
    log.plog('final title: ' + infoModule.info.page['title'], 2)

    log.plog(
        '======================================= OUTLINE ================================',
        2)
    ## fetch outline
    if 'featured_source' in infoModule.info.source and infoModule.info.source[
            'featured_source'] == '1':
        infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(
            infoModule.info.page['rawHTML'])
    else:
        infoModule.info.page['plainText'] = strip_html.clearHTML(
            infoModule.info.page['rawHTML'])

    outline = False
    #this toggle allows for ignoring regex in favor of body_extractor
    if infoModule.info.site['skipBodyRegex'] == False:
        storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2)
        try:
            outline = storySearch()
            #set html block used for imaage, author and links to be what outline returns
            if outline != False:
                infoModule.info.page['imageHTML'] = infoModule.info.page[
                    'rawHTML']
                infoModule.info.page['rawHTML'] = outline
        except TimeoutFunctionException:
            outline = False
            log.plog(
                "ERROR regex timed out for %s" %
                infoModule.info.source['story_start_marker'], 5)

    #outline = find_story.findStoryViaRegex()
    if outline != False:
        if infoModule.info.page['promoter'] == '0' and infoModule.info.source[
                'source_id'] != '0' and 'source_format' in infoModule.info.source and len(
                    infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            ## parse links in page only in regex block if we have regex
            log.plog(
                '======================================= LINK SCORING ================================',
                2)
            links.linkScoring(outline, 'subs')
            links.linkScoring(outline, 'newsroom')
            log.plog(
                '======================================= OUTBOUND LINKS ================================',
                2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(outline)

        if 'featured_source' in infoModule.info.source and infoModule.info.source[
                'featured_source'] == '1':
            infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(
                outline)
        else:
            infoModule.info.page['outline'] = strip_html.clearHTML(outline)
    else:
        log.plog('searching for body using body extractor', 2)
        outline = body_extractor.extract(infoModule.info.page['plainText'])
        if outline != False:
            infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']
            abbreviatedHTML = html_body_extractor.html_body_extractor(
                infoModule.info.page['rawHTML'], outline)
            if abbreviatedHTML != None:
                infoModule.info.page['rawHTML'] = abbreviatedHTML
            infoModule.info.page['outline'] = outline
        else:
            log.plog('could not create an outline for this story!', 5)
            os._exit(0)
        ## parse links in page - no regex, so look in rawHTML for links
        ## if there are widgetBlockers, first clear them from the html
        linkHTML = infoModule.info.page['rawHTML']
        widgetBlacklistQ = mysql_tools.mysqlQuery(
            "select * from " + siteDB + ".widgetBlacklist",
            infoModule.info.site['dblink'])
        while True:
            widgetBlacklist = widgetBlacklistQ.fetch_row(1, 1)
            if widgetBlacklist == ():
                break
            if isinstance(linkHTML, str) == False:
                log.plog('linkHTML is not string', 5)
                os._exit(0)
            wblMatch = re.search(
                widgetBlacklist[0]['start_text'] + '.*?' +
                widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I)
            if wblMatch != None:
                log.plog(
                    "found widget blacklist for " +
                    widgetBlacklist[0]['start_text'] + '.*?' +
                    widgetBlacklist[0]['end_text'], 2)
                linkHTML = linkHTML.replace(wblMatch.group(0), '')
                mysql_tools.mysqlQuery(
                    "update " + siteDB +
                    ".widgetBlacklist set hits=hits+1 where widget_id=" +
                    widgetBlacklist[0]['widget_id'],
                    infoModule.info.site['dblink'])

        if infoModule.info.page['promoter'] == '0' and infoModule.info.source[
                'source_id'] != '0' and 'source_format' in infoModule.info.source and len(
                    infoModule.info.source['source_format']) > 0:
            #link scoring only happens on rss feeds
            log.plog(
                '======================================= LINK SCORING ================================',
                2)
            links.linkScoring(linkHTML, 'subs')
            links.linkScoring(linkHTML, 'newsroom')
            log.plog(
                '======================================= OUTBOUND LINKS ================================',
                2)
            #don't go more than one level deep on blind stories
            links.outboundLinks(linkHTML)

    log.plog(
        '======================================= IMAGES ================================',
        2)
    #find images
    if 'image_start_marker' in infoModule.info.source:
        image_start_marker = infoModule.info.source['image_start_marker']
    else:
        image_start_marker = ''

    if 'image_end_marker' in infoModule.info.source:
        image_end_marker = infoModule.info.source['image_end_marker']
    else:
        image_end_marker = ''
    imageArray = find_images.findImages(infoModule.info.page['imageHTML'],
                                        image_start_marker, image_end_marker)
    if imageArray == None:
        log.plog('could not find image', 3)
    else:
        x = imageArray[0]
        y = imageArray[1]
        imageURL = imageArray[2]

        if imageURL == '':
            log.plog('could not find image', 3)
        else:
            log.plog('image found: ' + imageURL, 2)
            infoModule.info.page['largestImage'] = imageURL
            infoModule.info.page['maxSize'] = x * y

    log.plog(
        '======================================= IMAGE CREDIT ================================',
        2)
    ## image credit if any
    infoModule.info.page['imageSource'] = ''
    if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source:
        imageSource = find_credit.findCredit(
            infoModule.info.page['rawHTML'],
            infoModule.info.source['image_source_start_marker'],
            infoModule.info.source['image_source_end_marker'])
        if imageSource != False:
            infoModule.info.page['imageSource'] = imageSource

    log.plog(
        '======================================= VIDEOS ================================',
        2)
    ###look for videos
    videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML'])

    if videoLink == False:
        infoModule.info.page['vlink'] = ''
    else:
        log.plog('found video embed', 2)
        infoModule.info.page['vlink'] = videoLink
        vthumb = find_video.findVideoThumb(videoLink)
        if vthumb == False:
            infoModule.info.page['vthumb'] = ''
        else:
            log.plog('found video thumb', 2)
            infoModule.info.page['vthumb'] = vthumb

    log.plog(
        '======================================= AUTHOR ================================',
        2)
    ##author in story?
    if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source:
        author = find_author.findAuthor()
        if author != False:
            author = strip_html.clearHTML(author)
            infoModule.info.page['author'] = author
        else:
            infoModule.info.page['author'] = ''
    else:
        infoModule.info.page['author'] = ''

    log.plog(
        '======================================= ENTITIES ================================',
        2)
    #### find entities
    entities.entityFinder(
        infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
        True)
    nickname = False
    while nickname is False:
        try:
            entities.nicknameFinder(
                infoModule.info.page['title'] + ' ' +
                infoModule.info.page['outline'], True)
            nickname = True
        except:
            pass
    ## test cityToTeam
    #cityToTeam.getRelevantEntity()

    entities.setPrimo()

    #### chop outline to 500 chars unless featured
    if 'featured_source' not in infoModule.info.source or infoModule.info.source[
            'featured_source'] == '0':
        infoModule.info.page[
            'outline'] = infoModule.info.page['outline'][0:500] + '...'

    if len(infoModule.info.entityList) < 1:
        log.plog("no entities found in story!", 5)
        os._exit(0)

    log.plog(
        '======================================= UNKNOWN ENTITIES ================================',
        2)
    ## any unknown entities?
    entityFixedString = infoModule.info.page[
        'title'] + ' ' + infoModule.info.page['outline']
    entityFixedString = entityFixedString.replace("'s", "")
    entityFixedString = re.sub('\W+', ' ', entityFixedString)

    find_new_entities.find_new_entities(entityFixedString)
    ## page must have at least one non-hidden entity
    invisibleTypesQuery = mysql_tools.mysqlQuery(
        "select mptype_id from db_topics.mptypes where visibility='invisible'",
        infoModule.info.site['dblink'])
    invisibleTypes = ''
    sep = ''
    while True:
        oneType = invisibleTypesQuery.fetch_row(1, 1)
        if oneType == ():
            break
        invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id']
        sep = ','

    sep = ''
    cclist = ''
    for eKey in infoModule.info.entityList.keys():
        cclist = cclist + sep + str(eKey)
        sep = ','

    sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")"
    nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql,
                                                infoModule.info.site['dblink'])
    if nonHiddenEntitiesQ.num_rows() == 0:
        log.plog('no non-hidden entities found in story!', 4)
        os._exit(0)

    newSubId = addStory.addStory()
    if newSubId == False:
        log.plog('addStory failed', 5)

    else:
        log.plog("Story added.  New sub_id: " + str(newSubId), 2)

    os._exit(0)
示例#5
0
        infoModule.info.page['outline'] = outline
    else:
        log.plog('could not create an outline for this story!', 5)
        sys.exit()
'''
#add nickname for testing
infoModule.info.page[
    'title'] = 'Elizabeth Smart: Cop Found Me But "Walked Away"'
infoModule.info.page[
    'outline'] = '(AP)  Elizabeth Smart told jurors Tuesday how a Salt Lake City police detective tried to see behind her veil but backed down when the man accused of kidnapping her said her face was hidden for religious reasons. "I was mad at myself, that I didn\'t say anything," she said on her second day of testimony in the federal trial of Brian David Mitchell. "I felt terrible that the detective hadn\'t pushed harder and had just walked away." Smart, now 23, was 14 when she was taken at knifepoint in June 2002 while sleeping. Nine months later, motorists spotted her walking in a Salt Lake City suburb with Mitchell. Mitchell, 57, faces life in prison if he is convicted of kidnapping and unlawful transportation of a minor across state lines with the intent to engage in criminal sexual activity.'

entities.entityFinder(
    infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
    True)
print "entity IDs found:"
pprint.pprint(infoModule.info.entityList)

entities.nicknameFinder(
    infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'],
    True)

print "entity IDs found after nicknames:"
pprint.pprint(infoModule.info.entityList)

entities.setPrimo()

print "entity state after setPrimo"
pprint.pprint(infoModule.info.entityList)
print "Total Entities Found:"
print(len(infoModule.info.entityList))
示例#6
0
def getEntities(searchText, title, jsonOut=True, byID=False):
    infoModule.info.page['outline'] = searchText
    infoModule.info.page['title'] = title

    entities.entityFinder(title + ' ' + searchText, True)
    entities.nicknameFinder(title + ' ' + searchText, True, True)
    entities.setPrimo()
    #pprint.pprint(infoModule.info.entityList)

    entityList = infoModule.info.entityList
    if jsonOut == True:
        #res = json.dumps(infoModule.info.entityList)
        #pprint.pprint(res)
        if len(entityList) > 0:
            ents = entityList.keys()

            # hacky JSON building; python's json module outputs slightly different than what we need
            res = '['
            for ids in ents:
                entityName = entityLib.entityLibrary(ids, 'entityName')
                celeb_type = entityLib.entityLibrary(ids, 'celeb_type')
                linkPath = entityLib.entityLibrary(ids, 'linkPath')
                entityURL = entityLib.entityLibrary(ids, 'lookupUrl')
                if byID:
                    #swap URL for ID
                    entityURL = str(ids)

                if celeb_type != 'hidden':
                    if 'nameUsed' in entityList[ids]:
                        #this means there's a nickname response
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '","nameUsed":"' + entityList[ids][
                            'nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                                linkPath
                            ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
                    else:
                        res = res + '{"id":"' + str(
                            entityURL
                        ) + '","name":"' + str(
                            entityName
                        ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(
                            celeb_type
                        ) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(
                            linkPath
                        ) + '","icon":"http://informifi.com/fi_icon.png"}]},'
            res = res[:-1] + ']'
            #catch for nothing but hiddens
            if res == ']':
                res = ''
            #pprint.pprint(res)
        else:
            res = ''
    return res