def getEntities(searchText, title, jsonOut=True, byID=False): infoModule.info.page['outline'] = searchText infoModule.info.page['title'] = title entities.entityFinder(title + ' ' + searchText, True) entities.nicknameFinder(title + ' ' + searchText, True, True) entities.setPrimo() #pprint.pprint(infoModule.info.entityList) entityList = infoModule.info.entityList if jsonOut == True: #res = json.dumps(infoModule.info.entityList) #pprint.pprint(res) if len(entityList) > 0: ents = entityList.keys() # hacky JSON building; python's json module outputs slightly different than what we need res = '[' for ids in ents: entityName = entityLib.entityLibrary(ids, 'entityName') celeb_type = entityLib.entityLibrary(ids, 'celeb_type') linkPath = entityLib.entityLibrary(ids, 'linkPath') entityURL = entityLib.entityLibrary(ids, 'lookupUrl') if byID: #swap URL for ID entityURL = str(ids) if celeb_type != 'hidden': if 'nameUsed' in entityList[ids]: #this means there's a nickname response res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '","nameUsed":"' + entityList[ids]['nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},' else: res = res + '{"id":"' + str(entityURL) + '","name":"' + str(entityName) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str(celeb_type) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str(linkPath) + '","icon":"http://informifi.com/fi_icon.png"}]},' res = res[:-1] + ']' #catch for nothing but hiddens if res == ']': res = '' #pprint.pprint(res) else: res = '' return res
else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) else: log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) sys.exit() ''' #add nickname for testing infoModule.info.page['title'] = 'Elizabeth Smart: Cop Found Me But "Walked Away"' infoModule.info.page['outline'] = '(AP) Elizabeth Smart told jurors Tuesday how a Salt Lake City police detective tried to see behind her veil but backed down when the man accused of kidnapping her said her face was hidden for religious reasons. "I was mad at myself, that I didn\'t say anything," she said on her second day of testimony in the federal trial of Brian David Mitchell. "I felt terrible that the detective hadn\'t pushed harder and had just walked away." Smart, now 23, was 14 when she was taken at knifepoint in June 2002 while sleeping. Nine months later, motorists spotted her walking in a Salt Lake City suburb with Mitchell. Mitchell, 57, faces life in prison if he is convicted of kidnapping and unlawful transportation of a minor across state lines with the intent to engage in criminal sexual activity.' entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) print "entity IDs found:" pprint.pprint(infoModule.info.entityList) entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) print "entity IDs found after nicknames:" pprint.pprint(infoModule.info.entityList) entities.setPrimo() print "entity state after setPrimo" pprint.pprint(infoModule.info.entityList) print "Total Entities Found:" print(len(infoModule.info.entityList))
def scanPage(): siteDB = infoModule.info.site['database'] if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) os._exit(0) urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1,1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) log.plog("fetching " + infoModule.info.page['url'], 2) try: socket = urllib.urlopen(infoModule.info.page['url']) except IOError: log.plog('could not open ' + infoModule.info.page['url'], 4) return False responseCode = socket.getcode() log.plog('urllib response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return False headerInfo = socket.info() contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return False # put in to account for WSJ -dpg if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = wsjAuthHack(infoModule.info.page['url']) elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = nytAuthHack(infoModule.info.page['url']) else: infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL #redirected urls need to be blocked too urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1,1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) ### and short url needs to be blocked #do not read links that have only one string in them linkParts = urlparse.urlparse(infoModule.info.page['url']) shortPath = re.search('^/\w+/*$', linkParts[2]) lp = linkParts[2] if shortPath != None: log.plog("link excluded because it only has a short path of characters: %s" % linkParts[2], 2) os._exit(0) ## anything in htmlBlacklist? htmlBlacklistQ = mysql_tools.mysqlQuery("select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink']) while True: htmlBlacklist = htmlBlacklistQ.fetch_row(1,1) if htmlBlacklist == (): break badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML']) if badSeedHTML != None: log.plog('html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3) os._exit(0) ################################### #special case for feedburner sources #ernst does not like special cases ################################### infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url']) #check AGAIN to see if url is already in system escURL = infoModule.info.page['url'].replace("'", "\\'") urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink']) #don't exit, return false so that a new story can be tried if urlCheckQ.num_rows() > 0: log.plog("scanpage-url already in newsroom: %s" % infoModule.info.page['url'] , 2) log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1,1))) return False urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink']) if urlCheckQ.num_rows() > 0: log.plog("scanpage-url already in subs: %s" % infoModule.info.page['url'], 2) log.plog("sub_id: " + str(urlCheckQ.fetch_row(1,1))) return False ## if source is '0', try to find source if infoModule.info.source['source_id'] == '0': sourceRegexQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink']) while True: sourceRegex = sourceRegexQ.fetch_row(1,1) if sourceRegex == (): break urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url']) if urlTest != None: log.plog('found source via regex: ' + sourceRegex[0]['title'], 2) infoModule.info.source = sourceRegex[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' break ## maybe check last modified header and don't get stories older than 7 days? '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url']) if possibleAgeInDays != None: log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2) if int(possibleAgeInDays) > 5: log.plog("story is " + str(possibleAgeInDays) + " days old. Not reading", 2) return False ''' if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) os._exit(0) #add meta description into the mix infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8') log.plog("meta_description: " + infoModule.info.page['meta_description'], 2) log.plog('======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) log.plog('======================================= OUTLINE ================================', 2) ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) outline = False #this toggle allows for ignoring regex in favor of body_extractor if infoModule.info.site['skipBodyRegex'] == False: storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() #set html block used for imaage, author and links to be what outline returns if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; infoModule.info.page['rawHTML'] = outline except TimeoutFunctionException: outline = False log.plog("ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5) #outline = find_story.findStoryViaRegex() if outline != False: if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds ## parse links in page only in regex block if we have regex log.plog('======================================= LINK SCORING ================================', 2) links.linkScoring(outline, 'subs') links.linkScoring(outline, 'newsroom') log.plog('======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(outline) if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) else: log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) os._exit(0) ## parse links in page - no regex, so look in rawHTML for links ## if there are widgetBlockers, first clear them from the html linkHTML = infoModule.info.page['rawHTML'] widgetBlacklistQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink']) while True: widgetBlacklist = widgetBlacklistQ.fetch_row(1,1) if widgetBlacklist == (): break if isinstance(linkHTML, str) == False: log.plog('linkHTML is not string', 5) os._exit(0) wblMatch = re.search(widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I) if wblMatch != None: log.plog("found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2) linkHTML = linkHTML.replace(wblMatch.group(0), '') mysql_tools.mysqlQuery("update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink']) if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds log.plog('======================================= LINK SCORING ================================', 2) links.linkScoring(linkHTML, 'subs') links.linkScoring(linkHTML, 'newsroom') log.plog('======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(linkHTML) log.plog('======================================= IMAGES ================================', 2) #find images if 'image_start_marker' in infoModule.info.source: image_start_marker = infoModule.info.source['image_start_marker'] else: image_start_marker = '' if 'image_end_marker' in infoModule.info.source: image_end_marker = infoModule.info.source['image_end_marker'] else: image_end_marker = '' imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker) if imageArray == None: log.plog('could not find image', 3) else: x = imageArray[0] y = imageArray[1] imageURL = imageArray[2] if imageURL == '': log.plog('could not find image', 3) else: log.plog('image found: ' + imageURL, 2) infoModule.info.page['largestImage'] = imageURL infoModule.info.page['maxSize'] = x * y log.plog('======================================= IMAGE CREDIT ================================', 2) ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False: infoModule.info.page['imageSource'] = imageSource log.plog('======================================= VIDEOS ================================', 2) ###look for videos videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML']) if videoLink == False: infoModule.info.page['vlink'] = '' else: log.plog('found video embed', 2) infoModule.info.page['vlink'] = videoLink vthumb = find_video.findVideoThumb(videoLink) if vthumb == False: infoModule.info.page['vthumb'] = '' else: log.plog('found video thumb', 2) infoModule.info.page['vthumb'] = vthumb log.plog('======================================= AUTHOR ================================', 2) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = strip_html.clearHTML(author) infoModule.info.page['author'] = author else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' log.plog('======================================= ENTITIES ================================', 2) #### find entities entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = False while nickname is False: try: entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = True except: pass ## test cityToTeam #cityToTeam.getRelevantEntity() entities.setPrimo() #### chop outline to 500 chars unless featured if 'featured_source' not in infoModule.info.source or infoModule.info.source['featured_source'] == '0': infoModule.info.page['outline'] = infoModule.info.page['outline'][0:500] + '...' if len(infoModule.info.entityList) < 1: log.plog("no entities found in story!", 5) os._exit(0) log.plog('======================================= UNKNOWN ENTITIES ================================', 2) ## any unknown entities? entityFixedString = infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'] entityFixedString = entityFixedString.replace("'s", "") entityFixedString = re.sub('\W+', ' ', entityFixedString) find_new_entities.find_new_entities(entityFixedString) ## page must have at least one non-hidden entity invisibleTypesQuery = mysql_tools.mysqlQuery("select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink']) invisibleTypes = '' sep = '' while True: oneType = invisibleTypesQuery.fetch_row(1,1) if oneType == (): break invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id'] sep = ',' sep = '' cclist = '' for eKey in infoModule.info.entityList.keys(): cclist = cclist + sep + str(eKey) sep = ',' sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")" nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if nonHiddenEntitiesQ.num_rows() == 0: log.plog('no non-hidden entities found in story!', 4) os._exit(0) newSubId = addStory.addStory() if newSubId == False: log.plog('addStory failed', 5) else: log.plog("Story added. New sub_id: " + str(newSubId), 2) os._exit(0)
def scanPage(): siteDB = infoModule.info.site['database'] if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) os._exit(0) urlBlockerQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1, 1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog( 'url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) log.plog("fetching " + infoModule.info.page['url'], 2) try: socket = urllib.urlopen(infoModule.info.page['url']) except IOError: log.plog('could not open ' + infoModule.info.page['url'], 4) return False responseCode = socket.getcode() log.plog('urllib response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return False headerInfo = socket.info() contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return False # put in to account for WSJ -dpg if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = wsjAuthHack( infoModule.info.page['url']) elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = nytAuthHack( infoModule.info.page['url']) else: infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL #redirected urls need to be blocked too urlBlockerQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1, 1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog( 'url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) ### and short url needs to be blocked #do not read links that have only one string in them linkParts = urlparse.urlparse(infoModule.info.page['url']) shortPath = re.search('^/\w+/*$', linkParts[2]) lp = linkParts[2] if shortPath != None: log.plog( "link excluded because it only has a short path of characters: %s" % linkParts[2], 2) os._exit(0) ## anything in htmlBlacklist? htmlBlacklistQ = mysql_tools.mysqlQuery( "select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink']) while True: htmlBlacklist = htmlBlacklistQ.fetch_row(1, 1) if htmlBlacklist == (): break badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML']) if badSeedHTML != None: log.plog( 'html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3) os._exit(0) ################################### #special case for feedburner sources #ernst does not like special cases ################################### infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url']) #check AGAIN to see if url is already in system escURL = infoModule.info.page['url'].replace("'", "\\'") urlCheckQ = mysql_tools.mysqlQuery( "select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink']) #don't exit, return false so that a new story can be tried if urlCheckQ.num_rows() > 0: log.plog( "scanpage-url already in newsroom: %s" % infoModule.info.page['url'], 2) log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1, 1))) return False urlCheckQ = mysql_tools.mysqlQuery( "select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink']) if urlCheckQ.num_rows() > 0: log.plog( "scanpage-url already in subs: %s" % infoModule.info.page['url'], 2) log.plog("sub_id: " + str(urlCheckQ.fetch_row(1, 1))) return False ## if source is '0', try to find source if infoModule.info.source['source_id'] == '0': sourceRegexQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink']) while True: sourceRegex = sourceRegexQ.fetch_row(1, 1) if sourceRegex == (): break urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url']) if urlTest != None: log.plog('found source via regex: ' + sourceRegex[0]['title'], 2) infoModule.info.source = sourceRegex[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' break ## maybe check last modified header and don't get stories older than 7 days? '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url']) if possibleAgeInDays != None: log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2) if int(possibleAgeInDays) > 5: log.plog("story is " + str(possibleAgeInDays) + " days old. Not reading", 2) return False ''' if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) os._exit(0) #add meta description into the mix infoModule.info.page['meta_description'] = '' meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode( 'utf-8') log.plog( "meta_description: " + infoModule.info.page['meta_description'], 2) log.plog( '======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len( infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) log.plog( '======================================= OUTLINE ================================', 2) ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures( infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) outline = False #this toggle allows for ignoring regex in favor of body_extractor if infoModule.info.site['skipBodyRegex'] == False: storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() #set html block used for imaage, author and links to be what outline returns if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page[ 'rawHTML'] infoModule.info.page['rawHTML'] = outline except TimeoutFunctionException: outline = False log.plog( "ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5) #outline = find_story.findStoryViaRegex() if outline != False: if infoModule.info.page['promoter'] == '0' and infoModule.info.source[ 'source_id'] != '0' and 'source_format' in infoModule.info.source and len( infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds ## parse links in page only in regex block if we have regex log.plog( '======================================= LINK SCORING ================================', 2) links.linkScoring(outline, 'subs') links.linkScoring(outline, 'newsroom') log.plog( '======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(outline) if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures( outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) else: log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'] abbreviatedHTML = html_body_extractor.html_body_extractor( infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) os._exit(0) ## parse links in page - no regex, so look in rawHTML for links ## if there are widgetBlockers, first clear them from the html linkHTML = infoModule.info.page['rawHTML'] widgetBlacklistQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink']) while True: widgetBlacklist = widgetBlacklistQ.fetch_row(1, 1) if widgetBlacklist == (): break if isinstance(linkHTML, str) == False: log.plog('linkHTML is not string', 5) os._exit(0) wblMatch = re.search( widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I) if wblMatch != None: log.plog( "found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2) linkHTML = linkHTML.replace(wblMatch.group(0), '') mysql_tools.mysqlQuery( "update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink']) if infoModule.info.page['promoter'] == '0' and infoModule.info.source[ 'source_id'] != '0' and 'source_format' in infoModule.info.source and len( infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds log.plog( '======================================= LINK SCORING ================================', 2) links.linkScoring(linkHTML, 'subs') links.linkScoring(linkHTML, 'newsroom') log.plog( '======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(linkHTML) log.plog( '======================================= IMAGES ================================', 2) #find images if 'image_start_marker' in infoModule.info.source: image_start_marker = infoModule.info.source['image_start_marker'] else: image_start_marker = '' if 'image_end_marker' in infoModule.info.source: image_end_marker = infoModule.info.source['image_end_marker'] else: image_end_marker = '' imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker) if imageArray == None: log.plog('could not find image', 3) else: x = imageArray[0] y = imageArray[1] imageURL = imageArray[2] if imageURL == '': log.plog('could not find image', 3) else: log.plog('image found: ' + imageURL, 2) infoModule.info.page['largestImage'] = imageURL infoModule.info.page['maxSize'] = x * y log.plog( '======================================= IMAGE CREDIT ================================', 2) ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit( infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False: infoModule.info.page['imageSource'] = imageSource log.plog( '======================================= VIDEOS ================================', 2) ###look for videos videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML']) if videoLink == False: infoModule.info.page['vlink'] = '' else: log.plog('found video embed', 2) infoModule.info.page['vlink'] = videoLink vthumb = find_video.findVideoThumb(videoLink) if vthumb == False: infoModule.info.page['vthumb'] = '' else: log.plog('found video thumb', 2) infoModule.info.page['vthumb'] = vthumb log.plog( '======================================= AUTHOR ================================', 2) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = strip_html.clearHTML(author) infoModule.info.page['author'] = author else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' log.plog( '======================================= ENTITIES ================================', 2) #### find entities entities.entityFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = False while nickname is False: try: entities.nicknameFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = True except: pass ## test cityToTeam #cityToTeam.getRelevantEntity() entities.setPrimo() #### chop outline to 500 chars unless featured if 'featured_source' not in infoModule.info.source or infoModule.info.source[ 'featured_source'] == '0': infoModule.info.page[ 'outline'] = infoModule.info.page['outline'][0:500] + '...' if len(infoModule.info.entityList) < 1: log.plog("no entities found in story!", 5) os._exit(0) log.plog( '======================================= UNKNOWN ENTITIES ================================', 2) ## any unknown entities? entityFixedString = infoModule.info.page[ 'title'] + ' ' + infoModule.info.page['outline'] entityFixedString = entityFixedString.replace("'s", "") entityFixedString = re.sub('\W+', ' ', entityFixedString) find_new_entities.find_new_entities(entityFixedString) ## page must have at least one non-hidden entity invisibleTypesQuery = mysql_tools.mysqlQuery( "select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink']) invisibleTypes = '' sep = '' while True: oneType = invisibleTypesQuery.fetch_row(1, 1) if oneType == (): break invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id'] sep = ',' sep = '' cclist = '' for eKey in infoModule.info.entityList.keys(): cclist = cclist + sep + str(eKey) sep = ',' sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")" nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if nonHiddenEntitiesQ.num_rows() == 0: log.plog('no non-hidden entities found in story!', 4) os._exit(0) newSubId = addStory.addStory() if newSubId == False: log.plog('addStory failed', 5) else: log.plog("Story added. New sub_id: " + str(newSubId), 2) os._exit(0)
log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) sys.exit() ''' #add nickname for testing infoModule.info.page[ 'title'] = 'Elizabeth Smart: Cop Found Me But "Walked Away"' infoModule.info.page[ 'outline'] = '(AP) Elizabeth Smart told jurors Tuesday how a Salt Lake City police detective tried to see behind her veil but backed down when the man accused of kidnapping her said her face was hidden for religious reasons. "I was mad at myself, that I didn\'t say anything," she said on her second day of testimony in the federal trial of Brian David Mitchell. "I felt terrible that the detective hadn\'t pushed harder and had just walked away." Smart, now 23, was 14 when she was taken at knifepoint in June 2002 while sleeping. Nine months later, motorists spotted her walking in a Salt Lake City suburb with Mitchell. Mitchell, 57, faces life in prison if he is convicted of kidnapping and unlawful transportation of a minor across state lines with the intent to engage in criminal sexual activity.' entities.entityFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) print "entity IDs found:" pprint.pprint(infoModule.info.entityList) entities.nicknameFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) print "entity IDs found after nicknames:" pprint.pprint(infoModule.info.entityList) entities.setPrimo() print "entity state after setPrimo" pprint.pprint(infoModule.info.entityList)
def getEntities(searchText, title, jsonOut=True, byID=False): infoModule.info.page['outline'] = searchText infoModule.info.page['title'] = title entities.entityFinder(title + ' ' + searchText, True) entities.nicknameFinder(title + ' ' + searchText, True, True) entities.setPrimo() #pprint.pprint(infoModule.info.entityList) entityList = infoModule.info.entityList if jsonOut == True: #res = json.dumps(infoModule.info.entityList) #pprint.pprint(res) if len(entityList) > 0: ents = entityList.keys() # hacky JSON building; python's json module outputs slightly different than what we need res = '[' for ids in ents: entityName = entityLib.entityLibrary(ids, 'entityName') celeb_type = entityLib.entityLibrary(ids, 'celeb_type') linkPath = entityLib.entityLibrary(ids, 'linkPath') entityURL = entityLib.entityLibrary(ids, 'lookupUrl') if byID: #swap URL for ID entityURL = str(ids) if celeb_type != 'hidden': if 'nameUsed' in entityList[ids]: #this means there's a nickname response res = res + '{"id":"' + str( entityURL ) + '","name":"' + str( entityName ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str( celeb_type ) + '","nameUsed":"' + entityList[ids][ 'nameUsed'] + '",' + '"links":[{"from":"celebrifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"}]},' else: res = res + '{"id":"' + str( entityURL ) + '","name":"' + str( entityName ) + '","primo":"' + entityList[ids]['primo'] + '","type":"' + str( celeb_type ) + '",' + '"links":[{"from":"celebrifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"},{"from":"politifi.com","url":"' + str( linkPath ) + '","icon":"http://informifi.com/fi_icon.png"}]},' res = res[:-1] + ']' #catch for nothing but hiddens if res == ']': res = '' #pprint.pprint(res) else: res = '' return res