def stripsplitTitle(full_title): ## Regex for title to split on ## ## beautiful soup replaces html entities, so this needs to put them back in full_title = full_title.replace('~1.314392653~', '&') #full_title = clearCurlies(full_title) full_title = clearHTML(full_title) print("RTS:Full_title I'm being given : " + full_title) sp = re.compile('''\s+(»|\>\>|»|«|\<\<|«|\||||\-\-|::|-|–|/)\s+''',re.I) print("RT2:Beginning to split title. Unsplit title: \"" + full_title + "\"",2) blocks = re.split(sp, full_title) ## Find longest block of title ## #print(blocks) #print("length : " + str(len(blocks))) if len(blocks)>1: longest=1 longestb='' for b in blocks: if len(b)>1 and len(b.split(' '))>longest: longest=len(b.split(' ')) longestb=b print("RT2:longestblock: FOUND! \"" + longestb.strip() + "\"",2) #found title in story, confidence + 1 infoModule.info.page['confidence'] += 1 return longestb.strip() else: print("RT2:title unsplittable, running through contig title and returning: \"" + full_title.strip() + "\"",2) tit = check_rt(full_title.strip(), infoModule.info.page['rawHTML']) print("RT2:title coming back from check_rt : " + tit) ## logic that splits titles and takes up the H1 print("RT2: length of tit.split " + str(len(tit.split(' ')))) print(tit.split(' ')) print("RT2: length of full_title.strip.split " + str(len(full_title.strip().split(' ')))) print(full_title.strip()) print(full_title.strip().split(' ')) quotient = (len(tit.split(' '))) / len(full_title.strip().split(' ')) quotient_limit = 0.50 print(" QUOTIENT : " + str(quotient)) if( quotient >= quotient_limit): #found title in story, confidence + 1 infoModule.info.page['confidence'] = infoModule.info.page['confidence'] + 1 return tit else: #reverting to h1 or title tag, confidence - 1 infoModule.info.page['confidence'] = infoModule.info.page['confidence'] - 1 soup=BeautifulSoup(''.join(infoModule.info.page['rawHTML'])) testStr = soup.h1 pprint.pprint(testStr) if len(testStr.contents) > 0: clearedH1 = clearHTML(str(testStr.contents[0])).strip() if testStr != None and clearedH1 != '': print "clearedH1 ~" + clearedH1 + "~" return clearedH1 else: return full_title else: #h1 was empty, though it existed return full_title
def bodyExtract(url): global be_results infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['rawHTML'] = fetchPage(url) htmlTitle() infoModule.info.page['title'] = real_title2.realTitle() print infoModule.info.page['title'] #sys.exit() meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8') print "meta_description: " + infoModule.info.page['meta_description'] infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) if be_results != None: print be_results return be_results
def bodyExtract(url): global be_results infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['rawHTML'] = fetchPage(url) htmlTitle() infoModule.info.page['title'] = real_title2.realTitle() print infoModule.info.page['title'] #sys.exit() meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode( 'utf-8') print "meta_description: " + infoModule.info.page['meta_description'] infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) be_results = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) if be_results != None: print be_results return be_results
row=cr.fetch_row(1,1) if row == (): break print(row) url = row[0]['url'] infoModule.info.page['url'] = url infoModule.info.page['title'] = 'whatevs' log.plog('testing clearHTML', 1) print 'Attempting URL: ' + url + "\r\n" urlSocket = urllib.urlopen(url) html = urlSocket.read() if (len(sys.argv) > 1 and sys.argv[1]) == 'features': html = strip_html.clearHTMLFeatures(html) else: html = strip_html.clearHTML(html) find_new_entities.find_new_entities(html) print '=========================================================' # blockedWords test while (1): url = 'http://news.ycombinator.com/item?id=2092273' infoModule.info.page['url'] = url infoModule.info.page['title'] = 'whatevs' print 'Attempting URL: ' + url + "\r\n" urlSocket = urllib.urlopen(url) html = urlSocket.read() html = strip_html.clearHTML(html) print html find_new_entities.find_new_entities(html)
def stripsplitTitle(full_title): ## Regex for title to split on ## ## beautiful soup replaces html entities, so this needs to put them back in full_title = full_title.replace('~1.314392653~', '&') #full_title = clearCurlies(full_title) full_title = clearHTML(full_title) print("RTS:Full_title I'm being given : " + full_title) sp = re.compile( '''\s+(»|\>\>|»|«|\<\<|«|\||||\-\-|::|-|–|/)\s+''', re.I) print( "RT2:Beginning to split title. Unsplit title: \"" + full_title + "\"", 2) blocks = re.split(sp, full_title) ## Find longest block of title ## #print(blocks) #print("length : " + str(len(blocks))) if len(blocks) > 1: longest = 1 longestb = '' for b in blocks: if len(b) > 1 and len(b.split(' ')) > longest: longest = len(b.split(' ')) longestb = b print("RT2:longestblock: FOUND! \"" + longestb.strip() + "\"", 2) #found title in story, confidence + 1 infoModule.info.page['confidence'] += 1 return longestb.strip() else: print( "RT2:title unsplittable, running through contig title and returning: \"" + full_title.strip() + "\"", 2) tit = check_rt(full_title.strip(), infoModule.info.page['rawHTML']) print("RT2:title coming back from check_rt : " + tit) ## logic that splits titles and takes up the H1 print("RT2: length of tit.split " + str(len(tit.split(' ')))) print(tit.split(' ')) print("RT2: length of full_title.strip.split " + str(len(full_title.strip().split(' ')))) print(full_title.strip()) print(full_title.strip().split(' ')) quotient = (len(tit.split(' '))) / len(full_title.strip().split(' ')) quotient_limit = 0.50 print(" QUOTIENT : " + str(quotient)) if (quotient >= quotient_limit): #found title in story, confidence + 1 infoModule.info.page[ 'confidence'] = infoModule.info.page['confidence'] + 1 return tit else: #reverting to h1 or title tag, confidence - 1 infoModule.info.page[ 'confidence'] = infoModule.info.page['confidence'] - 1 soup = BeautifulSoup(''.join(infoModule.info.page['rawHTML'])) testStr = soup.h1 pprint.pprint(testStr) if len(testStr.contents) > 0: clearedH1 = clearHTML(str(testStr.contents[0])).strip() if testStr != None and clearedH1 != '': print "clearedH1 ~" + clearedH1 + "~" return clearedH1 else: return full_title else: #h1 was empty, though it existed return full_title
def check_rt(title, html): print("<br><br>title I'm getting before clearHTML : " + title + "<br><br><br>") #lines = clearCurlies(clearHTML(html)) try: lines = clearHTML(html).decode('utf-8') except: lines = clearHTML(html) title = clearHTML(title) lines = re.sub("\n\n+", '\n', lines) lines = re.split("\n+", lines) print("<br><br>title I'm being passed after clearHTML : " + title + "<br><br>") matches = [] title_array = re.split("\s+", title) for k in range(len(lines)): print title print str(k) + " " + lines[k] # capture each phrase phrase = '' # start with the title pointer at zero for each line title_pointer = 0 # title incrementor title_inc = 0 # start with word_match_count at zero for each line word_match_count = 0 # while title pointer hasn't reached the end of the title, continue while (title_pointer < len(title_array)): # this begins at each line # words to match begins at nothing for each line words_to_match = '' # while the title buffer is not at the end of the title while (title_inc < len(title_array)): # words to match should add each word with a space at the end words_to_match += title_array[title_inc] + " " # chopping off the last space for testing match_this = words_to_match[:-1] try: match_this = match_this.decode('utf-8') except: pass # try and match this cluster of words in the line #print("line : " + str(k) + ", matching : " + match_this + ", title_pointer : "+ str(title_pointer) + ", title_inc : "+ str(title_inc) + ", title_pntr_word: "+title_array[title_pointer] +", title_inc_word : " + title_array[title_inc]) #handle some encoding fall-through try: cluster_match = lines[k].find(match_this) except: cluster_match = -1 if cluster_match >= 0: #print("Found") # if this matched cluster is larger than the last, then we replace total count # add one for a match if (word_match_count < (title_inc - title_pointer + 1)): word_match_count = (title_inc - title_pointer + 1) phrase = match_this # great, it matched, now let's try adding a word and matching that title_inc += 1 else: break #increment pointer once title_inc has made it through, or failed. if (title_inc == len(title_array)): break # This starts the next search from the word after the pointer was just starting at title_pointer += 1 title_inc = title_pointer # append results to our list [line number, match count] matches.append((k, word_match_count, phrase)) matches = sorted(matches, key=itemgetter(1, 0), reverse=True) # returns the highest match title return matches[0][2]
def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)') try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page['rawHTML'].decode(cd['encoding']) windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog("meta_description: " + infoModule.info.page['meta_description'], 2) log.plog('======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog('======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases(infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub('<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks(infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len(infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog('======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog('======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos(infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and (imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0; largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and (largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return(failOn('no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies(infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status" : status, "story" : storyObj} output = json.dumps(returnVal) return output
def scanPage(): siteDB = infoModule.info.site['database'] if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) os._exit(0) urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1,1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) log.plog("fetching " + infoModule.info.page['url'], 2) try: socket = urllib.urlopen(infoModule.info.page['url']) except IOError: log.plog('could not open ' + infoModule.info.page['url'], 4) return False responseCode = socket.getcode() log.plog('urllib response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return False headerInfo = socket.info() contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return False # put in to account for WSJ -dpg if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = wsjAuthHack(infoModule.info.page['url']) elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = nytAuthHack(infoModule.info.page['url']) else: infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL #redirected urls need to be blocked too urlBlockerQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1,1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog('url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) ### and short url needs to be blocked #do not read links that have only one string in them linkParts = urlparse.urlparse(infoModule.info.page['url']) shortPath = re.search('^/\w+/*$', linkParts[2]) lp = linkParts[2] if shortPath != None: log.plog("link excluded because it only has a short path of characters: %s" % linkParts[2], 2) os._exit(0) ## anything in htmlBlacklist? htmlBlacklistQ = mysql_tools.mysqlQuery("select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink']) while True: htmlBlacklist = htmlBlacklistQ.fetch_row(1,1) if htmlBlacklist == (): break badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML']) if badSeedHTML != None: log.plog('html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3) os._exit(0) ################################### #special case for feedburner sources #ernst does not like special cases ################################### infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url']) #check AGAIN to see if url is already in system escURL = infoModule.info.page['url'].replace("'", "\\'") urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink']) #don't exit, return false so that a new story can be tried if urlCheckQ.num_rows() > 0: log.plog("scanpage-url already in newsroom: %s" % infoModule.info.page['url'] , 2) log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1,1))) return False urlCheckQ = mysql_tools.mysqlQuery("select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink']) if urlCheckQ.num_rows() > 0: log.plog("scanpage-url already in subs: %s" % infoModule.info.page['url'], 2) log.plog("sub_id: " + str(urlCheckQ.fetch_row(1,1))) return False ## if source is '0', try to find source if infoModule.info.source['source_id'] == '0': sourceRegexQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink']) while True: sourceRegex = sourceRegexQ.fetch_row(1,1) if sourceRegex == (): break urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url']) if urlTest != None: log.plog('found source via regex: ' + sourceRegex[0]['title'], 2) infoModule.info.source = sourceRegex[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' break ## maybe check last modified header and don't get stories older than 7 days? '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url']) if possibleAgeInDays != None: log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2) if int(possibleAgeInDays) > 5: log.plog("story is " + str(possibleAgeInDays) + " days old. Not reading", 2) return False ''' if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) os._exit(0) #add meta description into the mix infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode('utf-8') log.plog("meta_description: " + infoModule.info.page['meta_description'], 2) log.plog('======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) log.plog('======================================= OUTLINE ================================', 2) ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) outline = False #this toggle allows for ignoring regex in favor of body_extractor if infoModule.info.site['skipBodyRegex'] == False: storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() #set html block used for imaage, author and links to be what outline returns if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; infoModule.info.page['rawHTML'] = outline except TimeoutFunctionException: outline = False log.plog("ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5) #outline = find_story.findStoryViaRegex() if outline != False: if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds ## parse links in page only in regex block if we have regex log.plog('======================================= LINK SCORING ================================', 2) links.linkScoring(outline, 'subs') links.linkScoring(outline, 'newsroom') log.plog('======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(outline) if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) else: log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML']; abbreviatedHTML = html_body_extractor.html_body_extractor(infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) os._exit(0) ## parse links in page - no regex, so look in rawHTML for links ## if there are widgetBlockers, first clear them from the html linkHTML = infoModule.info.page['rawHTML'] widgetBlacklistQ = mysql_tools.mysqlQuery("select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink']) while True: widgetBlacklist = widgetBlacklistQ.fetch_row(1,1) if widgetBlacklist == (): break if isinstance(linkHTML, str) == False: log.plog('linkHTML is not string', 5) os._exit(0) wblMatch = re.search(widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I) if wblMatch != None: log.plog("found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2) linkHTML = linkHTML.replace(wblMatch.group(0), '') mysql_tools.mysqlQuery("update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink']) if infoModule.info.page['promoter'] == '0' and infoModule.info.source['source_id'] != '0' and 'source_format' in infoModule.info.source and len(infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds log.plog('======================================= LINK SCORING ================================', 2) links.linkScoring(linkHTML, 'subs') links.linkScoring(linkHTML, 'newsroom') log.plog('======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(linkHTML) log.plog('======================================= IMAGES ================================', 2) #find images if 'image_start_marker' in infoModule.info.source: image_start_marker = infoModule.info.source['image_start_marker'] else: image_start_marker = '' if 'image_end_marker' in infoModule.info.source: image_end_marker = infoModule.info.source['image_end_marker'] else: image_end_marker = '' imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker) if imageArray == None: log.plog('could not find image', 3) else: x = imageArray[0] y = imageArray[1] imageURL = imageArray[2] if imageURL == '': log.plog('could not find image', 3) else: log.plog('image found: ' + imageURL, 2) infoModule.info.page['largestImage'] = imageURL infoModule.info.page['maxSize'] = x * y log.plog('======================================= IMAGE CREDIT ================================', 2) ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False: infoModule.info.page['imageSource'] = imageSource log.plog('======================================= VIDEOS ================================', 2) ###look for videos videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML']) if videoLink == False: infoModule.info.page['vlink'] = '' else: log.plog('found video embed', 2) infoModule.info.page['vlink'] = videoLink vthumb = find_video.findVideoThumb(videoLink) if vthumb == False: infoModule.info.page['vthumb'] = '' else: log.plog('found video thumb', 2) infoModule.info.page['vthumb'] = vthumb log.plog('======================================= AUTHOR ================================', 2) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = strip_html.clearHTML(author) infoModule.info.page['author'] = author else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' log.plog('======================================= ENTITIES ================================', 2) #### find entities entities.entityFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = False while nickname is False: try: entities.nicknameFinder(infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = True except: pass ## test cityToTeam #cityToTeam.getRelevantEntity() entities.setPrimo() #### chop outline to 500 chars unless featured if 'featured_source' not in infoModule.info.source or infoModule.info.source['featured_source'] == '0': infoModule.info.page['outline'] = infoModule.info.page['outline'][0:500] + '...' if len(infoModule.info.entityList) < 1: log.plog("no entities found in story!", 5) os._exit(0) log.plog('======================================= UNKNOWN ENTITIES ================================', 2) ## any unknown entities? entityFixedString = infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'] entityFixedString = entityFixedString.replace("'s", "") entityFixedString = re.sub('\W+', ' ', entityFixedString) find_new_entities.find_new_entities(entityFixedString) ## page must have at least one non-hidden entity invisibleTypesQuery = mysql_tools.mysqlQuery("select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink']) invisibleTypes = '' sep = '' while True: oneType = invisibleTypesQuery.fetch_row(1,1) if oneType == (): break invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id'] sep = ',' sep = '' cclist = '' for eKey in infoModule.info.entityList.keys(): cclist = cclist + sep + str(eKey) sep = ',' sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")" nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if nonHiddenEntitiesQ.num_rows() == 0: log.plog('no non-hidden entities found in story!', 4) os._exit(0) newSubId = addStory.addStory() if newSubId == False: log.plog('addStory failed', 5) else: log.plog("Story added. New sub_id: " + str(newSubId), 2) os._exit(0)
'HTMLTitle'] log('using html title') else: log('title from realTitle') if infoModule.info.page['title'] == '': log('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title except Exception, e: log("Couldn't pass real_title2.realTitle()") log("ging to strip_html") try: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) except: log("couldn't pass strip_html for title") sys.exit(1) log('final title: ' + infoModule.info.page['title']) title = real_title2.realTitle() infoModule.info.page['title'] = title text_page = strip_html.clearHTML(full_page) body_text = extract(text_page) if body_text != None: body_text = body_text[0:450] + "..." h = HTMLParser.HTMLParser() try: be_text = h.unescape(body_text) except: be_text = body_text
def scanPage(step): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print infoModule.info.page['url'] ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) hit = False outline = False originalStep = step while hit == False: #pick out most popular regex sql = "select count(*) as common, story_start_marker, story_end_marker from " + siteDB + ".sources where story_start_marker != '' group by story_start_marker order by count(*) desc limit %d,1" % step regexQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if regexQ == False: break if regexQ.num_rows() == 0: break regex = regexQ.fetch_row(1,1) infoModule.info.source['story_start_marker'] = regex[0]['story_start_marker'] infoModule.info.source['story_end_marker'] = regex[0]['story_end_marker'] infoModule.info.source['story_end_marker'] = infoModule.info.source['story_end_marker'].replace('\/', '/') infoModule.info.source['story_start_marker'] = infoModule.info.source['story_start_marker'].replace('\/', '/') storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() except: outline = False if outline != False: hit = True step+=1 if outline != False: startMarker = infoModule.info.source['story_start_marker'].replace('<', '<') endMarker = infoModule.info.source['story_end_marker'].replace('<', '<') if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) infoModule.info.page['outline'] = infoModule.info.page['outline'].decode('utf-8') infoModule.info.page['outline'] = infoModule.info.page['outline'].encode('ascii', 'xmlcharrefreplace') print str(step) print startMarker print endMarker print infoModule.info.page['outline'] else: print "no match"
def check_rt(title, html): print("<br><br>title I'm getting before clearHTML : " + title + "<br><br><br>") #lines = clearCurlies(clearHTML(html)) try: lines = clearHTML(html).decode('utf-8') except: lines = clearHTML(html) title = clearHTML(title) lines = re.sub("\n\n+", '\n', lines) lines = re.split("\n+", lines) print("<br><br>title I'm being passed after clearHTML : " + title + "<br><br>") matches = [] title_array = re.split("\s+", title) for k in range(len(lines)): print title print str(k) + " " + lines[k] # capture each phrase phrase = '' # start with the title pointer at zero for each line title_pointer = 0 # title incrementor title_inc = 0 # start with word_match_count at zero for each line word_match_count = 0 # while title pointer hasn't reached the end of the title, continue while(title_pointer < len(title_array)): # this begins at each line # words to match begins at nothing for each line words_to_match = '' # while the title buffer is not at the end of the title while(title_inc < len(title_array)): # words to match should add each word with a space at the end words_to_match += title_array[title_inc] + " " # chopping off the last space for testing match_this = words_to_match[:-1] try: match_this = match_this.decode('utf-8') except: pass # try and match this cluster of words in the line #print("line : " + str(k) + ", matching : " + match_this + ", title_pointer : "+ str(title_pointer) + ", title_inc : "+ str(title_inc) + ", title_pntr_word: "+title_array[title_pointer] +", title_inc_word : " + title_array[title_inc]) #handle some encoding fall-through try: cluster_match = lines[k].find(match_this) except: cluster_match = -1 if cluster_match >= 0: #print("Found") # if this matched cluster is larger than the last, then we replace total count # add one for a match if(word_match_count<(title_inc-title_pointer+1)): word_match_count = (title_inc-title_pointer+1) phrase = match_this # great, it matched, now let's try adding a word and matching that title_inc+=1 else: break #increment pointer once title_inc has made it through, or failed. if(title_inc==len(title_array)): break; # This starts the next search from the word after the pointer was just starting at title_pointer+=1 title_inc=title_pointer # append results to our list [line number, match count] matches.append((k, word_match_count, phrase)) matches = sorted(matches, key=itemgetter(1,0), reverse=True) # returns the highest match title return matches[0][2]
infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log('using html title') else: log('title from realTitle') if infoModule.info.page['title'] == '': log('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title except Exception, e: log("Couldn't pass real_title2.realTitle()") log("ging to strip_html") try: infoModule.info.page['title'] = strip_html.clearHTML(infoModule.info.page['title']) except: log("couldn't pass strip_html for title") sys.exit(1) log('final title: ' + infoModule.info.page['title']) title = real_title2.realTitle() infoModule.info.page['title'] = title text_page = strip_html.clearHTML(full_page) body_text = extract(text_page) if body_text != None: body_text = body_text[0:450]+"..." h = HTMLParser.HTMLParser() try: be_text = h.unescape(body_text) except: be_text = body_text
def scanPage(): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print "<b>URL</b> <a href=\"" + infoModule.info.page['url'] + "\">" + infoModule.info.page['url'] + "</a><br />" # get title #title = find_title.findTitle() #if title != False: # infoModule.info.page['title'] = title #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: # infoModule.info.page['title'] = infoModule.info.page['potential_title'] #else: # log.plog('no title found!', 3) # sys.exit() ##print infoModule.info.page['title'] #find images #if 'image_start_marker' in infoModule.info.source: # image_start_marker = infoModule.info.source['image_start_marker'] #else: # image_start_marker = '' #if 'image_end_marker' in infoModule.info.source: # image_end_marker = infoModule.info.source['image_end_marker'] #else: # image_end_marker = '' #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker) #x = imageArray[0] #y = imageArray[1] #imageURL = imageArray[2] #if imageURL == '': # log.plog('could not find image', 3) #else: # log.plog('image found: ' + imageURL, 2) # infoModule.info.page['largestImage'] = imageURL # infoModule.info.page['maxSize'] = x * y ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit(infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False and imageSource != None: infoModule.info.page['imageSource'] = imageSource print "<b>Image Credit:</b> " + imageSource + "<br />" ###look for videos #videoHunter = find_video.youtube() #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML']) #if videoLink == False: # log.plog('no video found', 2) # infoModule.info.page['vlink'] = '' #else: # log.plog('found video embed', 2) # infoModule.info.page['vlink'] = videoLink ## parse links in page #links.linkScoring(infoModule.info.page['rawHTML'], 'subs') #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom') #links.outboundLinks(infoModule.info.page['rawHTML']) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = author.replace('<', '<') infoModule.info.page['author'] = author print "<b>Author:</b> " + author + "<br />" else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures(infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) outline = find_story.findStoryViaRegex() if outline != False: if 'featured_source' in infoModule.info.source and infoModule.info.source['featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures(outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />" print "<hr>"
def scanPage(): siteDB = infoModule.info.site['database'] if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) os._exit(0) urlBlockerQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1, 1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog( 'url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) log.plog("fetching " + infoModule.info.page['url'], 2) try: socket = urllib.urlopen(infoModule.info.page['url']) except IOError: log.plog('could not open ' + infoModule.info.page['url'], 4) return False responseCode = socket.getcode() log.plog('urllib response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return False headerInfo = socket.info() contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return False # put in to account for WSJ -dpg if re.search("wsj\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = wsjAuthHack( infoModule.info.page['url']) elif re.search("nytimes\.com", infoModule.info.page['url'], re.S | re.M | re.I): infoModule.info.page['rawHTML'] = nytAuthHack( infoModule.info.page['url']) else: infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL #redirected urls need to be blocked too urlBlockerQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".urlBlocker", infoModule.info.site['dblink']) while True: urlBlocker = urlBlockerQ.fetch_row(1, 1) if urlBlocker == (): break blockTest = re.search(urlBlocker[0]['regex'], infoModule.info.page['url']) if blockTest != None: log.plog( 'url ' + infoModule.info.page['url'] + " matches urlBlocker " + urlBlocker[0]['regex'], 2) os._exit(0) ### and short url needs to be blocked #do not read links that have only one string in them linkParts = urlparse.urlparse(infoModule.info.page['url']) shortPath = re.search('^/\w+/*$', linkParts[2]) lp = linkParts[2] if shortPath != None: log.plog( "link excluded because it only has a short path of characters: %s" % linkParts[2], 2) os._exit(0) ## anything in htmlBlacklist? htmlBlacklistQ = mysql_tools.mysqlQuery( "select regex from " + siteDB + ".htmlBlacklist", infoModule.info.site['dblink']) while True: htmlBlacklist = htmlBlacklistQ.fetch_row(1, 1) if htmlBlacklist == (): break badSeedHTML = re.search(htmlBlacklist[0]['regex'], infoModule.info.page['rawHTML']) if badSeedHTML != None: log.plog( 'html matches htmlBlocker regex: ' + htmlBlacklist[0]['regex'], 3) os._exit(0) ################################### #special case for feedburner sources #ernst does not like special cases ################################### infoModule.info.page['url'] = re.sub('\?.*utm_source.*$', '', infoModule.info.page['url']) #check AGAIN to see if url is already in system escURL = infoModule.info.page['url'].replace("'", "\\'") urlCheckQ = mysql_tools.mysqlQuery( "select sub_id from " + siteDB + ".newsroom where url='" + escURL + "'", infoModule.info.site['dblink']) #don't exit, return false so that a new story can be tried if urlCheckQ.num_rows() > 0: log.plog( "scanpage-url already in newsroom: %s" % infoModule.info.page['url'], 2) log.plog("newsroom_id: " + str(urlCheckQ.fetch_row(1, 1))) return False urlCheckQ = mysql_tools.mysqlQuery( "select sub_id from " + siteDB + ".subs where url='" + escURL + "'", infoModule.info.site['dblink']) if urlCheckQ.num_rows() > 0: log.plog( "scanpage-url already in subs: %s" % infoModule.info.page['url'], 2) log.plog("sub_id: " + str(urlCheckQ.fetch_row(1, 1))) return False ## if source is '0', try to find source if infoModule.info.source['source_id'] == '0': sourceRegexQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".sources where url_regex != ''", infoModule.info.site['dblink']) while True: sourceRegex = sourceRegexQ.fetch_row(1, 1) if sourceRegex == (): break urlTest = re.search(sourceRegex[0]['url_regex'], infoModule.info.page['url']) if urlTest != None: log.plog('found source via regex: ' + sourceRegex[0]['title'], 2) infoModule.info.source = sourceRegex[0] for i in infoModule.info.source.keys(): ## this is sort of hack-y, but stupid python returns None for null if infoModule.info.source[i] == None: infoModule.info.source[i] = '' break ## maybe check last modified header and don't get stories older than 7 days? '''possibleAgeInDays = dateGuesser.urlDateGuesser(infoModule.info.page['url']) if possibleAgeInDays != None: log.plog("age of story might be: " + str(possibleAgeInDays) + " based on " + infoModule.info.page['url'], 2) if int(possibleAgeInDays) > 5: log.plog("story is " + str(possibleAgeInDays) + " days old. Not reading", 2) return False ''' if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) os._exit(0) #add meta description into the mix infoModule.info.page['meta_description'] = '' meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1).decode( 'utf-8') log.plog( "meta_description: " + infoModule.info.page['meta_description'], 2) log.plog( '======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len( infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) log.plog( '======================================= OUTLINE ================================', 2) ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures( infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) outline = False #this toggle allows for ignoring regex in favor of body_extractor if infoModule.info.site['skipBodyRegex'] == False: storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() #set html block used for imaage, author and links to be what outline returns if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page[ 'rawHTML'] infoModule.info.page['rawHTML'] = outline except TimeoutFunctionException: outline = False log.plog( "ERROR regex timed out for %s" % infoModule.info.source['story_start_marker'], 5) #outline = find_story.findStoryViaRegex() if outline != False: if infoModule.info.page['promoter'] == '0' and infoModule.info.source[ 'source_id'] != '0' and 'source_format' in infoModule.info.source and len( infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds ## parse links in page only in regex block if we have regex log.plog( '======================================= LINK SCORING ================================', 2) links.linkScoring(outline, 'subs') links.linkScoring(outline, 'newsroom') log.plog( '======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(outline) if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures( outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) else: log.plog('searching for body using body extractor', 2) outline = body_extractor.extract(infoModule.info.page['plainText']) if outline != False: infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'] abbreviatedHTML = html_body_extractor.html_body_extractor( infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline else: log.plog('could not create an outline for this story!', 5) os._exit(0) ## parse links in page - no regex, so look in rawHTML for links ## if there are widgetBlockers, first clear them from the html linkHTML = infoModule.info.page['rawHTML'] widgetBlacklistQ = mysql_tools.mysqlQuery( "select * from " + siteDB + ".widgetBlacklist", infoModule.info.site['dblink']) while True: widgetBlacklist = widgetBlacklistQ.fetch_row(1, 1) if widgetBlacklist == (): break if isinstance(linkHTML, str) == False: log.plog('linkHTML is not string', 5) os._exit(0) wblMatch = re.search( widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], linkHTML, re.S | re.I) if wblMatch != None: log.plog( "found widget blacklist for " + widgetBlacklist[0]['start_text'] + '.*?' + widgetBlacklist[0]['end_text'], 2) linkHTML = linkHTML.replace(wblMatch.group(0), '') mysql_tools.mysqlQuery( "update " + siteDB + ".widgetBlacklist set hits=hits+1 where widget_id=" + widgetBlacklist[0]['widget_id'], infoModule.info.site['dblink']) if infoModule.info.page['promoter'] == '0' and infoModule.info.source[ 'source_id'] != '0' and 'source_format' in infoModule.info.source and len( infoModule.info.source['source_format']) > 0: #link scoring only happens on rss feeds log.plog( '======================================= LINK SCORING ================================', 2) links.linkScoring(linkHTML, 'subs') links.linkScoring(linkHTML, 'newsroom') log.plog( '======================================= OUTBOUND LINKS ================================', 2) #don't go more than one level deep on blind stories links.outboundLinks(linkHTML) log.plog( '======================================= IMAGES ================================', 2) #find images if 'image_start_marker' in infoModule.info.source: image_start_marker = infoModule.info.source['image_start_marker'] else: image_start_marker = '' if 'image_end_marker' in infoModule.info.source: image_end_marker = infoModule.info.source['image_end_marker'] else: image_end_marker = '' imageArray = find_images.findImages(infoModule.info.page['imageHTML'], image_start_marker, image_end_marker) if imageArray == None: log.plog('could not find image', 3) else: x = imageArray[0] y = imageArray[1] imageURL = imageArray[2] if imageURL == '': log.plog('could not find image', 3) else: log.plog('image found: ' + imageURL, 2) infoModule.info.page['largestImage'] = imageURL infoModule.info.page['maxSize'] = x * y log.plog( '======================================= IMAGE CREDIT ================================', 2) ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit( infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False: infoModule.info.page['imageSource'] = imageSource log.plog( '======================================= VIDEOS ================================', 2) ###look for videos videoLink = find_video.findVideoEmbed(infoModule.info.page['rawHTML']) if videoLink == False: infoModule.info.page['vlink'] = '' else: log.plog('found video embed', 2) infoModule.info.page['vlink'] = videoLink vthumb = find_video.findVideoThumb(videoLink) if vthumb == False: infoModule.info.page['vthumb'] = '' else: log.plog('found video thumb', 2) infoModule.info.page['vthumb'] = vthumb log.plog( '======================================= AUTHOR ================================', 2) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = strip_html.clearHTML(author) infoModule.info.page['author'] = author else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' log.plog( '======================================= ENTITIES ================================', 2) #### find entities entities.entityFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = False while nickname is False: try: entities.nicknameFinder( infoModule.info.page['title'] + ' ' + infoModule.info.page['outline'], True) nickname = True except: pass ## test cityToTeam #cityToTeam.getRelevantEntity() entities.setPrimo() #### chop outline to 500 chars unless featured if 'featured_source' not in infoModule.info.source or infoModule.info.source[ 'featured_source'] == '0': infoModule.info.page[ 'outline'] = infoModule.info.page['outline'][0:500] + '...' if len(infoModule.info.entityList) < 1: log.plog("no entities found in story!", 5) os._exit(0) log.plog( '======================================= UNKNOWN ENTITIES ================================', 2) ## any unknown entities? entityFixedString = infoModule.info.page[ 'title'] + ' ' + infoModule.info.page['outline'] entityFixedString = entityFixedString.replace("'s", "") entityFixedString = re.sub('\W+', ' ', entityFixedString) find_new_entities.find_new_entities(entityFixedString) ## page must have at least one non-hidden entity invisibleTypesQuery = mysql_tools.mysqlQuery( "select mptype_id from db_topics.mptypes where visibility='invisible'", infoModule.info.site['dblink']) invisibleTypes = '' sep = '' while True: oneType = invisibleTypesQuery.fetch_row(1, 1) if oneType == (): break invisibleTypes = invisibleTypes + sep + oneType[0]['mptype_id'] sep = ',' sep = '' cclist = '' for eKey in infoModule.info.entityList.keys(): cclist = cclist + sep + str(eKey) sep = ',' sql = "select celeb_id from db_topics.celebs where celeb_id in (" + cclist + ") and mptype_id not in (" + invisibleTypes + ")" nonHiddenEntitiesQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if nonHiddenEntitiesQ.num_rows() == 0: log.plog('no non-hidden entities found in story!', 4) os._exit(0) newSubId = addStory.addStory() if newSubId == False: log.plog('addStory failed', 5) else: log.plog("Story added. New sub_id: " + str(newSubId), 2) os._exit(0)
def scanPage(step): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print infoModule.info.page['url'] ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures( infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) hit = False outline = False originalStep = step while hit == False: #pick out most popular regex sql = "select count(*) as common, story_start_marker, story_end_marker from " + siteDB + ".sources where story_start_marker != '' group by story_start_marker order by count(*) desc limit %d,1" % step regexQ = mysql_tools.mysqlQuery(sql, infoModule.info.site['dblink']) if regexQ == False: break if regexQ.num_rows() == 0: break regex = regexQ.fetch_row(1, 1) infoModule.info.source['story_start_marker'] = regex[0][ 'story_start_marker'] infoModule.info.source['story_end_marker'] = regex[0][ 'story_end_marker'] infoModule.info.source['story_end_marker'] = infoModule.info.source[ 'story_end_marker'].replace('\/', '/') infoModule.info.source['story_start_marker'] = infoModule.info.source[ 'story_start_marker'].replace('\/', '/') storySearch = timeout.TimeoutFunction(find_story.findStoryViaRegex, 2) try: outline = storySearch() except: outline = False if outline != False: hit = True step += 1 if outline != False: startMarker = infoModule.info.source['story_start_marker'].replace( '<', '<') endMarker = infoModule.info.source['story_end_marker'].replace( '<', '<') if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures( outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) infoModule.info.page['outline'] = infoModule.info.page[ 'outline'].decode('utf-8') infoModule.info.page['outline'] = infoModule.info.page[ 'outline'].encode('ascii', 'xmlcharrefreplace') print str(step) print startMarker print endMarker print infoModule.info.page['outline'] else: print "no match"
row = cr.fetch_row(1, 1) if row == (): break print(row) url = row[0]['url'] infoModule.info.page['url'] = url infoModule.info.page['title'] = 'whatevs' log.plog('testing clearHTML', 1) print 'Attempting URL: ' + url + "\r\n" urlSocket = urllib.urlopen(url) html = urlSocket.read() if (len(sys.argv) > 1 and sys.argv[1]) == 'features': html = strip_html.clearHTMLFeatures(html) else: html = strip_html.clearHTML(html) find_new_entities.find_new_entities(html) print '=========================================================' # blockedWords test while (1): url = 'http://news.ycombinator.com/item?id=2092273' infoModule.info.page['url'] = url infoModule.info.page['title'] = 'whatevs' print 'Attempting URL: ' + url + "\r\n" urlSocket = urllib.urlopen(url) html = urlSocket.read() html = strip_html.clearHTML(html) print html find_new_entities.find_new_entities(html)
return False retval = websock.read() return retval if __name__ == '__main__': if len(sys.argv) > 1: url = sys.argv[1] infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['rawHTML'] = fetchPage(url) htmlTitle() infoModule.info.page['title'] = real_title2.realTitle() print infoModule.info.page['title'] #sys.exit() infoModule.info.page['plainText'] = strip_html.clearHTML(infoModule.info.page['rawHTML']) infoModule.info.site['body_extractor_no_date'] = True infoModule.info.page['meta_description'] = '' meta_search = re.search('meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) print "meta_description: " + infoModule.info.page['meta_description'] outline = extract(infoModule.info.page['plainText'], doAsciiConvert=False) outline = unicodeMapper.clearCurlies(outline) print outline else: unittest.main()
def fetchStory(url): siteDB = 'peepbuzz' infoModule.info.page['url'] = url log.plog("fetching " + url, 2) request_obj = urllib2.Request(url) request_obj.add_header('Referer', 'http://www.google.com/') request_obj.add_header( 'User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)' ) try: websock = urllib2.urlopen(request_obj) except IOError: log.plog('could not open ' + url, 4) return failOn('could not open ' + url) responseCode = websock.getcode() headerInfo = websock.info() pprint.pprint(headerInfo) log.plog('urllib2 response code: ' + str(responseCode), 2) if responseCode != 200 and responseCode != 302 and responseCode != 301 and responseCode != 303: log.plog('got failure response code from server', 4) return failOn('got failure response code from server') contentType = headerInfo.gettype() if contentType != 'text/html' and contentType != 'text/html, text/html': log.plog('content type: ' + contentType + '. not fetching', 4) return failOn('content type: ' + contentType + '. not fetching') readWithTimeout = timeout.TimeoutFunction(websock.read, 5) #infoModule.info.page['rawHTML'] = websock.read() try: infoModule.info.page['rawHTML'] = readWithTimeout() except timeout.TimeoutFunctionException: log.plog("timeout while trying to fetch " + url, 101) return failOn('read timeout ' + url) redirURL = websock.geturl() if redirURL != url: log.plog('redirected to ' + redirURL, 2) url = redirURL #redirected urls need to be blocked too if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) return failOn('article length exceeds 500k, probably not html') windows_trouble_list = [u'\x93', u'\x92', u'\x91', u'\x96', u'\x94'] cd = chardet.detect(infoModule.info.page['rawHTML']) if cd['encoding'] != 'ascii': log.plog('Server encoding: ' + cd['encoding'], 2) oldHTML = infoModule.info.page['rawHTML'] infoModule.info.page['rawHTML'] = infoModule.info.page[ 'rawHTML'].decode(cd['encoding']) windows_chars_in_html = [ trouble for trouble in windows_trouble_list if infoModule.info.page['rawHTML'].find(trouble) >= 0 ] if len(windows_chars_in_html) > 0: #windows = infoModule.info.page['rawHTML'].find(u'\x93') log.plog('this is actually windows-1252', 3) infoModule.info.page['rawHTML'] = oldHTML.decode('windows-1252') # some configuration options infoModule.info.page['meta_description'] = '' meta_search = re.search( 'meta name="description" content="(.*?\s+.*?\s+.*?\s+.*?\s+).*?"', infoModule.info.page['rawHTML'], re.I | re.S) if meta_search != None: infoModule.info.page['meta_description'] = meta_search.group(1) log.plog( "meta_description: " + infoModule.info.page['meta_description'], 2) log.plog( '======================================= TITLE ================================', 2) # get title #set HTMLTitle first HTMLTitle = re.search('<title>(.*?)<\/title>', infoModule.info.page['rawHTML'], re.S | re.I) if HTMLTitle != None: infoModule.info.page['HTMLTitle'] = HTMLTitle.group(1) log.plog('html title found: ' + infoModule.info.page['HTMLTitle'], 2) else: infoModule.info.page['HTMLTitle'] = "" title = find_title.findTitle() if title != False: infoModule.info.page['title'] = title log.plog('title from regex', 2) if 'potential_title' in infoModule.info.page and len( infoModule.info.page['potential_title']) > 0: infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['potential_title']) log.plog('title from potential_title', 2) else: infoModule.info.page['title'] = real_title2.realTitle() if infoModule.info.page['title'] == False: infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] log.plog('using html title', 2) else: log.plog('title from realTitle', 2) if infoModule.info.page['title'] == '': log.plog('could not find title for page. Setting to HTML Title', 4) infoModule.info.page['title'] = infoModule.info.page['HTMLTitle'] #clear html from title infoModule.info.page['title'] = strip_html.clearHTML( infoModule.info.page['title']) #also titleCase the title #infoModule.info.page['title'] = infoModule.info.page['title'].title() log.plog('final title: ' + infoModule.info.page['title'], 2) #cd = chardet.detect(infoModule.info.page['title']) #if cd['encoding'] != 'ascii': # log.plog('title encoding: ' + cd['encoding'], 2) # oldTitle = infoModule.info.page['title'] # infoModule.info.page['title'] = infoModule.info.page['title'].decode(cd['encoding']) # windows_chars_in_html = [trouble for trouble in windows_trouble_list if infoModule.info.page['title'].find(trouble) >= 0] # if len(windows_chars_in_html) > 0: # #windows = infoModule.info.page['rawHTML'].find(u'\x93') # log.plog('title is actually windows-1252', 3) # infoModule.info.page['title'] = oldTitle.decode('windows-1252') log.plog( '======================================= OUTLINE ================================', 2) ## fetch outline #remove special case elements from the html. These are lines or blocks of code that cause #problems if left in infoModule.info.page['plainText'] = strip_html.removeSpecialCases( infoModule.info.page['rawHTML']) infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['plainText']) #clearHTML can't take out title, because title gets passed to clearHTML, but it should be removed here infoModule.info.page['plainText'] = re.sub( '<title.*?</title.*?>', '', infoModule.info.page['plainText'], 0, re.I | re.S | re.M) outline = False #this toggle allows for ignoring regex in favor of body_extractor log.plog('searching for body using body extractor', 2) infoModule.info.site['body_extractor_no_date'] = True outline = body_extractor.extract(infoModule.info.page['plainText'], doAsciiConvert=False) infoModule.info.page['imageHTML'] = infoModule.info.page['rawHTML'] if outline != None: abbreviatedHTML = html_body_extractor.html_body_extractor( infoModule.info.page['rawHTML'], outline) if abbreviatedHTML != None: infoModule.info.page['rawHTML'] = abbreviatedHTML infoModule.info.page['outline'] = outline #use largestBlock to strip leading dom elements off that seem extraneous infoModule.info.page['outline'] = largestBlock.removePreceedingBlocks( infoModule.info.page['imageHTML'], infoModule.info.page['outline']) else: log.plog('could not create an outline for this story!', 5) infoModule.info.page['outline'] = '' #return failOn('could not create an outline for this story!') # outline must be at least minOutlineLen minOutlineLen = 255 if len(infoModule.info.page['outline']) > 0 and len( infoModule.info.page['outline']) < minOutlineLen: log.plog('outline too short, assuming failure', 3) infoModule.info.page['outline'] = '' log.plog( '======================================= IMAGES ================================', 2) #find images image_start_marker = '' image_end_marker = '' imageArray = find_all_images.findImages(infoModule.info.page['imageHTML'], url) if imageArray == None: log.plog('could not find image', 3) imageArray = '' log.plog( '======================================= VIDEOS ================================', 2) ###look for videos allVideosJSON = find_all_videos.find_all_videos( infoModule.info.page['imageHTML']) allVideos = json.loads(allVideosJSON) if len(allVideos) > 0: log.plog('found video embed', 2) print allVideosJSON #if no outline and no images over x by y and no videos, then no story if infoModule.info.page['outline'] == '' and ( imageArray == '' or imageArray == []) and allVideos == '': failOn('nothing found') #largest image if no outline must be at least 450 x 450 to make it an image page largestImageDimensions = 0 largestImage = [] for image in imageArray: if image['width'] * image['height'] > largestImageDimensions: largestImage = image largestImageDimensions = image['width'] * image['height'] print largestImage minImageSize = 400 if infoModule.info.page['outline'] == '' and allVideos == [] and ( largestImage == [] or largestImage['width'] < minImageSize or largestImage['height'] < minImageSize): return (failOn( 'no story or video found, and largest image less than min size')) status = 'OK' storyObj = {} storyObj['title'] = infoModule.info.page['title'] storyObj['outline'] = unicodeMapper.clearCurlies( infoModule.info.page['outline']) storyObj['url'] = url storyObj['images'] = imageArray storyObj['videos'] = allVideos returnVal = {"status": status, "story": storyObj} output = json.dumps(returnVal) return output
def scanPage(): if 'url' not in infoModule.info.page: log.plog('scan page called without url', 4) sys.exit() log.plog("fetching " + infoModule.info.page['url'], 2) socket = urllib.urlopen(infoModule.info.page['url']) infoModule.info.page['rawHTML'] = socket.read() redirURL = socket.geturl() if redirURL != infoModule.info.page['url']: log.plog('redirected to ' + redirURL, 2) infoModule.info.page['url'] = redirURL ## maybe check last modified header and don't get stories older than 7 days? if len(infoModule.info.page['rawHTML']) > 500000: log.plog("article length exceeds 500k, probably not html", 2) sys.exit() print "<b>URL</b> <a href=\"" + infoModule.info.page[ 'url'] + "\">" + infoModule.info.page['url'] + "</a><br />" #print "<b>URL</b> " + infoModule.info.page['url'] + "<br />" # get title #title = find_title.findTitle() #if title != False: # infoModule.info.page['title'] = title #elif 'potential_title' in infoModule.info.page and len(infoModule.info.page['potential_title']) > 0: # infoModule.info.page['title'] = infoModule.info.page['potential_title'] #else: # log.plog('no title found!', 3) # sys.exit() ##print infoModule.info.page['title'] #find images #if 'image_start_marker' in infoModule.info.source: # image_start_marker = infoModule.info.source['image_start_marker'] #else: # image_start_marker = '' #if 'image_end_marker' in infoModule.info.source: # image_end_marker = infoModule.info.source['image_end_marker'] #else: # image_end_marker = '' #imageArray = find_images.findImages(infoModule.info.page['rawHTML'], image_start_marker, image_end_marker) #x = imageArray[0] #y = imageArray[1] #imageURL = imageArray[2] #if imageURL == '': # log.plog('could not find image', 3) #else: # log.plog('image found: ' + imageURL, 2) # infoModule.info.page['largestImage'] = imageURL # infoModule.info.page['maxSize'] = x * y ## image credit if any infoModule.info.page['imageSource'] = '' if 'image_source_start_marker' in infoModule.info.source and 'image_source_end_marker' in infoModule.info.source: imageSource = find_credit.findCredit( infoModule.info.page['rawHTML'], infoModule.info.source['image_source_start_marker'], infoModule.info.source['image_source_end_marker']) if imageSource != False and imageSource != None: infoModule.info.page['imageSource'] = imageSource #print "<b>Image Credit:</b> " + imageSource + "<br />" ###look for videos #videoHunter = find_video.youtube() #videoLink = videoHunter.getURL(infoModule.info.page['rawHTML']) #if videoLink == False: # log.plog('no video found', 2) # infoModule.info.page['vlink'] = '' #else: # log.plog('found video embed', 2) # infoModule.info.page['vlink'] = videoLink ## parse links in page #links.linkScoring(infoModule.info.page['rawHTML'], 'subs') #links.linkScoring(infoModule.info.page['rawHTML'], 'newsroom') #links.outboundLinks(infoModule.info.page['rawHTML']) ##author in story? if 'author_start_marker' in infoModule.info.source and 'author_end_marker' in infoModule.info.source: author = find_author.findAuthor() if author != False: author = author.replace('<', '<') infoModule.info.page['author'] = author print "<b>Author:</b> " + author + "<br />" else: infoModule.info.page['author'] = '' else: infoModule.info.page['author'] = '' ## fetch outline if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['plainText'] = strip_html.clearHTMLFeatures( infoModule.info.page['rawHTML']) else: infoModule.info.page['plainText'] = strip_html.clearHTML( infoModule.info.page['rawHTML']) outline = find_story.findStoryViaRegex() if outline != False: if 'featured_source' in infoModule.info.source and infoModule.info.source[ 'featured_source'] == '1': infoModule.info.page['outline'] = strip_html.clearHTMLFeatures( outline) else: infoModule.info.page['outline'] = strip_html.clearHTML(outline) print "<b>Outline:</b> " + infoModule.info.page['outline'] + "<br />" print "<hr>"