def extract_old(text): lines = re.split('\n', text) lengths = empty(len(lines)) for i in range(len(lines)): lengths[i] = len(lines[i]) total = sum(lengths) average = total / len(lines) longest = amax(lengths) deviation = std(lengths) w_count = word_count(lines) groups = grouper(lengths) deviants = [] dates = find_date(lines) comments = find_comments(lines) start = False end_count = 0 large_block = [] blocks = [] period_ratio = [] # Looking for beginning of story based on cluster of long lines period_ratio = periodCase(lines) for i in range(len(lines)): if lengths[i] > average + (deviation * 1.2) \ and titleCase(lines[i]) == False \ and period_ratio[i] > 0.005: start = True end_count = 0 elif start == True: end_count += 1 if end_count > 1: if len(large_block) < 2: large_block = [] else: blocks += large_block large_block = [] if start == True: large_block.append(lines[i]) deviants.append(i) if len(blocks) == 0: return False # Maybe use second block instead of first if len(blocks) > 1 and \ len(blocks[0]) < 3 and \ len(blocks[1]) > len(blocks[0]): blocks[0] = blocks[1] story = '' for line in blocks[0]: story += line return story
def extract(text, doAsciiConvert=True): #log.plog("BE: Initialized", 2) #find meta description tag if possible #convert to ascii is problematic for utf-8 process stacks, but may be needed for the *fi sites if doAsciiConvert: text = convertToAscii(text) text = re.sub("\|", '\n', text) #remove all empty lines # this will significantly up the average line length for the outliers calculation and may text = re.sub("\n\s*\n+", '\n', text, 0, re.S | re.M) try: text = text.decode('utf-8') except: pass lines = re.split('\n', text) #remove blank lines if doAsciiConvert: title = convertToAscii(infoModule.info.page['title']) else: title = infoModule.info.page['title'] lengths = [0] * len(lines) long_count = long_letter_count(lines) for i in range(len(lines)): #really? No-one thought to strip whitespace? lines[i] = lines[i].strip() print str(i) + ': ' + lines[i] if long_count[i] < 2: print( "@@ " + str(i) + " : " + str(long_count[i]) + " : " + lines[i], 2) lengths[i] = len(lines[i]) else: lengths[i] = 0 total = sum(lengths) average = total / len(lines) try: longest = amax(lengths) except: print "amax failed" return None try: deviation = std(lengths) except: print "std failed" return None try: groups = grouper(lengths) except: print "grouper failed" return None try: titles = find_be_title(lines, title) except: print "titles failed" return None try: dates = find_date(lines, titles) except: print "find_date failed" return None print "dates:" print dates try: outliers = find_outliers(average, lengths, deviation, dates) except: print "find_outliers failed" return None print "outliers:" print outliers #find contiguous outliers as likely indicator of where the body is contiguous_list = contiguous_outliers(outliers) try: comments = find_comments(lines, outliers) except: print "find_comments failed" return None #print("titles: ", titles) #print("dates: ", dates) #print("outliers: ", outliers) #print("comments: ", comments) blocks = [] if not dates and not title and not comments and len(lines) < 20: #log.plog(("BE: FAILED: No dates titles or comments returning nothing"), 2) if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[ 'body_extractor_no_date'] == True: pass else: return None if not dates and not titles: #log.plog(("BE: FAILED: No dates and no titles found. Returning Nothing"), 2) if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[ 'body_extractor_no_date'] == True: pass else: return None if not dates: #log.plog(("BE: FAILED : No dates found. This should be corrected shortly"), 2) # no date found. Drop confidence infoModule.info.page['confidence'] -= 1 if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[ 'body_extractor_no_date'] == True: # for peepbuzz case, body_extractor can ignore failure to find date dates = [0] else: return None #date = first date, UNLESS there is a contiguous outlier closer to a later date date = dates[0] if len(dates) > 1 and len(contiguous_list) > 0: #distance from first date to contiguous block: first_distance = abs(contiguous_list[0] - dates[0]) print "first distance: " + str(first_distance) for i in range(1, len(dates)): next_distance = abs(contiguous_list[0] - dates[i]) #log.plog('next distance: ' + str(next_distance), 2) if next_distance < first_distance: # date[i] is closer. use it. date = dates[i] #log.plog('using date ' + str(date), 2) break if len(outliers) > 0: outlier = outliers[0] else: #log.plog('no outliers found', 3) return None found = 0 ###### meta description testing ########## meta_description_outliers = [] meta_description_lines = [] # a meta description match that starts at position 0 is better than one that is later in the string if 'meta_description' in infoModule.info.page and infoModule.info.page[ 'meta_description'] != '': print "meta description is: " + infoModule.info.page['meta_description'] #test for meta descriptions for i in range(len(outliers)): #print str(outliers[i]) + " " + lines[outliers[i]] outlier_position = lines[outliers[i]].find( infoModule.info.page['meta_description']) if outlier_position == 0: print "meta in outliers (perfect match): " + lines[outliers[i]] meta_description_outliers.insert(0, outliers[i]) elif outlier_position > 0: print "meta in outliers: " + lines[outliers[i]] meta_description_outliers.append(outliers[i]) for i in range(len(lines)): #print str(i) + " " + lines[i] line_position = lines[i].find( infoModule.info.page['meta_description']) if line_position == 0: print "meta in lines: " + lines[i] meta_description_lines.insert(0, i) elif line_position > 0: print "meta in lines: " + lines[i] meta_description_lines.append(i) #log.plog(("BE:DCTFO: Finding date closest to first outlier"), 2) if titles: for i in range(len(outliers)): if outliers[i] > titles[0]: for k in range(len(dates)): print(dates[k], outliers[i], titles[0]) if dates[k] < outliers[i]: date = dates[k] outlier = outliers[i] else: #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " using titles[0] : " + str(titles[0])), 2) found = 1 if found == 1: break else: for i in range(len(outliers)): for k in range(len(dates)): if dates[k] >= outliers[i]: found = 1 break if dates[k] < outliers[i]: outlier = outliers[i] date = dates[k] if found == 1: #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " without title"), 2) break if found == 0: #log.plog(("BE: Couldn't get suitable dates -- we are defaulting to date : " + str(date) + " and outlier : "+str(outlier)), 2) pass # no comments, then try and get the body from date on if titles: #log.plog(("BE: titles found, assigning start_bod"), 2) if titles[0] > date and count_outliers(date, titles[0], outliers, lines) < 2: if count_outliers(date, titles[0], outliers, lines) >= count_outliers( titles[0], len(lines), outliers, lines): #log.plog("BE: title after date, and outliers between date and title are greater than rest of body", 2) start_bod = date else: #log.plog("BE: title after date, and no outliers between date and title", 2) start_bod = titles[0] start_bod = start_bod + 1 elif titles[0] < date and count_outliers(titles[0], date, outliers, lines) < 2: #print (titles[0], date, outliers, count_outliers(titles[0], date, outliers,lines)) #if meta is in between title and date, sway the decision towards using the meta meta_lines_in_range = [ meta_line for meta_line in meta_description_lines if meta_line > titles[0] and meta_line < date ] meta_outliers_in_range = [ meta_out for meta_out in meta_description_outliers if meta_out > titles[0] and meta_out < date ] if len(meta_outliers_in_range) > 0: #log.plog('BE: title before date, meta_outlier in between', 2) start_bod = meta_outliers_in_range[0] elif len(meta_lines_in_range) > 0: #log.plog('BE: title before date, meta_line in between', 2) start_bod = meta_lines_in_range[0] elif count_outliers(titles[0], date, outliers, lines) >= count_outliers( date, len(lines), outliers, lines): #log.plog("BE: title before date, and outliers are greater than the rest of the body", 2) start_bod = titles[0] else: #log.plog("BE: title before date, and no outliers between title and date", 2) start_bod = date else: #date appears at end of body #print str(titles[0]) #print date #print "count outliers" + str(count_outliers(titles[0], date, outliers, lines)) #print "count outliers" + str(count_outliers(date, titles[0], outliers, lines)) title_to_date = count_outliers(titles[0], date, outliers, lines) date_to_title = count_outliers(date, titles[0], outliers, lines) #log.plog("BE: date appears at end of body", 2) if title_to_date >= date_to_title: #log.plog("more outliers moving from title to date", 2); start_bod = titles[0] else: #log.plog("more outliers moving from date to title", 2); start_bod = date start_bod = start_bod + 1 else: #log.plog("BE: no titles present, using date closest to first outlier as marker", 2) #print outliers[0] #print date outliers_to_end = count_outliers(date, len(lines), outliers, lines) outliers_before_date = count_outliers(outliers[0], date, outliers, lines) if outliers_to_end < 2 and outliers_before_date > outliers_to_end: #log.plog('date to end: not enough outliers compared to first outlier to date - LOW CONFIDENCE', 2) start_bod = outliers[0] else: start_bod = date #sys.exit() #log.plog(("BE: start_bod : "+str(start_bod)), 2) if not comments: #log.plog(("BE:NC: no comments, trying to get body from date and titles on if no outliers before date"), 2) o_count = count_outliers(start_bod, len(lines), outliers, lines) if o_count > 2: blocks = get_body(start_bod, len(lines), lines, dates) if 'preserve_breaks' in infoModule.info.site and infoModule.info.site[ 'preserve_breaks'] == True: story = "[linebreak]".join(blocks) else: story = " ".join(blocks) #log.plog(("BE: got story, returning"), 2) return story else: #log.plog(("BE:NC: FAILED: no blocks found"), 2) return None # comments found # we know titles most usually appear before body, so we try and rely on this before anything else elif titles: #log.plog("BE: using titles to go through list of comments to find suitable outliers", 2) for i in range(len(comments)): if comments[i] > outlier: #log.plog("BE: Attempting start_bod: "+ str(start_bod) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2) if count_outliers(start_bod, comments[i], outliers, lines) > 1 or count_outliers( start_bod, comments[i], outliers, lines) > len(outliers) - 1: # is meta in outliers? # if so, start there, not start bod for meta_outlier in meta_description_outliers: if meta_outlier > start_bod and meta_outlier < comments[ i]: #log.plog("shifting forward to meta tag at line " + str(meta_outlier), 2) start_bod = meta_outlier break blocks = get_body(start_bod, comments[i], lines, dates) print "get body " + str(start_bod) + ' ' + str(comments[i]) break else: #log.plog(("BE: comments found"), 2) # This next line causes a problem with pages whose body is one line or less, # like a link to another story especially when there are comments with long content ##log.plog(("BE: Attempting to count outliers (0, "+str(start_bod)+", outliers, lines) and got : " + str(count_outliers(0, start_bod, outliers, lines))), 2) if (count_outliers(0, start_bod, outliers, lines) >= 2): # if above 1, we may have the story above the date # then, grab first outlier until dates[0] #log.plog(("BE:OBFD: Outliers before first found date"), 2) # this means the previous start is now the new end end = start_bod begin = find_group(outliers[0], groups) # if comment tag is found and it is less than the first date, we'll use that as the new end if comments[0] > begin and comments[0] < end: end = comments[0] blocks = get_body(begin, end, lines, dates) else: # we are trying to pick out the date closest to the first outlier, # to get rid of any dates tohat may occur outside of the body # this checks to see if a date comes after the first outlier #log.plog(("BE:DCTFO: going through list of comments to find suitable place where outliers might be"), 2) for i in range(len(comments)): if comments[i] > outlier: #log.plog("BE:DCTFO: Attempting start_bod: "+ str(start_bod+1) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2) # we are now going through the list of comments to find a suitable place where outliers might be # This number of count_outliers seems to be a problem, where >0 pulls some false positives and >1 pulls some false negatives # There has to be another metric to determine validity of content if count_outliers(start_bod + 1, comments[i], outliers, lines) > 1: #if meta description tag in lines or outliers is within two rows of start bod, use that instead if start_bod in meta_description_outliers or start_bod in meta_description_lines: #log.plog("BE:DCTFO: found meta description at start_bod", 2) pass elif start_bod - 1 in meta_description_outliers or start_bod - 1 in meta_description_lines: #log.plog("BE:DCTFO: found meta description one behind start_bod", 2) start_bod = start_bod - 1 elif start_bod - 2 in meta_description_outliers or start_bod - 2 in meta_description_lines: #log.plog("BE:DCTFO: found meta description two behind start_bod", 2) start_bod = start_bod - 2 else: start_bod = start_bod + 1 blocks = get_body(start_bod, comments[i], lines, dates) break if not blocks: # missed the body altogether, let's just grab from the date on # is there a match for meta_description in the outliers? try that if len(meta_description_outliers) > 0: #log.plog("BE: FAILED, using meta description outlier", 2) start_bod = meta_description_outliers[0] else: #log.plog(("BE: FAILED: missed the body altogether, let's just grab from the start : "+ str(start_bod) + " to " + str(len(lines))), 2) pass blocks = get_body(start_bod, len(lines), lines, dates) if len(blocks) == 0: #log.plog(("BE: FAILED: I couldn't pull anything, returning to None"), 2) return None body = [] for i in range(len(blocks)): #title shouldn't be in blocks, unless block is really much longer than title # (200 chars) if not re.search(title, blocks[i]) or len(blocks[i]) > 200 + len(title): body.append(blocks[i]) if body: ## for zezt, try preserving line breaks if 'preserve_breaks' in infoModule.info.site and infoModule.info.site[ 'preserve_breaks'] == True: story = "[linebreak]".join(body) else: story = " ".join(body) three_word_ratio = triple_ratio(story) #log.plog("three_word_ratio: " + str(three_word_ratio), 2) if three_word_ratio < 0.08 or three_word_ratio > 0.3: #log.plog("BE: Not English!", 3) return None story = re.sub("\n", "", story) story = re.sub("<.*?>", "", story) #log.plog(("BE: I found something, returning story"), 2) #log.plog((story), 2) #if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True: # story = story.replace("[linebreak]", "<br />\n ") #manual fix inserted by ernst, 1/29/11 story = re.sub(r'Email Sent! You have successfully emailed the post.', '', story) return story else: #log.plog("BE: no body") return None
def extract(text, doAsciiConvert = True): #log.plog("BE: Initialized", 2) #find meta description tag if possible #convert to ascii is problematic for utf-8 process stacks, but may be needed for the *fi sites if doAsciiConvert: text = convertToAscii(text) text = re.sub("\|", '\n', text) #remove all empty lines # this will significantly up the average line length for the outliers calculation and may text = re.sub("\n\s*\n+", '\n', text, 0, re.S | re.M) try: text = text.decode('utf-8') except: pass lines = re.split('\n', text) #remove blank lines if doAsciiConvert: title = convertToAscii(infoModule.info.page['title']) else: title = infoModule.info.page['title'] lengths = [0]*len(lines) long_count = long_letter_count(lines) for i in range(len(lines)): #really? No-one thought to strip whitespace? lines[i] = lines[i].strip() print str(i) + ': ' + lines[i] if long_count[i] < 2 : print("@@ " + str(i) + " : " + str(long_count[i]) + " : " + lines[i], 2) lengths[i] = len(lines[i]) else: lengths[i] = 0 total = sum(lengths) average = total / len(lines) try: longest = amax(lengths) except: print "amax failed" return None try: deviation = std(lengths) except: print "std failed" return None try: groups = grouper(lengths) except: print "grouper failed" return None try: titles = find_be_title(lines, title) except: print "titles failed" return None try: dates = find_date(lines, titles) except: print "find_date failed" return None print "dates:" print dates try: outliers = find_outliers(average, lengths, deviation, dates) except: print "find_outliers failed" return None print "outliers:" print outliers #find contiguous outliers as likely indicator of where the body is contiguous_list = contiguous_outliers(outliers) try: comments = find_comments(lines, outliers) except: print "find_comments failed" return None #print("titles: ", titles) #print("dates: ", dates) #print("outliers: ", outliers) #print("comments: ", comments) blocks = [] if not dates and not title and not comments and len(lines)<20: #log.plog(("BE: FAILED: No dates titles or comments returning nothing"), 2) if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True: pass else: return None if not dates and not titles: #log.plog(("BE: FAILED: No dates and no titles found. Returning Nothing"), 2) if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True: pass else: return None if not dates: #log.plog(("BE: FAILED : No dates found. This should be corrected shortly"), 2) # no date found. Drop confidence infoModule.info.page['confidence'] -= 1 if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True: # for peepbuzz case, body_extractor can ignore failure to find date dates = [0] else: return None #date = first date, UNLESS there is a contiguous outlier closer to a later date date = dates[0] if len(dates) > 1 and len(contiguous_list) > 0: #distance from first date to contiguous block: first_distance = abs(contiguous_list[0] - dates[0]) print "first distance: " + str(first_distance) for i in range(1,len(dates)): next_distance = abs(contiguous_list[0] - dates[i]) #log.plog('next distance: ' + str(next_distance), 2) if next_distance < first_distance: # date[i] is closer. use it. date = dates[i] #log.plog('using date ' + str(date), 2) break if len(outliers) > 0: outlier = outliers[0] else: #log.plog('no outliers found', 3) return None found = 0 ###### meta description testing ########## meta_description_outliers = [] meta_description_lines = [] # a meta description match that starts at position 0 is better than one that is later in the string if 'meta_description' in infoModule.info.page and infoModule.info.page['meta_description'] != '': print "meta description is: " + infoModule.info.page['meta_description'] #test for meta descriptions for i in range(len(outliers)): #print str(outliers[i]) + " " + lines[outliers[i]] outlier_position = lines[outliers[i]].find(infoModule.info.page['meta_description']) if outlier_position == 0: print "meta in outliers (perfect match): " + lines[outliers[i]] meta_description_outliers.insert(0, outliers[i]) elif outlier_position > 0: print "meta in outliers: " + lines[outliers[i]] meta_description_outliers.append(outliers[i]) for i in range(len(lines)): #print str(i) + " " + lines[i] line_position = lines[i].find(infoModule.info.page['meta_description']) if line_position == 0: print "meta in lines: " + lines[i] meta_description_lines.insert(0, i) elif line_position > 0: print "meta in lines: " + lines[i] meta_description_lines.append(i) #log.plog(("BE:DCTFO: Finding date closest to first outlier"), 2) if titles: for i in range(len(outliers)): if outliers[i] > titles[0]: for k in range(len(dates)): print(dates[k], outliers[i], titles[0]) if dates[k] < outliers[i]: date = dates[k] outlier = outliers[i] else: #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " using titles[0] : " + str(titles[0])), 2) found = 1 if found == 1: break else: for i in range(len(outliers)): for k in range(len(dates)): if dates[k] >= outliers[i]: found = 1 break if dates[k] < outliers[i]: outlier = outliers[i] date = dates[k] if found == 1: #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " without title"), 2) break if found == 0: #log.plog(("BE: Couldn't get suitable dates -- we are defaulting to date : " + str(date) + " and outlier : "+str(outlier)), 2) pass # no comments, then try and get the body from date on if titles: #log.plog(("BE: titles found, assigning start_bod"), 2) if titles[0] > date and count_outliers(date, titles[0], outliers, lines) < 2: if count_outliers(date, titles[0], outliers, lines) >= count_outliers(titles[0], len(lines), outliers, lines): #log.plog("BE: title after date, and outliers between date and title are greater than rest of body", 2) start_bod = date else: #log.plog("BE: title after date, and no outliers between date and title", 2) start_bod = titles[0] start_bod = start_bod + 1 elif titles[0] < date and count_outliers(titles[0], date, outliers, lines) < 2 : #print (titles[0], date, outliers, count_outliers(titles[0], date, outliers,lines)) #if meta is in between title and date, sway the decision towards using the meta meta_lines_in_range = [meta_line for meta_line in meta_description_lines if meta_line > titles[0] and meta_line < date] meta_outliers_in_range = [meta_out for meta_out in meta_description_outliers if meta_out > titles[0] and meta_out < date] if len(meta_outliers_in_range) > 0: #log.plog('BE: title before date, meta_outlier in between', 2) start_bod = meta_outliers_in_range[0] elif len(meta_lines_in_range) > 0: #log.plog('BE: title before date, meta_line in between', 2) start_bod= meta_lines_in_range[0] elif count_outliers(titles[0], date, outliers,lines) >= count_outliers(date, len(lines), outliers, lines): #log.plog("BE: title before date, and outliers are greater than the rest of the body", 2) start_bod = titles[0] else: #log.plog("BE: title before date, and no outliers between title and date", 2) start_bod = date else: #date appears at end of body #print str(titles[0]) #print date #print "count outliers" + str(count_outliers(titles[0], date, outliers, lines)) #print "count outliers" + str(count_outliers(date, titles[0], outliers, lines)) title_to_date = count_outliers(titles[0], date, outliers, lines) date_to_title = count_outliers(date, titles[0], outliers, lines) #log.plog("BE: date appears at end of body", 2) if title_to_date >= date_to_title: #log.plog("more outliers moving from title to date", 2); start_bod = titles[0] else: #log.plog("more outliers moving from date to title", 2); start_bod = date start_bod = start_bod + 1 else: #log.plog("BE: no titles present, using date closest to first outlier as marker", 2) #print outliers[0] #print date outliers_to_end = count_outliers(date, len(lines), outliers, lines) outliers_before_date = count_outliers(outliers[0], date, outliers, lines) if outliers_to_end < 2 and outliers_before_date > outliers_to_end: #log.plog('date to end: not enough outliers compared to first outlier to date - LOW CONFIDENCE', 2) start_bod = outliers[0] else: start_bod = date #sys.exit() #log.plog(("BE: start_bod : "+str(start_bod)), 2) if not comments: #log.plog(("BE:NC: no comments, trying to get body from date and titles on if no outliers before date"), 2) o_count = count_outliers(start_bod, len(lines), outliers, lines) if o_count > 2: blocks = get_body(start_bod, len(lines), lines, dates) if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True: story = "[linebreak]".join(blocks) else: story = " ".join(blocks) #log.plog(("BE: got story, returning"), 2) return story else: #log.plog(("BE:NC: FAILED: no blocks found"), 2) return None # comments found # we know titles most usually appear before body, so we try and rely on this before anything else elif titles: #log.plog("BE: using titles to go through list of comments to find suitable outliers", 2) for i in range(len(comments)): if comments[i] > outlier: #log.plog("BE: Attempting start_bod: "+ str(start_bod) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2) if count_outliers(start_bod, comments[i], outliers, lines) > 1 or count_outliers(start_bod, comments[i], outliers, lines) > len(outliers)-1: # is meta in outliers? # if so, start there, not start bod for meta_outlier in meta_description_outliers: if meta_outlier > start_bod and meta_outlier < comments[i]: #log.plog("shifting forward to meta tag at line " + str(meta_outlier), 2) start_bod = meta_outlier break blocks = get_body(start_bod, comments[i], lines, dates) print "get body " + str(start_bod) + ' ' + str(comments[i]) break else: #log.plog(("BE: comments found"), 2) # This next line causes a problem with pages whose body is one line or less, # like a link to another story especially when there are comments with long content ##log.plog(("BE: Attempting to count outliers (0, "+str(start_bod)+", outliers, lines) and got : " + str(count_outliers(0, start_bod, outliers, lines))), 2) if (count_outliers(0, start_bod, outliers, lines) >= 2): # if above 1, we may have the story above the date # then, grab first outlier until dates[0] #log.plog(("BE:OBFD: Outliers before first found date"), 2) # this means the previous start is now the new end end = start_bod begin = find_group(outliers[0], groups) # if comment tag is found and it is less than the first date, we'll use that as the new end if comments[0] > begin and comments[0] < end: end = comments[0] blocks = get_body(begin, end, lines, dates) else: # we are trying to pick out the date closest to the first outlier, # to get rid of any dates tohat may occur outside of the body # this checks to see if a date comes after the first outlier #log.plog(("BE:DCTFO: going through list of comments to find suitable place where outliers might be"), 2) for i in range(len(comments)): if comments[i] > outlier: #log.plog("BE:DCTFO: Attempting start_bod: "+ str(start_bod+1) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2) # we are now going through the list of comments to find a suitable place where outliers might be # This number of count_outliers seems to be a problem, where >0 pulls some false positives and >1 pulls some false negatives # There has to be another metric to determine validity of content if count_outliers(start_bod+1, comments[i], outliers, lines) > 1: #if meta description tag in lines or outliers is within two rows of start bod, use that instead if start_bod in meta_description_outliers or start_bod in meta_description_lines: #log.plog("BE:DCTFO: found meta description at start_bod", 2) pass elif start_bod - 1 in meta_description_outliers or start_bod - 1 in meta_description_lines: #log.plog("BE:DCTFO: found meta description one behind start_bod", 2) start_bod = start_bod - 1 elif start_bod - 2 in meta_description_outliers or start_bod - 2 in meta_description_lines: #log.plog("BE:DCTFO: found meta description two behind start_bod", 2) start_bod = start_bod - 2 else: start_bod = start_bod + 1 blocks = get_body(start_bod, comments[i], lines, dates) break if not blocks: # missed the body altogether, let's just grab from the date on # is there a match for meta_description in the outliers? try that if len(meta_description_outliers) > 0: #log.plog("BE: FAILED, using meta description outlier", 2) start_bod = meta_description_outliers[0] else: #log.plog(("BE: FAILED: missed the body altogether, let's just grab from the start : "+ str(start_bod) + " to " + str(len(lines))), 2) pass blocks = get_body(start_bod, len(lines), lines, dates) if len(blocks) == 0: #log.plog(("BE: FAILED: I couldn't pull anything, returning to None"), 2) return None body = [] for i in range(len(blocks)): #title shouldn't be in blocks, unless block is really much longer than title # (200 chars) if not re.search(title, blocks[i]) or len(blocks[i]) > 200 + len(title): body.append(blocks[i]) if body : ## for zezt, try preserving line breaks if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True: story = "[linebreak]".join(body) else: story = " ".join(body) three_word_ratio = triple_ratio(story) #log.plog("three_word_ratio: " + str(three_word_ratio), 2) if three_word_ratio < 0.08 or three_word_ratio > 0.3: #log.plog("BE: Not English!", 3) return None story = re.sub("\n", "", story) story = re.sub("<.*?>", "", story) #log.plog(("BE: I found something, returning story"), 2) #log.plog((story), 2) #if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True: # story = story.replace("[linebreak]", "<br />\n ") #manual fix inserted by ernst, 1/29/11 story = re.sub(r'Email Sent! You have successfully emailed the post.', '', story) return story else: #log.plog("BE: no body") return None