示例#1
0
def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)


    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]): 
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story
示例#2
0
def extract_old(text):
    lines = re.split('\n', text)

    lengths = empty(len(lines))
    for i in range(len(lines)):
        lengths[i] = len(lines[i])

    total = sum(lengths)
    average = total / len(lines)
    longest = amax(lengths)
    deviation = std(lengths)

    w_count = word_count(lines)
    groups = grouper(lengths)
    deviants = []
    dates = find_date(lines)
    comments = find_comments(lines)

    start = False
    end_count = 0
    large_block = []
    blocks = []
    period_ratio = []

    # Looking for beginning of story based on cluster of long lines
    period_ratio = periodCase(lines)
    for i in range(len(lines)):
        if lengths[i] > average + (deviation * 1.2) \
           and titleCase(lines[i]) == False \
           and period_ratio[i] > 0.005:
            start = True
            end_count = 0
        elif start == True:
            end_count += 1

        if end_count > 1:
            if len(large_block) < 2:
                large_block = []
            else:
                blocks += large_block
                large_block = []

        if start == True:
            large_block.append(lines[i])
            deviants.append(i)

    if len(blocks) == 0:
        return False
    # Maybe use second block instead of first
    if len(blocks) > 1 and \
       len(blocks[0]) < 3 and \
       len(blocks[1]) > len(blocks[0]):
        blocks[0] = blocks[1]

    story = ''
    for line in blocks[0]:
        story += line

    return story
示例#3
0
def extract(text, doAsciiConvert=True):
    #log.plog("BE: Initialized", 2)
    #find meta description tag if possible
    #convert to ascii is problematic for utf-8 process stacks, but may be needed for the *fi sites
    if doAsciiConvert:
        text = convertToAscii(text)

    text = re.sub("\|", '\n', text)
    #remove all empty lines
    # this will significantly up the average line length for the outliers calculation and may
    text = re.sub("\n\s*\n+", '\n', text, 0, re.S | re.M)
    try:
        text = text.decode('utf-8')
    except:
        pass
    lines = re.split('\n', text)
    #remove blank lines

    if doAsciiConvert:
        title = convertToAscii(infoModule.info.page['title'])
    else:
        title = infoModule.info.page['title']
    lengths = [0] * len(lines)
    long_count = long_letter_count(lines)

    for i in range(len(lines)):
        #really? No-one thought to strip whitespace?
        lines[i] = lines[i].strip()
        print str(i) + ': ' + lines[i]
        if long_count[i] < 2:
            print(
                "@@ " + str(i) + " : " + str(long_count[i]) + " : " + lines[i],
                2)
            lengths[i] = len(lines[i])
        else:
            lengths[i] = 0

    total = sum(lengths)
    average = total / len(lines)
    try:
        longest = amax(lengths)
    except:
        print "amax failed"
        return None
    try:
        deviation = std(lengths)
    except:
        print "std failed"
        return None
    try:
        groups = grouper(lengths)
    except:
        print "grouper failed"
        return None
    try:
        titles = find_be_title(lines, title)
    except:
        print "titles failed"
        return None
    try:
        dates = find_date(lines, titles)
    except:
        print "find_date failed"
        return None
    print "dates:"
    print dates
    try:
        outliers = find_outliers(average, lengths, deviation, dates)
    except:
        print "find_outliers failed"
        return None
    print "outliers:"
    print outliers
    #find contiguous outliers as likely indicator of where the body is
    contiguous_list = contiguous_outliers(outliers)
    try:
        comments = find_comments(lines, outliers)
    except:
        print "find_comments failed"
        return None
        #print("titles: ", titles)
        #print("dates: ", dates)
        #print("outliers: ", outliers)
        #print("comments: ", comments)

    blocks = []

    if not dates and not title and not comments and len(lines) < 20:
        #log.plog(("BE: FAILED: No dates titles or comments returning nothing"), 2)
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[
                'body_extractor_no_date'] == True:
            pass
        else:
            return None
    if not dates and not titles:
        #log.plog(("BE: FAILED: No dates and no titles found. Returning Nothing"), 2)
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[
                'body_extractor_no_date'] == True:
            pass
        else:
            return None
    if not dates:
        #log.plog(("BE: FAILED : No dates found. This should be corrected shortly"), 2)
        # no date found.  Drop confidence
        infoModule.info.page['confidence'] -= 1
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site[
                'body_extractor_no_date'] == True:
            # for peepbuzz case, body_extractor can ignore failure to find date
            dates = [0]
        else:
            return None

    #date = first date, UNLESS there is a contiguous outlier closer to a later date
    date = dates[0]
    if len(dates) > 1 and len(contiguous_list) > 0:
        #distance from first date to contiguous block:
        first_distance = abs(contiguous_list[0] - dates[0])
        print "first distance: " + str(first_distance)
        for i in range(1, len(dates)):
            next_distance = abs(contiguous_list[0] - dates[i])
            #log.plog('next distance: ' + str(next_distance), 2)
            if next_distance < first_distance:
                # date[i] is closer.  use it.
                date = dates[i]
                #log.plog('using date ' + str(date), 2)
                break

    if len(outliers) > 0:
        outlier = outliers[0]
    else:
        #log.plog('no outliers found', 3)
        return None
    found = 0

    ###### meta description testing ##########
    meta_description_outliers = []
    meta_description_lines = []
    # a meta description match that starts at position 0 is better than one that is later in the string

    if 'meta_description' in infoModule.info.page and infoModule.info.page[
            'meta_description'] != '':
        print "meta description is: " + infoModule.info.page['meta_description']
        #test for meta descriptions
        for i in range(len(outliers)):
            #print str(outliers[i]) + " " + lines[outliers[i]]
            outlier_position = lines[outliers[i]].find(
                infoModule.info.page['meta_description'])
            if outlier_position == 0:
                print "meta in outliers (perfect match): " + lines[outliers[i]]
                meta_description_outliers.insert(0, outliers[i])
            elif outlier_position > 0:
                print "meta in outliers: " + lines[outliers[i]]
                meta_description_outliers.append(outliers[i])
        for i in range(len(lines)):
            #print str(i) + " " + lines[i]
            line_position = lines[i].find(
                infoModule.info.page['meta_description'])
            if line_position == 0:
                print "meta in lines: " + lines[i]
                meta_description_lines.insert(0, i)
            elif line_position > 0:
                print "meta in lines: " + lines[i]
                meta_description_lines.append(i)

    #log.plog(("BE:DCTFO: Finding date closest to first outlier"), 2)
    if titles:
        for i in range(len(outliers)):
            if outliers[i] > titles[0]:
                for k in range(len(dates)):
                    print(dates[k], outliers[i], titles[0])
                    if dates[k] < outliers[i]:
                        date = dates[k]
                        outlier = outliers[i]
                    else:
                        #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " using titles[0] : " + str(titles[0])), 2)
                        found = 1
            if found == 1:
                break
    else:
        for i in range(len(outliers)):
            for k in range(len(dates)):
                if dates[k] >= outliers[i]:
                    found = 1
                    break
                if dates[k] < outliers[i]:
                    outlier = outliers[i]
                    date = dates[k]
            if found == 1:
                #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " without title"), 2)
                break
    if found == 0:
        #log.plog(("BE: Couldn't get suitable dates -- we are defaulting to  date : " + str(date) + " and outlier : "+str(outlier)), 2)
        pass

    # no comments, then try and get the body from date on
    if titles:
        #log.plog(("BE: titles found, assigning start_bod"), 2)
        if titles[0] > date and count_outliers(date, titles[0], outliers,
                                               lines) < 2:
            if count_outliers(date, titles[0],
                              outliers, lines) >= count_outliers(
                                  titles[0], len(lines), outliers, lines):
                #log.plog("BE: title after date, and outliers between date and title are greater than rest of body", 2)
                start_bod = date
            else:
                #log.plog("BE: title after date, and no outliers between date and title", 2)
                start_bod = titles[0]
                start_bod = start_bod + 1
        elif titles[0] < date and count_outliers(titles[0], date, outliers,
                                                 lines) < 2:
            #print (titles[0], date, outliers, count_outliers(titles[0], date, outliers,lines))
            #if meta is in between title and date, sway the decision towards using the meta
            meta_lines_in_range = [
                meta_line for meta_line in meta_description_lines
                if meta_line > titles[0] and meta_line < date
            ]
            meta_outliers_in_range = [
                meta_out for meta_out in meta_description_outliers
                if meta_out > titles[0] and meta_out < date
            ]
            if len(meta_outliers_in_range) > 0:
                #log.plog('BE: title before date, meta_outlier in between', 2)
                start_bod = meta_outliers_in_range[0]
            elif len(meta_lines_in_range) > 0:
                #log.plog('BE: title before date, meta_line in between', 2)
                start_bod = meta_lines_in_range[0]
            elif count_outliers(titles[0], date, outliers,
                                lines) >= count_outliers(
                                    date, len(lines), outliers, lines):
                #log.plog("BE: title before date, and outliers are greater than the rest of the body", 2)
                start_bod = titles[0]
            else:
                #log.plog("BE: title before date, and no outliers between title and date", 2)
                start_bod = date
        else:
            #date appears at end of body
            #print str(titles[0])
            #print date
            #print "count outliers" + str(count_outliers(titles[0], date, outliers, lines))
            #print "count outliers" + str(count_outliers(date, titles[0], outliers, lines))
            title_to_date = count_outliers(titles[0], date, outliers, lines)
            date_to_title = count_outliers(date, titles[0], outliers, lines)
            #log.plog("BE: date appears at end of body", 2)
            if title_to_date >= date_to_title:
                #log.plog("more outliers moving from title to date", 2);
                start_bod = titles[0]
            else:
                #log.plog("more outliers moving from date to title", 2);
                start_bod = date
            start_bod = start_bod + 1
    else:
        #log.plog("BE: no titles present, using date closest to first outlier as marker", 2)
        #print outliers[0]
        #print date
        outliers_to_end = count_outliers(date, len(lines), outliers, lines)
        outliers_before_date = count_outliers(outliers[0], date, outliers,
                                              lines)
        if outliers_to_end < 2 and outliers_before_date > outliers_to_end:
            #log.plog('date to end: not enough outliers compared to first outlier to date - LOW CONFIDENCE', 2)
            start_bod = outliers[0]
        else:
            start_bod = date
    #sys.exit()

    #log.plog(("BE: start_bod : "+str(start_bod)), 2)
    if not comments:
        #log.plog(("BE:NC: no comments, trying to get body from date and titles on if no outliers before date"), 2)
        o_count = count_outliers(start_bod, len(lines), outliers, lines)
        if o_count > 2:
            blocks = get_body(start_bod, len(lines), lines, dates)
            if 'preserve_breaks' in infoModule.info.site and infoModule.info.site[
                    'preserve_breaks'] == True:
                story = "[linebreak]".join(blocks)
            else:
                story = " ".join(blocks)
            #log.plog(("BE: got story, returning"), 2)
            return story
        else:
            #log.plog(("BE:NC: FAILED: no blocks found"), 2)
            return None
    # comments found

    # we know titles most usually appear before body, so we try and rely on this before anything else
    elif titles:
        #log.plog("BE: using titles to go through list of comments to find suitable outliers", 2)
        for i in range(len(comments)):
            if comments[i] > outlier:
                #log.plog("BE: Attempting start_bod: "+ str(start_bod) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2)
                if count_outliers(start_bod, comments[i], outliers,
                                  lines) > 1 or count_outliers(
                                      start_bod, comments[i], outliers,
                                      lines) > len(outliers) - 1:
                    # is meta in outliers?
                    # if so, start there, not start bod
                    for meta_outlier in meta_description_outliers:
                        if meta_outlier > start_bod and meta_outlier < comments[
                                i]:
                            #log.plog("shifting forward to meta tag at line " + str(meta_outlier), 2)
                            start_bod = meta_outlier
                            break
                    blocks = get_body(start_bod, comments[i], lines, dates)
                    print "get body " + str(start_bod) + ' ' + str(comments[i])
                    break
    else:
        #log.plog(("BE: comments found"), 2)
        # This next line causes a problem with pages whose body is one line or less,
        # like a link to another story especially when there are comments with long content
        ##log.plog(("BE: Attempting to count outliers (0, "+str(start_bod)+", outliers, lines) and got : " + str(count_outliers(0, start_bod, outliers, lines))), 2)
        if (count_outliers(0, start_bod, outliers, lines) >= 2):
            # if above 1, we may have the story above the date
            # then, grab first outlier until dates[0]
            #log.plog(("BE:OBFD: Outliers before first found date"), 2)
            # this means the previous start is now the new end
            end = start_bod
            begin = find_group(outliers[0], groups)
            # if comment tag is found and it is less than the first date, we'll use that as the new end
            if comments[0] > begin and comments[0] < end:
                end = comments[0]
            blocks = get_body(begin, end, lines, dates)
        else:
            # we are trying to pick out the date closest to the first outlier,
            # to get rid of any dates tohat may occur outside of the body
            # this checks to see if a date comes after the first outlier
            #log.plog(("BE:DCTFO: going through list of comments to find suitable place where outliers might be"), 2)
            for i in range(len(comments)):
                if comments[i] > outlier:
                    #log.plog("BE:DCTFO: Attempting start_bod: "+ str(start_bod+1) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2)
                    # we are now going through the list of comments to find a suitable place where outliers might be
                    # This number of count_outliers seems to be a problem, where >0 pulls some false positives and >1 pulls some false negatives
                    # There has to be another metric to determine validity of content
                    if count_outliers(start_bod + 1, comments[i], outliers,
                                      lines) > 1:
                        #if meta description tag in lines or outliers is within two rows of start bod, use that instead
                        if start_bod in meta_description_outliers or start_bod in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description at start_bod", 2)
                            pass
                        elif start_bod - 1 in meta_description_outliers or start_bod - 1 in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description one behind start_bod", 2)
                            start_bod = start_bod - 1
                        elif start_bod - 2 in meta_description_outliers or start_bod - 2 in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description two behind start_bod", 2)
                            start_bod = start_bod - 2
                        else:
                            start_bod = start_bod + 1
                        blocks = get_body(start_bod, comments[i], lines, dates)

                        break
    if not blocks:
        # missed the body altogether, let's just grab from the date on
        # is there a match for meta_description in the outliers?  try that
        if len(meta_description_outliers) > 0:
            #log.plog("BE: FAILED, using meta description outlier", 2)
            start_bod = meta_description_outliers[0]
        else:
            #log.plog(("BE: FAILED: missed the body altogether, let's just grab from the start : "+ str(start_bod) + " to " + str(len(lines))), 2)
            pass
        blocks = get_body(start_bod, len(lines), lines, dates)

    if len(blocks) == 0:
        #log.plog(("BE: FAILED: I couldn't pull anything, returning to None"), 2)
        return None
    body = []
    for i in range(len(blocks)):
        #title shouldn't be in blocks, unless block is really much longer than title
        # (200 chars)
        if not re.search(title,
                         blocks[i]) or len(blocks[i]) > 200 + len(title):
            body.append(blocks[i])

    if body:
        ## for zezt, try preserving line breaks
        if 'preserve_breaks' in infoModule.info.site and infoModule.info.site[
                'preserve_breaks'] == True:
            story = "[linebreak]".join(body)
        else:
            story = " ".join(body)

        three_word_ratio = triple_ratio(story)
        #log.plog("three_word_ratio: " + str(three_word_ratio), 2)
        if three_word_ratio < 0.08 or three_word_ratio > 0.3:
            #log.plog("BE: Not English!", 3)
            return None
        story = re.sub("\n", "", story)
        story = re.sub("<.*?>", "", story)
        #log.plog(("BE: I found something, returning story"), 2)
        #log.plog((story), 2)
        #if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True:
        #    story = story.replace("[linebreak]", "<br />\n&nbsp;&nbsp;")
        #manual fix inserted by ernst, 1/29/11
        story = re.sub(r'Email Sent! You have successfully emailed the post.',
                       '', story)
        return story
    else:
        #log.plog("BE: no body")
        return None
示例#4
0
def extract(text, doAsciiConvert = True):
    #log.plog("BE: Initialized", 2)
    #find meta description tag if possible
    #convert to ascii is problematic for utf-8 process stacks, but may be needed for the *fi sites
    if doAsciiConvert:
        text = convertToAscii(text)
        
    text = re.sub("\|", '\n', text)
    #remove all empty lines 
    # this will significantly up the average line length for the outliers calculation and may
    text = re.sub("\n\s*\n+", '\n', text, 0, re.S | re.M)
    try:
        text = text.decode('utf-8')
    except:
        pass
    lines = re.split('\n', text)
    #remove blank lines

    if doAsciiConvert:
        title = convertToAscii(infoModule.info.page['title'])
    else:
        title = infoModule.info.page['title']
    lengths = [0]*len(lines)
    long_count = long_letter_count(lines)

    for i in range(len(lines)):
        #really? No-one thought to strip whitespace?
        lines[i] = lines[i].strip()
        print str(i) + ': ' + lines[i]
        if long_count[i] < 2 :  
            print("@@ " + str(i) + " : " + str(long_count[i]) + " : " + lines[i], 2)
            lengths[i] = len(lines[i])
        else:
            lengths[i] = 0


    total = sum(lengths)
    average = total / len(lines)
    try:
        longest = amax(lengths)
    except:
        print "amax failed"
        return None
    try:
        deviation = std(lengths)
    except:
        print "std failed"
        return None
    try:
        groups = grouper(lengths)
    except:
        print "grouper failed"
        return None
    try:
        titles = find_be_title(lines, title)
    except:
        print "titles failed"
        return None
    try:
        dates = find_date(lines, titles)
    except:
        print "find_date failed"
        return None
    print "dates:"
    print dates
    try:
        outliers = find_outliers(average, lengths, deviation, dates)
    except:
        print "find_outliers failed"
        return None
    print "outliers:"
    print outliers
    #find contiguous outliers as likely indicator of where the body is
    contiguous_list = contiguous_outliers(outliers)
    try:
        comments = find_comments(lines, outliers)
    except:
        print "find_comments failed"
        return None
        #print("titles: ", titles)
        #print("dates: ", dates)
        #print("outliers: ", outliers)
        #print("comments: ", comments)
    
    blocks = []

    if not dates and not title and not comments and len(lines)<20:
        #log.plog(("BE: FAILED: No dates titles or comments returning nothing"), 2)
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True:
            pass
        else:
            return None
    if not dates and not titles:
        #log.plog(("BE: FAILED: No dates and no titles found. Returning Nothing"), 2)
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True:
            pass
        else:
            return None
    if not dates:
        #log.plog(("BE: FAILED : No dates found. This should be corrected shortly"), 2)
        # no date found.  Drop confidence
        infoModule.info.page['confidence'] -= 1
        if 'body_extractor_no_date' in infoModule.info.site and infoModule.info.site['body_extractor_no_date'] == True:
            # for peepbuzz case, body_extractor can ignore failure to find date
            dates = [0]
        else:
            return None

    #date = first date, UNLESS there is a contiguous outlier closer to a later date
    date = dates[0]
    if len(dates) > 1 and len(contiguous_list) > 0:
        #distance from first date to contiguous block:
        first_distance = abs(contiguous_list[0] - dates[0])
        print "first distance: " + str(first_distance)
        for i in range(1,len(dates)):
            next_distance = abs(contiguous_list[0] - dates[i])
            #log.plog('next distance: ' + str(next_distance), 2)
            if next_distance < first_distance:
                # date[i] is closer.  use it.
                date = dates[i]
                #log.plog('using date ' + str(date), 2)
                break
        
    if len(outliers) > 0:
        outlier = outliers[0]
    else:
        #log.plog('no outliers found', 3)
        return None
    found = 0

    ###### meta description testing ##########
    meta_description_outliers = []
    meta_description_lines = []
    # a meta description match that starts at position 0 is better than one that is later in the string
    
    if 'meta_description' in infoModule.info.page and infoModule.info.page['meta_description'] != '':
        print "meta description is: " + infoModule.info.page['meta_description']
        #test for meta descriptions
        for i in range(len(outliers)):
            #print str(outliers[i]) + " " + lines[outliers[i]]
            outlier_position = lines[outliers[i]].find(infoModule.info.page['meta_description'])
            if outlier_position == 0:
                print "meta in outliers (perfect match): " + lines[outliers[i]]
                meta_description_outliers.insert(0, outliers[i])
            elif outlier_position > 0:
                print "meta in outliers: " + lines[outliers[i]]
                meta_description_outliers.append(outliers[i])
        for i in range(len(lines)):
            #print str(i) + " " + lines[i]
            line_position = lines[i].find(infoModule.info.page['meta_description'])
            if line_position == 0:
                print "meta in lines: " + lines[i]
                meta_description_lines.insert(0, i)
            elif line_position > 0:
                print "meta in lines: " + lines[i]
                meta_description_lines.append(i)
                
    
    #log.plog(("BE:DCTFO: Finding date closest to first outlier"), 2)
    if titles:
        for i in range(len(outliers)):
            if outliers[i] > titles[0]:
                for k in range(len(dates)):
                    print(dates[k], outliers[i], titles[0])
                    if dates[k] < outliers[i]:
                        date = dates[k]
                        outlier = outliers[i]
                    else:
                        #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " using titles[0] : " + str(titles[0])), 2)
                        found = 1
            if found == 1:
                    break
    else:
        for i in range(len(outliers)):
            for k in range(len(dates)):
                if dates[k] >= outliers[i]:
                    found = 1
                    break
                if dates[k] < outliers[i]:
                    outlier = outliers[i]
                    date = dates[k]
            if found == 1:
                #log.plog(("BE: found date : " + str(date) + " and outlier : " + str(outlier) + " without title"), 2)
                break
    if found == 0:
        #log.plog(("BE: Couldn't get suitable dates -- we are defaulting to  date : " + str(date) + " and outlier : "+str(outlier)), 2)
        pass
             
    # no comments, then try and get the body from date on
    if titles:
        #log.plog(("BE: titles found, assigning start_bod"), 2)
        if titles[0] > date and count_outliers(date, titles[0], outliers, lines) < 2:
            if count_outliers(date, titles[0], outliers, lines) >= count_outliers(titles[0], len(lines), outliers, lines):
                #log.plog("BE: title after date, and outliers between date and title are greater than rest of body", 2)
                start_bod = date
            else:
                #log.plog("BE: title after date, and no outliers between date and title", 2)
                start_bod = titles[0]
                start_bod = start_bod + 1
        elif titles[0] < date and count_outliers(titles[0], date, outliers, lines) < 2 :
            #print (titles[0], date, outliers, count_outliers(titles[0], date, outliers,lines)) 
            #if meta is in between title and date, sway the decision towards using the meta
            meta_lines_in_range = [meta_line for meta_line in meta_description_lines 
                if meta_line > titles[0] and meta_line < date]
            meta_outliers_in_range = [meta_out for meta_out in meta_description_outliers 
                    if meta_out > titles[0] and meta_out < date]
            if len(meta_outliers_in_range) > 0:
                #log.plog('BE: title before date, meta_outlier in between', 2)
                start_bod = meta_outliers_in_range[0]
            elif len(meta_lines_in_range) > 0:
                #log.plog('BE: title before date, meta_line in between', 2)
                start_bod= meta_lines_in_range[0]
            elif count_outliers(titles[0], date, outliers,lines) >= count_outliers(date, len(lines), outliers, lines):
                #log.plog("BE: title before date, and outliers are greater than the rest of the body", 2)
                start_bod = titles[0]
            else:
                #log.plog("BE: title before date, and no outliers between title and date", 2)
                start_bod = date
        else:
            #date appears at end of body 
            #print str(titles[0])
            #print date
            #print "count outliers" + str(count_outliers(titles[0], date, outliers, lines))
            #print "count outliers" + str(count_outliers(date, titles[0], outliers, lines))
            title_to_date = count_outliers(titles[0], date, outliers, lines)
            date_to_title = count_outliers(date, titles[0], outliers, lines)
            #log.plog("BE: date appears at end of body", 2)
            if title_to_date >= date_to_title:
                #log.plog("more outliers moving from title to date", 2);
                start_bod = titles[0]
            else:
                #log.plog("more outliers moving from date to title", 2);
                start_bod = date
            start_bod = start_bod + 1
    else:
        #log.plog("BE: no titles present, using date closest to first outlier as marker", 2)
        #print outliers[0]
        #print date
        outliers_to_end = count_outliers(date, len(lines), outliers, lines)
        outliers_before_date = count_outliers(outliers[0], date, outliers, lines)
        if outliers_to_end < 2 and outliers_before_date > outliers_to_end:
            #log.plog('date to end: not enough outliers compared to first outlier to date - LOW CONFIDENCE', 2)
            start_bod = outliers[0]
        else:
            start_bod = date
    #sys.exit()
 
    #log.plog(("BE: start_bod : "+str(start_bod)), 2)
    if not comments:
        #log.plog(("BE:NC: no comments, trying to get body from date and titles on if no outliers before date"), 2)
        o_count = count_outliers(start_bod, len(lines), outliers, lines) 
        if o_count > 2:
            blocks = get_body(start_bod, len(lines), lines, dates)    
            if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True:
                story = "[linebreak]".join(blocks)
            else:
                story = " ".join(blocks)
            #log.plog(("BE: got story, returning"), 2)
            return story
        else:    
            #log.plog(("BE:NC: FAILED: no blocks found"), 2) 
            return None
    # comments found

    # we know titles most usually appear before body, so we try and rely on this before anything else
    elif titles:
        #log.plog("BE: using titles to go through list of comments to find suitable outliers", 2)
        for i in range(len(comments)):
            if comments[i] > outlier:
                #log.plog("BE: Attempting start_bod: "+ str(start_bod) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2)
                if count_outliers(start_bod, comments[i], outliers, lines) > 1 or count_outliers(start_bod, comments[i], outliers, lines) > len(outliers)-1:
                    # is meta in outliers?
                    # if so, start there, not start bod
                    for meta_outlier in meta_description_outliers:
                        if meta_outlier > start_bod and meta_outlier < comments[i]:
                            #log.plog("shifting forward to meta tag at line " + str(meta_outlier), 2)
                            start_bod = meta_outlier
                            break
                    blocks = get_body(start_bod, comments[i], lines, dates)
                    print "get body " + str(start_bod) + ' ' + str(comments[i])
                    break
    else:
        #log.plog(("BE: comments found"), 2)
        # This next line causes a problem with pages whose body is one line or less,
        # like a link to another story especially when there are comments with long content
        ##log.plog(("BE: Attempting to count outliers (0, "+str(start_bod)+", outliers, lines) and got : " + str(count_outliers(0, start_bod, outliers, lines))), 2)           
        if (count_outliers(0, start_bod, outliers, lines) >= 2):
            # if above 1, we may have the story above the date
            # then, grab first outlier until dates[0]
            #log.plog(("BE:OBFD: Outliers before first found date"), 2)
            # this means the previous start is now the new end
            end = start_bod
            begin = find_group(outliers[0], groups)
            # if comment tag is found and it is less than the first date, we'll use that as the new end
            if comments[0] > begin and comments[0] < end:
                end = comments[0]
            blocks = get_body(begin, end, lines, dates)
        else:
            # we are trying to pick out the date closest to the first outlier,
            # to get rid of any dates tohat may occur outside of the body
            # this checks to see if a date comes after the first outlier 
            #log.plog(("BE:DCTFO: going through list of comments to find suitable place where outliers might be"), 2)
            for i in range(len(comments)):
                if comments[i] > outlier:
                    #log.plog("BE:DCTFO: Attempting start_bod: "+ str(start_bod+1) + " Comments: " + str(comments[i]) + " Outliers: " + str(count_outliers(start_bod, comments[i], outliers, lines)), 2)
                    # we are now going through the list of comments to find a suitable place where outliers might be
                    # This number of count_outliers seems to be a problem, where >0 pulls some false positives and >1 pulls some false negatives
                    # There has to be another metric to determine validity of content
                    if count_outliers(start_bod+1, comments[i], outliers, lines) > 1:
                        #if meta description tag in lines or outliers is within two rows of start bod, use that instead
                        if start_bod in meta_description_outliers or start_bod in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description at start_bod", 2)
                            pass
                        elif start_bod - 1 in meta_description_outliers or start_bod - 1 in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description one behind start_bod", 2)
                            start_bod = start_bod - 1
                        elif start_bod - 2 in meta_description_outliers or start_bod - 2 in meta_description_lines:
                            #log.plog("BE:DCTFO: found meta description two behind start_bod", 2)
                            start_bod = start_bod - 2
                        else:
                            start_bod = start_bod + 1
                        blocks = get_body(start_bod, comments[i], lines, dates)
                            
                        break
    if not blocks:
        # missed the body altogether, let's just grab from the date on
        # is there a match for meta_description in the outliers?  try that
        if len(meta_description_outliers) > 0:
            #log.plog("BE: FAILED, using meta description outlier", 2)
            start_bod = meta_description_outliers[0]
        else:
            #log.plog(("BE: FAILED: missed the body altogether, let's just grab from the start : "+ str(start_bod) + " to " + str(len(lines))), 2)
            pass
        blocks = get_body(start_bod, len(lines), lines, dates)  
                    
    if len(blocks) == 0:
        #log.plog(("BE: FAILED: I couldn't pull anything, returning to None"), 2)
        return None
    body = []
    for i in range(len(blocks)):
        #title shouldn't be in blocks, unless block is really much longer than title
        # (200 chars)
        if not re.search(title, blocks[i]) or len(blocks[i]) > 200 + len(title):
            body.append(blocks[i])
    
    if body :
        ## for zezt, try preserving line breaks
        if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True:
            story = "[linebreak]".join(body)
        else:
            story = " ".join(body)
            
        three_word_ratio = triple_ratio(story)
        #log.plog("three_word_ratio: " + str(three_word_ratio), 2)
        if three_word_ratio < 0.08 or three_word_ratio > 0.3:
            #log.plog("BE: Not English!", 3)
            return None
        story = re.sub("\n", "", story)
        story = re.sub("<.*?>", "", story)
        #log.plog(("BE: I found something, returning story"), 2)
        #log.plog((story), 2)
        #if 'preserve_breaks' in infoModule.info.site and infoModule.info.site['preserve_breaks'] == True:
        #    story = story.replace("[linebreak]", "<br />\n&nbsp;&nbsp;")
        #manual fix inserted by ernst, 1/29/11
        story = re.sub(r'Email Sent! You have successfully emailed the post.', '', story)
        return story
    else:
        #log.plog("BE: no body")
        return None