def strip_date(s): """ remove all date/time bits from text """ d,dspan = fuzzydate.parse_date(s) if dspan is not None: s = s[:dspan[0]] + s[dspan[1]:] t,tspan = fuzzydate.parse_time(s) if tspan is not None: s = s[:tspan[0]] + s[tspan[1]:] if tspan is not None or dspan is not None: # TODO: strip leftover "on" "at" etc... s = re.compile(r'\b(on|at|published|posted)\b[:]?',re.IGNORECASE).sub('',s) return s
def parse_byline(candidate,all,headline_node): authors = [] score = 0.0 txt = util.render_text(candidate) txt = u' '.join(txt.split()).strip() if len(txt) > 200: return (authors,score) logging.debug("byline: consider <%s> '%s'"%(candidate.tag,txt[:75])) # if candidate.tag == 'a': # score += eval_author_link(candidate) # split up using html structure parts = util.iter_text(candidate) # pass 1: check for and strip out parts with dates & times # TODO: this is a bit ruthless - could lose names if in same block parts2 = [] for txt,el in parts: is_pubdate_frag = False if pats.pubdate['pubdate_indicator'].search(txt): is_pubdate_frag = True t,dspan = fuzzydate.parse_date(txt) if dspan is not None: logging.debug(" +0.1 contains date") score += 0.1 is_pubdate_frag = True d,tspan = fuzzydate.parse_time(txt) if tspan is not None: logging.debug(" +0.1 contains time") score += 0.1 is_pubdate_frag = True if not is_pubdate_frag: parts2.append((txt,el)) # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars... # (capturing patterns are included in results) split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))',re.IGNORECASE|re.UNICODE) parts3 = [] for txt,el in parts2: fragments = split_pat.split(txt) for frag in fragments: parts3.append((frag.strip(),el)) # pass three - split out indicatives ("by", "posted by" etc) parts4 = [] for txt,el in parts3: for frag in pats.byline['indicative'].split(txt): parts4.append((frag,el)) # clean up parts4 = [(txt.strip(),el) for txt,el in parts4] parts4 = [(txt,el) for txt,el in parts4 if txt!=u''] # now run through classifying and collecting authors authors,score = parse_byline_parts(parts4) # TEST: likely-looking class or id if pats.byline['classes'].search(candidate.get('class','')): logging.debug(" +1 likely class") score += 1.0 if pats.byline['classes'].search(candidate.get('id','')): logging.debug(" +1 likely id") score += 1.0 # TEST: directly after headline? foo = intervening(headline_node,candidate,all) if foo is not None: if len(foo) == 0: logging.debug(" +0.5 directly after headline") score += 0.5 logging.debug( " total: %.3f" % (score,)) return (authors, score)
def parse_byline(candidate, all, headline_node): authors = [] score = 0.0 txt = util.render_text(candidate) txt = u' '.join(txt.split()).strip() if len(txt) > 200: return (authors, score) logging.debug("byline: consider <%s> '%s'" % (candidate.tag, txt[:75])) # if candidate.tag == 'a': # score += eval_author_link(candidate) # split up using html structure parts = util.iter_text(candidate) # pass 1: check for and strip out parts with dates & times # TODO: this is a bit ruthless - could lose names if in same block parts2 = [] for txt, el in parts: is_pubdate_frag = False if pats.pubdate['pubdate_indicator'].search(txt): is_pubdate_frag = True t, dspan = fuzzydate.parse_date(txt) if dspan is not None: logging.debug(" +0.1 contains date") score += 0.1 is_pubdate_frag = True d, tspan = fuzzydate.parse_time(txt) if tspan is not None: logging.debug(" +0.1 contains time") score += 0.1 is_pubdate_frag = True if not is_pubdate_frag: parts2.append((txt, el)) # pass 2: split up text on likely separators - "and" "in" or any non alphabetic chars... # (capturing patterns are included in results) split_pat = re.compile(r'((?:\b(?:and|with|in)\b)|(?:[^-_.\w\s]+))', re.IGNORECASE | re.UNICODE) parts3 = [] for txt, el in parts2: fragments = split_pat.split(txt) for frag in fragments: parts3.append((frag.strip(), el)) # pass three - split out indicatives ("by", "posted by" etc) parts4 = [] for txt, el in parts3: for frag in pats.byline['indicative'].split(txt): parts4.append((frag, el)) # clean up parts4 = [(txt.strip(), el) for txt, el in parts4] parts4 = [(txt, el) for txt, el in parts4 if txt != u''] # now run through classifying and collecting authors authors, score = parse_byline_parts(parts4) # TEST: likely-looking class or id if pats.byline['classes'].search(candidate.get('class', '')): logging.debug(" +1 likely class") score += 1.0 if pats.byline['classes'].search(candidate.get('id', '')): logging.debug(" +1 likely id") score += 1.0 # TEST: directly after headline? foo = intervening(headline_node, candidate, all) if foo is not None: if len(foo) == 0: logging.debug(" +0.5 directly after headline") score += 0.5 logging.debug(" total: %.3f" % (score, )) return (authors, score)