def extract_date(txt): # TODO: provide default timezone based on guessed country (prob from domain name) filler = fuzzydate.fuzzydate( day=1, hour=0, minute=0, second=0, microsecond=0) fd = fuzzydate.parse_datetime(txt) try: return fd.datetime(filler) except: return None
def extract_date(txt): # TODO: provide default timezone based on guessed country (prob from domain name) filler = fuzzydate.fuzzydate(day=1, hour=0, minute=0, second=0, microsecond=0) fd = fuzzydate.parse_datetime(txt) try: return fd.datetime(filler) except: return None
def testExamplesFromWild(self): filler = fuzzydate(month=1, day=1, hour=0, minute=0, second=0, microsecond=0) for foo in self.examples_from_the_wild: fuzzy = parse_datetime(foo[0]) got = fuzzy.datetime(filler) if foo[1] is not None: expected = datetime.datetime(*foo[1]) else: expected = None self.assertEqual(got,expected, "'%s': expected '%s', got '%s')" % (foo[0],expected,got))
def extract_pubdate(doc, url, headline_linenum): """ returns date,linenum """ candidates = {} logging.debug("extracting pubdate") # TODO: try some definitive meta tags first? # "DCSext.articleFirstPublished" # "DC.date.issued" # "last-modified" # check for date in slug for pat in pats.pubdate['url_datefmts']: m = pat.search(url) if m is not None: d = datetime.datetime( int(m.group('year')), int(m.group('month')), int(m.group('day')) ) logging.debug(" using %s from url" % (d,)) return d,None meta_dates = set() for meta in doc.findall('.//meta'): n = meta.get('name', meta.get('property', '')) if pats.pubdate['metatags'].search(n): logging.debug(" date: consider meta name='%s' content='%s'" % (n,meta.get('content',''))) fuzzy = fuzzydate.parse_datetime(meta.get('content','')) if not fuzzy.empty_date(): meta_dates.add(fuzzy.date(fuzzydate.fuzzydate(day=1))) # if len(meta_dates)==1: # # only one likely-looking <meta> entry - lets go with it # d = list(meta_dates)[0] # logging.debug(" using %s from <meta>" % (d,)) # return d,None # start looking through whole page for e in util.tags(doc,'p','span','div','li','td','th','h4','h5','h6','font'): txt = unicode(e.text_content()).strip() txt = u' '.join(txt.split()) # discard anything too short or long if len(txt)<6 or len(txt) > 150: continue score = 1 dt = extract_date(txt) if dt is None: continue logging.debug(" date: considering %s '%s'" % (e.tag,txt)) # TEST: proximity to headline in html if headline_linenum>0 and e.sourceline>0: dist = e.sourceline - headline_linenum if dist >-10 and dist <25: logging.debug(" near headline") score += 1 # TEST: likely class or id? if pats.pubdate['classes'].search(e.get('class','')): logging.debug(" likely class") score += 1 if pats.pubdate['classes'].search(e.get('id','')): logging.debug(" likely id") score += 1 # in byline is also a good indicator if pats.byline['classes'].search(e.get('class','')): logging.debug(" likely class") score += 1 if pats.byline['classes'].search(e.get('id','')): logging.debug(" likely id") score += 1 # TEST: also appears in likely <meta> tags? if dt.date() in meta_dates: logging.debug(" appears in <meta>") score += 1 # TEST: not within likely-looking comment container? in_comment = False foo = e.getparent() while foo is not None: if pats.pubdate['comment_classes'].search(foo.get('class','')): in_comment = True break foo = foo.getparent() if not in_comment: logging.debug(" not inside likely comment") score += 1 # TEST: indicative text? ("posted on" , "last updated" etc...) if pats.pubdate['pubdate_indicator'].search(txt): logging.debug(" text indicative of pubdate") score += 1 # TEST: date appears in url? eg "http://blah.com/blahblah-20100801-blah.html" if re.compile("%d[-_/.]?0?%d[-_/.]?0?%d" % (dt.year,dt.month,dt.day)).search(url): logging.debug(" full date appears in url") score += 2 elif re.compile("%d[-_/.]?0?%d" % (dt.year,dt.month)).search(url): logging.debug(" year and month appear in url") score += 1 if dt.date() not in candidates or score>candidates[dt.date()]['score']: candidates[dt.date()] = {'datetime': dt, 'score': score, 'node': e} if not candidates: return None,None out = sorted(candidates.items(), key=lambda item: item[1]['score'], reverse=True) # print "=========" # pprint( out[:5] ) # print "=========" best = out[0][1] return best['datetime'],best['node']
def extract_pubdate(doc, url, headline_linenum): """ returns date,linenum """ candidates = {} logging.debug("extracting pubdate") # TODO: try some definitive meta tags first? # "DCSext.articleFirstPublished" # "DC.date.issued" # "last-modified" # check for date in slug for pat in pats.pubdate['url_datefmts']: m = pat.search(url) if m is not None: d = datetime.datetime(int(m.group('year')), int(m.group('month')), int(m.group('day'))) logging.debug(" using %s from url" % (d, )) return d, None meta_dates = set() for meta in doc.findall('.//meta'): n = meta.get('name', meta.get('property', '')) if pats.pubdate['metatags'].search(n): logging.debug(" date: consider meta name='%s' content='%s'" % (n, meta.get('content', ''))) fuzzy = fuzzydate.parse_datetime(meta.get('content', '')) if not fuzzy.empty_date(): meta_dates.add(fuzzy.date(fuzzydate.fuzzydate(day=1))) # if len(meta_dates)==1: # # only one likely-looking <meta> entry - lets go with it # d = list(meta_dates)[0] # logging.debug(" using %s from <meta>" % (d,)) # return d,None # start looking through whole page for e in util.tags(doc, 'p', 'span', 'div', 'li', 'td', 'th', 'h4', 'h5', 'h6', 'font'): txt = unicode(e.text_content()).strip() txt = u' '.join(txt.split()) # discard anything too short or long if len(txt) < 6 or len(txt) > 150: continue score = 1 dt = extract_date(txt) if dt is None: continue logging.debug(" date: considering %s '%s'" % (e.tag, txt)) # TEST: proximity to headline in html if headline_linenum > 0 and e.sourceline > 0: dist = e.sourceline - headline_linenum if dist > -10 and dist < 25: logging.debug(" near headline") score += 1 # TEST: likely class or id? if pats.pubdate['classes'].search(e.get('class', '')): logging.debug(" likely class") score += 1 if pats.pubdate['classes'].search(e.get('id', '')): logging.debug(" likely id") score += 1 # in byline is also a good indicator if pats.byline['classes'].search(e.get('class', '')): logging.debug(" likely class") score += 1 if pats.byline['classes'].search(e.get('id', '')): logging.debug(" likely id") score += 1 # TEST: also appears in likely <meta> tags? if dt.date() in meta_dates: logging.debug(" appears in <meta>") score += 1 # TEST: not within likely-looking comment container? in_comment = False foo = e.getparent() while foo is not None: if pats.pubdate['comment_classes'].search(foo.get('class', '')): in_comment = True break foo = foo.getparent() if not in_comment: logging.debug(" not inside likely comment") score += 1 # TEST: indicative text? ("posted on" , "last updated" etc...) if pats.pubdate['pubdate_indicator'].search(txt): logging.debug(" text indicative of pubdate") score += 1 # TEST: date appears in url? eg "http://blah.com/blahblah-20100801-blah.html" if re.compile("%d[-_/.]?0?%d[-_/.]?0?%d" % (dt.year, dt.month, dt.day)).search(url): logging.debug(" full date appears in url") score += 2 elif re.compile("%d[-_/.]?0?%d" % (dt.year, dt.month)).search(url): logging.debug(" year and month appear in url") score += 1 if dt.date( ) not in candidates or score > candidates[dt.date()]['score']: candidates[dt.date()] = {'datetime': dt, 'score': score, 'node': e} if not candidates: return None, None out = sorted(candidates.items(), key=lambda item: item[1]['score'], reverse=True) # print "=========" # pprint( out[:5] ) # print "=========" best = out[0][1] return best['datetime'], best['node']