def __init__(self, feed, cachename='cache'): self.error = False if type(feed) is list: self.rss_entries = list() for f in feed: p = feedparser.parse(f) if p.bozo: self.error = True else: self.rss_entries.append(p['entries']) # Interleave the feeds self.rss_entries = _interleave(self.rss_entries) else: p = feedparser.parse(feed) if p.bozo: self.error = True else: self.rss_entries = feedparser.parse(feed)['entries'] if self.error: self.parser = None self.cache = None self.fetch = None self.rss_entries = None self.entries = None return self.parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser( indexNames=False) #self.parser.addIndexOnAttribute('property') self.cache = dict() self.cachename = cachename self.fetch = fetch.Fetcher() _cachedir = _config['cachedir'] try: with open(f"{_cachedir}/{cachename}.cache", 'rb') as f: self.cache = pickle.load(f) for x in self.cache.keys(): pubdate = self.cache[x]['published'] if (timedate.datetime.now() - pubdate > datetime.timedelta(days=5)): del self.cache[x] except: pass if len(self.rss_entries) == 0: print("ERROR: empty rss list") self.entries = list()
def league_table(url, cache): if url in cache: entry = cache[url] headers = { 'If-None-Match': entry['etag'] } else: entry = None headers = None f = fetch.Fetcher() resp = f.get(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code != 200: print(resp.status_code) return None if entry and resp.headers.get('etag') == entry['etag']: return entry['value'] print(f"Cache miss {url}", flush=True) parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser() parser.parseStr(resp.text) tables = parser.getElementsByTagName('table') rows = tables[0].getAllChildNodes().getElementsByTagName('tr') table = [] for row in rows[1:-1]: data = row.getAllChildNodes().getElementsByTagName('td') r = [] for d in [ 2,3,4,5,6,7,8,10 ]: r.append(data[d].textContent) r.append(row.attributes['class'] == "gs-o-table__row--break") table.append(r) time = parser.getElementsByTagName('time') time = dateutil.parser.isoparse(time[0].attributes['datetime']) league = parser.getElementsByTagName('h1') league = league[0].textContent value = (league, time, table) cache[url] = dict( value=value, etag=resp.headers.get('etag') ) return value
def process_records(queue, rule, wb): newqueue = [] for record in queue: maybesave(wb, queue) url = record.get("url") try: (fp, filename) = io.get_tempfile() f = fetch.Fetcher(mode=record.get("mode"), url=url, filename=filename) url = get_url(f, wb, host_filter=rule.get("host_filter")) filename = f.filename # consider retrying the fetch if it failed if f.error and fetch.err.is_temporal(f.error): if not record.get("retry"): record["retry"] = True queue.append(record) if record.get("mode") == fetch.Fetcher.SPIDER: data = open(filename, 'r').read() urls = spider.unbox_it_to_ss(spider.findall(data, url)) urls = urlrewrite.rewrite_urls(url, urls) (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb) if record.get("mode") == fetch.Fetcher.FETCH: shutil.move(filename, io.safe_filename(urlrewrite.url_to_filename(url))) except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost): pass except KeyboardInterrupt: q = queue[queue.index(record):] q.extend(newqueue) save_session(wb, queue=q) sys.exit(1) except Exception, exc: log_exc(exc, url, wb) finally:
def cricket_scorecard_table(url, cache): if url in cache: entry = cache[url] headers = { 'If-None-Match': entry['etag'] } else: entry = None headers = None f = fetch.Fetcher() resp = f.get(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code != 200: print(f"{resp.status_code} on {url}") return None if entry and resp.headers.get('etag') == entry['etag']: return entry['value'] print(f"Cache miss {url}", flush=True) parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser() parser.parseStr(resp.text) article = parser.getElementsByTagName('article')[0] match = _get_span(article, 'sp-c-fixture__title') home_fix = article.getElementsByClassName('sp-c-fixture__team--time-home') away_fix = article.getElementsByClassName('sp-c-fixture__team--time-away') home_name = _get_span(home_fix, 'sp-c-fixture__team-name-trunc', 'abbr') away_name = _get_span(away_fix, 'sp-c-fixture__team-name-trunc', 'abbr') home_scores = _get_span(home_fix, 'sp-c-fixture__cricket-score', as_list=True, ignore='gs-u-vh') away_scores = _get_span(away_fix, 'sp-c-fixture__cricket-score', as_list=True, ignore='gs-u-vh') if home_scores: home_scores = [ l.replace(' ',' ').replace(' - ','-').strip() for l in home_scores ] else: home_scores = [] if away_scores: away_scores = [ l.replace(' ',' ').replace(' - ','-').strip() for l in away_scores ] else: away_scores = [] status = _get_span(article, 'sp-c-fixture__win-message') innings = [] for number in range(4): bats = parser.getElementById(f"batting-table{number+1}") if bats: bats = bats.getChildren() else: continue bowls = parser.getElementById(f"bowling-table{number+1}") if bowls: bowls = bowls.getChildren() falls = parser.getElementById(f"fall-of-wicket-table{number+1}") if falls: falls = falls.getChildren() title = _get_span(bats, 'gs-u-align-left', 'h2') bat_foot = bats.getElementsByTagName('tfoot') tot_overs = _get_span(bat_foot, 'qa-overs') tot_runs = _get_span(bat_foot, 'qa-runs', ignore='gs-u-vh') bat_body = bats.getElementsByTagName('tbody') bat_lines = [] for r in bat_body.getElementsByTagName('tr'): vals = _get_span(r, 'gs-o-table__cell', as_list=True) bat_lines.append(vals) falls_body = falls.getElementsByTagName('tbody') fall_lines = [] for r in falls_body.getElementsByTagName('tr'): vals = _get_span(r, 'gs-o-table__cell', as_list=True) fall_lines.append(vals) innings.append( dict( name=title, runs=tot_runs, overs=tot_overs, batting=bat_lines, falls=fall_lines, ) ) metas = parser.getElementById('event-meta').getChildren() metas1 = [ f.replace(':', '').lower() for f in _get_span(metas, tag_name='dt', as_list=True) ] metas2 = _get_span(metas, tag_name='dd', as_list=True) metas = dict(list(zip(metas1, metas2))) toss = metas['toss'] toss = toss.replace(' won the ', ' won ').replace(' and decided to ', ': ') venue = metas['venue'] if ',' in venue: _,_,venue = venue.rpartition(',') venue = venue.strip() table = dict( match=match, home_name=home_name, home_scores=home_scores, away_name=away_name, away_scores=away_scores, status=status, innings=innings, toss=toss, venue=venue, ) cache[url] = dict( value=table, etag=resp.headers.get('etag') ) return table
def football_gossip_entries(url, cache): seen = [] if url in cache: entry = cache[url] etag = entry['etag'] if 'seen' in entry: seen = entry['seen'] headers = { 'If-None-Match': entry['etag'] } else: etag = None entry = None headers = None f = fetch.Fetcher() # "seen" works around the CDN sending different etags occasionally resp = f.head(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code == 200: newetag = resp.headers.get('etag') if newetag in seen: cache[url]['etag'] = newetag return entry['value'] resp = f.get(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code != 200: print(f"{resp.status_code} on {url}") return None if entry and (resp.headers.get('etag') == etag): return entry['value'] print(f"Cache miss {url}", flush=True) seen.append(resp.headers.get('etag')) seen = seen[-10:] parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser() parser.parseStr(resp.content.decode("utf-8")) paragraphs = [] div = parser.getElementById('story-body') children = div.getAllChildNodes().getElementsByTagName('p') for p in children: head = "" line = "" tail = "" first = True for c in p.childBlocks: if type(c) is str: if c: if first: head += c else: line += c else: if (c.nodeName == 'b' and first): head += c.textContent first = False elif c.nodeName == 'a': tail = c.textContent else: line += c.textContent line = line.replace(' ', ' ').strip() line = line.replace('()','').strip() while line.endswith('(') or line.endswith(')'): line = line[:-1] head = head.replace(' ', ' ').strip() tail = tail.replace('(','') tail = tail.replace(')','').strip() tail = f"({tail})" if head and line and tail: paragraphs.append((head, line, tail)) cache[url] = dict( value=paragraphs, etag=resp.headers.get('etag'), seen=seen ) return paragraphs
def fixtures_table(url, cache): if url in cache: entry = cache[url] headers = { 'If-None-Match': entry['etag'] } else: entry = None headers = None f = fetch.Fetcher() resp = f.get(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code != 200: print(f"{resp.status_code} on {url}") return None if entry and resp.headers.get('etag') == entry['etag']: return entry['value'] print(f"Cache miss {url}", flush=True) parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser() parser.parseStr(resp.text) divs = parser.getElementsByClassName('qa-match-block') table = [] for div_row in divs: children = div_row.getAllChildNodes() league = children.getElementsByTagName('h3') league = league[0].textContent.upper() if league not in _config['football_fixture_leagues']: continue round_ = children.getElementsByTagName('h4') if len(round_): round_ = round_[0].textContent else: round_ = None block = [] matches = children.getElementsByTagName('ul') matches = matches[0].getAllChildNodes().getElementsByTagName('li') for match in matches: nodes = match.getAllChildNodes() home_team = _get_span(nodes, 'sp-c-fixture__team-name-trunc', 'abbr',0) away_team = _get_span(nodes, 'sp-c-fixture__team-name-trunc', 'abbr',1) home_goals = _get_span(nodes, 'sp-c-fixture__number--home') away_goals = _get_span(nodes, 'sp-c-fixture__number--away') kickoff = _get_span(nodes, 'sp-c-fixture__block--time') status = _get_span(nodes, 'sp-c-fixture__aside') if not status: status = _get_span(nodes, 'sp-c-fixture__status') if status: if "abandoned" in status: home_goals = "A" away_goals = "A" kickoff = None status = status.replace("Match postponed -","") status = status.replace("Match abandoned -","") status = status.replace(" mins", "min") status = status.replace(' ','') block.append(dict( home_team=home_team, away_team=away_team, home_goals=home_goals, away_goals=away_goals, status=status, kickoff=kickoff, )) table.append(dict( league=league, round_=round_, matches=block, )) cache[url] = dict( value=table, etag=resp.headers.get('etag') ) return table
def cricket_fixtures_table(url, cache): if url in cache: entry = cache[url] headers = { 'If-None-Match': entry['etag'] } else: entry = None headers = None f = fetch.Fetcher() resp = f.get(url, headers=headers) if resp.status_code == 304: return entry['value'] if resp.status_code != 200: print(f"{resp.status_code} on {url}") return None if entry and resp.headers.get('etag') == entry['etag']: return entry['value'] print(f"Cache miss {url}", flush=True) parser = AdvancedHTMLParser.IndexedAdvancedHTMLParser() parser.parseStr(resp.text) spans = parser.getElementsByClassName('qa-fixture-block') table = [] for span_row in spans: children = span_row.getAllChildNodes() series = children.getElementsByTagName('h3') series = series[1].textContent.upper() if series.upper() not in _config['cricket_series']: continue block = [] matches = children.getElementsByTagName('ul') matches = matches[0].getAllChildNodes().getElementsByTagName('li') for match in matches: nodes = match.getAllChildNodes() link = nodes.getElementsByTagName('a') if link: link = link[0].getAttribute('href') home_team = _get_span(nodes, 'sp-c-head-to-head__team-name-trunc', 'abbr',0) away_team = _get_span(nodes, 'sp-c-head-to-head__team-name-trunc', 'abbr',1) home_score = _get_span(nodes, 'sp-c-head-to-head__home-team-scores', sub_class='sp-c-head-to-head__cricket-score', as_list=True) away_score = _get_span(nodes, 'sp-c-head-to-head__away-team-scores', sub_class='sp-c-head-to-head__cricket-score', as_list=True) status = _get_span(nodes, 'sp-c-head-to-head__status') title = _get_span(nodes, 'sp-c-head-to-head__title') venue = _get_span(nodes, 'sp-c-head-to-head__venue') time = _get_span(nodes, 'qa-score-time') home_batting = _get_span(nodes, 'sp-c-head-to-head__team-indicator--home') away_batting = _get_span(nodes, 'sp-c-head-to-head__team-indicator--away') block.append(dict( home_team=home_team, away_team=away_team, home_score=home_score, away_score=away_score, status=status, link=link, title=title, time=time, venue=venue, home_batting=bool(home_batting), away_batting=bool(away_batting), )) table.append(dict( series=series, matches=block, )) cache[url] = dict( value=table, etag=resp.headers.get('etag') ) return table
def main(argv=None): import sys # http://www.python.org/doc/2.4.4/lib/module-time.html import time if argv is None: argv = sys.argv options, args = parse_options(argv[1:]) update_parameters(options.parameter) step_list = options.steps try: rootdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if os.getcwd() != rootdir: raise Fatal("The GISTEMP procedure must be run from the root " "directory of the project.\nPlease change directory " "to %s and try again." % rootdir) # Carry out preflight checks and fetch missing files. import fetch fetcher = fetch.Fetcher() fetcher.fetch() # Create all the temporary directories we're going to use. for d in ['log', 'result', 'work']: mkdir(d) step_fn = { '0': run_step0, '1': run_step1, '2': run_step2, '3': run_step3, '3c': run_step3c, '4': run_step4, '5': run_step5, } # Record start time now, and ending times for each step. start_time = time.time() cannot = [s for s in step_list if not step_fn.has_key(s)] if cannot: raise Fatal("Can't run steps %s" % str(cannot)) # Create a message for stdout. if len(step_list) == 1: logit = "STEP %s" % step_list[0] else: assert len(step_list) >= 2 try: t = [str(s) for s in range(step_list[0], step_list[-1]+1)] except: t = [] if step_list == t: logit = "STEPS %s to %s" % (step_list[0], step_list[-1]) else: logit = "STEPS %s" % ', '.join(step_list) log("====> %s ====" % logit) data = None for step in step_list: data = step_fn[step](data) # Consume the data in whatever the last step was, in order to # write its output, and hence suck data through the whole # pipeline. for _ in data: pass end_time = time.time() log("====> Timing Summary ====") log("Run took %.1f seconds" % (end_time - start_time)) return 0 except Fatal, err: sys.stderr.write(str(err)) sys.stderr.write('\n') return 2
def fetcher(self): if not self._fetcher: import fetch self._fetcher = fetch.Fetcher(self.handler) return self._fetcher
def dl_input_files(): import fetch fetcher = fetch.Fetcher() fetcher.fetch()