def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() logger.debug('Article parsed; trying to store') v, boring, diff_info = add_to_git_repo(to_store, url_to_filename(article.url), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version( v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = diff_info v_row.save() if not boring: article.last_update = t article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if (update_priority(a) > 1 or do_all)], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() logger.debug('Article parsed; trying to store') v, boring, diff = add_to_git_repo(to_store, url_to_filename(article.url), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version(v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = get_diff_info(diff) v_row.diff_details_json = diff v_row.update_severity(save=False) if not boring: article.last_update = t v_row.save() article.save()
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter( Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done with gc!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) # isn't this inherent in update_priority being > 1 above? if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i+1, len(all_urls), url)) if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue if not models.Article.objects.filter(url=url).count(): logger.debug('Adding!') models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_articles(todays_git_dir): logger.info('Starting scraper; looking for new URLs') all_urls = get_all_article_urls() logger.info('Got all %s urls; storing to database' % len(all_urls)) for i, url in enumerate(all_urls): logger.debug('Woo: %d/%d is %s' % (i + 1, len(all_urls), url)) # Looks like it skips URLs longer than 255? if len(url) > 255: #Icky hack, but otherwise they're truncated in DB. continue # Is there an index on this column? if not models.Article.objects.filter(url=url).count(): logger.debug('Adding Article {0}'.format(url)) models.Article(url=url, git_dir=todays_git_dir).save() logger.info('Done storing to database')
def update_versions(todays_repo, do_all=False): logger.info('Looking for articles to check') # For memory issues, restrict to the last year of articles threshold = datetime.now() - timedelta(days=366) article_query = models.Article.objects.exclude(git_dir='old').filter(Q(last_update__gt=threshold) | Q(initial_date__gt=threshold)) articles = list(article_query) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def is_boring(old, new): oldu = canonicalize(old.decode('utf8')) newu = canonicalize(new.decode('utf8')) if oldu.splitlines()[1:] == newu.splitlines()[1:]: return True for charset in CHARSET_LIST: try: if oldu.encode(charset) == new: logger.debug('Boring!') return True except UnicodeEncodeError: pass return False
def update_versions(todays_repo, do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check( ) * 1. / get_update_delay(x.minutes_since_update()) articles = sorted( [a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc'], models.GIT_DIR + todays_repo) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i + 1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_versions(do_all=False): articles = list(models.Article.objects.all()) total_articles = len(articles) update_priority = lambda x: x.minutes_since_check() * 1. / get_update_delay(x.minutes_since_update()) articles = sorted([a for a in articles if update_priority(a) > 1 or do_all], key=update_priority, reverse=True) logger.info('Checking %s of %s articles', len(articles), total_articles) # Do git gc at the beginning, so if we're falling behind and killed # it still happens and I don't run out of quota. =) logger.info('Starting with gc:') try: run_git_command(['gc']) except subprocess.CalledProcessError as e: print >> sys.stderr, 'Error on initial gc!' print >> sys.stderr, 'Output was """' print >> sys.stderr, e.output print >> sys.stderr, '"""' raise logger.info('Done!') for i, article in enumerate(articles): logger.debug('Woo: %s %s %s (%s/%s)', article.minutes_since_update(), article.minutes_since_check(), update_priority(article), i+1, len(articles)) delay = get_update_delay(article.minutes_since_update()) if article.minutes_since_check() < delay and not do_all: continue logger.info('Considering %s', article.url) article.last_check = datetime.now() try: update_article(article) except Exception, e: if isinstance(e, subprocess.CalledProcessError): logger.error('CalledProcessError when updating %s', article.url) logger.error(repr(e.output)) else: logger.error('Unknown exception when updating %s', article.url) logger.error(traceback.format_exc()) article.save()
def update_article(article): parsed_article = load_article(article.url) if parsed_article is None: return to_store = unicode(parsed_article).encode('utf8') t = datetime.now() logger.debug('Article parsed; trying to store') v, boring, diff_info = add_to_git_repo(to_store, article.filename(), article) if v: logger.info('Modifying! new blob: %s', v) v_row = models.Version( v=v, boring=boring, title=parsed_article.title, byline=parsed_article.byline, date=t, article=article, ) v_row.diff_info = diff_info v_row.save() if not boring: article.last_update = t article.save() # Notify slack team = models.SlackBotTeam.objects.get() standalone = models.StandaloneArticle.objects.get(url=article.url) if standalone: user = standalone.added_by link = '/article-history/%s' % article.url link = '%s%s' % (settings.WEBAPP_URL, link) bot_text = 'Veo cambios en %s (%s)' % (article.url, link) if not user.startswith('web_frontend'): Client = SlackClient(team.bot_access_token) dm = Client.api_call(method='im.open', user=user) channel = dm.get('channel').get('id') channel = channel if channel else user Client.api_call(method='chat.postMessage', channel=channel, text=bot_text)
def is_boring(old, new): oldu = canonicalize(old.decode('utf8')) newu = canonicalize(new.decode('utf8')) def extra_canonical(s): """Ignore changes in whitespace or the date line""" nondate_portion = s.split('\n', 1)[1] return nondate_portion.split() if extra_canonical(oldu) == extra_canonical(newu): return True for charset in CHARSET_LIST: try: if oldu.encode(charset) == new: logger.debug('Boring!') return True except UnicodeEncodeError: pass return False
def run_git_command(command, git_dir, max_timeout=15): """Run a git command like ['show', filename] and return the output. First, wait up to max_timeout seconds for the index.lock file to go away. If the index.lock file remains, raise an IndexLockError. Still have a race condition if two programs run this at the same time. """ end_time = time.time() + max_timeout delay = 0.1 lock_file = os.path.join(git_dir, '.git/index.lock') while os.path.exists(lock_file): if time.time() < end_time - delay: time.sleep(delay) else: raise IndexLockError('Git index.lock file exists for %s seconds' % max_timeout) logger.debug('GIT call: %s' % command) output = subprocess.check_output([GIT_PROGRAM] + command, cwd=git_dir, stderr=subprocess.STDOUT) return output
def is_boring(old, new): oldu = canonicalize(old.decode('utf8')) newu = canonicalize(new.decode('utf8')) def extra_canonical(s): """Ignore changes in whitespace or the date line""" # This is fragile: depending on the text looking a particular way! nondate_portion = s.split('\n', 1)[1] return nondate_portion.split() if extra_canonical(oldu) == extra_canonical(newu): return True # This seems kind of fragile. Are we 100% sure that differences between # these encodings are unimportant? Also, how does this relate to non-latin # text? for charset in CHARSET_LIST: try: if oldu.encode(charset) == new: logger.debug('Boring!') return True except UnicodeEncodeError: pass return False
def add_to_git_repo(data, filename, article): start_time = time.time() #Don't use full path because it can exceed the maximum filename length #full_path = os.path.join(models.GIT_DIR, filename) os.chdir(article.full_git_dir) mkdir_p(os.path.dirname(filename)) boring = False diff_info = None try: previous = run_git_command(['show', 'HEAD:' + filename], article.full_git_dir) except subprocess.CalledProcessError as e: if (e.output.endswith("does not exist in 'HEAD'\n") or e.output.endswith("exists on disk, but not in 'HEAD'.\n")): already_exists = False else: raise else: already_exists = True open(filename, 'w').write(data) if already_exists: if previous == data: logger.debug('Article matches current version in repo') return None, None, None #Now check how many times this same version has appeared before my_hash = run_git_command(['hash-object', filename], article.full_git_dir).strip() commits = [v.v for v in article.versions()] if len(commits) > 2: logger.debug('Checking for duplicates among %s commits', len(commits)) def get_hash(version): """Return the SHA1 hash of filename in a given version""" output = run_git_command(['ls-tree', '-r', version, filename], article.full_git_dir) return output.split()[2] hashes = map(get_hash, commits) number_equal = sum(1 for h in hashes if h == my_hash) logger.debug('Got %s', number_equal) if number_equal >= 2: #Refuse to list a version more than twice run_git_command(['checkout', filename], article.full_git_dir) return None, None, None if is_boring(previous, data): boring = True else: diff_info = get_diff_info(previous, data) run_git_command(['add', filename], article.full_git_dir) if not already_exists: commit_message = 'Adding file %s' % filename else: commit_message = 'Change to %s' % filename logger.debug('Running git commit... %s', time.time() - start_time) run_git_command(['commit', filename, '-m', commit_message], article.full_git_dir) logger.debug('git revlist... %s', time.time() - start_time) # Now figure out what the commit ID was. # I would like this to be "git rev-list HEAD -n1 filename" # unfortunately, this command is slow: it doesn't abort after the # first line is output. Without filename, it does abort; therefore # we do this and hope no intervening commit occurs. # (looks like the slowness is fixed in git HEAD) v = run_git_command(['rev-list', 'HEAD', '-n1'], article.full_git_dir).strip() logger.debug('done %s', time.time() - start_time) return v, boring, diff_info
def add_to_git_repo(data, filename, article): start_time = time.time() #Don't use full path because it can exceed the maximum filename length #full_path = os.path.join(models.GIT_DIR, filename) os.chdir(article.full_git_dir) mkdir_p(os.path.dirname(filename)) boring = False diff_info = None try: previous = run_git_command(['show', 'HEAD:'+filename], article.full_git_dir) except subprocess.CalledProcessError as e: if (e.output.endswith("does not exist in 'HEAD'\n") or e.output.endswith("exists on disk, but not in 'HEAD'.\n")): already_exists = False else: raise else: already_exists = True open(filename, 'w').write(data) if already_exists: if previous == data: logger.debug('Article matches current version in repo') return None, None, None #Now check how many times this same version has appeared before my_hash = run_git_command(['hash-object', filename], article.full_git_dir).strip() commits = [v.v for v in article.versions()] if len(commits) > 2: logger.debug('Checking for duplicates among %s commits', len(commits)) def get_hash(version): """Return the SHA1 hash of filename in a given version""" output = run_git_command(['ls-tree', '-r', version, filename], article.full_git_dir) return output.split()[2] hashes = map(get_hash, commits) number_equal = sum(1 for h in hashes if h == my_hash) logger.debug('Got %s', number_equal) if number_equal >= 2: #Refuse to list a version more than twice run_git_command(['checkout', filename], article.full_git_dir) return None, None, None if is_boring(previous, data): boring = True else: diff_info = get_diff_info(previous, data) run_git_command(['add', filename], article.full_git_dir) if not already_exists: commit_message = 'Adding file %s' % filename else: commit_message = 'Change to %s' % filename logger.debug('Running git commit... %s', time.time()-start_time) run_git_command(['commit', filename, '-m', commit_message], article.full_git_dir) logger.debug('git revlist... %s', time.time()-start_time) # Now figure out what the commit ID was. # I would like this to be "git rev-list HEAD -n1 filename" # unfortunately, this command is slow: it doesn't abort after the # first line is output. Without filename, it does abort; therefore # we do this and hope no intervening commit occurs. # (looks like the slowness is fixed in git HEAD) v = run_git_command(['rev-list', 'HEAD', '-n1'], article.full_git_dir).strip() logger.debug('done %s', time.time()-start_time) return v, boring, diff_info