def calc_feed_stats(session: Session, feed_name: str) -> dict: page_urls = [ page_url.url for page_url in session.query(PageURL).filter( PageURL.feed_name == feed_name) ] n_pages = len(page_urls) if n_pages: n_lines = count( session.query(PageLine).filter(PageLine.line != '', PageLine.url.in_(page_urls))) else: n_lines = 0 if n_lines: n_parsed = count( session.query(ParsedPageLine).filter( ParsedPageLine.url.in_(page_urls))) else: n_parsed = 0 if n_parsed: n_approved = count( session.query(Tweet).filter( Tweet.status == TweetReviewStatus.approved, Tweet.url.in_(page_urls), )) else: n_approved = 0 return { 'feed_name': feed_name, 'n_pages': n_pages, 'n_lines': n_lines, 'n_parsed': n_parsed, 'n_approved': n_approved, }
def migrate_posted_tweets(session: Session, cur, table_posted: str): t0 = time.time() logger.info('Migrating %s', table_posted) n_in = 0 n_out = 0 cur.execute('SELECT url, line, parsed, status, edited, tweet, inserted ' f'FROM {table_posted}') for url, line, parsed, status_str, edited, tweet, inserted in cur: n_in += 1 logger.debug('%d %s', n_in, url) if not count( session.query(PostedTweet).filter(PostedTweet.text == tweet)): posted_tweet = PostedTweet( url=url, line=line, parsed=parsed, status=convert_status(status_str), edited=edited, text=tweet, inserted=inserted, ) session.add(posted_tweet) if n_in % 10000 == 0: logger.info('%d flush', n_in) session.flush() n_out += 1 logger.info('commit') session.commit() logger.info( 'Migrated %s: %d -> %d in %ds', table_posted, n_in, n_out, time.time() - t0, )
def save_page_urls( session: Session, feed_name: str, page_urls: List[str], mtime: datetime.datetime, ): counter = 0 for page_url in set(page_urls): if count(session.query(PageURL).filter(PageURL.url == page_url)): continue try: page_url = PageURL(url=page_url, feed_name=feed_name, inserted=mtime) session.add(page_url) counter += 1 except Exception as e: logger.error( 'Error while inserting new URL in the db %s %s', page_url, e, ) session.commit() logger.info( 'done %s %d urls inserted', feed_name.ljust(40), counter, )
def match_pages( pages: List[Tuple[Page]], session_factory, keyword_lists: Sequence[Sequence[str]], param_hash: str, ): session = scoped_session(session_factory) for (page,) in pages: if count( session.query(PageLine).filter( PageLine.url == page.url, PageLine.param_hash == param_hash ) ): continue logger.info('Matched %s', page.url) inserted = False for line in page.text.splitlines(): if contains_keyword_from_each_list(line, keyword_lists): page_line = PageLine( url=page.url, line=line.strip(), param_hash=param_hash ) session.add(page_line) inserted = True if not inserted: page_line = PageLine(url=page.url, param_hash=param_hash) session.add(page_line) session.commit() session.close()
def migrate_pages(session: Session, cur, table_pages: str): t0 = time.time() logger.info('Migrating %s', table_pages) n_in = 0 n_out = 0 cur.execute(f'SELECT url, text, inserted FROM {table_pages}') for url, text, inserted in cur: n_in += 1 logger.debug('%d %s', n_in, url) if not count(session.query(Page).filter(Page.url == url)): page = Page( url=url, text=text, inserted=inserted, ) session.add(page) if n_in % 10000 == 0: logger.info('%d flush', n_in) session.flush() n_out += 1 logger.info('commit') session.commit() logger.info( 'Migrated %s: %d -> %d in %ds', table_pages, n_in, n_out, time.time() - t0, )
def parse_page_lines( page_lines: List[Tuple[PageLine]], session_factory, pattern: str, param_hash: str, ): rx = regex.compile(pattern) session = scoped_session(session_factory) for (page_line, ) in page_lines: if count( session.query(ParsedPageLine).filter( ParsedPageLine.line == page_line.line, ParsedPageLine.param_hash == param_hash, )): continue logger.info('Parsed %s', page_line.url) for parsed in parse_line(rx, page_line.line): parsed_page_line = ParsedPageLine( url=page_line.url, line=page_line.line, parsed=parsed, param_hash=param_hash, ) session.add(parsed_page_line) session.commit() session.close()
def main(config: dict): feeds = config['feeds'] dates = [ datetime.datetime.fromisoformat(d) for d in config['archive_dates'] ] session = create_session(config['db']['url']) for feed in feeds: if not feed.get('name') or not feed.get('url'): continue for date in dates: if count( session.query(ArchivedPageURL).filter( ArchivedPageURL.feed_url == feed['url'], ArchivedPageURL.date == date, )): continue archived_url = find_closest_snapshot_url(feed['url'], date) archived_page_url = ArchivedPageURL( feed_url=feed['url'], archived_url=archived_url, date=date, ) session.add(archived_page_url) session.commit() session.close()
def main(config: dict, secrets: dict, interactive: bool, dry_run: bool): session = create_session(config['db']['url']) approved_tweets = session.query(Tweet).filter( Tweet.status == TweetReviewStatus.approved) posted_tweets = session.query(PostedTweet).all() posted_tweets_parsed = [t.parsed for t in posted_tweets] pending_tweets = [ t for t in approved_tweets if t.parsed not in posted_tweets_parsed ] total_approved_tweets = count(approved_tweets) total_posted_tweets = len(posted_tweets) total_pending_tweets = len(pending_tweets) logger.info('Number of approved tweets: %d', total_approved_tweets) logger.info('Number of posted tweets: %d', total_posted_tweets) logger.info('Number of tweets to post: %d', total_pending_tweets) if not total_pending_tweets: logger.warning('Nothing to do, all tweets have already been posted') return i = random.randint(0, total_pending_tweets - 1) tweet = pending_tweets[i] template_str = deep_get(config, ['post_tweet', 'tweet_template'], default='${text} ${url}') text = Template(template_str).substitute(text=tweet.text, url=tweet.url) logger.warning( '%d/%d/%d posting tweet "%s"', i, total_pending_tweets, total_approved_tweets, text, ) if interactive: inp = input('Are you sure you want to post this tweet? [y/N] ') if inp != 'y': print('Bailing out!') return status_id = post_tweet(text, secrets, dry_run) if not status_id: return posted_tweet = PostedTweet.from_tweet(tweet, text, status_id) session.add(posted_tweet) session.commit() name = config['post_tweet']['profile_name'] description = Template( config['post_tweet']['profile_description_template']).substitute( n_posted=total_posted_tweets + 1, n_approved=total_approved_tweets) logger.warning( 'Updating profile, name: "%s", description: "%s"', name, description, ) update_profile(name, description, secrets, dry_run) session.close()
def main(config: dict, cache_path: Path, approved: bool = False): session = create_session(config['db']['url']) if approved: tweets = session.query(Tweet).filter( Tweet.status == TweetReviewStatus.approved) else: tweets = session.query(PostedTweet).all() for tweet in tweets: exported_tweet = print_export_tweet(cache_path, tweet) if exported_tweet and not count( session.query(ExportedTweet).filter( ExportedTweet.text == exported_tweet.text)): session.add(exported_tweet) session.flush() session.commit() session.close()
def migrate_exported_tweets(session: Session, cur, table_print_export: str): t0 = time.time() logger.info('Migrating %s', table_print_export) n_in = 0 n_out = 0 cur.execute( 'SELECT url, text, title, description, image_path, domain, timestamp, ' f'inserted FROM {table_print_export}') for ( url, text, title, description, image_path, domain, timestamp, inserted, ) in cur: n_in += 1 logger.debug('%d %s', n_in, url) if not count( session.query(ExportedTweet).filter( ExportedTweet.text == text)): exported_tweet = ExportedTweet( url=url, text=text, title=title, description=description, image_path=image_path, domain=domain, timestamp=timestamp, inserted=inserted, ) session.add(exported_tweet) if n_in % 10000 == 0: logger.info('%d flush', n_in) session.flush() n_out += 1 logger.info('commit') session.commit() logger.info( 'Migrated %s: %d -> %d in %ds', table_print_export, n_in, n_out, time.time() - t0, )
def check_posted_tweets(session, api, screen_name: str, max_id: Optional[int] = None) -> Optional[int]: logger.info(f'Fetching user timeline, {max_id=}') statuses = api.GetUserTimeline(screen_name='covid_chance', count=100, max_id=max_id) last_id = None for status in statuses: last_id = status.id logger.info('Checking %d "%s"', status.id, status.full_text) if count( session.query(PostedTweet).filter( PostedTweet.status_id == status.id)): continue m = re.match(r'(?P<raw_text>.+) https://t\.co/\w+$', status.full_text) raw_text = html.unescape(m.group('raw_text')) try: posted_tweet = (session.query(PostedTweet).filter( PostedTweet.text.like(f'{raw_text}%')).one_or_none()) except MultipleResultsFound: logger.error( 'Multiple tweets with the same text found: %d "%s"', status.id, raw_text, ) continue if posted_tweet: if not posted_tweet.status_id: logger.warning('Updating status id: %d "%s"', status.id, raw_text) posted_tweet.status_id = status.id else: logger.warning('Adding: %d "%s"', status.id, raw_text) new_posted_tweet = PostedTweet.from_status(status) session.add(new_posted_tweet) session.commit() return last_id
def migrate_parsed_page_lines(session: Session, cur, table_parsed: str): t0 = time.time() logger.info('Migrating %s', table_parsed) n_in = 0 n_out = 0 cur.execute( f'SELECT url, line, parsed, param_hash, inserted FROM {table_parsed}') for url, line, parsed, param_hash, inserted in cur: n_in += 1 logger.debug('%d %s', n_in, url) if not count( session.query(ParsedPageLine).filter( ParsedPageLine.url == url, ParsedPageLine.line == line, ParsedPageLine.param_hash == param_hash, )): parsed_page_line = ParsedPageLine( url=url, line=line, parsed=parsed, param_hash=param_hash, inserted=inserted, ) session.add(parsed_page_line) if n_in % 10000 == 0: logger.info('%d flush', n_in) session.flush() n_out += 1 logger.info('commit') session.commit() logger.info( 'Migrated %s: %d -> %d in %ds', table_parsed, n_in, n_out, time.time() - t0, )
def migrate_archived_page_urls(session: Session, cur, table_archives: str): t0 = time.time() logger.info('Migrating %s', table_archives) n_in = 0 n_out = 0 cur.execute( f'SELECT feed_url, archived_url, date, inserted FROM {table_archives}') for feed_url, archived_url, date, inserted in cur: n_in += 1 logger.debug('%d %s', n_in, archived_url) if not count( session.query(ArchivedPageURL).filter( ArchivedPageURL.feed_url == feed_url, ArchivedPageURL.archived_url == archived_url, ArchivedPageURL.date == date, )): archived_page_url = ArchivedPageURL( feed_url=feed_url, archived_url=archived_url, date=date, inserted=inserted, ) session.add(archived_page_url) if n_in % 10000 == 0: logger.info('%d flush', n_in) session.flush() n_out += 1 logger.info('commit') session.commit() logger.info( 'Migrated %s: %d -> %d in %ds', table_archives, n_in, n_out, time.time() - t0, )
def main(config, review_all: bool, incl_approved: bool): session = create_session(config['db']['url']) parsed_page_lines = session.query(ParsedPageLine).filter( ParsedPageLine.parsed != '') reviewed_tweets = session.query(Tweet).filter( Tweet.status != TweetReviewStatus.none) approved_tweets = [ t for t in reviewed_tweets if t.status == TweetReviewStatus.approved ] rejected_tweets = [ t for t in reviewed_tweets if t.status == TweetReviewStatus.rejected ] if review_all: pending_parsed_page_lines = parsed_page_lines else: reviewed_tweets_parsed = [ tweet.parsed for tweet in session.query(Tweet).filter( Tweet.status != TweetReviewStatus.none) ] pending_parsed_page_lines = parsed_page_lines.filter( ParsedPageLine.parsed.notin_(reviewed_tweets_parsed)) pending_tweets = [ Tweet.from_parsed_page_line(parsed_page_line) for parsed_page_line in pending_parsed_page_lines ] if not review_all: if incl_approved: pending_tweets += approved_tweets else: invalid_approved_tweets = [t for t in approved_tweets if t.invalid] pending_tweets += invalid_approved_tweets total_pending_tweets = len(pending_tweets) logger.info('Number of matching lines: %d', session.query(PageLine).count()) logger.info('Number of parsed tweets: %d', count(parsed_page_lines)) logger.info('Number of approved tweets: %d', len(approved_tweets)) logger.info('Number of rejected tweets: %d', len(rejected_tweets)) logger.info('Number of tweets to review: %d', total_pending_tweets) i = 0 while i < len(pending_tweets): tweet = pending_tweets[i] print_tweet( tweet, i=i + 1, total=total_pending_tweets, highlight=True, ) inp = None while inp is None or (inp not in ('y', 'n', 'e', 'q', 's', 'p', '')): inp = rlinput('Do you like this tweet? ' '"y" or Enter = yes, ' '"n" = no, ' '"e" = edit, ' '"s" = skip (ask next time again), ' '"p" = show previous tweet, ' '"q" = quit \n' '> ') if inp == 'q': break if inp == 's': i = i + 1 continue if inp == 'p': i = max(i - 1, 0) continue if inp in ('y' or ''): tweet.status = TweetReviewStatus.approved elif inp == 'n': tweet.status = TweetReviewStatus.rejected elif inp == 'e': edited_text = None while edited_text is None: edited_text = rlinput( 'Enter new text or delete it to reject the tweet.\n> ', tweet.edited or tweet.parsed, ) tweet.edited = edited_text if edited_text == '': tweet.status = TweetReviewStatus.rejected else: tweet.status = TweetReviewStatus.approved else: raise NotImplementedError('Invalid input') if inspect(tweet).transient: session.add(tweet) session.commit() i = i + 1 session.close()