Exemplo n.º 1
0
def calc_feed_stats(session: Session, feed_name: str) -> dict:
    page_urls = [
        page_url.url for page_url in session.query(PageURL).filter(
            PageURL.feed_name == feed_name)
    ]
    n_pages = len(page_urls)
    if n_pages:
        n_lines = count(
            session.query(PageLine).filter(PageLine.line != '',
                                           PageLine.url.in_(page_urls)))
    else:
        n_lines = 0
    if n_lines:
        n_parsed = count(
            session.query(ParsedPageLine).filter(
                ParsedPageLine.url.in_(page_urls)))
    else:
        n_parsed = 0
    if n_parsed:
        n_approved = count(
            session.query(Tweet).filter(
                Tweet.status == TweetReviewStatus.approved,
                Tweet.url.in_(page_urls),
            ))
    else:
        n_approved = 0
    return {
        'feed_name': feed_name,
        'n_pages': n_pages,
        'n_lines': n_lines,
        'n_parsed': n_parsed,
        'n_approved': n_approved,
    }
Exemplo n.º 2
0
def migrate_posted_tweets(session: Session, cur, table_posted: str):
    t0 = time.time()
    logger.info('Migrating %s', table_posted)
    n_in = 0
    n_out = 0
    cur.execute('SELECT url, line, parsed, status, edited, tweet, inserted '
                f'FROM {table_posted}')
    for url, line, parsed, status_str, edited, tweet, inserted in cur:
        n_in += 1
        logger.debug('%d %s', n_in, url)
        if not count(
                session.query(PostedTweet).filter(PostedTweet.text == tweet)):
            posted_tweet = PostedTweet(
                url=url,
                line=line,
                parsed=parsed,
                status=convert_status(status_str),
                edited=edited,
                text=tweet,
                inserted=inserted,
            )
            session.add(posted_tweet)
            if n_in % 10000 == 0:
                logger.info('%d flush', n_in)
                session.flush()
            n_out += 1
    logger.info('commit')
    session.commit()
    logger.info(
        'Migrated %s: %d -> %d in %ds',
        table_posted,
        n_in,
        n_out,
        time.time() - t0,
    )
Exemplo n.º 3
0
def save_page_urls(
    session: Session,
    feed_name: str,
    page_urls: List[str],
    mtime: datetime.datetime,
):
    counter = 0
    for page_url in set(page_urls):
        if count(session.query(PageURL).filter(PageURL.url == page_url)):
            continue
        try:
            page_url = PageURL(url=page_url,
                               feed_name=feed_name,
                               inserted=mtime)
            session.add(page_url)
            counter += 1
        except Exception as e:
            logger.error(
                'Error while inserting new URL in the db %s %s',
                page_url,
                e,
            )
    session.commit()
    logger.info(
        'done %s %d urls inserted',
        feed_name.ljust(40),
        counter,
    )
Exemplo n.º 4
0
def match_pages(
    pages: List[Tuple[Page]],
    session_factory,
    keyword_lists: Sequence[Sequence[str]],
    param_hash: str,
):
    session = scoped_session(session_factory)
    for (page,) in pages:
        if count(
            session.query(PageLine).filter(
                PageLine.url == page.url, PageLine.param_hash == param_hash
            )
        ):
            continue
        logger.info('Matched %s', page.url)
        inserted = False
        for line in page.text.splitlines():
            if contains_keyword_from_each_list(line, keyword_lists):
                page_line = PageLine(
                    url=page.url, line=line.strip(), param_hash=param_hash
                )
                session.add(page_line)
                inserted = True
        if not inserted:
            page_line = PageLine(url=page.url, param_hash=param_hash)
            session.add(page_line)
    session.commit()
    session.close()
Exemplo n.º 5
0
def migrate_pages(session: Session, cur, table_pages: str):
    t0 = time.time()
    logger.info('Migrating %s', table_pages)
    n_in = 0
    n_out = 0
    cur.execute(f'SELECT url, text, inserted FROM {table_pages}')
    for url, text, inserted in cur:
        n_in += 1
        logger.debug('%d %s', n_in, url)
        if not count(session.query(Page).filter(Page.url == url)):
            page = Page(
                url=url,
                text=text,
                inserted=inserted,
            )
            session.add(page)
            if n_in % 10000 == 0:
                logger.info('%d flush', n_in)
                session.flush()
            n_out += 1
    logger.info('commit')
    session.commit()
    logger.info(
        'Migrated %s: %d -> %d in %ds',
        table_pages,
        n_in,
        n_out,
        time.time() - t0,
    )
Exemplo n.º 6
0
def parse_page_lines(
    page_lines: List[Tuple[PageLine]],
    session_factory,
    pattern: str,
    param_hash: str,
):
    rx = regex.compile(pattern)
    session = scoped_session(session_factory)
    for (page_line, ) in page_lines:
        if count(
                session.query(ParsedPageLine).filter(
                    ParsedPageLine.line == page_line.line,
                    ParsedPageLine.param_hash == param_hash,
                )):
            continue
        logger.info('Parsed %s', page_line.url)
        for parsed in parse_line(rx, page_line.line):
            parsed_page_line = ParsedPageLine(
                url=page_line.url,
                line=page_line.line,
                parsed=parsed,
                param_hash=param_hash,
            )
            session.add(parsed_page_line)
    session.commit()
    session.close()
Exemplo n.º 7
0
def main(config: dict):
    feeds = config['feeds']
    dates = [
        datetime.datetime.fromisoformat(d) for d in config['archive_dates']
    ]
    session = create_session(config['db']['url'])
    for feed in feeds:
        if not feed.get('name') or not feed.get('url'):
            continue
        for date in dates:
            if count(
                    session.query(ArchivedPageURL).filter(
                        ArchivedPageURL.feed_url == feed['url'],
                        ArchivedPageURL.date == date,
                    )):
                continue
            archived_url = find_closest_snapshot_url(feed['url'], date)
            archived_page_url = ArchivedPageURL(
                feed_url=feed['url'],
                archived_url=archived_url,
                date=date,
            )
            session.add(archived_page_url)
            session.commit()
    session.close()
Exemplo n.º 8
0
def main(config: dict, secrets: dict, interactive: bool, dry_run: bool):
    session = create_session(config['db']['url'])
    approved_tweets = session.query(Tweet).filter(
        Tweet.status == TweetReviewStatus.approved)
    posted_tweets = session.query(PostedTweet).all()
    posted_tweets_parsed = [t.parsed for t in posted_tweets]
    pending_tweets = [
        t for t in approved_tweets if t.parsed not in posted_tweets_parsed
    ]
    total_approved_tweets = count(approved_tweets)
    total_posted_tweets = len(posted_tweets)
    total_pending_tweets = len(pending_tweets)

    logger.info('Number of approved tweets: %d', total_approved_tweets)
    logger.info('Number of posted tweets:   %d', total_posted_tweets)
    logger.info('Number of tweets to post:  %d', total_pending_tweets)

    if not total_pending_tweets:
        logger.warning('Nothing to do, all tweets have already been posted')
        return

    i = random.randint(0, total_pending_tweets - 1)
    tweet = pending_tweets[i]
    template_str = deep_get(config, ['post_tweet', 'tweet_template'],
                            default='${text} ${url}')
    text = Template(template_str).substitute(text=tweet.text, url=tweet.url)

    logger.warning(
        '%d/%d/%d posting tweet "%s"',
        i,
        total_pending_tweets,
        total_approved_tweets,
        text,
    )
    if interactive:
        inp = input('Are you sure you want to post this tweet? [y/N] ')
        if inp != 'y':
            print('Bailing out!')
            return
    status_id = post_tweet(text, secrets, dry_run)
    if not status_id:
        return

    posted_tweet = PostedTweet.from_tweet(tweet, text, status_id)
    session.add(posted_tweet)
    session.commit()

    name = config['post_tweet']['profile_name']
    description = Template(
        config['post_tweet']['profile_description_template']).substitute(
            n_posted=total_posted_tweets + 1, n_approved=total_approved_tweets)
    logger.warning(
        'Updating profile, name: "%s", description: "%s"',
        name,
        description,
    )
    update_profile(name, description, secrets, dry_run)
    session.close()
Exemplo n.º 9
0
def main(config: dict, cache_path: Path, approved: bool = False):
    session = create_session(config['db']['url'])
    if approved:
        tweets = session.query(Tweet).filter(
            Tweet.status == TweetReviewStatus.approved)
    else:
        tweets = session.query(PostedTweet).all()
    for tweet in tweets:
        exported_tweet = print_export_tweet(cache_path, tweet)
        if exported_tweet and not count(
                session.query(ExportedTweet).filter(
                    ExportedTweet.text == exported_tweet.text)):
            session.add(exported_tweet)
            session.flush()
    session.commit()
    session.close()
Exemplo n.º 10
0
def migrate_exported_tweets(session: Session, cur, table_print_export: str):
    t0 = time.time()
    logger.info('Migrating %s', table_print_export)
    n_in = 0
    n_out = 0
    cur.execute(
        'SELECT url, text, title, description, image_path, domain, timestamp, '
        f'inserted FROM {table_print_export}')
    for (
            url,
            text,
            title,
            description,
            image_path,
            domain,
            timestamp,
            inserted,
    ) in cur:
        n_in += 1
        logger.debug('%d %s', n_in, url)
        if not count(
                session.query(ExportedTweet).filter(
                    ExportedTweet.text == text)):
            exported_tweet = ExportedTweet(
                url=url,
                text=text,
                title=title,
                description=description,
                image_path=image_path,
                domain=domain,
                timestamp=timestamp,
                inserted=inserted,
            )
            session.add(exported_tweet)
            if n_in % 10000 == 0:
                logger.info('%d flush', n_in)
                session.flush()
            n_out += 1
    logger.info('commit')
    session.commit()
    logger.info(
        'Migrated %s: %d -> %d in %ds',
        table_print_export,
        n_in,
        n_out,
        time.time() - t0,
    )
Exemplo n.º 11
0
def check_posted_tweets(session,
                        api,
                        screen_name: str,
                        max_id: Optional[int] = None) -> Optional[int]:
    logger.info(f'Fetching user timeline, {max_id=}')
    statuses = api.GetUserTimeline(screen_name='covid_chance',
                                   count=100,
                                   max_id=max_id)
    last_id = None
    for status in statuses:
        last_id = status.id
        logger.info('Checking %d "%s"', status.id, status.full_text)
        if count(
                session.query(PostedTweet).filter(
                    PostedTweet.status_id == status.id)):
            continue
        m = re.match(r'(?P<raw_text>.+) https://t\.co/\w+$', status.full_text)
        raw_text = html.unescape(m.group('raw_text'))
        try:
            posted_tweet = (session.query(PostedTweet).filter(
                PostedTweet.text.like(f'{raw_text}%')).one_or_none())
        except MultipleResultsFound:
            logger.error(
                'Multiple tweets with the same text found: %d "%s"',
                status.id,
                raw_text,
            )
            continue
        if posted_tweet:
            if not posted_tweet.status_id:
                logger.warning('Updating status id: %d "%s"', status.id,
                               raw_text)
                posted_tweet.status_id = status.id
        else:
            logger.warning('Adding: %d "%s"', status.id, raw_text)
            new_posted_tweet = PostedTweet.from_status(status)
            session.add(new_posted_tweet)
    session.commit()
    return last_id
Exemplo n.º 12
0
def migrate_parsed_page_lines(session: Session, cur, table_parsed: str):
    t0 = time.time()
    logger.info('Migrating %s', table_parsed)
    n_in = 0
    n_out = 0
    cur.execute(
        f'SELECT url, line, parsed, param_hash, inserted FROM {table_parsed}')
    for url, line, parsed, param_hash, inserted in cur:
        n_in += 1
        logger.debug('%d %s', n_in, url)
        if not count(
                session.query(ParsedPageLine).filter(
                    ParsedPageLine.url == url,
                    ParsedPageLine.line == line,
                    ParsedPageLine.param_hash == param_hash,
                )):
            parsed_page_line = ParsedPageLine(
                url=url,
                line=line,
                parsed=parsed,
                param_hash=param_hash,
                inserted=inserted,
            )
            session.add(parsed_page_line)
            if n_in % 10000 == 0:
                logger.info('%d flush', n_in)
                session.flush()
            n_out += 1
    logger.info('commit')
    session.commit()
    logger.info(
        'Migrated %s: %d -> %d in %ds',
        table_parsed,
        n_in,
        n_out,
        time.time() - t0,
    )
Exemplo n.º 13
0
def migrate_archived_page_urls(session: Session, cur, table_archives: str):
    t0 = time.time()
    logger.info('Migrating %s', table_archives)
    n_in = 0
    n_out = 0
    cur.execute(
        f'SELECT feed_url, archived_url, date, inserted FROM {table_archives}')
    for feed_url, archived_url, date, inserted in cur:
        n_in += 1
        logger.debug('%d %s', n_in, archived_url)
        if not count(
                session.query(ArchivedPageURL).filter(
                    ArchivedPageURL.feed_url == feed_url,
                    ArchivedPageURL.archived_url == archived_url,
                    ArchivedPageURL.date == date,
                )):
            archived_page_url = ArchivedPageURL(
                feed_url=feed_url,
                archived_url=archived_url,
                date=date,
                inserted=inserted,
            )
            session.add(archived_page_url)
            if n_in % 10000 == 0:
                logger.info('%d flush', n_in)
                session.flush()
            n_out += 1
    logger.info('commit')
    session.commit()
    logger.info(
        'Migrated %s: %d -> %d in %ds',
        table_archives,
        n_in,
        n_out,
        time.time() - t0,
    )
Exemplo n.º 14
0
def main(config, review_all: bool, incl_approved: bool):
    session = create_session(config['db']['url'])

    parsed_page_lines = session.query(ParsedPageLine).filter(
        ParsedPageLine.parsed != '')
    reviewed_tweets = session.query(Tweet).filter(
        Tweet.status != TweetReviewStatus.none)
    approved_tweets = [
        t for t in reviewed_tweets if t.status == TweetReviewStatus.approved
    ]
    rejected_tweets = [
        t for t in reviewed_tweets if t.status == TweetReviewStatus.rejected
    ]
    if review_all:
        pending_parsed_page_lines = parsed_page_lines
    else:
        reviewed_tweets_parsed = [
            tweet.parsed for tweet in session.query(Tweet).filter(
                Tweet.status != TweetReviewStatus.none)
        ]
        pending_parsed_page_lines = parsed_page_lines.filter(
            ParsedPageLine.parsed.notin_(reviewed_tweets_parsed))
    pending_tweets = [
        Tweet.from_parsed_page_line(parsed_page_line)
        for parsed_page_line in pending_parsed_page_lines
    ]
    if not review_all:
        if incl_approved:
            pending_tweets += approved_tweets
        else:
            invalid_approved_tweets = [t for t in approved_tweets if t.invalid]
            pending_tweets += invalid_approved_tweets
    total_pending_tweets = len(pending_tweets)

    logger.info('Number of matching lines:   %d',
                session.query(PageLine).count())
    logger.info('Number of parsed tweets:    %d', count(parsed_page_lines))
    logger.info('Number of approved tweets:  %d', len(approved_tweets))
    logger.info('Number of rejected tweets:  %d', len(rejected_tweets))
    logger.info('Number of tweets to review: %d', total_pending_tweets)

    i = 0
    while i < len(pending_tweets):
        tweet = pending_tweets[i]
        print_tweet(
            tweet,
            i=i + 1,
            total=total_pending_tweets,
            highlight=True,
        )
        inp = None
        while inp is None or (inp not in ('y', 'n', 'e', 'q', 's', 'p', '')):
            inp = rlinput('Do you like this tweet? '
                          '"y" or Enter = yes, '
                          '"n" = no, '
                          '"e" = edit, '
                          '"s" = skip (ask next time again), '
                          '"p" = show previous tweet, '
                          '"q" = quit \n'
                          '> ')
        if inp == 'q':
            break
        if inp == 's':
            i = i + 1
            continue
        if inp == 'p':
            i = max(i - 1, 0)
            continue
        if inp in ('y' or ''):
            tweet.status = TweetReviewStatus.approved
        elif inp == 'n':
            tweet.status = TweetReviewStatus.rejected
        elif inp == 'e':
            edited_text = None
            while edited_text is None:
                edited_text = rlinput(
                    'Enter new text or delete it to reject the tweet.\n> ',
                    tweet.edited or tweet.parsed,
                )
            tweet.edited = edited_text
            if edited_text == '':
                tweet.status = TweetReviewStatus.rejected
            else:
                tweet.status = TweetReviewStatus.approved
        else:
            raise NotImplementedError('Invalid input')
        if inspect(tweet).transient:
            session.add(tweet)
        session.commit()
        i = i + 1
    session.close()