Exemplo n.º 1
0
def tag_pairs_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    if not newsbreaker_initialized:
        newsbreaker.init(
            os.path.join(metadata_folder, 'topic_model'), 
            'topic_model.pkl', 
            'vocab.txt'
        )

        globals()['newsbreaker_initialized'] = True

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['pairs'].find().count()
            return json.dumps(
                { 
                    'user': user,
                    'count': n
                }, indent=2
            )

        try:
            basefeed = request.args.get('feed') or choice(feeds).name
            baseindex = int(
                request.args.get('index') or choice(
                    [
                        entry 
                        for entry in entries_metadata[basefeed] 
                        if entry.data.get('newsbreaker') and \
                            entry.data.get('politics')
                    ]
                ).index
            )

        except KeyError as e:
            return 'Invalid feed: %s' % e.args, 400

        except ValueError as e:
            return 'Invalid index: %s' % e.args, 400


        for base_metadata in entries_metadata[basefeed]:
            if base_metadata.index == baseindex:
                break
        else:
            return '%s|%d not found' % (basefeed, baseindex), 400


        if not base_metadata.data.get('newsbreaker', False):
            return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400


        selected_date = base_metadata.data['date']

        # Get all entries in the same date as base
        # that are breakable, and get them as BreakableEntries
        day_entries = load_entries(
            metadata_folder, 
            it=(
                (
                    feed, 
                    (
                        entry
                        for entry in entries_metadata[feed.name]
                        if entry.data.get('date') == selected_date and \
                            entry.data.get('newsbreaker', False) and \
                            entry.data.get('politics')
                    )
                )
                for feed in feeds
            )
        )

        # Retrieve base as BreakableEntry
        for base in day_entries:
            if base.feedname == basefeed and base.index == baseindex:
                break
        else:
            return 'Base entry isn\'t breakable', 500

        # Get the WHAT distance of all entries with base
        day_entries = [
            (entry, entry.what_distance(base))
            for entry in day_entries
        ]

        # Get all pairs of entries that are not base
        # ordering by dist_e1 + dist_e2 ascending
        tests = [
            (e1, e2) 

            for e1, e2, _ in sorted(
                (
                    (e1, e2, d1 + d2)
                    for i, (e1, d1) in enumerate(day_entries)
                    for j, (e2, d2) in enumerate(day_entries)
                    if i < j and e1 != base and e2 != base
                ), key=lambda t: t[2]
            )
        ]

        # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS)
        try:
            max_tests = int(request.args.get('maxtests'))
        except:
            max_tests = MAX_TESTS_PAIRS

        tests = sample(
            tests, 
            min(len(tests), max_tests)
        )

        if not tests:
            # No more entries in this date
            return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400


        # Only filtered entries remain now

        return render_template('tag_pairs.html', 
            title='Pairs tagger',

            base={
                'feedname': base.feedname,
                'index': base.index,
                'title': base.title, 
                'content': base.content,
            },

            tests=[
                [
                    {
                        'feedname': e1.feedname,
                        'index': e1.index,
                        'title': e1.title, 
                        'content': e1.content,
                    },
                    {
                        'feedname': e2.feedname,
                        'index': e2.index,
                        'title': e2.title, 
                        'content': e2.content,
                    }
                ]

                for e1, e2 in tests
            ],

            enumerate=enumerate, # pass it to jinja
        )


    elif request.method == 'POST':
        if 'base' not in request.form:
            return 'Base not found in form inputs', 400

        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v))) # str it just in case

        response = redirect(
            url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests)
        )

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'e1'], 
                request.form[testname + 'e2'], 
                int(testvalue == '1') # 0 or 1
            ]

            for testname, testvalue in tests
            if testvalue in ('-1', '1')
        ]

        if not tests:
            return response # don't insert to tests_db without any values

        tests_db['pairs'].insert_many(
            { 
                'base': request.form['base'], 
                'e1': test[0], 
                'e2': test[1], 
                'res': test[2],
                'user': user
            }

            for test in tests
        )

        return response
Exemplo n.º 2
0
def tag_pairs_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    if not newsbreaker_initialized:
        newsbreaker.init(os.path.join(metadata_folder, 'topic_model'),
                         'topic_model.pkl', 'vocab.txt')

        globals()['newsbreaker_initialized'] = True

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['pairs'].find().count()
            return json.dumps({'user': user, 'count': n}, indent=2)

        try:
            basefeed = request.args.get('feed') or choice(feeds).name
            baseindex = int(
                request.args.get('index') or choice(
                    [
                        entry
                        for entry in entries_metadata[basefeed]
                        if entry.data.get('newsbreaker') and \
                            entry.data.get('politics')
                    ]
                ).index
            )

        except KeyError as e:
            return 'Invalid feed: %s' % e.args, 400

        except ValueError as e:
            return 'Invalid index: %s' % e.args, 400

        for base_metadata in entries_metadata[basefeed]:
            if base_metadata.index == baseindex:
                break
        else:
            return '%s|%d not found' % (basefeed, baseindex), 400

        if not base_metadata.data.get('newsbreaker', False):
            return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400

        selected_date = base_metadata.data['date']

        # Get all entries in the same date as base
        # that are breakable, and get them as BreakableEntries
        day_entries = load_entries(
            metadata_folder,
            it=(
                (
                    feed,
                    (
                        entry
                        for entry in entries_metadata[feed.name]
                        if entry.data.get('date') == selected_date and \
                            entry.data.get('newsbreaker', False) and \
                            entry.data.get('politics')
                    )
                )
                for feed in feeds
            )
        )

        # Retrieve base as BreakableEntry
        for base in day_entries:
            if base.feedname == basefeed and base.index == baseindex:
                break
        else:
            return 'Base entry isn\'t breakable', 500

        # Get the WHAT distance of all entries with base
        day_entries = [(entry, entry.what_distance(base))
                       for entry in day_entries]

        # Get all pairs of entries that are not base
        # ordering by dist_e1 + dist_e2 ascending
        tests = [
            (e1, e2)
            for e1, e2, _ in sorted(((e1, e2, d1 + d2)
                                     for i, (e1, d1) in enumerate(day_entries)
                                     for j, (e2, d2) in enumerate(day_entries)
                                     if i < j and e1 != base and e2 != base),
                                    key=lambda t: t[2])
        ]

        # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS)
        try:
            max_tests = int(request.args.get('maxtests'))
        except:
            max_tests = MAX_TESTS_PAIRS

        tests = sample(tests, min(len(tests), max_tests))

        if not tests:
            # No more entries in this date
            return '%s|%d cannot be used for tagger' % (basefeed,
                                                        baseindex), 400

        # Only filtered entries remain now

        return render_template(
            'tag_pairs.html',
            title='Pairs tagger',
            base={
                'feedname': base.feedname,
                'index': base.index,
                'title': base.title,
                'content': base.content,
            },
            tests=[[{
                'feedname': e1.feedname,
                'index': e1.index,
                'title': e1.title,
                'content': e1.content,
            }, {
                'feedname': e2.feedname,
                'index': e2.index,
                'title': e2.title,
                'content': e2.content,
            }] for e1, e2 in tests],
            enumerate=enumerate,  # pass it to jinja
        )

    elif request.method == 'POST':
        if 'base' not in request.form:
            return 'Base not found in form inputs', 400

        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v)))  # str it just in case

        response = redirect(
            url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests))

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'e1'],
                request.form[testname + 'e2'],
                int(testvalue == '1')  # 0 or 1
            ] for testname, testvalue in tests if testvalue in ('-1', '1')
        ]

        if not tests:
            return response  # don't insert to tests_db without any values

        tests_db['pairs'].insert_many({
            'base': request.form['base'],
            'e1': test[0],
            'e2': test[1],
            'res': test[2],
            'user': user
        } for test in tests)

        return response
Exemplo n.º 3
0
    index = int(index)  # just in case

    for entry in entries:
        if entry.feedname == feedname and entry.index == index:
            return entry
    else:
        raise KeyError((feedname, index))


if __name__ == '__main__':
    viz = sys.argv[1]
    feedname = sys.argv[2]
    index = sys.argv[3]

    folder = 'data'
    init(os.path.join(folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt')

    entries = load_entries(folder)

    # Filter entries: only politics
    entries = [entry for entry in entries if entry.data.get('politics')]

    from pymongo import MongoClient
    mongo_client = MongoClient()

    dists_db = mongo_client.distances

    collection = '_'.join(sys.argv[1:4])
    col = getattr(dists_db, collection)

    base = get_entry(feedname, index)