Пример #1
0
def get_base_for_pairs():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    entries = {
        feed: [entry for entry in entries_metadata[feed.name]]
        for feed in feeds
    }

    if 'date' in request.args:
        entries = {
            feed: [e for e in l if e.data.get('date') == request.args['date']]
            for feed, l in entries.items()
        }

    entries = load_entries(
        metadata_folder,
        it=(
            (
                feed,
                (
                    entry
                    for entry in l
                    if entry.data.get('newsbreaker', False) and \
                        entry.data.get('politics')
                )
            )
            for feed, l in entries.items()
        )
    )

    try:
        max_entries = int(request.args.get('maxentries'))
    except:
        max_entries = MAX_BASE_FOR_PAIRS

    max_entries = min(len(entries), max_entries)
    max_entries = max_entries - max_entries % 3

    tests = sample(entries, max_entries)

    if not tests:
        # No more entries in this date
        return '%s cannot be used for tagger' % request.args.get('date'), 400

    # Only filtered entries remain now

    return render_template(
        'get_base_for_pairs.html',
        title='Base for pairs',
        tests=[[{
            'feedname': tests[i * 3 + j].feedname,
            'index': tests[i * 3 + j].index,
            'title': tests[i * 3 + j].title,
            'content': tests[i * 3 + j].content,
        } for j in range(3)] for i in range(len(tests) // 3)],
        maxtests=request.args.get('maxtests'),  # for tag_pairs_page
        enumerate=enumerate,  # pass it to jinja
    )
Пример #2
0

def get_entry(name, sep='|'):
    feedname, index = name.split(sep)
    index = int(index)

    for entry in entries:
        if entry.feedname == feedname and entry.index == index:
            return entry
    else:
        raise KeyError(name)


if __name__ == '__main__':
    action_tests = sys.argv[1] == 'tests'  # else, threshold

    base_feedname = sys.argv[2]
    base_index = sys.argv[3]
    base_name = '%s|%s' % (base_feedname, base_index)
    multiple = int(sys.argv[4]) if len(sys.argv) >= 5 else None

    entries = load_entries('data')

    if action_tests:
        if multiple:
            run_tests(base_name, multiple=int(multiple))
        else:
            run_tests(base_name)
    else:
        print('Threshold:', threshold(base_name))
Пример #3
0
def tag_politics_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['politics'].find().count()
            return json.dumps(
                {
                    'user':
                    user,
                    'politics_ratio':
                    tests_db['politics'].find({
                        'res': 1
                    }).count() / n,
                    'count':
                    n
                },
                indent=2)

        try:
            maxtests = int(request.args.get('maxtests'))
        except:
            maxtests = MAX_TESTS_POLITICS

        tests = sample([
            e for feed in feeds for e in entries_metadata[feed.name]
            if e.data.get('newsbreaker', False)
        ], min(sum(len(l) for l in entries_metadata.values()), maxtests))

        # Only filtered entries remain now

        if not tests:
            return 'Not a single entry to tag!', 500

        tests = load_entries(metadata_folder,
                             it=((feed, (entry for entry in tests
                                         if entry.feedname == feed.name))
                                 for feed in feeds))

        return render_template(
            'tag_politics.html',
            title='Politics tagger',
            tests=[{
                'feedname': entry.feedname,
                'index': entry.index,
                'title': entry.title,
                'content': entry.content,
            } for entry in tests],
            enumerate=enumerate,  # pass it to jinja
        )

    elif request.method == 'POST':
        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v)))  # str it just in case

        response = redirect(
            url_for('tag_politics_page') + '?maxtests=%d' % len(tests))

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'entry'],
                testvalue == '1'  # 0 -> false, 1 -> true
            ] for testname, testvalue in tests if testvalue in ('-1', '1')
        ]

        if not tests:
            return response  # don't insert to tests_db without any values

        tests_db['politics'].insert_many({
            'entry': test[0],
            'res': test[1],
            'user': user
        } for test in tests)

        return response
Пример #4
0
def tag_pairs_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    if not newsbreaker_initialized:
        newsbreaker.init(os.path.join(metadata_folder, 'topic_model'),
                         'topic_model.pkl', 'vocab.txt')

        globals()['newsbreaker_initialized'] = True

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['pairs'].find().count()
            return json.dumps({'user': user, 'count': n}, indent=2)

        try:
            basefeed = request.args.get('feed') or choice(feeds).name
            baseindex = int(
                request.args.get('index') or choice(
                    [
                        entry
                        for entry in entries_metadata[basefeed]
                        if entry.data.get('newsbreaker') and \
                            entry.data.get('politics')
                    ]
                ).index
            )

        except KeyError as e:
            return 'Invalid feed: %s' % e.args, 400

        except ValueError as e:
            return 'Invalid index: %s' % e.args, 400

        for base_metadata in entries_metadata[basefeed]:
            if base_metadata.index == baseindex:
                break
        else:
            return '%s|%d not found' % (basefeed, baseindex), 400

        if not base_metadata.data.get('newsbreaker', False):
            return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400

        selected_date = base_metadata.data['date']

        # Get all entries in the same date as base
        # that are breakable, and get them as BreakableEntries
        day_entries = load_entries(
            metadata_folder,
            it=(
                (
                    feed,
                    (
                        entry
                        for entry in entries_metadata[feed.name]
                        if entry.data.get('date') == selected_date and \
                            entry.data.get('newsbreaker', False) and \
                            entry.data.get('politics')
                    )
                )
                for feed in feeds
            )
        )

        # Retrieve base as BreakableEntry
        for base in day_entries:
            if base.feedname == basefeed and base.index == baseindex:
                break
        else:
            return 'Base entry isn\'t breakable', 500

        # Get the WHAT distance of all entries with base
        day_entries = [(entry, entry.what_distance(base))
                       for entry in day_entries]

        # Get all pairs of entries that are not base
        # ordering by dist_e1 + dist_e2 ascending
        tests = [
            (e1, e2)
            for e1, e2, _ in sorted(((e1, e2, d1 + d2)
                                     for i, (e1, d1) in enumerate(day_entries)
                                     for j, (e2, d2) in enumerate(day_entries)
                                     if i < j and e1 != base and e2 != base),
                                    key=lambda t: t[2])
        ]

        # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS)
        try:
            max_tests = int(request.args.get('maxtests'))
        except:
            max_tests = MAX_TESTS_PAIRS

        tests = sample(tests, min(len(tests), max_tests))

        if not tests:
            # No more entries in this date
            return '%s|%d cannot be used for tagger' % (basefeed,
                                                        baseindex), 400

        # Only filtered entries remain now

        return render_template(
            'tag_pairs.html',
            title='Pairs tagger',
            base={
                'feedname': base.feedname,
                'index': base.index,
                'title': base.title,
                'content': base.content,
            },
            tests=[[{
                'feedname': e1.feedname,
                'index': e1.index,
                'title': e1.title,
                'content': e1.content,
            }, {
                'feedname': e2.feedname,
                'index': e2.index,
                'title': e2.title,
                'content': e2.content,
            }] for e1, e2 in tests],
            enumerate=enumerate,  # pass it to jinja
        )

    elif request.method == 'POST':
        if 'base' not in request.form:
            return 'Base not found in form inputs', 400

        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v)))  # str it just in case

        response = redirect(
            url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests))

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'e1'],
                request.form[testname + 'e2'],
                int(testvalue == '1')  # 0 or 1
            ] for testname, testvalue in tests if testvalue in ('-1', '1')
        ]

        if not tests:
            return response  # don't insert to tests_db without any values

        tests_db['pairs'].insert_many({
            'base': request.form['base'],
            'e1': test[0],
            'e2': test[1],
            'res': test[2],
            'user': user
        } for test in tests)

        return response
Пример #5
0
def tag_politics_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['politics'].find().count()
            return json.dumps(
                { 
                    'user': user,
                    'politics_ratio': tests_db['politics'].find(
                        {'res': 1}
                    ).count() / n,
                    'count': n
                }, indent=2
            )

        try:
            maxtests = int(request.args.get('maxtests'))
        except:
            maxtests = MAX_TESTS_POLITICS

        tests = sample(
            [
                e 
                for feed in feeds 
                for e in entries_metadata[feed.name]
                if e.data.get('newsbreaker', False)
            ],
            min(
                sum(len(l) for l in entries_metadata.values()), 
                maxtests
            )
        )

        # Only filtered entries remain now

        if not tests:
            return 'Not a single entry to tag!', 500


        tests = load_entries(
            metadata_folder, 
            it=(
                (
                    feed,
                    (
                        entry
                        for entry in tests
                        if entry.feedname == feed.name
                    )
                )
                for feed in feeds
            )
        )

        return render_template('tag_politics.html', 
            title='Politics tagger',

            tests=[
                {
                    'feedname': entry.feedname,
                    'index': entry.index,
                    'title': entry.title, 
                    'content': entry.content,
                }

                for entry in tests
            ],

            enumerate=enumerate, # pass it to jinja
        )


    elif request.method == 'POST':
        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v))) # str it just in case

        response = redirect(
            url_for('tag_politics_page') + '?maxtests=%d' % len(tests)
        )

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'entry'], 
                testvalue == '1' # 0 -> false, 1 -> true
            ]

            for testname, testvalue in tests
            if testvalue in ('-1', '1')
        ]

        if not tests:
            return response # don't insert to tests_db without any values

        tests_db['politics'].insert_many(
            { 
                'entry': test[0],
                'res': test[1],
                'user': user
            }

            for test in tests
        )

        return response
Пример #6
0
def tag_pairs_page():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    if not newsbreaker_initialized:
        newsbreaker.init(
            os.path.join(metadata_folder, 'topic_model'), 
            'topic_model.pkl', 
            'vocab.txt'
        )

        globals()['newsbreaker_initialized'] = True

    user = request.cookies['user']

    if request.method == 'GET':
        if 'stats' in request.args:
            n = tests_db['pairs'].find().count()
            return json.dumps(
                { 
                    'user': user,
                    'count': n
                }, indent=2
            )

        try:
            basefeed = request.args.get('feed') or choice(feeds).name
            baseindex = int(
                request.args.get('index') or choice(
                    [
                        entry 
                        for entry in entries_metadata[basefeed] 
                        if entry.data.get('newsbreaker') and \
                            entry.data.get('politics')
                    ]
                ).index
            )

        except KeyError as e:
            return 'Invalid feed: %s' % e.args, 400

        except ValueError as e:
            return 'Invalid index: %s' % e.args, 400


        for base_metadata in entries_metadata[basefeed]:
            if base_metadata.index == baseindex:
                break
        else:
            return '%s|%d not found' % (basefeed, baseindex), 400


        if not base_metadata.data.get('newsbreaker', False):
            return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400


        selected_date = base_metadata.data['date']

        # Get all entries in the same date as base
        # that are breakable, and get them as BreakableEntries
        day_entries = load_entries(
            metadata_folder, 
            it=(
                (
                    feed, 
                    (
                        entry
                        for entry in entries_metadata[feed.name]
                        if entry.data.get('date') == selected_date and \
                            entry.data.get('newsbreaker', False) and \
                            entry.data.get('politics')
                    )
                )
                for feed in feeds
            )
        )

        # Retrieve base as BreakableEntry
        for base in day_entries:
            if base.feedname == basefeed and base.index == baseindex:
                break
        else:
            return 'Base entry isn\'t breakable', 500

        # Get the WHAT distance of all entries with base
        day_entries = [
            (entry, entry.what_distance(base))
            for entry in day_entries
        ]

        # Get all pairs of entries that are not base
        # ordering by dist_e1 + dist_e2 ascending
        tests = [
            (e1, e2) 

            for e1, e2, _ in sorted(
                (
                    (e1, e2, d1 + d2)
                    for i, (e1, d1) in enumerate(day_entries)
                    for j, (e2, d2) in enumerate(day_entries)
                    if i < j and e1 != base and e2 != base
                ), key=lambda t: t[2]
            )
        ]

        # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS)
        try:
            max_tests = int(request.args.get('maxtests'))
        except:
            max_tests = MAX_TESTS_PAIRS

        tests = sample(
            tests, 
            min(len(tests), max_tests)
        )

        if not tests:
            # No more entries in this date
            return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400


        # Only filtered entries remain now

        return render_template('tag_pairs.html', 
            title='Pairs tagger',

            base={
                'feedname': base.feedname,
                'index': base.index,
                'title': base.title, 
                'content': base.content,
            },

            tests=[
                [
                    {
                        'feedname': e1.feedname,
                        'index': e1.index,
                        'title': e1.title, 
                        'content': e1.content,
                    },
                    {
                        'feedname': e2.feedname,
                        'index': e2.index,
                        'title': e2.title, 
                        'content': e2.content,
                    }
                ]

                for e1, e2 in tests
            ],

            enumerate=enumerate, # pass it to jinja
        )


    elif request.method == 'POST':
        if 'base' not in request.form:
            return 'Base not found in form inputs', 400

        tests = []
        for k, v in request.form.items():
            if regex_test.fullmatch(k):
                tests.append((k, str(v))) # str it just in case

        response = redirect(
            url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests)
        )

        if not tests:
            return response

        tests = [
            [
                request.form[testname + 'e1'], 
                request.form[testname + 'e2'], 
                int(testvalue == '1') # 0 or 1
            ]

            for testname, testvalue in tests
            if testvalue in ('-1', '1')
        ]

        if not tests:
            return response # don't insert to tests_db without any values

        tests_db['pairs'].insert_many(
            { 
                'base': request.form['base'], 
                'e1': test[0], 
                'e2': test[1], 
                'res': test[2],
                'user': user
            }

            for test in tests
        )

        return response
Пример #7
0
def get_base_for_pairs():
    if 'user' not in request.cookies:
        return 'User must be logged in to tag news', 401

    entries = {
        feed: [
            entry 
            for entry in entries_metadata[feed.name]
        ]

        for feed in feeds
    }

    if 'date' in request.args:
        entries = {
            feed: [e for e in l if e.data.get('date') == request.args['date']]
            for feed, l in entries.items()
        }

    entries = load_entries(
        metadata_folder, 
        it=(
            (
                feed, 
                (
                    entry
                    for entry in l
                    if entry.data.get('newsbreaker', False) and \
                        entry.data.get('politics')
                )
            )
            for feed, l in entries.items()
        )
    )

    try:
        max_entries = int(request.args.get('maxentries'))
    except:
        max_entries = MAX_BASE_FOR_PAIRS

    max_entries = min(len(entries), max_entries)
    max_entries = max_entries - max_entries % 3

    tests = sample(entries, max_entries)

    if not tests:
        # No more entries in this date
        return '%s cannot be used for tagger' % request.args.get('date'), 400

    # Only filtered entries remain now

    return render_template('get_base_for_pairs.html', 
        title='Base for pairs',

        tests=[
            [
                {
                    'feedname': tests[i*3 + j].feedname,
                    'index': tests[i*3 + j].index,
                    'title': tests[i*3 + j].title, 
                    'content': tests[i*3 + j].content,
                }

                for j in range(3)
            ]

            for i in range(len(tests) // 3)
        ],

        maxtests=request.args.get('maxtests'), # for tag_pairs_page

        enumerate=enumerate, # pass it to jinja
    )
Пример #8
0
        if entry.feedname == feedname and entry.index == index:
            return entry
    else:
        raise KeyError(name)


if __name__ == '__main__':
    base_feedname = sys.argv[1]
    base_index = sys.argv[2]
    base_name = '%s|%s' % (base_feedname, base_index)
    threshold = float(sys.argv[3]) if len(sys.argv) >= 4 else 0.6 # for network

    client = MongoClient()
    db = client.distances

    entries = load_entries('data')
    feed_names = sorted({entry.feedname for entry in entries})
    base_entry = get_entry(base_name)

    # Network
    result = {}
    result['threshold'] = threshold
    result['title'] = base_entry.title
    result['feedNames'] = feed_names
    result['nodes'] = []
    result['links'] = []

    dists = list(
        getattr(db, 'network_%s' % entry_ids(base_entry, sep='_')).find()
    )
Пример #9
0
    for entry in entries:
        if entry.feedname == feedname and entry.index == index:
            return entry
    else:
        raise KeyError((feedname, index))


if __name__ == '__main__':
    viz = sys.argv[1]
    feedname = sys.argv[2]
    index = sys.argv[3]

    folder = 'data'
    init(os.path.join(folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt')

    entries = load_entries(folder)

    # Filter entries: only politics
    entries = [entry for entry in entries if entry.data.get('politics')]

    from pymongo import MongoClient
    mongo_client = MongoClient()

    dists_db = mongo_client.distances

    collection = '_'.join(sys.argv[1:4])
    col = getattr(dists_db, collection)

    base = get_entry(feedname, index)
    base_date = to_datetime(base.data['date'])