def get_base_for_pairs(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 entries = { feed: [entry for entry in entries_metadata[feed.name]] for feed in feeds } if 'date' in request.args: entries = { feed: [e for e in l if e.data.get('date') == request.args['date']] for feed, l in entries.items() } entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in l if entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed, l in entries.items() ) ) try: max_entries = int(request.args.get('maxentries')) except: max_entries = MAX_BASE_FOR_PAIRS max_entries = min(len(entries), max_entries) max_entries = max_entries - max_entries % 3 tests = sample(entries, max_entries) if not tests: # No more entries in this date return '%s cannot be used for tagger' % request.args.get('date'), 400 # Only filtered entries remain now return render_template( 'get_base_for_pairs.html', title='Base for pairs', tests=[[{ 'feedname': tests[i * 3 + j].feedname, 'index': tests[i * 3 + j].index, 'title': tests[i * 3 + j].title, 'content': tests[i * 3 + j].content, } for j in range(3)] for i in range(len(tests) // 3)], maxtests=request.args.get('maxtests'), # for tag_pairs_page enumerate=enumerate, # pass it to jinja )
def get_entry(name, sep='|'): feedname, index = name.split(sep) index = int(index) for entry in entries: if entry.feedname == feedname and entry.index == index: return entry else: raise KeyError(name) if __name__ == '__main__': action_tests = sys.argv[1] == 'tests' # else, threshold base_feedname = sys.argv[2] base_index = sys.argv[3] base_name = '%s|%s' % (base_feedname, base_index) multiple = int(sys.argv[4]) if len(sys.argv) >= 5 else None entries = load_entries('data') if action_tests: if multiple: run_tests(base_name, multiple=int(multiple)) else: run_tests(base_name) else: print('Threshold:', threshold(base_name))
def tag_politics_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['politics'].find().count() return json.dumps( { 'user': user, 'politics_ratio': tests_db['politics'].find({ 'res': 1 }).count() / n, 'count': n }, indent=2) try: maxtests = int(request.args.get('maxtests')) except: maxtests = MAX_TESTS_POLITICS tests = sample([ e for feed in feeds for e in entries_metadata[feed.name] if e.data.get('newsbreaker', False) ], min(sum(len(l) for l in entries_metadata.values()), maxtests)) # Only filtered entries remain now if not tests: return 'Not a single entry to tag!', 500 tests = load_entries(metadata_folder, it=((feed, (entry for entry in tests if entry.feedname == feed.name)) for feed in feeds)) return render_template( 'tag_politics.html', title='Politics tagger', tests=[{ 'feedname': entry.feedname, 'index': entry.index, 'title': entry.title, 'content': entry.content, } for entry in tests], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('tag_politics_page') + '?maxtests=%d' % len(tests)) if not tests: return response tests = [ [ request.form[testname + 'entry'], testvalue == '1' # 0 -> false, 1 -> true ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['politics'].insert_many({ 'entry': test[0], 'res': test[1], 'user': user } for test in tests) return response
def tag_pairs_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 if not newsbreaker_initialized: newsbreaker.init(os.path.join(metadata_folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt') globals()['newsbreaker_initialized'] = True user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['pairs'].find().count() return json.dumps({'user': user, 'count': n}, indent=2) try: basefeed = request.args.get('feed') or choice(feeds).name baseindex = int( request.args.get('index') or choice( [ entry for entry in entries_metadata[basefeed] if entry.data.get('newsbreaker') and \ entry.data.get('politics') ] ).index ) except KeyError as e: return 'Invalid feed: %s' % e.args, 400 except ValueError as e: return 'Invalid index: %s' % e.args, 400 for base_metadata in entries_metadata[basefeed]: if base_metadata.index == baseindex: break else: return '%s|%d not found' % (basefeed, baseindex), 400 if not base_metadata.data.get('newsbreaker', False): return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400 selected_date = base_metadata.data['date'] # Get all entries in the same date as base # that are breakable, and get them as BreakableEntries day_entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in entries_metadata[feed.name] if entry.data.get('date') == selected_date and \ entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed in feeds ) ) # Retrieve base as BreakableEntry for base in day_entries: if base.feedname == basefeed and base.index == baseindex: break else: return 'Base entry isn\'t breakable', 500 # Get the WHAT distance of all entries with base day_entries = [(entry, entry.what_distance(base)) for entry in day_entries] # Get all pairs of entries that are not base # ordering by dist_e1 + dist_e2 ascending tests = [ (e1, e2) for e1, e2, _ in sorted(((e1, e2, d1 + d2) for i, (e1, d1) in enumerate(day_entries) for j, (e2, d2) in enumerate(day_entries) if i < j and e1 != base and e2 != base), key=lambda t: t[2]) ] # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS) try: max_tests = int(request.args.get('maxtests')) except: max_tests = MAX_TESTS_PAIRS tests = sample(tests, min(len(tests), max_tests)) if not tests: # No more entries in this date return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400 # Only filtered entries remain now return render_template( 'tag_pairs.html', title='Pairs tagger', base={ 'feedname': base.feedname, 'index': base.index, 'title': base.title, 'content': base.content, }, tests=[[{ 'feedname': e1.feedname, 'index': e1.index, 'title': e1.title, 'content': e1.content, }, { 'feedname': e2.feedname, 'index': e2.index, 'title': e2.title, 'content': e2.content, }] for e1, e2 in tests], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': if 'base' not in request.form: return 'Base not found in form inputs', 400 tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests)) if not tests: return response tests = [ [ request.form[testname + 'e1'], request.form[testname + 'e2'], int(testvalue == '1') # 0 or 1 ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['pairs'].insert_many({ 'base': request.form['base'], 'e1': test[0], 'e2': test[1], 'res': test[2], 'user': user } for test in tests) return response
def tag_politics_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['politics'].find().count() return json.dumps( { 'user': user, 'politics_ratio': tests_db['politics'].find( {'res': 1} ).count() / n, 'count': n }, indent=2 ) try: maxtests = int(request.args.get('maxtests')) except: maxtests = MAX_TESTS_POLITICS tests = sample( [ e for feed in feeds for e in entries_metadata[feed.name] if e.data.get('newsbreaker', False) ], min( sum(len(l) for l in entries_metadata.values()), maxtests ) ) # Only filtered entries remain now if not tests: return 'Not a single entry to tag!', 500 tests = load_entries( metadata_folder, it=( ( feed, ( entry for entry in tests if entry.feedname == feed.name ) ) for feed in feeds ) ) return render_template('tag_politics.html', title='Politics tagger', tests=[ { 'feedname': entry.feedname, 'index': entry.index, 'title': entry.title, 'content': entry.content, } for entry in tests ], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('tag_politics_page') + '?maxtests=%d' % len(tests) ) if not tests: return response tests = [ [ request.form[testname + 'entry'], testvalue == '1' # 0 -> false, 1 -> true ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['politics'].insert_many( { 'entry': test[0], 'res': test[1], 'user': user } for test in tests ) return response
def tag_pairs_page(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 if not newsbreaker_initialized: newsbreaker.init( os.path.join(metadata_folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt' ) globals()['newsbreaker_initialized'] = True user = request.cookies['user'] if request.method == 'GET': if 'stats' in request.args: n = tests_db['pairs'].find().count() return json.dumps( { 'user': user, 'count': n }, indent=2 ) try: basefeed = request.args.get('feed') or choice(feeds).name baseindex = int( request.args.get('index') or choice( [ entry for entry in entries_metadata[basefeed] if entry.data.get('newsbreaker') and \ entry.data.get('politics') ] ).index ) except KeyError as e: return 'Invalid feed: %s' % e.args, 400 except ValueError as e: return 'Invalid index: %s' % e.args, 400 for base_metadata in entries_metadata[basefeed]: if base_metadata.index == baseindex: break else: return '%s|%d not found' % (basefeed, baseindex), 400 if not base_metadata.data.get('newsbreaker', False): return '%s|%d isn\'t a filtered entry' % (basefeed, baseindex), 400 selected_date = base_metadata.data['date'] # Get all entries in the same date as base # that are breakable, and get them as BreakableEntries day_entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in entries_metadata[feed.name] if entry.data.get('date') == selected_date and \ entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed in feeds ) ) # Retrieve base as BreakableEntry for base in day_entries: if base.feedname == basefeed and base.index == baseindex: break else: return 'Base entry isn\'t breakable', 500 # Get the WHAT distance of all entries with base day_entries = [ (entry, entry.what_distance(base)) for entry in day_entries ] # Get all pairs of entries that are not base # ordering by dist_e1 + dist_e2 ascending tests = [ (e1, e2) for e1, e2, _ in sorted( ( (e1, e2, d1 + d2) for i, (e1, d1) in enumerate(day_entries) for j, (e2, d2) in enumerate(day_entries) if i < j and e1 != base and e2 != base ), key=lambda t: t[2] ) ] # Get a sample of as much as request.args.get('maxtests', MAX_TESTS_PAIRS) try: max_tests = int(request.args.get('maxtests')) except: max_tests = MAX_TESTS_PAIRS tests = sample( tests, min(len(tests), max_tests) ) if not tests: # No more entries in this date return '%s|%d cannot be used for tagger' % (basefeed, baseindex), 400 # Only filtered entries remain now return render_template('tag_pairs.html', title='Pairs tagger', base={ 'feedname': base.feedname, 'index': base.index, 'title': base.title, 'content': base.content, }, tests=[ [ { 'feedname': e1.feedname, 'index': e1.index, 'title': e1.title, 'content': e1.content, }, { 'feedname': e2.feedname, 'index': e2.index, 'title': e2.title, 'content': e2.content, } ] for e1, e2 in tests ], enumerate=enumerate, # pass it to jinja ) elif request.method == 'POST': if 'base' not in request.form: return 'Base not found in form inputs', 400 tests = [] for k, v in request.form.items(): if regex_test.fullmatch(k): tests.append((k, str(v))) # str it just in case response = redirect( url_for('get_base_for_pairs') + '?maxtests=%d' % len(tests) ) if not tests: return response tests = [ [ request.form[testname + 'e1'], request.form[testname + 'e2'], int(testvalue == '1') # 0 or 1 ] for testname, testvalue in tests if testvalue in ('-1', '1') ] if not tests: return response # don't insert to tests_db without any values tests_db['pairs'].insert_many( { 'base': request.form['base'], 'e1': test[0], 'e2': test[1], 'res': test[2], 'user': user } for test in tests ) return response
def get_base_for_pairs(): if 'user' not in request.cookies: return 'User must be logged in to tag news', 401 entries = { feed: [ entry for entry in entries_metadata[feed.name] ] for feed in feeds } if 'date' in request.args: entries = { feed: [e for e in l if e.data.get('date') == request.args['date']] for feed, l in entries.items() } entries = load_entries( metadata_folder, it=( ( feed, ( entry for entry in l if entry.data.get('newsbreaker', False) and \ entry.data.get('politics') ) ) for feed, l in entries.items() ) ) try: max_entries = int(request.args.get('maxentries')) except: max_entries = MAX_BASE_FOR_PAIRS max_entries = min(len(entries), max_entries) max_entries = max_entries - max_entries % 3 tests = sample(entries, max_entries) if not tests: # No more entries in this date return '%s cannot be used for tagger' % request.args.get('date'), 400 # Only filtered entries remain now return render_template('get_base_for_pairs.html', title='Base for pairs', tests=[ [ { 'feedname': tests[i*3 + j].feedname, 'index': tests[i*3 + j].index, 'title': tests[i*3 + j].title, 'content': tests[i*3 + j].content, } for j in range(3) ] for i in range(len(tests) // 3) ], maxtests=request.args.get('maxtests'), # for tag_pairs_page enumerate=enumerate, # pass it to jinja )
if entry.feedname == feedname and entry.index == index: return entry else: raise KeyError(name) if __name__ == '__main__': base_feedname = sys.argv[1] base_index = sys.argv[2] base_name = '%s|%s' % (base_feedname, base_index) threshold = float(sys.argv[3]) if len(sys.argv) >= 4 else 0.6 # for network client = MongoClient() db = client.distances entries = load_entries('data') feed_names = sorted({entry.feedname for entry in entries}) base_entry = get_entry(base_name) # Network result = {} result['threshold'] = threshold result['title'] = base_entry.title result['feedNames'] = feed_names result['nodes'] = [] result['links'] = [] dists = list( getattr(db, 'network_%s' % entry_ids(base_entry, sep='_')).find() )
for entry in entries: if entry.feedname == feedname and entry.index == index: return entry else: raise KeyError((feedname, index)) if __name__ == '__main__': viz = sys.argv[1] feedname = sys.argv[2] index = sys.argv[3] folder = 'data' init(os.path.join(folder, 'topic_model'), 'topic_model.pkl', 'vocab.txt') entries = load_entries(folder) # Filter entries: only politics entries = [entry for entry in entries if entry.data.get('politics')] from pymongo import MongoClient mongo_client = MongoClient() dists_db = mongo_client.distances collection = '_'.join(sys.argv[1:4]) col = getattr(dists_db, collection) base = get_entry(feedname, index) base_date = to_datetime(base.data['date'])