def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace('\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x+100]) #print(where_clause) rows = corpus.run_sql('SELECT `id`, `type`, `feature` FROM `feature_map_test` '+where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace( '\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x + 100]) #print(where_clause) rows = corpus.run_sql( 'SELECT `id`, `type`, `feature` FROM `feature_map_test` ' + where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) ' 'WHERE `reddit`.`name`= \'%s\'' 'LIMIT %d, %d' % (reddit, j, chunk), None) if len(rows) == 0: break #for row in rows: # atuple = gen_feature(row) # pprint.pprint(atuple) it = pool.imap_unordered(gen_feature, rows) while True: try: atuple = it.next() corpus.run_sql('INSERT IGNORE INTO `comment_sparse_feature1` (`id`, `vector`) VALUES (%s, %s)', atuple) i += 1 #print('i=%d' % i)
pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in [ 'worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit' ]: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql( 'SELECT `body_pos` AS `text` FROM `comment_pos` ' 'LEFT JOIN `comment` ON (`comment`.`id`=`comment_pos`.`id`) ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) ' 'WHERE `reddit`.`name`= \'%s\'' 'LIMIT %d, %d' % (reddit, j, chunk), None) if len(rows) == 0: break it = pool.imap_unordered(gen_feature, rows, 10) new_feature_set = set() while True: try: atuple = it.next() new_feature_set = new_feature_set.union(atuple) except StopIteration: break print('calc difference') new_feature_set.difference_update(feature_set)
if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit']: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `body_pos` AS `text` FROM `comment_pos` ' 'LEFT JOIN `comment` ON (`comment`.`id`=`comment_pos`.`id`) ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) ' 'WHERE `reddit`.`name`= \'%s\'' 'LIMIT %d, %d' % (reddit, j, chunk), None) if len(rows) == 0: break it = pool.imap_unordered(gen_feature, rows, 10) new_feature_set = set() while True: try: atuple = it.next() new_feature_set = new_feature_set.union(atuple) except StopIteration: break print('calc difference') new_feature_set.difference_update(feature_set) print('difference calced')
'lix': (0, 100), 'rix': (0, 10) } indices = [ 'ari', 'flesch_reading_ease', 'flesch_kincaid_grade_level', 'gunning_fog_index', 'smog_index', 'coleman_liau_index', 'lix', 'rix' ] for i in indices: bin_width = (limits[i][1] - limits[i][0]) / 20 if bin_width <= 1: bin_width = 1 result = corpus.run_sql( 'SELECT COUNT(*) AS count, FLOOR(FLOOR(%s)/%s)*%s AS bin ' 'FROM comment_feature_read ' 'LEFT JOIN comment ON (comment.id=comment_feature_read.id) LEFT JOIN submission ON (submission.id=comment.submission_id) LEFT JOIN reddit ON (reddit.id=submission.reddit_id) ' 'WHERE reddit.name = \'netsec\' ' 'GROUP BY bin ORDER BY bin' % (i, bin_width, bin_width), None) print(i) old_values = [(r['bin'], r['count']) for r in result] values = [] for a, b in old_values: if a >= limits[i][0] and a <= limits[i][1]: values.append((a, b)) graph.hist_prebin('data/%s_hist' % i, values, bin_width, i, 'Frequency', 'Frequency of %s values' % i) result = corpus.run_sql( 'SELECT * FROM comment_feature_read ' 'LEFT JOIN comment ON (comment.id=comment_feature_read.id) LEFT JOIN submission ON (submission.id=comment.submission_id) LEFT JOIN reddit ON (reddit.id=submission.reddit_id) '
corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql( 'SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) ' 'WHERE `reddit`.`name`= \'%s\'' 'LIMIT %d, %d' % (reddit, j, chunk), None) if len(rows) == 0: break #for row in rows: # atuple = gen_feature(row) # pprint.pprint(atuple) it = pool.imap_unordered(gen_feature, rows) while True: try: atuple = it.next() corpus.run_sql( 'INSERT IGNORE INTO `comment_sparse_feature1` (`id`, `vector`) VALUES (%s, %s)', atuple) i += 1
'smog_index': (0, 20), 'coleman_liau_index': (-30, 30), 'lix': (0, 100), 'rix': (0, 10) } indices = ['ari', 'flesch_reading_ease', 'flesch_kincaid_grade_level', 'gunning_fog_index', 'smog_index', 'coleman_liau_index', 'lix', 'rix'] for i in indices: bin_width = (limits[i][1] - limits[i][0])/20 if bin_width <= 1: bin_width = 1 result = corpus.run_sql('SELECT COUNT(*) AS count, FLOOR(FLOOR(%s)/%s)*%s AS bin ' 'FROM comment_feature_read ' 'LEFT JOIN comment ON (comment.id=comment_feature_read.id) LEFT JOIN submission ON (submission.id=comment.submission_id) LEFT JOIN reddit ON (reddit.id=submission.reddit_id) ' 'WHERE reddit.name = \'netsec\' ' 'GROUP BY bin ORDER BY bin' % (i, bin_width, bin_width), None) print(i) old_values = [ (r['bin'], r['count']) for r in result ] values = [] for a, b in old_values: if a >= limits[i][0] and a <= limits[i][1]: values.append((a, b)) graph.hist_prebin('data/%s_hist' % i, values, bin_width, i, 'Frequency', 'Frequency of %s values' % i) result = corpus.run_sql('SELECT * FROM comment_feature_read ' 'LEFT JOIN comment ON (comment.id=comment_feature_read.id) LEFT JOIN submission ON (submission.id=comment.submission_id) LEFT JOIN reddit ON (reddit.id=submission.reddit_id) ' 'WHERE reddit.name = \'netsec\' '