def train_classifiers(user): global args corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) cls = {} for sr in args.subreddits: document = corpus.get_train_documents(args.type, user['username'], sr, args.c[0]).encode('utf-8') cl = RedditPPM() cl.train(document) cls[sr] = cl del corpus return cls
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace( '\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x + 100]) #print(where_clause) rows = corpus.run_sql( 'SELECT `id`, `type`, `feature` FROM `feature_map_test` ' + where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
'\\1', atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in [ 'worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit' ]: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql(
ranklist.append(rank) return ranklist if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--type', choices=['submission', 'comment'], required=True) parser.add_argument('n', type=int, nargs=1) parser.add_argument('c', type=int, nargs=1) parser.add_argument('subreddits', type=str, nargs='+') args = parser.parse_args(sys.argv[1:]) corpus = RedditMySQLCorpus(multiprocessing.cpu_count()) corpus.setup(**(cred.kwargs)) corpus.create() print(args.n, args.c, args.subreddits) userlist = corpus.get_user_list(args.type, args.c[0], args.subreddits) userlist = userlist[:args.n[0]] print('Got users') pprint.pprint(userlist) print('Downloading document list') corpora = {} for sr in args.subreddits: corpora[sr] = corpus.get_test_grouped_documents(args.type, sr) print('Downloaded %s' % sr) del corpus pool = multiprocessing.Pool(multiprocessing.cpu_count())
from main import cred def fn(arg): st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment',