def train_classifiers(user): global args corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) cls = {} for sr in subreddits: document = corpus.get_train_documents(args.type, user['username'], sr, args.c[0]).encode('utf-8') cl = RedditPPM() cl.train(document) cls[sr] = cl del corpus return cls
def train_classifiers(user): global args corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) cls = {} for sr in args.subreddits: document = corpus.get_train_documents(args.type, user['username'], sr, args.c[0]).encode('utf-8') cl = RedditPPM() cl.train(document) cls[sr] = cl del corpus return cls
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace('\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x+100]) #print(where_clause) rows = corpus.run_sql('SELECT `id`, `type`, `feature` FROM `feature_map_test` '+where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace( '\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x + 100]) #print(where_clause) rows = corpus.run_sql( 'SELECT `id`, `type`, `feature` FROM `feature_map_test` ' + where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
words = ngram.get_word_ngram(text, n=1, clean=False) words = { k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) '
atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in [ 'worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit' ]: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql( 'SELECT `body_pos` AS `text` FROM `comment_pos` '
from main import cred def fn(arg): st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment', None,
st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment', None, limit=i) tt = [] dgt.start()