words = { k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) ' 'WHERE `reddit`.`name`= \'%s\''
def fn(arg): st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment', None, limit=i)
flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in [ 'worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit' ]: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql( 'SELECT `body_pos` AS `text` FROM `comment_pos` ' 'LEFT JOIN `comment` ON (`comment`.`id`=`comment_pos`.`id`) '
read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment', None, limit=i) tt = [] dgt.start() for i in range(0, multiprocessing.cpu_count()):