def countPages(extractor, limit=-1): page_idx = 0 printerval = 1000 togo = printerval t = Timer() for ex in extractor: page_idx = page_idx + 1 if togo<=0: togo = printerval print "Counted " + str(page_idx) + " pages" togo = togo - 1 if limit>0 and page_idx>limit: break t.printTime()
def scan(extractor_filename, methods=[], startIdx=0, skipevery=1, offset=0): logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) page_idx = 0 dbconn = dbconnect.MySQLDatabaseConnection.connectionWithConfiguration('local') extractor = wikiutils.WikiTextExtractor(extractor_filename) commit_idx=0 commit_ceil=1000 commit_timer = Timer() ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() thisdir = "/".join(os.path.realpath(__file__).split("/")[:-1]) + "/" id2word = gensim.corpora.Dictionary.load_from_text(thisdir + 'lda/results_wordids.txt.bz2') lda = gensim.models.ldamodel.LdaModel.load(thisdir + 'lda/lda_1000') ctx = ScanContext(extractor, dbconn, id2word, lda) for ex in extractor: title = extractor.titleForCurrentPage() skip_scan = False skip_scan = skip_scan or (page_idx < startIdx) skip_scan = skip_scan or (page_idx+offset)%skipevery != 0 skip_scan = skip_scan or any([title.startswith(ignore + ":") for ignore in ignore_namespaces]) if not (skip_scan): try: for m in methods: functionDict[m](ctx) except Exception as e: print e page_idx += 1 commit_idx += 1 if commit_idx >= commit_ceil: commit_idx=0 dbconn.connection.commit() commit_timer.printTime() commit_timer = Timer() if offset==0: print "\tScanning page %d: %s" % (page_idx, extractor.titleForCurrentPage()) dbconn.connection.commit() print "Scan complete!"
def findIambsForPages(ptext, pageID): iambic_runs = [] paragraphs = ptext.split("\n") t = Timer() for paragraph in paragraphs: t.begin("makeblob") blob = textblob.TextBlob(paragraph) t.end("makeblob") t.begin("find iambs") for sen in blob.sentences: ## Get the runs for each sentence ## Each sentence is just a raw, nasty string of the code-stripped sentence iambic_runs = iambic_runs + wordutils.extract_iambic_pentameter(sen.string) t.end("find iambs") # t.printTime() return iambic_runs
def extract_iambic_pentameter(sentence): t = Timer() iambic_runs = [] run_start_index = 0; run_end_index = 0; # [x[1] for x in parse(sentence, chunks=False, tokenize=False).split()[0]] # City of Tshwane; Metropolitan Municipality official website ## Get just the parts of speech of just the first sentence sentence = " ".join(sentence.strip().split()) words = sentence.split(); if len(words) < 2: return iambic_runs has_parsed_sentence = False for w in range(len(words)): raw_word = words[w] safe_word = make_safe(raw_word) run_end_index = w+1; if safe_word in d: t.begin("safety") raw_previous_words = words[run_start_index:run_end_index] safe_words = map(make_safe, raw_previous_words) t.end("safety") if is_iambic(safe_words): if line_sylcount(safe_words) >= 10: if line_sylcount(safe_words) == 10: ## Optimization: Don't do the expensive sentence parsing until you need to t.begin("parse") if not has_parsed_sentence: t.begin("parse-itself") pos = parse(sentence).split()[0] t.end("parse-itself") pos = filter(lambda x: re.match('^[\w-]+', x[1]) is not None, pos) chnk = [x[2] for x in pos] pos = [x[1] for x in pos] pos = [p if p not in pos_map else pos_map[p] for p in pos] has_parsed_sentence = True if (len(words) != len(pos)): # print("Skipping sentence ''" + sentence + "''") return iambic_runs t.end("parse") t.begin("other") hasVerb = u'VB' in pos[run_start_index:run_end_index] leadChunk = None if run_start_index > 0: if chnk[run_start_index] == chnk[run_start_index-1]: leadChunk = chnk[run_start_index] lagChunk = None if run_end_index < len(pos)-1: if chnk[run_end_index] == chnk[run_end_index+1]: lagChunk = chnk[run_end_index] t.end("other") t.begin("make-options") options = optionsForWords(words, pos, chnk, run_start_index, run_end_index) t.end("make-options") t.begin("append") newrun = PoemLine(" ".join(raw_previous_words), pos[run_start_index:run_end_index], options=options) iambic_runs.append(newrun) t.end("append") run_start_index += 1 t.begin("advance") while run_end_index > run_end_index: raw_previous_words = words[run_start_index:run_end_index] if is_iambic(map(make_safe, raw_previous_words)): t.end("advance") break run_start_index += 1 t.end("advance") else: run_start_index = run_end_index else: run_start_index = run_end_index # if iambic_runs: # t.printTime() return iambic_runs
import argparse from util.benchmarking import Timer import wikibard.wikibard as wikibard import db.dbreader as dbreader import db.dbconnect as dbconnect import random import codecs parser = argparse.ArgumentParser(description="Write a bunch of poems, see how long it takes") parser.add_argument('count', type=int, help="number of poems to write") parser.add_argument('--output', type=str, default=None, help="output file") if __name__ == '__main__': args = parser.parse_args() t = Timer() t.begin("poems") outf = None if args.output: outf = codecs.open(args.output, 'w', 'utf-8') dbconfig = dbconnect.MySQLDatabaseConnection.dbconfigForName('local') dbconn = dbconnect.MySQLDatabaseConnection.connectionWithConfiguration('local') cursor = dbconn.connection.cursor() query = ( """SELECT view_counts.id FROM view_counts INNER JOIN page_categories""" """ ON page_categories.id = view_counts.id WHERE view_counts.count < 202917""" """ ORDER BY view_counts.count DESC LIMIT %s;""" ) values = (10000, ) cursor.execute(query, values) res = cursor.fetchall() for i in range(args.count):