def findIambsForPages(ptext, pageID): iambic_runs = [] paragraphs = ptext.split("\n") t = Timer() for paragraph in paragraphs: t.begin("makeblob") blob = textblob.TextBlob(paragraph) t.end("makeblob") t.begin("find iambs") for sen in blob.sentences: ## Get the runs for each sentence ## Each sentence is just a raw, nasty string of the code-stripped sentence iambic_runs = iambic_runs + wordutils.extract_iambic_pentameter(sen.string) t.end("find iambs") # t.printTime() return iambic_runs
def extract_iambic_pentameter(sentence): t = Timer() iambic_runs = [] run_start_index = 0; run_end_index = 0; # [x[1] for x in parse(sentence, chunks=False, tokenize=False).split()[0]] # City of Tshwane; Metropolitan Municipality official website ## Get just the parts of speech of just the first sentence sentence = " ".join(sentence.strip().split()) words = sentence.split(); if len(words) < 2: return iambic_runs has_parsed_sentence = False for w in range(len(words)): raw_word = words[w] safe_word = make_safe(raw_word) run_end_index = w+1; if safe_word in d: t.begin("safety") raw_previous_words = words[run_start_index:run_end_index] safe_words = map(make_safe, raw_previous_words) t.end("safety") if is_iambic(safe_words): if line_sylcount(safe_words) >= 10: if line_sylcount(safe_words) == 10: ## Optimization: Don't do the expensive sentence parsing until you need to t.begin("parse") if not has_parsed_sentence: t.begin("parse-itself") pos = parse(sentence).split()[0] t.end("parse-itself") pos = filter(lambda x: re.match('^[\w-]+', x[1]) is not None, pos) chnk = [x[2] for x in pos] pos = [x[1] for x in pos] pos = [p if p not in pos_map else pos_map[p] for p in pos] has_parsed_sentence = True if (len(words) != len(pos)): # print("Skipping sentence ''" + sentence + "''") return iambic_runs t.end("parse") t.begin("other") hasVerb = u'VB' in pos[run_start_index:run_end_index] leadChunk = None if run_start_index > 0: if chnk[run_start_index] == chnk[run_start_index-1]: leadChunk = chnk[run_start_index] lagChunk = None if run_end_index < len(pos)-1: if chnk[run_end_index] == chnk[run_end_index+1]: lagChunk = chnk[run_end_index] t.end("other") t.begin("make-options") options = optionsForWords(words, pos, chnk, run_start_index, run_end_index) t.end("make-options") t.begin("append") newrun = PoemLine(" ".join(raw_previous_words), pos[run_start_index:run_end_index], options=options) iambic_runs.append(newrun) t.end("append") run_start_index += 1 t.begin("advance") while run_end_index > run_end_index: raw_previous_words = words[run_start_index:run_end_index] if is_iambic(map(make_safe, raw_previous_words)): t.end("advance") break run_start_index += 1 t.end("advance") else: run_start_index = run_end_index else: run_start_index = run_end_index # if iambic_runs: # t.printTime() return iambic_runs
outf = None if args.output: outf = codecs.open(args.output, 'w', 'utf-8') dbconfig = dbconnect.MySQLDatabaseConnection.dbconfigForName('local') dbconn = dbconnect.MySQLDatabaseConnection.connectionWithConfiguration('local') cursor = dbconn.connection.cursor() query = ( """SELECT view_counts.id FROM view_counts INNER JOIN page_categories""" """ ON page_categories.id = view_counts.id WHERE view_counts.count < 202917""" """ ORDER BY view_counts.count DESC LIMIT %s;""" ) values = (10000, ) cursor.execute(query, values) res = cursor.fetchall() for i in range(args.count): random_id = random.sample(res, 1)[0][0] # random_id = 35607283 poem = wikibard.poemForPageID(random_id, 'elizabethan', dbconfig, multi=True) lines = [dbreader.textForLineID(dbconn, p['id']) for p in poem] if outf: outf.write(dbreader.pageTitleForPageID(dbconn, random_id)) outf.write(u'\n\n') for l in lines: outf.write(l) outf.write(u'\n') outf.write(u"\n\n\n") if outf: outf.close() t.end("poems") t.printTime()