Пример #1
0
def countPages(extractor, limit=-1):
    page_idx = 0
    printerval = 1000
    togo = printerval
    t = Timer()
    for ex in extractor:
        page_idx = page_idx + 1
        if togo<=0:
            togo = printerval
            print "Counted " + str(page_idx) + " pages"
        togo = togo - 1
        if limit>0 and page_idx>limit:
            break
    t.printTime()
Пример #2
0
def scan(extractor_filename, methods=[], startIdx=0, skipevery=1, offset=0):
    logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    page_idx = 0
    dbconn = dbconnect.MySQLDatabaseConnection.connectionWithConfiguration('local')
    extractor = wikiutils.WikiTextExtractor(extractor_filename)
    commit_idx=0
    commit_ceil=1000
    commit_timer = Timer()
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    thisdir = "/".join(os.path.realpath(__file__).split("/")[:-1]) + "/"
    id2word = gensim.corpora.Dictionary.load_from_text(thisdir + 'lda/results_wordids.txt.bz2')
    lda = gensim.models.ldamodel.LdaModel.load(thisdir + 'lda/lda_1000')
    ctx = ScanContext(extractor, dbconn, id2word, lda)

    for ex in extractor:
        title = extractor.titleForCurrentPage()
        skip_scan = False
        skip_scan = skip_scan or (page_idx < startIdx)
        skip_scan = skip_scan or (page_idx+offset)%skipevery != 0
        skip_scan = skip_scan or any([title.startswith(ignore + ":") for ignore in ignore_namespaces])
        if not (skip_scan):

            try:
                for m in methods:
                    functionDict[m](ctx)
            except Exception as e:
                print e

        page_idx += 1
        commit_idx += 1
        if commit_idx >= commit_ceil:
            commit_idx=0
            dbconn.connection.commit()
            commit_timer.printTime()
            commit_timer = Timer()
            if offset==0:
                print "\tScanning page %d: %s" % (page_idx, extractor.titleForCurrentPage())
    dbconn.connection.commit()
    print "Scan complete!"
Пример #3
0
def findIambsForPages(ptext, pageID):
    iambic_runs = []
    paragraphs = ptext.split("\n")

    t = Timer()

    for paragraph in paragraphs:
        t.begin("makeblob")
        blob = textblob.TextBlob(paragraph)
        t.end("makeblob")
        t.begin("find iambs")
        for sen in blob.sentences:

            ## Get the runs for each sentence
            ## Each sentence is just a raw, nasty string of the code-stripped sentence
            iambic_runs = iambic_runs + wordutils.extract_iambic_pentameter(sen.string)

        t.end("find iambs")

    # t.printTime()

    return iambic_runs
Пример #4
0
def extract_iambic_pentameter(sentence):

	t = Timer()

	iambic_runs = []
	run_start_index = 0;
	run_end_index = 0;

	# [x[1] for x in parse(sentence, chunks=False, tokenize=False).split()[0]]
	# City of Tshwane; Metropolitan Municipality official website

	## Get just the parts of speech of just the first sentence
	sentence = " ".join(sentence.strip().split())
	words = sentence.split();
	if len(words) < 2:
		return iambic_runs

	has_parsed_sentence = False

	for w in range(len(words)):
		raw_word = words[w]
		safe_word = make_safe(raw_word)
		run_end_index = w+1;

		if safe_word in d:
			t.begin("safety")
			raw_previous_words = words[run_start_index:run_end_index]
			safe_words = map(make_safe, raw_previous_words)
			t.end("safety")

			if is_iambic(safe_words):
				if line_sylcount(safe_words) >= 10:
					if line_sylcount(safe_words) == 10:

						## Optimization: Don't do the expensive sentence parsing until you need to
						t.begin("parse")
						if not has_parsed_sentence:
							t.begin("parse-itself")
							pos = parse(sentence).split()[0]
							t.end("parse-itself")
							pos = filter(lambda x: re.match('^[\w-]+', x[1]) is not None, pos)
							chnk = [x[2] for x in pos]
							pos = [x[1] for x in pos]
							pos = [p if p not in pos_map else pos_map[p] for p in pos]
							has_parsed_sentence = True
							if (len(words) != len(pos)):
								# print("Skipping sentence ''" + sentence + "''")
								return iambic_runs
						t.end("parse")

						t.begin("other")
						hasVerb = u'VB' in pos[run_start_index:run_end_index]
						leadChunk = None
						if run_start_index > 0:
							if chnk[run_start_index] == chnk[run_start_index-1]:
								leadChunk = chnk[run_start_index]
						lagChunk = None
						if run_end_index < len(pos)-1:
							if chnk[run_end_index] == chnk[run_end_index+1]:
								lagChunk = chnk[run_end_index]
						t.end("other")

						t.begin("make-options")
						options = optionsForWords(words, pos, chnk, run_start_index, run_end_index)
						t.end("make-options")

						t.begin("append")
						newrun = PoemLine(" ".join(raw_previous_words), pos[run_start_index:run_end_index], options=options)
						iambic_runs.append(newrun)
						t.end("append")
					run_start_index += 1

					t.begin("advance")
					while run_end_index > run_end_index:
						raw_previous_words = words[run_start_index:run_end_index]
						if is_iambic(map(make_safe, raw_previous_words)):
							t.end("advance")
							break
						run_start_index += 1
					t.end("advance")
			else:
				run_start_index = run_end_index
		else:
			run_start_index = run_end_index

	# if iambic_runs:
	# 	t.printTime()

	return iambic_runs
Пример #5
0
import argparse
from util.benchmarking import Timer
import wikibard.wikibard as wikibard
import db.dbreader as dbreader
import db.dbconnect as dbconnect
import random
import codecs

parser = argparse.ArgumentParser(description="Write a bunch of poems, see how long it takes")
parser.add_argument('count', type=int, help="number of poems to write")
parser.add_argument('--output', type=str, default=None, help="output file")

if __name__ == '__main__':
    args = parser.parse_args()
    t = Timer()
    t.begin("poems")
    outf = None
    if args.output:
        outf = codecs.open(args.output, 'w', 'utf-8')
    dbconfig = dbconnect.MySQLDatabaseConnection.dbconfigForName('local')
    dbconn = dbconnect.MySQLDatabaseConnection.connectionWithConfiguration('local')
    cursor = dbconn.connection.cursor()
    query = (
        """SELECT view_counts.id FROM view_counts INNER JOIN page_categories"""
        """ ON page_categories.id = view_counts.id WHERE view_counts.count < 202917"""
        """ ORDER BY view_counts.count DESC LIMIT %s;"""
        )
    values = (10000, )
    cursor.execute(query, values)
    res = cursor.fetchall()
    for i in range(args.count):