示例#1
0
lang = 'en'

sentences,tags = wpTextExtractor.wiki2sentences("<!-- See  -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True)
for s in sentences:
	print s
sys.exit(0)

#topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre']
topics = ['Barack_Obama']
shown = {}
shown2 = {}
shown3 = {}
for article in topics:
	revid = wikipydia.query_revid_by_date(article, lang, date)
	print revid
	wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
	sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True)
	wikimarkup = '\n'.join(sentences)
	print wikimarkup.encode('utf-8')

	while True:
		m = re.search(r'{{[^{}]*}}', wikimarkup)
		if not m:
			break
		ss = m.start() - 100
		if ss < 0:
			ss = 0
		ee = m.end() + 100
		if ee > len(wikimarkup):
			ee = len(wikimarkup)
		#print wikimarkup[ss:m.start()], m.group(), wikimarkup[m.end():ee]
示例#2
0
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5):
	if os.path.exists(output_dir):
		if not os.path.isdir(output_dir):
			sys.stderr.write(output_dir + " is not a directory\n")
			sys.exit(1)
	else:
		os.makedirs(output_dir)

	mark = {}
	success = 0
	articles = {}
	mark = {}
	for article, values in topics.items():
		if success >= upperlimit:
			break
		title = article

		# resolve redirects
		if not wikipydia.query_exists(title, lang):
			continue
		title = wikipydia.query_redirects(title, lang).replace(' ','_')

		if title in mark:
			continue
		mark[title] = True

		# the file prefix for output files
		file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted
		if file_prefix.startswith('.'):
			file_prefix = "%2E" + file_prefix[1:]

		if dryrun:
			print file_prefix
			success += 1
			continue

		done = False
		no_retry = 0
		while not done and no_retry < retry:
			try:
				revid = values['thenid']
				if revid == 0:
					revid = wikipydia.query_revid_by_date_fallback(title, lang, date)
				wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
				done = True
			except:
				no_retry += 1
				time.sleep(wait)

		if not wikimarkup:
			print 'Retrieving', title, 'failed'
			print 'RevID:', revid
			print 'Date:', date.isoformat()
			continue
		try:
			sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True)
		except:
			sys.stdout.flush()
			sys.stdout.write('Failed retrieving the text from ' + title + '\n')
			traceback.print_exc()
			sys.stdout.flush()
			continue

		# substitute angle brackets with html-like character encodings
		#sentences = [re.sub('<', '&lt;', re.sub('>', '&gt;', s)) for s in sentences]
		#sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.')
		output_filename = os.path.join(output_dir, file_prefix + '.sentences')
		output = write_lines_to_file(output_filename, sentences)
		output_filename = os.path.join(output_dir, file_prefix + '.tags')
		output = write_lines_to_file(output_filename, tags)
		success += 1

		priorid = values['priorid']
		if priorid == 0:
			priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15))
		articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid}
		sys.stderr.write('.')
	sys.stderr.write('\n')

	if not dryrun:
		if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'):
			write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))