Пример #1
0
def read_topics(topic_file, topics):
	#print "read_topics: " + topic_file
	f = codecs.open(topic_file, "rt", "utf-8")
	for line in f:
		fields = line.split()
		title = fields[0]
		score = int(fields[1])
		if len(fields) >= 3:
			thenid = fields[2]
		else:
			thenid = str(wikipydia.query_revid_by_date_fallback(page, lang, date))
		if len(fields) >= 4:
			priorid = fields[3]
		else:
			priorid = str(wikipydia.query_revid_by_date_fallback(page, lang, date - datetime.timedelta(days=15)))

		topics[title] = {'score': score, 'thenid': thenid, 'priorid': priorid}
Пример #2
0
def convert_topics(filename, lang):
	date = None
	topics_re = re.compile(r'^([0-9]{4})-([0-9]{2})-([0-9]{2})\.topics$')
	m = topics_re.match(os.path.basename(filename))
	if m:
		date = datetime.date(int(m.group(1)), int(m.group(2)), int(m.group(3)))

	lineno = 0
	try:
		f = open(filename, 'r')
		topic_line_re1 = re.compile("^(.+) ([0-9]+)$")
		topic_line_re2 = re.compile("^([^\t]+)\t([0-9]+)$")
		print "<table>";
		print "<tr><th>Rank</th><th>Titles</th><th>Actions</th></tr>";
		for line in f:
			lineno += 1
			line = line.rstrip('\n')
			m = topic_line_re1.match(line)
			if m:
				title = m.group(1)
				pageviews = int(m.group(2))
			else:
				m = topic_line_re2.match(line)
				if m:
					title = m.group(1)
					pageviews = int(m.group(2))
				else:
					title = line
					pageviews = None
			title = title.decode('utf8')
			if not wikipydia.query_exists(title, lang):
				continue
			title = wikipydia.query_redirects(title, lang)
			title = title.encode('utf8')
			escaped_title = urllib.quote(title.replace(' ','_'), safe="%") # force / to be quoted and % not to be quoted
			if pageviews:
				print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s<span class="score">%d</span></a></td>' % (lineno, lang, escaped_title, title, pageviews)
			else:
				print '<tr><td>%d</td><td><a href="http://%s.wikipedia.org/wiki/%s" target="view">%s</a></td>' % (lineno, lang, escaped_title, title)

			print '<td><span class="more">more</span><ul class="subnav">'
			print '\t<li><a href="http://%s.wikipedia.org/wiki/%s" target="view">View Now</a></li>' % (lang, escaped_title)
			if date:
				thenid = str(wikipydia.query_revid_by_date_fallback(title, lang, date))
				priorid = str(wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15)))
				print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + thenid + '" target="viewthen">View Then</a></li>'
				if priorid == "0":
					print '\t<li>View Prior</li>'
					print '\t<li>View Diff</li>'
				else:
					print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?oldid=' + priorid + '" target="viewprior">View Prior</a></li>'
					print '\t<li><a href="http://' + lang + '.wikipedia.org/w/index.php?diff=' + thenid + '&oldid=' + priorid + '" target="viewdiff">View Diff</a></li>'
			if lang != 'en':
				print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fwiki%2F' + escaped_title + '" target="translate">Translate Now</a></li>'
				if date:
					print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + thenid + '" target="translatethen">Translate Then</a></li>'
					if priorid == "0":
						print '\t<li>Translate Prior</li>'
						print '\t<li>Translate Diff</li>'
					else:
						print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?oldid=' + priorid + '" target="translateprior">Translate Prior</a></li>'
						print '\t<li><a href="http://translate.google.com/translate?hl=en&sl=' + lang + '&tl=en&u=http%3A%2F%2F' + lang + '.wikipedia.org%2Fw%2Findex.php?diff=' + thenid + '&oldid=' + priorid + '" target="translatediff">Translate Diff</a></li>'
			print "</ul></td></tr>";
		print "</table>";
	finally:
		if f:
			f.close()
Пример #3
0
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5):
	if os.path.exists(output_dir):
		if not os.path.isdir(output_dir):
			sys.stderr.write(output_dir + " is not a directory\n")
			sys.exit(1)
	else:
		os.makedirs(output_dir)

	mark = {}
	success = 0
	articles = {}
	mark = {}
	for article, values in topics.items():
		if success >= upperlimit:
			break
		title = article

		# resolve redirects
		if not wikipydia.query_exists(title, lang):
			continue
		title = wikipydia.query_redirects(title, lang).replace(' ','_')

		if title in mark:
			continue
		mark[title] = True

		# the file prefix for output files
		file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted
		if file_prefix.startswith('.'):
			file_prefix = "%2E" + file_prefix[1:]

		if dryrun:
			print file_prefix
			success += 1
			continue

		done = False
		no_retry = 0
		while not done and no_retry < retry:
			try:
				revid = values['thenid']
				if revid == 0:
					revid = wikipydia.query_revid_by_date_fallback(title, lang, date)
				wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
				done = True
			except:
				no_retry += 1
				time.sleep(wait)

		if not wikimarkup:
			print 'Retrieving', title, 'failed'
			print 'RevID:', revid
			print 'Date:', date.isoformat()
			continue
		try:
			sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True)
		except:
			sys.stdout.flush()
			sys.stdout.write('Failed retrieving the text from ' + title + '\n')
			traceback.print_exc()
			sys.stdout.flush()
			continue

		# substitute angle brackets with html-like character encodings
		#sentences = [re.sub('<', '&lt;', re.sub('>', '&gt;', s)) for s in sentences]
		#sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.')
		output_filename = os.path.join(output_dir, file_prefix + '.sentences')
		output = write_lines_to_file(output_filename, sentences)
		output_filename = os.path.join(output_dir, file_prefix + '.tags')
		output = write_lines_to_file(output_filename, tags)
		success += 1

		priorid = values['priorid']
		if priorid == 0:
			priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15))
		articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid}
		sys.stderr.write('.')
	sys.stderr.write('\n')

	if not dryrun:
		if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'):
			write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))