Exemplo n.º 1
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--host', default='glossify.io',
		help='server name where the database lives')
	parser.add_argument('--lang',
		help='2-character language code')
	parser.add_argument('--min-index', default=None, type=int,
		help='minimum word index')
	parser.add_argument('--max-index', default=200000, type=int,
		help='maximum word index')
	parser.add_argument('--remove', action='store_true',
		help='remove all documents before beginning?')
	args = parser.parse_args()

	# Get the words
	print "Getting words"
	word_list = get_word_list(args.lang, min_index=args.min_index, max_index=args.max_index)
	print_pricing_info(word_list)

	# Connect to DB
	print "Connecting to DB"
	db = dbutils.DBConnect(args.host, 'tlemberg', 'tlemberg')
	coll_name = "word_list_%s_forward" % args.lang
	coll = db[coll_name]

	print "Removing documents"
	if args.remove:
		coll.remove({})

	print "Translating"
	buf = dbutils.DBWriteBuffer(coll)
	progress = perf.ProgressDisplay(len(word_list))
	for word_list_chunk in dbutils.chunk_list(word_list, 1000):
		n_failures = 0
		while n_failures < MAX_FAILURES:
			try:
				tx_dict = pooled_translate([tup[0] for tup in word_list_chunk], args.lang, 'en')
				break
			except Exception as e:
				traceback.print_exc()
				print "SSL Exception. Retrying."
				n_failures += 1
				if n_failures == MAX_FAILURES:
					raise Exception('Too many SSL Exceptions. Giving up.')
		for (word, count) in word_list_chunk:
			tx = tx_dict[word]
			buf.append({
				'word': word,
				'count': count,
				'tx': tx,
			})
			progress.advance(1)
	buf.flush()

	print "Done"
Exemplo n.º 2
0
def get_excerpt_dictionary():

	# Authenticate the user
	user_profile = verify_auth_token()
	if user_profile is None:
		return json_result({
			'success': 0,
			'error'  : 'authentication failed',
		})

	try:
		# Read the parameters
		lang = request.form['lang']
	except KeyError:
		# Return failure if the arguments don't exist
		return json_result({
			'success': 0,
			'error'  : 'invalid parameters',
		})

	email = user_profile['email']

	excerpts = mongo.db.excerpts.find({
		'lang': lang,
		'email': email })

	phrase_ids = []
	for excerpt in excerpts:
		phrase_ids += excerpt['phrase_ids']

	print len(set(phrase_ids))

	coll = mongo.db["phrases_%s" % lang]

	d = {}
	for phrase_id_chunk in dbutils.chunk_list(phrase_ids, 1000):
		cursor = coll.find({ '_id': { '$in': phrase_id_chunk } })
		d.update(dictionary.create_dictionary_from_cursor(lang, cursor))

	print len(set(d.keys()))

	return json_result({
		'success': 1,
		'result' : d,
	})