def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', default='glossify.io', help='server name where the database lives') parser.add_argument('--lang', help='2-character language code') parser.add_argument('--min-index', default=None, type=int, help='minimum word index') parser.add_argument('--max-index', default=200000, type=int, help='maximum word index') parser.add_argument('--remove', action='store_true', help='remove all documents before beginning?') args = parser.parse_args() # Get the words print "Getting words" word_list = get_word_list(args.lang, min_index=args.min_index, max_index=args.max_index) print_pricing_info(word_list) # Connect to DB print "Connecting to DB" db = dbutils.DBConnect(args.host, 'tlemberg', 'tlemberg') coll_name = "word_list_%s_forward" % args.lang coll = db[coll_name] print "Removing documents" if args.remove: coll.remove({}) print "Translating" buf = dbutils.DBWriteBuffer(coll) progress = perf.ProgressDisplay(len(word_list)) for word_list_chunk in dbutils.chunk_list(word_list, 1000): n_failures = 0 while n_failures < MAX_FAILURES: try: tx_dict = pooled_translate([tup[0] for tup in word_list_chunk], args.lang, 'en') break except Exception as e: traceback.print_exc() print "SSL Exception. Retrying." n_failures += 1 if n_failures == MAX_FAILURES: raise Exception('Too many SSL Exceptions. Giving up.') for (word, count) in word_list_chunk: tx = tx_dict[word] buf.append({ 'word': word, 'count': count, 'tx': tx, }) progress.advance(1) buf.flush() print "Done"
def get_excerpt_dictionary(): # Authenticate the user user_profile = verify_auth_token() if user_profile is None: return json_result({ 'success': 0, 'error' : 'authentication failed', }) try: # Read the parameters lang = request.form['lang'] except KeyError: # Return failure if the arguments don't exist return json_result({ 'success': 0, 'error' : 'invalid parameters', }) email = user_profile['email'] excerpts = mongo.db.excerpts.find({ 'lang': lang, 'email': email }) phrase_ids = [] for excerpt in excerpts: phrase_ids += excerpt['phrase_ids'] print len(set(phrase_ids)) coll = mongo.db["phrases_%s" % lang] d = {} for phrase_id_chunk in dbutils.chunk_list(phrase_ids, 1000): cursor = coll.find({ '_id': { '$in': phrase_id_chunk } }) d.update(dictionary.create_dictionary_from_cursor(lang, cursor)) print len(set(d.keys())) return json_result({ 'success': 1, 'result' : d, })