def test_starting_out(self): translator = Translator((100, 100)) self.assertEqual(translator.global_origin_vector, Vector2D([50, 50])) self.assertEqual(translator.spacing, 10) translator = Translator((400, 400)) self.assertEqual(translator.spacing, 40)
def test_starting_out(self): translator = Translator((100, 100)) v1 = Vector2D([1, 1]) translation1 = translator.translate(v1) self.assertEqual(translation1, Vector2D([60, 40])) v2 = Vector2D([-20, 0]) translation2 = translator.translate(v2) self.assertEqual(translation2, Vector2D([-150, 50]))
def train(self, model: Seq2Seq, discriminator: Discriminator, src_file_names: List[str], tgt_file_names: List[str], unsupervised_big_epochs: int, print_every: int, save_every: int, num_words_in_batch: int, max_length: int, teacher_forcing: bool, save_file: str="model", n_unsupervised_batches: int=None, enable_unsupervised_backtranslation: bool=False): if self.main_optimizer is None or self.discriminator_optimizer is None: logger.info("Initializing optimizers...") self.main_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=self.main_lr, betas=self.main_betas) self.discriminator_optimizer = optim.RMSprop(discriminator.parameters(), lr=self.discriminator_lr) for big_epoch in range(unsupervised_big_epochs): src_batch_gen = BatchGenerator(src_file_names, num_words_in_batch, max_len=max_length, vocabulary=self.vocabulary, language="src", max_batch_count=n_unsupervised_batches) tgt_batch_gen = BatchGenerator(tgt_file_names, num_words_in_batch, max_len=max_length, vocabulary=self.vocabulary, language="tgt", max_batch_count=n_unsupervised_batches) logger.debug("Src batch:" + str(next(iter(src_batch_gen)))) logger.debug("Tgt batch:" + str(next(iter(tgt_batch_gen)))) timer = time.time() main_loss_total = 0 discriminator_loss_total = 0 epoch = 0 for src_batch, tgt_batch in zip(src_batch_gen, tgt_batch_gen): model.train() discriminator_loss, losses = self.train_batch(model, discriminator, src_batch, tgt_batch, teacher_forcing) main_loss = sum(losses) main_loss_total += main_loss discriminator_loss_total += discriminator_loss if epoch % save_every == 0 and epoch != 0: save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer, save_file + ".pt") if epoch % print_every == 0 and epoch != 0: main_loss_avg = main_loss_total / print_every discriminator_loss_avg = discriminator_loss_total / print_every main_loss_total = 0 discriminator_loss_total = 0 diff = time.time() - timer timer = time.time() translator = Translator(model, self.vocabulary, self.use_cuda) logger.debug("Auto: " + translator.translate_sentence("you can prepare your meals here .", "src", "src")) logger.debug("Translated: " + translator.translate_sentence("you can prepare your meals here .", "src", "tgt")) logger.info('%s big epoch, %s epoch, %s sec, %.4f main loss, ' '%.4f discriminator loss, current losses: %s' % (big_epoch, epoch, diff, main_loss_avg, discriminator_loss_avg, losses)) epoch += 1 save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer, save_file + ".pt") if enable_unsupervised_backtranslation: self.current_translation_model = Translator(model, self.vocabulary, self.use_cuda) model = copy.deepcopy(model)
def __init__(self, fileName): self.prog = None self.iProgLine = 0 self.symCommand = "" self.binCommand = "" self.nCommand = 0 self.isComment = False self.symCommandType = -1 self.tranlator = Translator() self.symTable = SymTable() self.newFile = fileName + ".hack" return None
def test_starting_out(self): translator = Translator((100, 100)) v1 = Vector2D([1, 1]) self.assertTrue(translator.is_in_range(v1)) translator.update(Vector2D([10, 0]), 1) self.assertFalse(translator.is_in_range(v1)) translator.update(Vector2D([0, 0]), 1) v2 = Vector2D([0, 5]) self.assertTrue(translator.is_in_range(v2)) v2.y += 1 self.assertFalse(translator.is_in_range(v2))
def main(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) use_cuda = torch.cuda.is_available() logging.info("Use CUDA: " + str(use_cuda)) _, _, vocabulary = collect_vocabularies( src_vocabulary_path=opt.src_vocabulary, tgt_vocabulary_path=opt.tgt_vocabulary, all_vocabulary_path=opt.all_vocabulary, reset=False) if opt.src_to_tgt_dict is not None and opt.tgt_to_src_dict is not None: translator = WordByWordModel(opt.src_to_tgt_dict, opt.tgt_to_src_dict, vocabulary, opt.max_length) else: model, _, _, _ = load_model(opt.model, use_cuda) translator = Translator(model, vocabulary, use_cuda) input_filename = opt.input output_filename = opt.output lang = opt.lang tgt_lang = "src" if lang == "tgt" else "tgt" logging.info("Writing output...") with open(input_filename, "r", encoding="utf-8") as r, open(output_filename, "w", encoding="utf-8") as w: for line in r: translated = translator.translate_sentence(line, lang, tgt_lang) logging.debug(translated) w.write(translated + "\n")
def init_zero_supervised(vocabulary, save_file, use_cuda): model, discriminator = build_model( max_length=opt.max_length, output_size=vocabulary.size(), rnn_size=opt.rnn_size, encoder_n_layers=opt.layers, decoder_n_layers=opt.layers, dropout=opt.dropout, use_cuda=use_cuda, enable_embedding_training=bool(opt.sv_embedding_training), discriminator_hidden_size=opt.discriminator_hidden_size, bidirectional=bool(opt.bidirectional), use_attention=bool(opt.attention)) if opt.src_embeddings is not None: load_embeddings(model, src_embeddings_filename=opt.src_embeddings, tgt_embeddings_filename=opt.tgt_embeddings, vocabulary=vocabulary) model = model.cuda() if use_cuda else model discriminator = discriminator.cuda() if use_cuda else discriminator print_summary(model) trainer = Trainer( vocabulary, max_length=opt.max_length, use_cuda=use_cuda, discriminator_lr=opt.discriminator_lr, main_lr=opt.sv_learning_rate, main_betas=(opt.adam_beta1, 0.999), ) if opt.sv_load_from: model, discriminator, main_optimizer, discriminator_optimizer = load_model( opt.sv_load_from, use_cuda) trainer.main_optimizer = main_optimizer trainer.discriminator_optimizer = discriminator_optimizer else: pair_file_names = [ (opt.train_src_bi, opt.train_tgt_bi), ] trainer.train_supervised(model, discriminator, pair_file_names, vocabulary, num_words_in_batch=opt.sv_num_words_in_batch, max_length=opt.max_length, save_file=save_file, big_epochs=opt.supervised_epochs, print_every=opt.print_every, save_every=opt.save_every, max_batch_count=opt.n_supervised_batches) for param in model.parameters(): param.requires_grad = False return Translator(model, vocabulary, use_cuda)
def run(self, interface, **kwargs): book = ChoseBook().run(interface, **kwargs) if not book.are_all_words_processed(): interface.display_info( "The book is not fully processed. Please firstly mark known words. " ) return translator = Translator() unknown_words_cnt = len(book.unknown_words) all_flashcards = defaultdict(list) for idx, word in enumerate(book.unknown_words): translation_units = translator.get_translation(word.stored_word) prompt = self.get_translation_choice_prompt( idx, unknown_words_cnt, translation_units) multiple_input_processor = MultipleInputProcessor( IntInRangeInputProcessor(valid_range=(0, len(translation_units)))) choices = interface.get_input( prompt, input_processor=multiple_input_processor) for choice in choices: chosen_translation_unit = translation_units[choice] all_flashcards[", ".join( chosen_translation_unit.words)].append(", ".join( chosen_translation_unit.meanings)) word.mark_if_known(True) all_flashcards_in_final_format = [ f"{key}={'/'.join(val)}" for key, val in all_flashcards.items() ] book.flashcards.extend(all_flashcards_in_final_format) db = Database() db.store_book(book) interface.display_info("FINISHED MAKING FLASHCARDS")
def test_starting_out(self): translator = Translator((100, 100)) translator.update(Vector2D([0, 0]), 0.5) self.assertEqual(translator.spacing, 5) self.assertEqual(translator.local_origin_vector, translator.global_origin_vector) translator.update(Vector2D([5, 5]), 0.5) self.assertEqual(translator.spacing, 5) self.assertEqual(translator.local_origin_vector, Vector2D([75, 25])) translator.update(Vector2D([5, 5]), 2) self.assertEqual(translator.spacing, 20) self.assertEqual(translator.local_origin_vector, Vector2D([150, -50]))
def train_supervised(self, model, discriminator, pair_file_names, vocabulary: Vocabulary, *, num_words_in_batch, big_epochs, max_length, max_batch_count=None, save_every=100, print_every=100, save_file="model"): if self.main_optimizer is None: logger.info("Initializing optimizers...") self.main_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=self.main_lr, betas=self.main_betas) self.discriminator_optimizer = optim.RMSprop(discriminator.parameters(), lr=self.discriminator_lr) for big_epoch in range(big_epochs): batch_gen = BilingualBatchGenerator(pair_file_names, max_length, num_words_in_batch, vocabulary, languages=["src", "tgt"], max_batch_count=max_batch_count) timer = time.time() loss_total = 0 epoch = 0 model.train() for src_batch, tgt_batch in batch_gen: logger.debug("Src batch: " + str(src_batch)) logger.debug("Tgt batch: " + str(tgt_batch)) loss = self.train_supervised_batch(model, src_batch, tgt_batch) Batch.print_pair(src_batch, tgt_batch, self.vocabulary, "src-tgt") logger.debug("Loss: " + str(loss)) loss_total += loss if epoch % save_every == 0 and epoch != 0: save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer, save_file + "_supervised.pt") if epoch % print_every == 0 and epoch != 0: print_loss_avg = loss_total / print_every loss_total = 0 diff = time.time() - timer timer = time.time() translator = Translator(model, self.vocabulary, self.use_cuda) logger.debug("Translated: "+ translator.translate_sentence("you can prepare your meals here .", "src", "tgt")) logger.info('%s big epoch, %s epoch, %s sec, %.4f main loss' % (big_epoch, epoch, diff, print_loss_avg)) epoch += 1 save_model(model, discriminator, self.main_optimizer, self.discriminator_optimizer, save_file + "_supervised.pt")
def test_starting_out(self): translator = Translator((100, 100)) self.assertEqual(translator.global_origin_vector, Vector2D([50, 50])) translator.update(Vector2D([10, 0]), 1) self.assertEqual(translator.spacing, 10) self.assertEqual(translator.local_origin_vector, Vector2D([150, 50])) translator.update(Vector2D([5, 5]), 1) self.assertEqual(translator.spacing, 10) self.assertEqual(translator.local_origin_vector, Vector2D([100, 0]))
def main(): logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("unmt") logger.propagate = False fh = logging.FileHandler(opt.log_file) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(fh) logger.addHandler(ch) use_cuda = torch.cuda.is_available() logger.info("Use CUDA: " + str(use_cuda)) _, _, vocabulary = collect_vocabularies( src_vocabulary_path=opt.src_vocabulary, tgt_vocabulary_path=opt.tgt_vocabulary, all_vocabulary_path=opt.all_vocabulary, src_file_names=(opt.train_src_mono, ), tgt_file_names=(opt.train_tgt_mono, ), src_max_words=opt.src_vocab_size, tgt_max_words=opt.tgt_vocab_size, reset=bool(opt.reset_vocabularies)) if opt.src_to_tgt_dict is not None and opt.tgt_to_src_dict is not None: zero_model = WordByWordModel(opt.src_to_tgt_dict, opt.tgt_to_src_dict, vocabulary, opt.max_length) elif opt.bootstrapped_model is not None: model, discriminator, _, _ = load_model(opt.bootstrapped_model, use_cuda) for param in model.parameters(): param.requires_grad = False zero_model = Translator(model, vocabulary, use_cuda) elif opt.train_src_bi is not None and opt.train_tgt_bi is not None: zero_model = init_zero_supervised(vocabulary, opt.save_model, use_cuda) else: assert False, "Zero model was not initialized" trainer = Trainer(vocabulary, max_length=opt.max_length, use_cuda=use_cuda, discriminator_lr=opt.discriminator_lr, main_lr=opt.learning_rate, main_betas=(opt.adam_beta1, 0.999),) trainer.current_translation_model = zero_model model, discriminator = build_model( max_length=opt.max_length, output_size=vocabulary.size(), rnn_size=opt.rnn_size, encoder_n_layers=opt.layers, decoder_n_layers=opt.layers, dropout=opt.dropout, use_cuda=use_cuda, enable_embedding_training=bool(opt.usv_embedding_training), discriminator_hidden_size=opt.discriminator_hidden_size, bidirectional=bool(opt.bidirectional), use_attention=bool(opt.attention) ) if opt.src_embeddings is not None: load_embeddings(model, src_embeddings_filename=opt.src_embeddings, tgt_embeddings_filename=opt.tgt_embeddings, vocabulary=vocabulary) model = model.cuda() if use_cuda else model print_summary(model) print_summary(discriminator) discriminator = discriminator.cuda() if use_cuda else discriminator if opt.usv_load_from: model, discriminator, main_optimizer, discriminator_optimizer = load_model(opt.usv_load_from, use_cuda) trainer.main_optimizer = main_optimizer trainer.discriminator_optimizer = discriminator_optimizer trainer.train(model, discriminator, src_file_names=[opt.train_src_mono, ], tgt_file_names=[opt.train_tgt_mono, ], unsupervised_big_epochs=opt.unsupervised_epochs, num_words_in_batch=opt.usv_num_words_in_batch, print_every=opt.print_every, save_every=opt.save_every, save_file=opt.save_model, n_unsupervised_batches=opt.n_unsupervised_batches, enable_unsupervised_backtranslation=opt.enable_unsupervised_backtranslation, teacher_forcing=bool(opt.teacher_forcing), max_length=opt.max_length)
def __init__(self): self.cal = Calculator() self.data_maps = DataMaps() self.translator = Translator()
class Parser: def __init__(self, fileName): self.prog = None self.iProgLine = 0 self.symCommand = "" self.binCommand = "" self.nCommand = 0 self.isComment = False self.symCommandType = -1 self.tranlator = Translator() self.symTable = SymTable() self.newFile = fileName + ".hack" return None def addData(self, data): self.prog = data return None def advance(self): # strip leading/trailing whitespaces, remove "/n" characters currentLine = self.prog[self.iProgLine].strip() self.iProgLine += 1 # do not parse empty lines and comments if ('//' in currentLine): i = currentLine.find('/') currentLine = currentLine[:i].strip() if ('/*' in currentLine and '*/' in currentLine): i = currentLine.find('/') j = currentLine[i+1:].find('/') currentLine = (currentLine[:i] + currentLine[j+1:]).strip() if ('/*' in currentLine): self.isComment = True return False if ('*/' in currentLine): self.isComment = False return False if (self.isComment): return False if (not currentLine): return False if (currentLine.startswith("(")): return False self.symCommand = currentLine # select correct type of instruction if (currentLine.startswith("@")): self.symCommandType = cType.A_COMMAND else: self.symCommandType = cType.C_COMMAND self.nCommand += 1 return True def symbol(self): value = "" if (self.symCommandType == cType.A_COMMAND): value = self.symCommand.strip("@") if (not value.isdigit()): if (self.symTable.contains(value)): value = self.symTable.getAddress(value) else: value = self.symTable.addEntry(value) return '{0:016b}'.format(int(value)) def instructions(self): if (self.symCommandType != cType.C_COMMAND): return ("", "", "") i = self.symCommand.find('=') j = self.symCommand.find(';') if (i == -1 and j == -1): dest = 'null' comp = self.symCommand jump = 'null' return (dest, comp, jump) if (i == -1): dest = 'null' comp = self.symCommand[:j] jump = self.symCommand[j+1:] return (dest, comp, jump) if (j == -1): dest = self.symCommand[:i] comp = self.symCommand[i+1:] jump = 'null' return (dest, comp, jump) dest = self.symCommand[:i] comp = self.symCommand[i+1:j] jump = self.symCommand[j+1:] return (dest, comp, jump) def commandToBinary(self): if (self.symCommandType == cType.C_COMMAND): (dest, comp, jump) = self.instructions() cCommandBinary = self.tranlator.translate(dest, comp, jump) return '111' + cCommandBinary else: return self.symbol() def parse(self): self.symTable.addData(self.prog) self.symTable.findLabels() hackFile = open(self.newFile, 'w') while (self.iProgLine < len(self.prog)): if (self.advance()): binCommand = self.commandToBinary() hackFile.write(binCommand) hackFile.write("\n") return
from typing import List from src.translator import Translator DEFAULT_BF_SOURCES = 'etc/hello_world.bf' def read_sources(filename: str) -> List[str]: """read char by char a source file :param filename: path to the source file :return: its content as a list of char """ bf_sources: List[str] = [] with open(filename, 'r') as source: for line in source: [bf_sources.append(char) for char in line] return bf_sources if __name__ == '__main__': sources = read_sources(DEFAULT_BF_SOURCES) sources = Translator.sanitize(sources) c_sources = Translator.bf_to_c(sources) print(''.join(c_sources))
def runf1(conn, args): # evaluation dataset # english context so that answer is in english data = MLQADataset(args.dataset, 'en', args.langQuestion) # initialize searcher init(conn, 'wiki', args) # initialise reader print("Reader") reader = Reader(model="models/distilbert-base-uncased-distilled-squad/", tokenizer="models/distilbert-uncased-my-tok") # initialise translator print("Translator") languages = {args.langQuestion, args.langSearch, 'en'} translator = Translator(languages) print("Translating between: {}".format(str(languages))) counters = {'f1': [], 'tally': 0, 'score': []} for doc in data.get(): questionSearch = translator(doc['question'], args.langQuestion, args.langSearch) #print("questionSearch ", questionSearch.encode('utf-8')) search(conn, questionSearch, args.langSearch) if args.langSearch == 'en': questionRead = questionSearch else: questionRead = translator(doc['question'], args.langQuestion, 'en') #print("questionRead ", questionRead.encode('utf-8')) # recv = {'search':[{'id':qid, 'docs':[{'context':'...', 'title':'...', 'score':score}]}] bestScore = 0 recv = recvall(conn) for n, docSearch in enumerate(recv['search'][0]['docs']): # reader answer question given contexts #print("n: ", n) #print("contextSearch ", docSearch['context'].encode('utf-8')) contextRead = translator(docSearch['context'], args.langSearch, 'en') #print("contextRead ", contextRead.encode('utf-8')) _, answerRead, score = reader(questionRead, contextRead) if score >= bestScore: bestScore = score bestAnswer = answerRead bestContext = contextRead #print("goldAnswer: ",doc['answer'].encode('utf-8')) #print("Answer: ",bestAnswer.encode('utf-8')) counters['f1'].append(f1_drqa(bestAnswer, doc['answer'])) counters['tally'] += 1 counters['score'].append(bestScore) # test if args.stop != 0 and counters['tally'] >= args.stop: print("Stoping at: ", counters['tally']) break #if i > 1: # break f1 = np.array(counters['f1']) exact_match = f1[f1 == 1.0].sum() / f1.size print("Exact match: {}".format(exact_match)) print("F1 mean: {}".format(f1.mean())) print("Mean score: {}".format(sum(counters['score']) / counters['tally'])) print("Total: {}".format(counters['tally'])) if args.save_as: print("Writing to: ", args.save_as) with open(args.save_as, "w") as fp: json.dump(counters, fp) close(conn, args.stop_server) return f1.mean()
def main_train(): # Build argument parser parser = argparse.ArgumentParser(description='Train a table to text model') # Training corpus corpora_group = parser.add_argument_group('training corpora', 'Corpora related arguments; specify either unaligned or' ' aligned training corpora') # "Languages (type,path)" corpora_group.add_argument('--src_corpus_params', type=str, default='table, ./data/processed_data/train/train.box', help='the source unaligned corpus (type,path). Type = text/table') corpora_group.add_argument('--trg_corpus_params', type=str, default='text, ./data/processed_data/train/train.article', help='the target unaligned corpus (type,path). Type = text/table') corpora_group.add_argument('--src_para_corpus_params', type=str, default='', help='the source corpus of parallel data(type,path). Type = text/table') corpora_group.add_argument('--trg_para_corpus_params', type=str, default='', help='the target corpus of parallel data(type,path). Type = text/table') # Maybe add src/target type (i.e. text/table) corpora_group.add_argument('--corpus_mode', type=str, default='mono', help='training mode: "mono" (unsupervised) / "para" (supervised)') corpora_group.add_argument('--max_sentence_length', type=int, default=50, help='the maximum sentence length for training (defaults to 50)') corpora_group.add_argument('--cache', type=int, default=100000, help='the cache size (in sentences) for corpus reading (defaults to 1000000)') # Embeddings/vocabulary embedding_group = parser.add_argument_group('embeddings', 'Embedding related arguments; either give pre-trained embeddings,' ' or a vocabulary and embedding dimensionality to' ' randomly initialize them') embedding_group.add_argument('--metadata_path', type=str, default='', required=True, help='Path for bin file created in pre-processing phase, ' 'containing BPEmb related metadata.') # Architecture architecture_group = parser.add_argument_group('architecture', 'Architecture related arguments') architecture_group.add_argument('--layers', type=int, default=2, help='the number of encoder/decoder layers (defaults to 2)') architecture_group.add_argument('--hidden', type=int, default=600, help='the number of dimensions for the hidden layer (defaults to 600)') architecture_group.add_argument('--dis_hidden', type=int, default=150, help='Number of dimensions for the discriminator hidden layers') architecture_group.add_argument('--n_dis_layers', type=int, default=2, help='Number of discriminator layers') architecture_group.add_argument('--disable_bidirectional', action='store_true', help='use a single direction encoder') architecture_group.add_argument('--disable_backtranslation', action='store_true', help='disable backtranslation') architecture_group.add_argument('--disable_field_loss', action='store_true', help='disable backtranslation') architecture_group.add_argument('--disable_discriminator', action='store_true', help='disable discriminator') architecture_group.add_argument('--shared_enc', action='store_true', help='share enc for both directions') architecture_group.add_argument('--shared_dec', action='store_true', help='share dec for both directions') # Denoising denoising_group = parser.add_argument_group('denoising', 'Denoising related arguments') denoising_group.add_argument('--denoising_mode', type=int, default=1, help='0/1/2 = disabled/old/new') denoising_group.add_argument('--word_shuffle', type=int, default=3, help='shuffle words (only relevant in new mode)') denoising_group.add_argument('--word_dropout', type=float, default=0.1, help='randomly remove words (only relevant in new mode)') denoising_group.add_argument('--word_blank', type=float, default=0.2, help='randomly blank out words (only relevant in new mode)') # Optimization optimization_group = parser.add_argument_group('optimization', 'Optimization related arguments') optimization_group.add_argument('--batch', type=int, default=50, help='the batch size (defaults to 50)') optimization_group.add_argument('--learning_rate', type=float, default=0.0002, help='the global learning rate (defaults to 0.0002)') optimization_group.add_argument('--dropout', metavar='PROB', type=float, default=0.3, help='dropout probability for the encoder/decoder (defaults to 0.3)') optimization_group.add_argument('--param_init', metavar='RANGE', type=float, default=0.1, help='uniform initialization in the specified range (defaults to 0.1, 0 for module specific default initialization)') optimization_group.add_argument('--iterations', type=int, default=300000, help='the number of training iterations (defaults to 300000)') # Model saving saving_group = parser.add_argument_group('model saving', 'Arguments for saving the trained model') saving_group.add_argument('--save', metavar='PREFIX', help='save models with the given prefix') saving_group.add_argument('--save_interval', type=int, default=0, help='save intermediate models at this interval') # Logging/validation logging_group = parser.add_argument_group('logging', 'Logging and validation arguments') logging_group.add_argument('--log_interval', type=int, default=100, help='log at this interval (defaults to 1000)') logging_group.add_argument('--dbg_print_interval', type=int, default=1000, help='log at this interval (defaults to 1000)') logging_group.add_argument('--src_valid_corpus', type=str, default='') logging_group.add_argument('--trg_valid_corpus', type=str, default='') logging_group.add_argument('--print_level', type=str, default='info', help='logging level [debug | info]') # Other misc_group = parser.add_argument_group('misc', 'Misc. arguments') misc_group.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') misc_group.add_argument('--cuda', type=str, default='cpu', help='device for training. default value: "cpu"') misc_group.add_argument('--bleu_device', type=str, default='', help='device for calculating BLEU scores in case a validation dataset is given') # Parse arguments args = parser.parse_args() logger = logging.getLogger() if args.print_level == 'debug': logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) elif args.print_level == 'info': logging.basicConfig(stream=sys.stderr, level=logging.INFO) elif args.print_level == 'warning': logging.basicConfig(stream=sys.stderr, level=logging.WARNING) else: logging.basicConfig(stream=sys.stderr, level=logging.CRITICAL) # Validate arguments if args.src_corpus_params is None or args.trg_corpus_params is None: print("Must supply corpus") sys.exit(-1) args.src_corpus_params = args.src_corpus_params.split(',') args.trg_corpus_params = args.trg_corpus_params.split(',') assert len(args.src_corpus_params) == 2 assert len(args.trg_corpus_params) == 2 src_type, src_corpus_path = args.src_corpus_params trg_type, trg_corpus_path = args.trg_corpus_params src_type = src_type.strip() src_corpus_path = src_corpus_path.strip() trg_type = trg_type.strip() trg_corpus_path = trg_corpus_path.strip() assert src_type != trg_type assert (src_type in ['table', 'text']) and (trg_type in ['table', 'text']) corpus_size = get_num_lines(src_corpus_path + '.content') # Select device if torch.cuda.is_available(): device = torch.device(args.cuda) else: device = torch.device('cpu') if args.bleu_device == '': args.bleu_device = device current_time = str(datetime.datetime.now().timestamp()) run_dir = 'run_' + current_time + '/' train_log_dir = 'logs/train/' + run_dir + args.save valid_log_dir = 'logs/valid/' + run_dir + args.save train_writer = SummaryWriter(train_log_dir) valid_writer = SummaryWriter(valid_log_dir) # Create optimizer lists src2src_optimizers = [] trg2trg_optimizers = [] src2trg_optimizers = [] trg2src_optimizers = [] # Method to create a module optimizer and add it to the given lists def add_optimizer(module, directions=()): if args.param_init != 0.0: for param in module.parameters(): param.data.uniform_(-args.param_init, args.param_init) optimizer = torch.optim.Adam(module.parameters(), lr=args.learning_rate) for direction in directions: direction.append(optimizer) return optimizer assert os.path.isfile(args.metadata_path) metadata = torch.load(args.metadata_path) bpemb_en = metadata.init_bpe_module() word_dict: BpeWordDict = torch.load(metadata.word_dict_path) field_dict: LabelDict = torch.load(metadata.field_dict_path) args.hidden = bpemb_en.dim + bpemb_en.dim // 2 if not args.disable_bidirectional: args.hidden *= 2 # Load embedding and/or vocab # word_dict = BpeWordDict.get(vocab=bpemb_en.words) w_sos_id = {'text': word_dict.bos_index, 'table': word_dict.sot_index} word_embeddings = nn.Embedding(len(word_dict), bpemb_en.dim, padding_idx=word_dict.pad_index) nn.init.normal_(word_embeddings.weight, 0, 0.1) nn.init.constant_(word_embeddings.weight[word_dict.pad_index], 0) with torch.no_grad(): word_embeddings.weight[:bpemb_en.vs, :] = torch.from_numpy(bpemb_en.vectors) word_embedding_size = word_embeddings.weight.data.size()[1] word_embeddings = word_embeddings.to(device) word_embeddings.weight.requires_grad = False logger.debug('w_embeddings is running on cuda: %d', next(word_embeddings.parameters()).is_cuda) # field_dict: LabelDict = torch.load('./data/processed_data/train/field.dict') field_embeddings = nn.Embedding(len(field_dict), bpemb_en.dim // 2, padding_idx=field_dict.pad_index) nn.init.normal_(field_embeddings.weight, 0, 0.1) nn.init.constant_(field_embeddings.weight[field_dict.pad_index], 0) field_embedding_size = field_embeddings.weight.data.size()[1] field_embeddings = field_embeddings.to(device) field_embeddings.weight.requires_grad = True logger.debug('f_embeddings is running on cuda: %d', next(word_embeddings.parameters()).is_cuda) src_encoder_word_embeddings = word_embeddings trg_encoder_word_embeddings = word_embeddings src_encoder_field_embeddings = field_embeddings trg_encoder_field_embeddings = field_embeddings src_decoder_word_embeddings = word_embeddings trg_decoder_word_embeddings = word_embeddings src_decoder_field_embeddings = field_embeddings trg_decoder_field_embeddings = field_embeddings src_generator = LinearGenerator(args.hidden, len(word_dict), len(field_dict)).to(device) if args.shared_dec: trg_generator = src_generator add_optimizer(src_generator, (src2src_optimizers, trg2src_optimizers, trg2trg_optimizers, src2trg_optimizers)) else: trg_generator = LinearGenerator(args.hidden, len(word_dict), len(field_dict)).to(device) add_optimizer(src_generator, (src2src_optimizers, trg2src_optimizers)) add_optimizer(trg_generator, (trg2trg_optimizers, src2trg_optimizers)) logger.debug('src generator is running on cuda: %d', next(src_generator.parameters()).is_cuda) logger.debug('trg generator is running on cuda: %d', next(src_generator.parameters()).is_cuda) # Build encoder src_enc = RNNEncoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size, hidden_size=args.hidden, bidirectional=not args.disable_bidirectional, layers=args.layers, dropout=args.dropout).to(device) if args.shared_enc: trg_enc = src_enc add_optimizer(src_enc, (src2src_optimizers, src2trg_optimizers, trg2trg_optimizers, trg2src_optimizers)) else: trg_enc = RNNEncoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size, hidden_size=args.hidden, bidirectional=not args.disable_bidirectional, layers=args.layers, dropout=args.dropout).to(device) add_optimizer(src_enc, (src2src_optimizers, src2trg_optimizers)) add_optimizer(trg_enc, (trg2trg_optimizers, trg2src_optimizers)) logger.debug('encoder model is running on cuda: %d', next(src_enc.parameters()).is_cuda) # Build decoders src_dec = RNNAttentionDecoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size, hidden_size=args.hidden, layers=args.layers, dropout=args.dropout, input_feeding=False).to(device) if args.shared_dec: trg_dec = src_dec add_optimizer(src_dec, (src2src_optimizers, trg2src_optimizers, trg2trg_optimizers, src2trg_optimizers)) else: trg_dec = RNNAttentionDecoder(word_embedding_size=word_embedding_size, field_embedding_size=field_embedding_size, hidden_size=args.hidden, layers=args.layers, dropout=args.dropout, input_feeding=False).to(device) add_optimizer(src_dec, (src2src_optimizers, trg2src_optimizers)) add_optimizer(trg_dec, (trg2trg_optimizers, src2trg_optimizers)) logger.debug('decoder model is running on cuda: %d', next(src_dec.parameters()).is_cuda) logger.debug('attention model is running on cuda: %d', next(src_dec.attention.parameters()).is_cuda) discriminator = None if (args.corpus_mode == 'mono') and not args.disable_discriminator: discriminator = Discriminator(args.hidden, args.dis_hidden, args.n_dis_layers, args.dropout) discriminator = discriminator.to(device) # Build translators src2src_translator = Translator("src2src", encoder_word_embeddings=src_encoder_word_embeddings, decoder_word_embeddings=src_decoder_word_embeddings, encoder_field_embeddings=src_encoder_field_embeddings, decoder_field_embeddings=src_decoder_field_embeddings, generator=src_generator, src_word_dict=word_dict, trg_word_dict=word_dict, src_field_dict=field_dict, trg_field_dict=field_dict, src_type=src_type, trg_type=src_type, w_sos_id=w_sos_id[src_type], bpemb_en=bpemb_en, encoder=src_enc, decoder=src_dec, discriminator=discriminator, denoising=args.denoising_mode, device=device, max_word_shuffle_distance=args.word_shuffle, word_dropout_prob=args.word_dropout, word_blanking_prob=args.word_blank) src2trg_translator = Translator("src2trg", encoder_word_embeddings=src_encoder_word_embeddings, decoder_word_embeddings=trg_decoder_word_embeddings, encoder_field_embeddings=src_encoder_field_embeddings, decoder_field_embeddings=trg_decoder_field_embeddings, generator=trg_generator, src_word_dict=word_dict, trg_word_dict=word_dict, src_field_dict=field_dict, trg_field_dict=field_dict, src_type=src_type, trg_type=trg_type, w_sos_id=w_sos_id[trg_type], bpemb_en=bpemb_en, encoder=src_enc, decoder=trg_dec, discriminator=discriminator, denoising=0, device=device, max_word_shuffle_distance=args.word_shuffle, word_dropout_prob=args.word_dropout, word_blanking_prob=args.word_blank) trg2trg_translator = Translator("trg2trg", encoder_word_embeddings=trg_encoder_word_embeddings, decoder_word_embeddings=trg_decoder_word_embeddings, encoder_field_embeddings=trg_encoder_field_embeddings, decoder_field_embeddings=trg_decoder_field_embeddings, generator=trg_generator, src_word_dict=word_dict, trg_word_dict=word_dict, src_field_dict=field_dict, trg_field_dict=field_dict, src_type=trg_type, trg_type=trg_type, w_sos_id=w_sos_id[trg_type], bpemb_en=bpemb_en, encoder=trg_enc, decoder=trg_dec, discriminator=discriminator, denoising=args.denoising_mode, device=device, max_word_shuffle_distance=args.word_shuffle, word_dropout_prob=args.word_dropout, word_blanking_prob=args.word_blank) trg2src_translator = Translator("trg2src", encoder_word_embeddings=trg_encoder_word_embeddings, decoder_word_embeddings=src_decoder_word_embeddings, encoder_field_embeddings=trg_encoder_field_embeddings, decoder_field_embeddings=src_decoder_field_embeddings, generator=src_generator, src_word_dict=word_dict, trg_word_dict=word_dict, src_field_dict=field_dict, trg_field_dict=field_dict, src_type=trg_type, trg_type=src_type, w_sos_id=w_sos_id[src_type], bpemb_en=bpemb_en, encoder=trg_enc, decoder=src_dec, discriminator=discriminator, denoising=0, device=device, max_word_shuffle_distance=args.word_shuffle, word_dropout_prob=args.word_dropout, word_blanking_prob=args.word_blank) # Build trainers trainers = [] iters_per_epoch = int(np.ceil(corpus_size / args.batch)) print("CORPUS_SIZE = %d | BATCH_SIZE = %d | ITERS_PER_EPOCH = %d" % (corpus_size, args.batch, iters_per_epoch)) if args.corpus_mode == 'mono': f_content = open(src_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape') f_labels = open(src_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape') src_corpus_path = data.CorpusReader(f_content, f_labels, max_sentence_length=args.max_sentence_length, cache_size=args.cache) f_content = open(trg_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape') f_labels = open(trg_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape') trg_corpus_path = data.CorpusReader(f_content, f_labels, max_sentence_length=args.max_sentence_length, cache_size=args.cache) if not args.disable_discriminator: disc_trainer = DiscTrainer(device, src_corpus_path, trg_corpus_path, src_enc, trg_enc, src_encoder_word_embeddings, src_encoder_field_embeddings, word_dict, field_dict, discriminator, args.learning_rate, batch_size=args.batch) trainers.append(disc_trainer) src2src_trainer = Trainer(translator=src2src_translator, optimizers=src2src_optimizers, corpus=src_corpus_path, batch_size=args.batch, iters_per_epoch=iters_per_epoch) trainers.append(src2src_trainer) if not args.disable_backtranslation: trgback2src_trainer = Trainer(translator=trg2src_translator, optimizers=trg2src_optimizers, corpus=data.BacktranslatorCorpusReader(corpus=src_corpus_path, translator=src2trg_translator), batch_size=args.batch, iters_per_epoch=iters_per_epoch) trainers.append(trgback2src_trainer) trg2trg_trainer = Trainer(translator=trg2trg_translator, optimizers=trg2trg_optimizers, corpus=trg_corpus_path, batch_size=args.batch, iters_per_epoch=iters_per_epoch) trainers.append(trg2trg_trainer) if not args.disable_backtranslation: srcback2trg_trainer = Trainer(translator=src2trg_translator, optimizers=src2trg_optimizers, corpus=data.BacktranslatorCorpusReader(corpus=trg_corpus_path, translator=trg2src_translator), batch_size=args.batch, iters_per_epoch=iters_per_epoch) trainers.append(srcback2trg_trainer) elif args.corpus_mode == 'para': fsrc_content = open(src_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape') fsrc_labels = open(src_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape') ftrg_content = open(trg_corpus_path + '.content', encoding=args.encoding, errors='surrogateescape') ftrg_labels = open(trg_corpus_path + '.labels', encoding=args.encoding, errors='surrogateescape') corpus = data.CorpusReader(fsrc_content, fsrc_labels, trg_word_file=ftrg_content, trg_field_file=ftrg_labels, max_sentence_length=args.max_sentence_length, cache_size=args.cache) src2trg_trainer = Trainer(translator=src2trg_translator, optimizers=src2trg_optimizers, corpus=corpus, batch_size=args.batch, iters_per_epoch=iters_per_epoch) trainers.append(src2trg_trainer) # Build validators if args.src_valid_corpus != '' and args.trg_valid_corpus != '': with ExitStack() as stack: src_content_vfile = stack.enter_context(open(args.src_valid_corpus + '.content', encoding=args.encoding, errors='surrogateescape')) src_labels_vfile = stack.enter_context(open(args.src_valid_corpus + '.labels', encoding=args.encoding, errors='surrogateescape')) trg_content_vfile = stack.enter_context(open(args.trg_valid_corpus + '.content', encoding=args.encoding, errors='surrogateescape')) trg_labels_vfile = stack.enter_context(open(args.trg_valid_corpus + '.labels', encoding=args.encoding, errors='surrogateescape')) src_content = src_content_vfile.readlines() src_labels = src_labels_vfile.readlines() trg_content = trg_content_vfile.readlines() trg_labels = trg_labels_vfile.readlines() assert len(src_content) == len(trg_content) == len(src_labels) == len(trg_labels), \ "Validation sizes do not match {} {} {} {}".format(len(src_content), len(trg_content), len(src_labels), len(trg_labels)) src_content = [list(map(int, line.strip().split())) for line in src_content] src_labels = [list(map(int, line.strip().split())) for line in src_labels] trg_content = [list(map(int, line.strip().split())) for line in trg_content] trg_labels = [list(map(int, line.strip().split())) for line in trg_labels] cache = [] for src_sent, src_label, trg_sent, trg_label in zip(src_content, src_labels, trg_content, trg_labels): if 0 < len(src_sent) <= args.max_sentence_length and 0 < len(trg_sent) <= args.max_sentence_length: cache.append((src_sent, src_label, trg_sent, trg_label)) src_content, src_labels, trg_content, trg_labels = zip(*cache) src2trg_validator = Validator(src2trg_translator, src_content, trg_content, src_labels, trg_labels) if args.corpus_mode == 'mono': src2src_validator = Validator(src2src_translator, src_content, src_content, src_labels, src_labels) trg2src_validator = Validator(trg2src_translator, trg_content, src_content, trg_labels, src_labels) trg2trg_validator = Validator(trg2trg_translator, trg_content, trg_content, trg_labels, trg_labels) del src_content del src_labels del trg_content del trg_labels else: src2src_validator = None src2trg_validator = None trg2src_validator = None trg2trg_validator = None # Build loggers loggers = [] semi_loggers = [] if args.corpus_mode == 'mono': if not args.disable_backtranslation: loggers.append(Logger('Source to target (backtranslation)', srcback2trg_trainer, src2trg_validator, None, args.encoding, short_name='src2trg_bt', train_writer=train_writer, valid_writer=valid_writer)) loggers.append(Logger('Target to source (backtranslation)', trgback2src_trainer, trg2src_validator, None, args.encoding, short_name='trg2src_bt', train_writer=train_writer, valid_writer=valid_writer)) loggers.append(Logger('Source to source', src2src_trainer, src2src_validator, None, args.encoding, short_name='src2src', train_writer=train_writer, valid_writer=valid_writer)) loggers.append(Logger('Target to target', trg2trg_trainer, trg2trg_validator, None, args.encoding, short_name='trg2trg', train_writer=train_writer, valid_writer=valid_writer)) elif args.corpus_mode == 'para': loggers.append(Logger('Source to target', src2trg_trainer, src2trg_validator, None, args.encoding, short_name='src2trg_para', train_writer=train_writer, valid_writer=valid_writer)) # Method to save models def save_models(name): # torch.save(src2src_translator, '{0}.{1}.src2src.pth'.format(args.save, name)) # torch.save(trg2trg_translator, '{0}.{1}.trg2trg.pth'.format(args.save, name)) torch.save(src2trg_translator, '{0}.{1}.src2trg.pth'.format(args.save, name)) if args.corpus_mode == 'mono': torch.save(trg2src_translator, '{0}.{1}.trg2src.pth'.format(args.save, name)) ref_string_path = args.trg_valid_corpus + '.str.content' if not os.path.isfile(ref_string_path): print("Creating ref file... [%s]" % (ref_string_path)) with ExitStack() as stack: fref_content = stack.enter_context( open(args.trg_valid_corpus + '.content', encoding=args.encoding, errors='surrogateescape')) fref_str_content = stack.enter_context( open(ref_string_path, mode='w', encoding=args.encoding, errors='surrogateescape')) for line in fref_content: ref_ids = [int(idstr) for idstr in line.strip().split()] ref_str = bpemb_en.decode_ids(ref_ids) fref_str_content.write(ref_str + '\n') print("Ref file created!") # Training for curr_iter in range(1, args.iterations + 1): print_dbg = (0 != args.dbg_print_interval) and (curr_iter % args.dbg_print_interval == 0) for trainer in trainers: trainer.step(print_dbg=print_dbg, include_field_loss=not args.disable_field_loss) if args.save is not None and args.save_interval > 0 and curr_iter % args.save_interval == 0: save_models('it{0}'.format(curr_iter)) if curr_iter % args.log_interval == 0: print() print('[{0}] TRAIN-STEP {1} x {2}'.format(args.save, curr_iter, args.batch)) for logger in loggers: logger.log(curr_iter) if curr_iter % iters_per_epoch == 0: save_models('it{0}'.format(curr_iter)) print() print('[{0}] VALID-STEP {1}'.format(args.save, curr_iter)) for logger in loggers: if logger.validator is not None: logger.validate(curr_iter) model = '{0}.{1}.src2trg.pth'.format(args.save, 'it{0}'.format(curr_iter)) bleu_thread = threading.Thread(target=calc_bleu, args=(model, args.save, args.src_valid_corpus, args.trg_valid_corpus + '.str.result', ref_string_path, bpemb_en, curr_iter, args.bleu_device, valid_writer)) bleu_thread.start() if args.cuda == args.bleu_device or args.bleu_device == 'cpu': bleu_thread.join() save_models('final') train_writer.close() valid_writer.close()
def main(): parser = argparse.ArgumentParser(description="translate.py") parser.add_argument("--eval_splits", type=str, nargs="+", default=["val", ], choices=["val", "test"], help="evaluate on val/test set, yc2 only has val") parser.add_argument("--res_dir", required=True, help="path to dir containing model .pt file") parser.add_argument("--batch_size", type=int, default=100, help="batch size") # beam search configs parser.add_argument("--use_beam", action="store_true", help="use beam search, otherwise greedy search") parser.add_argument("--beam_size", type=int, default=2, help="beam size") parser.add_argument("--n_best", type=int, default=1, help="stop searching when get n_best from beam search") parser.add_argument("--min_sen_len", type=int, default=5, help="minimum length of the decoded sentences") parser.add_argument("--max_sen_len", type=int, default=30, help="maximum length of the decoded sentences") parser.add_argument("--block_ngram_repeat", type=int, default=0, help="block repetition of ngrams during decoding.") parser.add_argument("--length_penalty_name", default="none", choices=["none", "wu", "avg"], help="length penalty to use.") parser.add_argument("--length_penalty_alpha", type=float, default=0., help="Google NMT length penalty parameter (higher = longer generation)") parser.add_argument("--eval_tool_dir", type=str, default="./densevid_eval") parser.add_argument("--no_cuda", action="store_true") parser.add_argument("--seed", default=2019, type=int) parser.add_argument("--debug", action="store_true") opt = parser.parse_args() opt.cuda = not opt.no_cuda # random seed random.seed(opt.seed) np.random.seed(opt.seed) torch.manual_seed(opt.seed) checkpoint = torch.load(os.path.join(opt.res_dir, "model.chkpt")) # add some of the train configs train_opt = checkpoint["opt"] # EDict(load_json(os.path.join(opt.res_dir, "model.cfg.json"))) for k in train_opt.__dict__: if k not in opt.__dict__: setattr(opt, k, getattr(train_opt, k)) print("train_opt", train_opt) decoding_strategy = "beam{}_lp_{}_la_{}".format( opt.beam_size, opt.length_penalty_name, opt.length_penalty_alpha) if opt.use_beam else "greedy" save_json(vars(opt), os.path.join(opt.res_dir, "{}_eval_cfg.json".format(decoding_strategy)), save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [os.path.join(opt.data_dir, e) for e in ["anet_entities_val_1_para.json", "anet_entities_val_2_para.json"]], "test": [os.path.join(opt.data_dir, e) for e in ["anet_entities_test_1_para.json", "anet_entities_test_2_para.json"]]} else: # yc2 reference_files_map = {"val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")]} for eval_mode in opt.eval_splits: print("Start evaluating {}".format(eval_mode)) # add 10 at max_n_sen to make the inference stage use all the segments eval_data_loader = get_data_loader(opt, eval_mode=eval_mode) eval_references = reference_files_map[eval_mode] # setup model translator = Translator(opt, checkpoint) pred_file = os.path.join(opt.res_dir, "{}_pred_{}.json".format(decoding_strategy, eval_mode)) pred_file = os.path.abspath(pred_file) if not os.path.exists(pred_file): json_res = run_translate(eval_data_loader, translator, opt=opt) save_json(json_res, pred_file, save_pretty=True) else: print("Using existing prediction file at {}".format(pred_file)) # COCO language evaluation lang_file = pred_file.replace(".json", "_lang.json") eval_command = ["python", "para-evaluate.py", "-s", pred_file, "-o", lang_file, "-v", "-r"] + eval_references subprocess.call(eval_command, cwd=opt.eval_tool_dir) # basic stats stat_filepath = pred_file.replace(".json", "_stat.json") eval_stat_cmd = ["python", "get_caption_stat.py", "-s", pred_file, "-r", eval_references[0], "-o", stat_filepath, "-v"] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = pred_file.replace(".json", "_rep.json") eval_rep_cmd = ["python", "evaluateRepetition.py", "-s", pred_file, "-r", eval_references[0], "-o", rep_filepath] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) metric_filepaths = [lang_file, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = pred_file.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) print("pred_file {} lang_file {}".format(pred_file, lang_file)) print("[Info] Finished {}.".format(eval_mode))
from fastapi import FastAPI, HTTPException from src.models import Answer from src.qa_model import QAModel from src.translator import Translator app = FastAPI() biobert_path = 'BioBertFolder/biobert_v1.0_pubmed_pmc/' bert_fnn_weights = 'assets/models/bertffn_crossentropy/bertffn' embedding_file = 'assets/Float16EmbeddingsExpanded5-27-19.pkl' qa_model = QAModel(biobert_path, bert_fnn_weights, embedding_file) translator = Translator(creds_path='gct_creds.json') @app.get('/api/v1/ask', response_model=Answer) async def ask(question: str, lang: str): if lang == 'uk': question = translator.translate(question, target="en") # return only 1 answer orig_result = qa_model.predict(question)[0] trans_result = translator.translate(orig_result) return {"original_answer": orig_result, "translated_answer": trans_result} elif lang == "en": result = qa_model.predict(question)[0] return {"original_answer": result, "translated_answer": result} else: raise HTTPException(400, "Only uk(Ukrainian) and en(English) languages are supported!")
def eval_language_metrics(checkpoint, eval_data_loader, opt, model=None, eval_mode="val"): """eval_mode can only be set to `val` here, as setting to `test` is cheating 0, run inference 1, Get METEOR, BLEU1-4, CIDEr scores 2, Get vocab size, sentence length """ translator = Translator(opt, checkpoint, model=model) json_res = run_translate(eval_data_loader, translator, opt=opt) res_filepath = os.path.abspath( opt.save_model + "_tmp_greedy_pred_{}.json".format(eval_mode)) save_json(json_res, res_filepath, save_pretty=True) if opt.dset_name == "anet": reference_files_map = { "val": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_val_1_para.json", "anet_entities_val_2_para.json" ] ], "test": [ os.path.join(opt.data_dir, e) for e in [ "anet_entities_test_1_para.json", "anet_entities_test_2_para.json" ] ] } else: # yc2 reference_files_map = { "val": [os.path.join(opt.data_dir, "yc2_val_anet_format_para.json")] } # COCO language evaluation eval_references = reference_files_map[eval_mode] lang_filepath = res_filepath.replace(".json", "_lang.json") eval_cmd = [ "python", "para-evaluate.py", "-s", res_filepath, "-o", lang_filepath, "-v", "-r" ] + eval_references subprocess.call(eval_cmd, cwd=opt.eval_tool_dir) # basic stats stat_filepath = res_filepath.replace(".json", "_stat.json") eval_stat_cmd = [ "python", "get_caption_stat.py", "-s", res_filepath, "-r", eval_references[0], "-o", stat_filepath, "-v" ] subprocess.call(eval_stat_cmd, cwd=opt.eval_tool_dir) # repetition evaluation rep_filepath = res_filepath.replace(".json", "_rep.json") eval_rep_cmd = [ "python", "evaluateRepetition.py", "-s", res_filepath, "-r", eval_references[0], "-o", rep_filepath ] subprocess.call(eval_rep_cmd, cwd=opt.eval_tool_dir) # save results logger.info("Finished eval {}.".format(eval_mode)) metric_filepaths = [lang_filepath, stat_filepath, rep_filepath] all_metrics = merge_dicts([load_json(e) for e in metric_filepaths]) all_metrics_filepath = res_filepath.replace(".json", "_all_metrics.json") save_json(all_metrics, all_metrics_filepath, save_pretty=True) return all_metrics, [res_filepath, all_metrics_filepath]
def get_json_from_translator(self, xml, prolog=None): document_tree = DocumentTree(xml, prolog) translator = Translator(document_tree, loads('{}')) json = translator.get_json() return json
from src.lexer import Lexer from src.parser import Parser import sys import json from src.translator import Translator if len(sys.argv) == 3: f = open(sys.argv[1], 'r') content = f.read() f = open(sys.argv[2], 'r') config = json.loads(f.read()) lexer = Lexer(content) parser = Parser(lexer) document_tree = parser.get_document_tree() translator = Translator(document_tree, config) json = translator.get_json() output_file = open("output.json", "w") output_file.write(json) output_file.close() else: print("Usage: python main.py <file_to_translate> <config_file>")