예제 #1
0
# [29] START
model_path = "uni_joint_2layer_gelu_synth.pth.tar"
# [29] END

# [30] START
model_synth_swe = Trans_model(embedding_size, src_vocab_size, trg_vocab_size,
                              src_pad_idx, num_heads, num_encoder_layers,
                              num_decoder_layers, forward_expansion, dropout,
                              max_len, device, 'gelu')
# [30] END

# [31] START
sp.Load(swedish_model)
criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id())
optimizer = optim.Adam(model_synth_swe.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
# [31] END

# [32] START
if load_model == True:
    checkpoint = torch.load(model_path, map_location='cpu')
    model_synth_swe.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
model_synth_swe.to(device)
# [32] END

# [33] START
translate_sentence(model_synth_swe, sent1, device, sami_model, swedish_model)
# [33] END
예제 #2
0
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
    'gelu'
)

sp.Load(swedish_model)
criterion = nn.CrossEntropyLoss(ignore_index=sp.pad_id())
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

load_model = False

if load_model == True:
    checkpoint = torch.load(model_path, map_location='cpu')
    model.load_state_dict(checkpoint['state_dict'])           
    optimizer.load_state_dict(checkpoint['optimizer'])
model.to(device)


sentence = "Sámediggi lea sámiid álbmotválljen orgána Norggas."

sentence2 = "Deaŧalaš lea gozihit álgoálbmotoli nationála ja riikkaidgaskasaš forain."
def main(DATA, MAX_LINES, MAX_PADDING, MIN_LEN_SENTENCE, SIZE_VOCAB, SHOW_SENTENCES, LR, EPOCHS):

    if DATA == 'europarl':
        URLS=["http://www.statmt.org/europarl/v10/training-monolingual/europarl-v10.es.tsv.gz"]
        FILES = ["europarl-v10.es.tsv.gz"]
        CORPORA = ["europarl-v10.es.tsv"]

    elif DATA == 'newscarl':
        URLS=[
        "http://data.statmt.org/news-crawl/es/news.2007.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2008.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2009.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2010.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2011.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2012.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2013.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2014.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2015.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2016.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2017.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2018.es.shuffled.deduped.gz",
        "http://data.statmt.org/news-crawl/es/news.2019.es.shuffled.deduped.gz"
      ]

        FILES=[
          "news.2007.es.shuffled.deduped.gz",
          "news.2008.es.shuffled.deduped.gz",
          "news.2009.es.shuffled.deduped.gz",
          "news.2010.es.shuffled.deduped.gz",
          "news.2011.es.shuffled.deduped.gz",
          "news.2012.es.shuffled.deduped.gz",
          "news.2013.es.shuffled.deduped.gz",
          "news.2014.es.shuffled.deduped.gz",
          "news.2015.es.shuffled.deduped.gz",
          "news.2016.es.shuffled.deduped.gz",
          "news.2017.es.shuffled.deduped.gz",
          "news.2018.es.shuffled.deduped.gz",
          "news.2019.es.shuffled.deduped.gz"
        ]

        CORPORA=[
          "news.2007.es.shuffled.deduped",
          "news.2008.es.shuffled.deduped",
          "news.2009.es.shuffled.deduped",
          "news.2010.es.shuffled.deduped",
          "news.2011.es.shuffled.deduped",
          "news.2012.es.shuffled.deduped",
          "news.2013.es.shuffled.deduped",
          "news.2014.es.shuffled.deduped",
          "news.2015.es.shuffled.deduped",
          "news.2016.es.shuffled.deduped",
          "news.2017.es.shuffled.deduped",
          "news.2018.es.shuffled.deduped",
          "news.2019.es.shuffled.deduped"
        ]

    print('File download') #---------------------------------------------
    for u, f in zip(URLS, FILES):
        print(u)
        sys.stdout.flush()
        if path.exists(f):
            print('File already downloaded'.format(f))
        else:
            wget.download(u, './'+f)


    print('Unzipping {}'.format(f)) #----------------------------------------------
    for f, c in zip(FILES, CORPORA):
        print(f)
        sys.stdout.flush()
        if path.exists(f):
            with gzip.open(f, 'rb') as f_in:
                with open(c, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        else:
            print('File already unzipped')


    print('Join all into one') #---------------------------------------------------
    sys.stdout.flush()
    with open('corpus.es', 'wb') as outfile:
        for fname in CORPORA:
            print('Joining file:', fname)
            sys.stdout.flush()
            with open(fname, 'rb') as infile:
                for line in infile:
                    outfile.write(line)


    print('Delete auxiliar files') #-----------------------------------------------
    sys.stdout.flush()
    for f in CORPORA:
        os.remove(f)

    print('Reduced File') #--------------------------------------------------------
    sys.stdout.flush()

    with open('corpus.es', 'rb') as cfile:
      with open('corpus.reduced.es', 'wb') as rfile:
        count = 0
        while count < MAX_LINES:
          line = cfile.readline()
          rfile.write(line)
          count += 1
        print('Number of lines in the reduced file:', count)


    print('Read Data') #----------------------------------------------------------
    sys.stdout.flush()
    FILE = 'corpus.reduced.es'
    data = []

    with open(FILE, 'rb') as corpus_file:
      Lines = corpus_file.readlines()
      for line in Lines:
        data.append(line)


    print('Preprocessing the Data') #----------------------------------------------
    sys.stdout.flush()

    print(data[50])
    print(sentence_to_words(data[50]))
    sys.stdout.flush()

    preprocessed_data = []
    perc = 0

    for ind, s in enumerate(data):
      words = sentence_to_words(s)
      if len(words) >= MIN_LEN_SENTENCE and len(words) <= MAX_PADDING :
        preprocessed_data.append(words)
      if ind > perc:
        print('{}/{} sentences preprocessed'.format(perc, len(data)))
        sys.stdout.flush()
        perc += SHOW_SENTENCES

    print('Length data readed: ', len(data))
    print('Length after preprocessing: ', len(preprocessed_data))
    print(preprocessed_data[50])
    sys.stdout.flush()


    print('Build Dictionary') #---------------------------------------------------
    sys.stdout.flush()

    FED = 20 #nmber of elements we will show
    word_dict, complete_dict = build_dict(preprocessed_data, SIZE_VOCAB)
    list_dict = [key for key in word_dict.keys()]
    print('First {} elements of the dictionary: {}'.format(FED, list_dict[:FED]))
    sys.stdout.flush()

    print('Selected {}/{} words'.format(SIZE_VOCAB, len(complete_dict)))
    sys.stdout.flush()

    # We will add the XXX as 0 in the dictionary, so when a custom sentence is inputted, the XXX will mark the word the model has to guess
    word_dict['XXX'] = 0
    word_dict['PAD'] = 1
    word_dict['INFREQ'] = 2


    print('Convert and Pad') #-----------------------------------------------------
    sys.stdout.flush()

    int_data, int_data_len = convert_and_pad_data(word_dict, preprocessed_data, MAX_PADDING)
    print(int_data[50], int_data_len[50])
    sys.stdout.flush()


    print('Extract Word') #-------------------------------------------------------
    sys.stdout.flush()

    # Check there is no sentence with all 2's
    int_data_pre = [d for d, lend in zip(int_data, int_data_len) if len(set(d[:lend])) > 1]
    len_data_pre = [lend for d, lend in zip(int_data, int_data_len) if len(set(d[:lend])) > 1]
    print('{} of the {} sentences were only 2\'s'.format(len(int_data)-len(int_data_pre), len(int_data)))
    sys.stdout.flush()

    masked_data = []
    word_masked = []

    masked_data = int_data_pre.copy()

    for idx, (sentence, len_sentence) in enumerate(zip(int_data_pre, len_data_pre)):
      acceptable_value = False

      while acceptable_value == False:
        idx_word = random.randint(0, len_sentence-1)
        if int_data_pre[idx][idx_word] != 2:
          acceptable_value = True

      word_masked.append(int_data_pre[idx][idx_word]) #save the word extracted
      masked_data[idx][idx_word] = 0 #put this word to 0

    print(masked_data[50])
    print(word_masked[50])
    sys.stdout.flush()


    print('Split Train, Valid and Test') #-----------------------------------------
    sys.stdout.flush()

    train_x, valid_x, train_y, valid_y, train_len, valid_len = train_test_split(masked_data, word_masked, len_data_pre, test_size=0.25, random_state=42)
    valid_x, test_x, valid_y, test_y, valid_len, test_len = train_test_split(valid_x, valid_y, valid_len, test_size=0.4, random_state=42)

    print('train: ', len(train_x), len(train_y), len(train_len))
    print('valid: ', len(valid_x), len(valid_y), len(valid_len))
    print('test: ', len(test_x), len(test_y), len(test_len))
    sys.stdout.flush()


    print('Cleaning Variables') #--------------------------------------------------
    sys.stdout.flush()
    preprocessed_data = None
    list_dict = None
    int_data = None
    int_data_len = None
    int_data_pre = None
    len_data_pre = None
    masked_data = None
    word_masked = None
    data = None
    Lines = None
    complete_dict = None


    # Training ------------------------------------------------------------------
    BATCH_SIZE = 128
    d_model = 256
    heads = 8
    N = 6

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    sys.stdout.flush()

    print('Preparing Input Masks')
    sys.stdout.flush()
    msk_input_loader = create_mask(train_x, BATCH_SIZE)
    valid_msk_input_loader = create_mask(valid_x, BATCH_SIZE)

    print('Preparing Train Data Loaders')
    sys.stdout.flush()
    train_torch_x = torch.tensor(train_x).clone()
    train_torch_y = torch.tensor(train_y).clone()
    train_sample_ds = torch.utils.data.TensorDataset(train_torch_x, train_torch_y)
    train_loader = torch.utils.data.DataLoader(train_sample_ds, batch_size=BATCH_SIZE)

    print('Preparing Validation Data Loaders')
    sys.stdout.flush()
    valid_torch_x = torch.tensor(valid_x).clone()
    valid_torch_y = torch.tensor(valid_y).clone()
    valid_sample_ds = torch.utils.data.TensorDataset(valid_torch_x, valid_torch_y)
    valid_loader = torch.utils.data.DataLoader(valid_sample_ds, batch_size=BATCH_SIZE)

    print('Initialize Model')
    sys.stdout.flush()
    model = Transformer(SIZE_VOCAB, d_model, N, heads).to(device)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    print('Define Loss and Optimizer')
    sys.stdout.flush()
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)

    mask_sample_ds = None
    train_torch_x = None
    train_torch_y = None
    train_sample_ds = None

    print('Going to train')
    sys.stdout.flush()
    losses, valid_losses = train(model, train_loader, msk_input_loader, valid_loader, valid_msk_input_loader, EPOCHS, optimizer, loss_function, device)


    # Testing ------------------------------------------------------------------
    print('Preparing Input Masks')
    msk_test_input_loader = create_mask(test_x, BATCH_SIZE)

    print('Preparing Data Loaders')
    test_torch_x = torch.tensor(test_x).clone()
    test_torch_y = torch.tensor(test_y).clone()
    test_sample_ds = torch.utils.data.TensorDataset(test_torch_x, test_torch_y)
    test_loader = torch.utils.data.DataLoader(test_sample_ds, batch_size=BATCH_SIZE)

    test_torch_x = None
    test_torch_y = None
    test_sample_ds = None

    print('Going to test')
    test_loss = evaluate(model, test_loader, msk_test_input_loader, loss_function, BATCH_SIZE, device)
    print(test_loss)


    # Custom Sentence ------------------------------------------------------------
    test_sentences = ["Ha habido una XXX en Colombia durant la presentación del presidente",
          "Todas las tropas han sido XXX a America",
          "Estaba pensando que quizas XXX deberías hacerlo",
          "Todo lo que llevo esta dentro de mí XXX"]

    for custom_sentence in test_sentences:
      resulting_word = guess_word(custom_sentence)

      print('Initial Sentence: \t {}'.format(custom_sentence))
      print('Word Guessed: \t\t {}'.format(resulting_word))