Exemplo n.º 1
0
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import os, sys

project_path = os.path.sep.join(
    os.path.abspath(__file__).split(os.path.sep)[:-3])
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences
from examples.nmt.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger

logger = get_logger("examples.nmt.train", os.path.join('..', '..', 'logs'))


def get_data(train_size, random_seed=100):
    """ Getting randomly shuffled training / testing data """
    en_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_en.txt'))
    fr_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_fr.txt'))
    logger.info('Length of text: {}'.format(len(en_text)))

    fr_text = [
        'sos ' + sent[:-1] + 'eos .' if sent.endswith('.') else 'sos ' + sent +
        ' eos .' for sent in fr_text
    ]
Exemplo n.º 2
0
import numpy as np
import os, sys

project_path = os.environ.get("PWD")
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences, get_data
from examples.nmt_bidirectional.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger
from examples.utils.config import Config

config = Config()

logger = get_logger("examples.nmt_bidirectional.train", config.LOGS_DIR)

batch_size = 64
hidden_size = 96
en_timesteps, fr_timesteps = 20, 20


def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text, en_timesteps,
                    fr_timesteps):
    """ Preprocessing data and getting a sequence of word indices """

    en_seq = sents2sequences(en_tokenizer,
                             en_text,
                             reverse=False,
                             padding_type='pre',
                             pad_length=en_timesteps)
Exemplo n.º 3
0
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import os, sys

project_path = os.path.sep.join(
    os.path.abspath(__file__).split(os.path.sep)[:-3])
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences
from examples.nmt_bidirectional.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger

logger = get_logger("examples.nmt_bidirectional.train",
                    os.path.join('..', '..', 'logs'))


def get_data(train_size, random_seed=100):
    """ Getting randomly shuffled training / testing data """
    en_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_en.txt'))
    fr_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_fr.txt'))
    logger.info('Length of text: {}'.format(len(en_text)))

    fr_text = [
        'sos ' + sent[:-1] + 'eos .' if sent.endswith('.') else 'sos ' + sent +
        ' eos .' for sent in fr_text
    ]
Exemplo n.º 4
0
from tensorflow.python.keras.utils import to_categorical
import numpy as np
import os, sys

project_path = os.path.sep.join(
    os.path.abspath(__file__).split(os.path.sep)[:-3])
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences
from examples.nmt.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger

base_dir = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-3])
logger = get_logger("examples.nmt.train", os.path.join(base_dir, 'logs'))

batch_size = 64
hidden_size = 96
en_timesteps, fr_timesteps = 20, 20


def get_data(train_size, random_seed=100):
    """ Getting randomly shuffled training / testing data """
    en_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_en.txt'))
    fr_text = read_data(
        os.path.join(project_path, 'data', 'small_vocab_fr.txt'))
    logger.info('Length of text: {}'.format(len(en_text)))

    fr_text = [
import numpy as np
import os, sys

project_path = os.environ.get("PWD")
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences, get_data
from examples.nmt.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger
from examples.utils.config import Config

config = Config()

logger = get_logger("examples.nmt.train", config.LOGS_DIR)

batch_size = 64
hidden_size = 96
en_timesteps, fr_timesteps = 20, 20


def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text, en_timesteps,
                    fr_timesteps):
    """ Preprocessing data and getting a sequence of word indices """

    en_seq = sents2sequences(en_tokenizer,
                             en_text,
                             reverse=False,
                             padding_type='pre',
                             pad_length=en_timesteps)
import numpy as np
import os, sys

project_path = os.environ.get("PWD")
if project_path not in sys.path:
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences, get_data
from examples.nmt.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger
from examples.utils.config import Config

config = Config()

logger = get_logger("examples.nmt.train_with_none", config.LOGS_DIR)

batch_size = 64
hidden_size = 96


def preprocess_data(en_tokenizer, fr_tokenizer, en_text, fr_text, en_timesteps, fr_timesteps):
    """ Preprocessing data and getting a sequence of word indices """

    en_seq = sents2sequences(en_tokenizer, en_text, reverse=False, padding_type='pre', pad_length=en_timesteps)
    fr_seq = sents2sequences(fr_tokenizer, fr_text, pad_length=fr_timesteps)
    logger.info('Vocabulary size (English): {}'.format(np.max(en_seq)+1))
    logger.info('Vocabulary size (French): {}'.format(np.max(fr_seq)+1))
    logger.debug('En text shape: {}'.format(en_seq.shape))
    logger.debug('Fr text shape: {}'.format(fr_seq.shape))
Exemplo n.º 7
0
    sys.path.append(project_path)

from examples.utils.data_helper import read_data, sents2sequences
from examples.nmt_bidirectional.model import define_nmt
from examples.utils.model_helper import plot_attention_weights
from examples.utils.logger import get_logger

# base_dir = os.path.sep.join(os.path.abspath(__file__).split(os.path.sep)[:-3])
base_dir = os.path.abspath('.')

parser = argparse.ArgumentParser()
parser.add_argument('-s', '--source', help='Source language train file')
parser.add_argument('-t', '--target', help='Target language train file')
arguments = parser.parse_args()

logger = get_logger("examples.bidirectional_nmt.train",
                    os.path.join(base_dir, 'logs'))

batch_size = 16
hidden_size = 96
en_timesteps, fr_timesteps = 15, 15


def get_data(train_ratio, random_seed=100):
    """ Getting randomly shuffled training / testing data """
    en_text = read_data(arguments.source)
    fr_text = read_data(arguments.target)
    logger.info('Length of text: {}'.format(len(en_text)))

    fr_text = [
        'sos ' + sent[:-1] + 'eos' if sent.endswith('.') else 'sos ' + sent +
        ' eos' for sent in fr_text