Exemplo n.º 1
0
class HapaxLegomera(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def compile_counts(self, X, *_):
        word_counts = Counter()
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)

            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    word_counts.update([token])

        return word_counts

    def fit(self, X, *_):
        return self

    def transform(self, X, *_):
        word_counts = self.compile_counts(X)
        result = []
        for sent in X:
            features = defaultdict(int)
            tokens = self.TK.tokenize(sent, lowercase=True)
            for i, token in enumerate(tokens):
                if not self.punct.match(token):
                    if word_counts[token] == 1:
                        features['hapax_legomera'] += 1
                    elif word_counts[token] == 2:
                        features['hapax_dislegomera'] += 1
            result.append(features)
        return result
Exemplo n.º 2
0
class WordTokenizer2(BaseEstimator, TransformerMixin):
    def __init__(self,
                 char_level=False,
                 strip_punctuation=False,
                 ngram_range=(1, 1)):
        self.TK = NISTTokenizer()
        self.word_index = dict()
        self.index_word = dict()
        self.strip_punctuation = strip_punctuation
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def fit(self, X, *_):
        i = 1
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)
            for t in tokens:
                if self.strip_punctuation:
                    if not self.punct.match(t):
                        if t not in self.word_index:
                            self.word_index[t] = i
                            self.index_word[i] = t
                            i += 1

                else:
                    if t not in self.word_index:
                        self.word_index[t] = i
                        self.index_word[i] = t
                        i += 1

        return self

    def transform(self, X, *_):

        #returns sequence of form [1,2,3,4]

        sequences = []
        for sent in X:
            seq = []
            tokens = self.TK.tokenize(sent, lowercase=True)
            for t in tokens:
                if self.strip_punctuation:
                    if not self.punct.match(t):
                        if t in self.word_index:
                            seq.append(self.word_index[t])

                else:
                    if t in self.word_index:
                        seq.append(self.word_index[t])

            sequences.append(seq)

        return sequences
Exemplo n.º 3
0
 def transform(self, X, y=None):
     sliw = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         sliw.append(len(tokens))
     return np.array(sliw).reshape(-1, 1)
Exemplo n.º 4
0
 def transform(self, X, y=None):
     awl = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         awl.append(np.mean([len(w) for w in tokens if not _punctuation.match(w)]))
     return np.array(awl).reshape(-1, 1)
Exemplo n.º 5
0
 def transform(self, X, y=None):
     fsf = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         fsf.append(len(list(filter(lambda x: x[1] == '.', nltk.pos_tag(tokens)))))
     return np.array(fsf).reshape(-1, 1)
Exemplo n.º 6
0
def tokenise(caption, lower = True):
    # import the NIST tokenizer    
    nist = NISTTokenizer() 
    if lower:
        caption = caption.lower()    
    caption = nist.tokenize(caption)
    return caption
Exemplo n.º 7
0
 def transform(self, X, y=None):
     ndw = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         ndw.append(len(set([w for w in tokens if not _punctuation.match(w)])))
     return np.array(ndw).reshape(-1, 1)
Exemplo n.º 8
0
class NltkNistTokenizer(Tokenizer):
    def __init__(self) -> None:
        super().__init__()
        self._base_tokenizer = NISTTokenizer()

    def tokenize_text(self, text: str) -> List[str]:
        return self._base_tokenizer.tokenize(text)
Exemplo n.º 9
0
def word_tokenize(sentence):
    tokenizer = NISTTokenizer()
    sentence = ' '.join(tokenizer.tokenize(sentence))
    # Rejoin special tokens that where tokenized by error: e.g. "<PERSON_1>" -> "< PERSON _ 1 >"
    for match in re.finditer(r'< (?:[A-Z]+ _ )+\d+ >', sentence):
        sentence = sentence.replace(match.group(),
                                    ''.join(match.group().split()))
    return sentence
Exemplo n.º 10
0
 def transform(self, X, y=None):
     hl = []
     tokenizer = NISTTokenizer()
     for i in range(X.shape[0]):
         tokens = tokenizer.tokenize(X[i, :][0], lowercase=True)
         c = Counter([w for w in tokens if not _punctuation.match(w)])
         hl.append(len([w for w, c in c.items() if c == 1]))
     return np.array(hl).reshape(-1, 1)
Exemplo n.º 11
0
def build_word_list():
    nist = NISTTokenizer()
    L = sys.stdin.read()
    stop_words = set(stopwords.words('english'))
    words = (nist.tokenize(L, lowercase=True))
    words = [word for word in words if not word in stop_words]
    words = [word for word in words if word.isalpha()]
    words = [word for word in words if len(word) > 3 & len(word) < 9]
    return words
def __old__get_train_test_split(corpus,
                                annotations,
                                n_splits=5,
                                train_test_split=0.8,
                                cutoff=3):
    annotations_no_tweet = annotations.drop(labels='Tweet', axis=1)
    to_drop = annotations_no_tweet[annotations_no_tweet.sum(
        axis=1) > cutoff].index
    annotations.drop(labels=to_drop, axis=0, inplace=True)

    train_idx = set()
    test_idx = set()

    for i in range(1, cutoff + 1, 1):
        an = annotations[annotations.sum(axis=1) == i]
        train_sample = an.sample(frac=train_test_split)
        train_idx.update(train_sample.index)
        test_idx.update(set(an.index).difference(train_sample.index))

    annotations.at[train_idx, 'set'] = 'train'
    annotations.at[test_idx, 'set'] = 'test'

    kf = KFold(n_splits=n_splits)
    train_data = annotations[annotations.set == 'train']
    for i, (train_idx, test_idx) in enumerate(kf.split(train_data)):
        fold_id = 'fold_{}'.format(i + 1)
        annotations[fold_id] = None
        col_id = annotations.columns.get_loc(fold_id)
        annotations.iloc[train_idx, col_id] = 'train'
        annotations.iloc[test_idx, col_id] = 'test'

    tokenizer = NISTTokenizer()
    annotations.Tweet = annotations.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    annotations.to_csv('moral-dataset-{}.csv'.format(corpus))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    for i in range(0, annotations.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(
            i * 100, (i + 1) * 100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(
                    elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                         signature='default',
                         as_dict=True)['default'])
                embeds.append(
                    pd.DataFrame(index=tweets.index,
                                 data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))
def get_train_test_split(corpus,
                         annotations,
                         n_splits=5,
                         train_test_split=0.8,
                         cutoff=3):
    annotations_no_tweet = annotations.drop(labels='Tweet', axis=1)
    grps = annotations_no_tweet.apply(lambda v: ''.join(map(str, v)),
                                      axis=1).to_frame(0).groupby(0)[0]
    test_idx = grps.apply(lambda g: g.sample(frac=1 - train_test_split)
                          ).index.get_level_values(1)
    train_idx = set(annotations_no_tweet.index).difference(test_idx)

    annotations.at[train_idx, 'set'] = 'train'
    annotations.at[test_idx, 'set'] = 'test'

    train_grps = annotations_no_tweet.loc[train_idx, :].apply(lambda v: ''.join(map(str, v)), axis=1) \
        .to_frame(0).groupby(0)[0]
    for i in range(n_splits):
        fold_test_idx = train_grps.apply(
            lambda g: g.sample(frac=1 / n_splits)).index.get_level_values(1)
        fold_train_idx = set(train_idx).difference(fold_test_idx)
        fold_id = 'fold_{}'.format(i + 1)
        annotations[fold_id] = None
        annotations.loc[fold_train_idx, fold_id] = 'train'
        annotations.loc[fold_test_idx, fold_id] = 'test'

    tokenizer = NISTTokenizer()
    annotations.Tweet = annotations.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    annotations.to_csv('moral-dataset-{}.csv'.format(corpus))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    for i in range(0, annotations.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(
            i * 100, (i + 1) * 100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = annotations[['Tweet']].iloc[(i * 100):(i + 1) * 100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(
                    elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                         signature='default',
                         as_dict=True)['default'])
                embeds.append(
                    pd.DataFrame(index=tweets.index,
                                 data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    all_embeds.to_csv('moral-dataset-{}_elmo_embeddings.csv'.format(corpus))
class WordIndexer(BaseEstimator, TransformerMixin):
    """
    code modified from https://github.com/adventuresinML/adventures-in-ml-code/blob/master/keras_lstm.py
    """

    def __init__(self, reverse=False):

        self.TK = NISTTokenizer()
        self.word2idx = None
        self.sent_size = 0

    def build_vocab(self, X, *_):
        counter = Counter()
        max_len = 0
        for sent in X:
            tokens = self.TK.tokenize(sent, lowercase=True)
            if len(tokens) > max_len:
                max_len = len(tokens)
            counter.update(tokens)

        sort_by_counts = sorted(counter.items(), key=lambda x: x[1])
        words, counts = zip(*sort_by_counts)

        word2idx = dict(zip(words, range(1, len(words) + 1)))
        return word2idx, max_len


    def fit(self, X, *_):
        self.word2idx, self.sent_size = self.build_vocab(X)
        return self

    def transform(self, X, *_):
        vec = np.zeros((len(X), self.sent_size + 25))
        for i, sent in enumerate(X):
            tokens = self.TK.tokenize(sent, lowercase=True)
            for j, tok in enumerate(tokens):
                vec[i][j] = self.word2idx[tok]
            return vec
Exemplo n.º 15
0
def loadData(dir_path):
    data = AutoVivification()
    for root, dirs, files in os.walk(dir_path):
        for file_name in files:
            if not file_name.endswith('.xml'):
                continue
            suffix = file_name.find('.xml')
            # language of the data
            lang = lang_map[file_name[:suffix].split('_')[2].lower()]
            # data type: train/dev/test
            data_type = os.path.basename(root)
            # data domain: restaurant, laptop, ...
            domain = os.path.basename(os.path.dirname(root))
            # subtask
            task = os.path.basename(os.path.dirname(os.path.dirname(root)))

            tokenizer = NISTTokenizer()

            tree = ET.parse(os.path.join(root, file_name))
            revs = tree.getroot()
            for rev in revs:
                for sents in rev:
                    for sent in sents:
                        text = None
                        ops = []
                        for c in sent:
                            if c.tag == 'text':
                                #text = tokenizer.tokenize(c.text, escape = False, return_str = True)
                                text = tokenizer.tokenize(c.text,
                                                          return_str=True)
                            elif c.tag == 'Opinions':
                                for op in c:
                                    ops.append(op.attrib)
                        if not ops:
                            continue
                        data[data_type][lang][task][domain][text] = ops
    return data['train'], data['dev'], data['test']
Exemplo n.º 16
0
def process_tweets(pair_id, target_pair, n_splits):
    target_pair['set'] = None
    target_pair['set'][target_pair['Test/Train/Dev'].isin(['Train', 'Dev'])] = 'train'
    target_pair['set'][pd.isnull(target_pair['set'])] = 'test'

    kf = KFold(n_splits=n_splits)
    target_pair_train = target_pair[target_pair.set == 'train']
    for i, (train_idx, test_idx) in enumerate(kf.split(target_pair_train)):
        fold_id = 'fold_{}'.format(i+1)
        target_pair[fold_id] = None
        target_pair[fold_id].iloc[train_idx] = 'train'
        target_pair[fold_id].iloc[test_idx] = 'test'

    tokenizer = NISTTokenizer()
    target_pair.Tweet = target_pair.Tweet.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))

    target_pair.rename(columns={'Stance 1': 'Target 1',
                                'Stance 2': 'Target 2'}, inplace=True)
    target_pair.to_csv('tweets-{}.csv'.format(pair_id))

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    embeds = []
    print('There are {} tweets in pair: '.format(target_pair.shape[0], pair_id))
    for i in range(0, target_pair.shape[0] // 100 + 1):
        print('Computing embeddings for [{} .. {})'.format(i*100, (i+1)*100))
        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            tweets = target_pair[['Tweet']].iloc[(i*100):(i+1)*100, :]
            if tweets.shape[0] > 0:
                elmo_tweet_embeddings = session.run(elmo(tf.squeeze(tf.cast(tweets.values, tf.string)),
                                                         signature='default', as_dict=True)['default'])
                embeds.append(pd.DataFrame(index=tweets.index, data=elmo_tweet_embeddings))
    all_embeds = pd.concat(embeds, 0)
    print('There are {} embeddings in pair: '.format(all_embeds.shape[0], pair_id))
    all_embeds.to_csv('tweets-{}_elmo_embeddings.csv'.format(pair_id))
Exemplo n.º 17
0
 def transform(self, X, y=None):
     tokenizer = NISTTokenizer()
     tokenized = np.array([' '.join(tokenizer.tokenize(X[i, :][0], lowercase=True)) for i in range(X.shape[0])])
     vectorizer = CountVectorizer(ngram_range=(1, self.n_grams), max_features=500, stop_words='english')
     return vectorizer.fit_transform(tokenized).toarray()
Exemplo n.º 18
0
class SentenceFeatures(BaseEstimator, TransformerMixin):
    """
    Extract sentence features in format supporting Pipelines.

    Uses the top 10 discriminating features from Simaki (2018)) paper:
    'Evaluating stance-annotated sentences from the Brexit
    Blog Corpus: A quantitative linguistic analysis'

    These are:
    1. Average word length
    2. Conjunction frequency
    3. Sentence length in words
    4. Comma frequency
    5. Full stop frequency
    6. Hapax Legomena (number of words appearing in utterance only once)
    7. Number of different words used
    8. Sentence length in characters
    9. Punctuation frequency
    10. Hapax dislegomena (number of words appearing in utterance only twice)
    """
    def __init__(self):
        self.TK = NISTTokenizer()
        self.punct = re.compile('^[^a-zA-Z0-9_]$')

    def fit(self, *_):
        return self

    def transform(self, X, *_):
        result = []
        for sent in X:
            #print(sent)
            features = defaultdict(int)
            num_words = len(sent.split())
            tokens = self.TK.tokenize(sent, lowercase=True)
            tags = nltk.pos_tag((tokens))
            features['sent length/words'] = num_words
            counts = Counter()
            for i, token in enumerate(tokens):

                if self.punct.match(token):
                    features['punctuation'] += 1
                    if token == ',':
                        features['comma'] += 1
                    if token == '.':
                        features['period'] += 1

                else:
                    if tags[i][1] == 'CC':
                        features['conjunctions'] += 1

                    num_chars = len(re.sub(r'\W', '', token))
                    features['mean word length'] += num_chars
                    features['sent length/chars'] += num_chars
                    counts.update([token])

            features['mean word length'] /= num_words
            features['hapax legomera'] = sum(
                [1 for k, v in counts.items() if v == 1])
            features['hapax dislegomera'] = sum(
                [1 for k, v in counts.items() if v == 2])
            #print(counts)
            features['different words'] = len(counts.keys())
            result.append(features)
            #print(features)
        return result
    seq2seq = t.train(
        seq2seq,
        train,
        num_epochs=6,
        optimizer=optimizer,
        teacher_forcing_ratio=0.6,
        teacher_forcing_half_life=5000,
        resume=opt.resume,
    )

predictor = Predictor(seq2seq, input_vocab, output_vocab)
loss, acc = Evaluator(loss=loss).evaluate(
    seq2seq,
    torchtext.data.TabularDataset(path=opt.test_path,
                                  format="tsv",
                                  fields=[("src", src), ("tgt", tgt)]),
)
logging.info("Loss: {}, Acc: {}".format(loss, acc))

import nltk
nltk.download('perluniprops')

from nltk.tokenize.nist import NISTTokenizer
nist = NISTTokenizer()

while True:
    seq_str = input("Type in a source sequence:")
    seq = nist.tokenize(seq_str.strip(), lowercase=False)
    print(predictor.predict(seq))
Exemplo n.º 20
0
# import the NIST tokenizer
nist = NISTTokenizer()

# list the files in the annotations data folder (if the structure if the database is unchanged from how it was downloaded
# from the official website)
annotations = [os.path.join(text_path, x) for x in os.listdir(text_path)]
annotations.sort()
# load the training annotations
train_cap = json.load(open(annotations[0]))
train_cap = train_cap['annotations']

train_dict = defaultdict(list)
for x in train_cap:
    key = str(x['image_id'])
    # pad short image ids with 0s so they match with the keys used later on
    while len(key) < 6:
        key = '0' + key
    train_dict[key] = train_dict[key] + [x]

coco_dict = defaultdict(int)

for x in train_dict.keys():
    for y in train_dict[x]:
        caption = y['caption'].lower()
        caption = nist.tokenize(caption)
        for z in caption:
            z = ''.join([x for x in z if not x in string.punctuation])
            if not z == []:
                coco_dict[z] += 1

save_obj(coco_dict, os.path.join(dict_loc, 'coco_frequency'))
Exemplo n.º 21
0
def run(n_splits=5):
    """
    Prepare the Brexit Blog Corpus data set for analysis
    :param n_splits: int, the number of train/test splits to degenerate, default=5
    :return:
    """
    print('Reading and processing the xlsx file...', end='')
    brexit_blog_corpus = pd.read_excel('brexit_blog_corpus.xlsx')

    # fix up some typos
    brexit_blog_corpus.replace('concession/contrarines', np.nan, inplace=True)
    brexit_blog_corpus.replace('hypotheticallity',
                               'hypotheticality',
                               inplace=True)

    # unfortunately, quite a few utterances are duplicates :(
    clean_dataset = brexit_blog_corpus.drop_duplicates(subset='Utterance')

    stance_columns = [
        'Stance category', 'second stance category', 'third', 'fourth', 'fifth'
    ]

    clean_dataset = clean_dataset[['Utterance ID No', 'Utterance'] +
                                  stance_columns].set_index('Utterance ID No')

    # extract the stance categories and do some cleaning
    stance_categories = set(clean_dataset[stance_columns].values.flatten())
    stance_categories.discard(np.nan)
    stance_categories = sorted(list(stance_categories))
    stance_categories = [
        w.replace(' ', '-').replace('/', '-') for w in stance_categories
    ]

    # one-hot encode the assigned stance labels
    mlb = MultiLabelBinarizer()
    k_hot_encoded_stances = mlb.fit_transform(
        [x[~pd.isnull(x)] for x in clean_dataset[stance_columns].values])
    k_hot_encoded_stances = pd.DataFrame(index=clean_dataset.index,
                                         data=k_hot_encoded_stances,
                                         columns=list(mlb.classes_))
    k_hot_encoded_stances.columns = stance_categories

    # join the one-hot encoded labels and utterances back together again
    clean_dataset_one_hot = clean_dataset[['Utterance', 'Stance category']] \
        .join(k_hot_encoded_stances)
    print('done.')

    print('Tokenising the utterances...', end='')
    # tokenize the Utterance
    tokenizer = NISTTokenizer()
    clean_dataset_one_hot.Utterance = clean_dataset_one_hot.Utterance.apply(
        lambda x: ' '.join(tokenizer.tokenize(x, lowercase=True)))
    print('done.')

    print('Constructing train/test split and saving to disk...', end='')
    # split the data into train and test sets in the ratio 80:20
    stance_columns = set(clean_dataset_one_hot.columns).difference(
        ['Utterance', 'Stance category'])
    stance_columns = sorted(list(stance_columns))

    # first split the data in two to get train and test sets
    reset_seeds()

    X_train, X_test, y_train, y_test = \
        train_test_split(clean_dataset_one_hot['Utterance'],
                         clean_dataset_one_hot[stance_columns],
                         test_size=0.2,
                         stratify=clean_dataset_one_hot['Stance category'])

    y_train['set'] = 'train'
    y_test['set'] = 'test'

    dataset = pd.concat([
        pd.DataFrame(data={
            'Utterance': X_train
        }).join(y_train),
        pd.DataFrame(data={
            'Utterance': X_test
        }).join(y_test)
    ],
                        axis=0)

    dataset.to_csv('bbc_dataset.csv')
    print('done.')

    print('Constructing the cv folds and saving to disk...', end='')
    X_train_folds = pd.DataFrame(
        index=X_train.index,
        columns=['fold_{}'.format(i) for i in range(1, n_splits + 1)])
    skf = StratifiedKFold(n_splits=n_splits)
    y = clean_dataset_one_hot.loc[y_train.index, 'Stance category']
    for i, (train_idx,
            test_idx) in enumerate(skf.split(np.zeros(X_train.shape[0]), y)):
        X_train_folds.iloc[train_idx, i] = 'train'
        X_train_folds.iloc[test_idx, i] = 'test'

    X_train_folds.to_csv('bbc_dataset_folds.csv')
    print('done.')

    print('Pre-computing the ELMO embeddings and saving to disk...', end='')

    elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=False)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        elmo_train_embeddings = session.run(
            elmo(tf.squeeze(tf.cast(X_train.values, tf.string)),
                 signature='default',
                 as_dict=True)['default'])
        elmo_train_embeddings = pd.DataFrame(index=X_train.index,
                                             data=elmo_train_embeddings)
        elmo_train_embeddings.to_csv('bbc_elmo_train_embeddings.csv')

        elmo_test_embeddings = session.run(
            elmo(tf.squeeze(tf.cast(X_test.values, tf.string)),
                 signature='default',
                 as_dict=True)['default'])
        elmo_test_embeddings = pd.DataFrame(index=X_test.index,
                                            data=elmo_test_embeddings)
        elmo_test_embeddings.to_csv('bbc_elmo_test_embeddings.csv')

    print('done.')
Exemplo n.º 22
0
def nist_tokenize(sentence):
    nist = NISTTokenizer()
    return ' '.join(nist.tokenize(sentence))
target_test_file = "target_test.txt"

line_pairs = []

nist = NISTTokenizer()

with open(source_file, "r") as source, open(target_file, "r") as target:
    for src, tgt in zip(source, target):
        target_parsed = tgt.split('\t')[2]
        if not(MIN_LEN < len(target_parsed.split()) < MAX_LEN):
            continue
        source_parsed = src.split('\t')[2]
        if not(MIN_LEN < len(source_parsed.split()) < MAX_LEN):
            continue

        target_tokenised = nist.tokenize(target_parsed, lowercase=True)
        if not(MIN_LEN < len(target_tokenised.split()) < MAX_LEN):
            continue
        source_tokenised = nist.tokenize(source_parsed, lowercase=True)
        if not(MIN_LEN < len(source_tokenised.split()) < MAX_LEN):
            continue

        source_joined = " ".join(source_tokenised) + "\n"
        target_joined = " ".join(target_tokenised) + "\n"
        line_pairs.append((source_joined, target_joined))

test_lines = len(line_pairs) // 4
train_lines = len(line_pairs) - test_lines

print("Lines in train:", train_lines)
print("Lines in test:", test_lines)