Пример #1
0
def get_frequency_dict(lang_code, lang_name):
	print_status("Creating frequency dictionaries...")

	frequency_dict = dict()

	# Load data
	for root, dirs, files in os.walk('datasets/monolingual-' + lang_code):
		if ('.DS_Store' in files):
			files.remove('.DS_Store')
		for f in files:
			print(f)
			filepath = os.path.join(root, f)
			file = open(filepath, 'rt', encoding='utf8')
			text = file.read()
			file.close()

			# Clean XML tags
			cleantext = BeautifulSoup(text, "lxml").text

			module = importlib.import_module("spacy.lang." + lang_code)
			nlp = getattr(module, lang_name)() if module is not None else spacy.language.Language()
			tokenizer = nlp.Defaults.create_tokenizer(nlp)
			tokens = list(tokenizer(cleantext))

			for word in tokens:
				word = word.text.lower()

				if is_other(word):
					continue
				else:
					if word in frequency_dict.keys():
						frequency_dict[word] += 1
					else:
						frequency_dict[word] = 1
	return frequency_dict
Пример #2
0
def get_probability_dict(frequency_dict):
	print_status("Creating probability dictionaries...")
	smoothing_factor = 1
	nr_of_tokens = sum(frequency_dict.values())
	nr_of_distinct_words = len(frequency_dict.keys())
	probability_dict = dict()
	for k, v in frequency_dict.items():
		probability_dict[k] = (v + smoothing_factor) / (nr_of_tokens + smoothing_factor * nr_of_distinct_words)
	probability_dict['OOV'] = smoothing_factor / (nr_of_tokens + smoothing_factor * nr_of_distinct_words)
	return probability_dict
Пример #3
0
 def get_ngrams(self):
     unigrams = []
     bigrams = []
     trigrams = []
     fourgrams = []
     fivegrams = []
     sixgrams = []
     print_status("Creating n-grams...")
     j = 0
     for token in self.tokens_dict.keys():
         if type(token) is float:
             print(f"ERROR : unknown token {token}")
             continue
         chars = list(
             pad_sequence(str(token),
                          pad_left=True,
                          left_pad_symbol="<w>",
                          pad_right=True,
                          right_pad_symbol="</w>",
                          n=self.n))
         ngrams = list(everygrams(chars, max_len=self.n))
         for ngram in ngrams:
             if (len(ngram) == 1 and self.n == 2):
                 for i in range(self.tokens_dict[token]):
                     unigrams.append(ngram)
             if (len(ngram) == 2 and self.n <= 3):
                 for i in range(self.tokens_dict[token]):
                     bigrams.append(ngram)
             if (len(ngram) == 3 and self.n <= 4):
                 for i in range(self.tokens_dict[token]):
                     trigrams.append(ngram)
             if (len(ngram) == 4 and self.n <= 5):
                 for i in range(self.tokens_dict[token]):
                     fourgrams.append(ngram)
             if (len(ngram) == 5 and self.n <= 6):
                 for i in range(self.tokens_dict[token]):
                     fivegrams.append(ngram)
             if (len(ngram) == 6 and self.n <= 6):
                 for i in range(self.tokens_dict[token]):
                     sixgrams.append(ngram)
         if j % (len(self.tokens_dict) / 10) == 0:
             print(f"token {j} of {len(self.tokens_dict)}")
         j += 1
     return unigrams + bigrams + trigrams + fourgrams + fivegrams + sixgrams
def get_tokenized_sentences(lang_code, lang_name):

    tokenizedFile = []
    # Initialize tokenizer
    module = importlib.import_module("spacy.lang." + lang_code)
    nlp = getattr(
        module,
        lang_name)() if module is not None else spacy.language.Language()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)

    # Load data
    print_status("Creating tokenized sentences from dataset...")
    for root, dirs, files in os.walk('datasets/monolingual-' + lang_code):
        if ('.DS_Store' in files):
            files.remove('.DS_Store')
        for f in files:
            print(f)
            filepath = os.path.join(root, f)
            file = open(filepath, 'rt', encoding='utf8')
            text = file.read()
            file.close()

            # Clean XML tags
            cleantext = BeautifulSoup(text, "lxml").text

            # Split in sentences
            sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s",
                                 cleantext)

            # Split in tokens
            for s in sentences:
                word_tokens = []
                tokens = list(tokenizer(s))
                for t in tokens:
                    t = t.text.lower()
                    if (not is_other(t)):
                        word_tokens.append(t)

                tokenizedFile.append(word_tokens)

    return tokenizedFile
Пример #5
0
 def get_ngrams(self):
     unigrams = []
     bigrams = []
     trigrams = []
     print_status("Creating n-grams...")
     j = 0
     for sent in self.tokens_arr:
         words = list(
             pad_sequence(sent,
                          pad_left=True,
                          left_pad_symbol="<s>",
                          pad_right=True,
                          right_pad_symbol="</s>",
                          n=self.n))
         ngrams = list(everygrams(words, max_len=self.n))
         for ngram in ngrams:
             if (len(ngram) == 1 and self.n == 2):
                 unigrams.append(ngram)
             if (len(ngram) == 2 and self.n <= 3):
                 bigrams.append(ngram)
         if j % (len(self.tokens_arr) / 10) == 0:
             print(f"token {j} of {len(self.tokens_arr)}")
         j += 1
     return unigrams + bigrams + trigrams
Пример #6
0
from sklearn.metrics import f1_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from tools.utils import save_predictions
import sys

PREDICTIONS_PATH = './results/predictions/'

# Get evaluation dataset from keyboard
if len(sys.argv) == 1:
	print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
	exit(1)
evaluation_dataset = sys.argv[1]

# Get predictions data
print_status("Getting predictions data...")
if (evaluation_dataset == 'dev'):
	predictionsFileName = PREDICTIONS_PATH + 'mBERT_predictions_dev.out' # validation
if (evaluation_dataset == 'test'):
	predictionsFileName = PREDICTIONS_PATH + 'mBERT_predictions_test.out' # test

# Get predictions
file = open(predictionsFileName, 'rt', encoding='utf8')
y = []
for line in file:
	# Remove empty lines, lines starting with # sent_enum, \n and split on tab
	if (line.strip() is not '' and '# sent_enum' not in line):
		line = line.rstrip('\n')
		splits = line.split("\t")
		pred = splits[1]
		y.append(pred)
Пример #7
0
import sys
import os

DICTIONARIES_PATH = "./dictionaries/word-level/"

# Get language codes and evaluation dataset from keyboard
if len(sys.argv) == 1:
    print("Please give two letter language codes as arg, for example en es")
    print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
    exit(1)
lang1_code = sys.argv[1]
lang2_code = sys.argv[2]
evaluation_dataset = sys.argv[3]

# Get dictionaries
print_status("Getting dictionaries...")
lang1_path = DICTIONARIES_PATH + 'probability_dict_' + lang1_code + '.csv'
lang2_path = DICTIONARIES_PATH + 'probability_dict_' + lang2_code + '.csv'
if (os.path.exists(lang1_path) and os.path.exists(lang2_path)):
    probability_lang1_df = pd.read_csv(lang1_path, encoding='utf-16')
    probability_lang1_dict = probability_lang1_df.set_index(
        'word')['probability'].to_dict()

    probability_lang2_df = pd.read_csv(lang2_path, encoding='utf-16')
    probability_lang2_dict = probability_lang2_df.set_index(
        'word')['probability'].to_dict()
else:
    print("Please run: python train_probability.py " + lang1_code + " " +
          lang2_code)

# Get data
Пример #8
0
if len(sys.argv) == 1:
    print("Please give two letter language codes as arg, for example en es")
    print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
    print("Please enter n value")
    exit(1)
lang1_code = sys.argv[1]
lang2_code = sys.argv[2]
evaluation_dataset = sys.argv[3]
n = int(sys.argv[4])

if n != 2 and n != 3 and n != 4 and n != 5 and n != 6:
    print("n should be 2, 3, 4, 5 or 6")
    exit(1)

# Get dictionaries
print_status("Getting dictionaries...")
lang1_path = CHAR_LEVEL_DICTIONARIES_PATH + str(
    n) + '_grams_dict_' + lang1_code + '.csv'
lang2_path = CHAR_LEVEL_DICTIONARIES_PATH + str(
    n) + '_grams_dict_' + lang2_code + '.csv'
if (os.path.exists(lang1_path) and os.path.exists(lang2_path)):
    frequency_lang1_df = pd.read_csv(lang1_path,
                                     encoding='utf-16',
                                     converters={"word": ast.literal_eval})
    frequency_lang1_dict = frequency_lang1_df.set_index(
        'word')['frequency'].to_dict()

    frequency_lang2_df = pd.read_csv(lang2_path,
                                     encoding='utf-16',
                                     converters={"word": ast.literal_eval})
    frequency_lang2_dict = frequency_lang2_df.set_index(
Пример #9
0
lang1 = sys.argv[1]
lang1_code = langs()[lang1]['code']
lang1_name = langs()[lang1]['name']

# Lang 2
lang2 = sys.argv[2]
lang2_code = langs()[lang2]['code']
lang2_name = langs()[lang2]['name']

# Frequency
fullTraining = sys.argv[3] == 'probability'

# If create frequency dictionaries
frequency_lang1_dict = get_frequency_dict(lang1_code, lang1_name)
frequency_lang2_dict = get_frequency_dict(lang2_code, lang2_name)

# Probability dict
probability_lang1_dict = get_probability_dict(frequency_lang1_dict)
if (fullTraining):
	write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang1_dict, 'frequency_dict_' + lang1_code, probability_lang1_dict, 'probability_dict_' + lang1_code)
else:
	write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang1_dict, 'frequency_dict_' + lang1_code)

probability_lang2_dict = get_probability_dict(frequency_lang2_dict)
if (fullTraining):
	write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang2_dict, 'frequency_dict_' + lang2_code, probability_lang2_dict, 'probability_dict_' + lang2_code)
else:
	write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang2_dict, 'frequency_dict_' + lang2_code)
print_status('Done!')

# sources: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
# https://github.com/kapadias/mediumposts/blob/master/nlp/published_notebooks/Introduction%20to%20Topic%20Modeling.ipynb

WORD_LEVEL_DICTIONARIES_PATH = "./dictionaries/word-level/"

# Get language codes and evaluation dataset from keyboard
if len(sys.argv) == 1:
	print("Please give two letter language codes as arg, for example en es")
	print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
	exit(1)
lang1_code = sys.argv[1]
lang2_code = sys.argv[2]
evaluation_dataset = sys.argv[3]

print_status("Getting dictionaries...")
lang1_path = WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_' + lang1_code + '.csv'
lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_' + lang2_code + '.csv'
if (os.path.exists(lang1_path) and os.path.exists(lang2_path)):
	probability_lang1_df = pd.read_csv(lang1_path, encoding='utf-16')
	probability_lang1_dict = probability_lang1_df.set_index('word')['probability'].to_dict()

	probability_lang2_df = pd.read_csv(lang2_path, encoding='utf-16')
	probability_lang2_dict = probability_lang2_df.set_index('word')['probability'].to_dict()
	print_status("Dictionaries ready!")
else:
	print("Please run: python train_probability.py " + lang1_code + " " + lang2_code)

# Get training dictionaries
print_status("Getting tokenized sentences...")
lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p'
Пример #11
0
from sklearn.linear_model import LogisticRegression
import pandas as pd
import sys
import os

# Get language codes and evaluation dataset from keyboard
if len(sys.argv) == 1:
    print("Please give two letter language codes as arg, for example en es")
    print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
    exit(1)
lang1_code = sys.argv[1]
lang2_code = sys.argv[2]
evaluation_dataset = sys.argv[3]

# Get training dictionaries
print_status("Getting tokenized sentences...")
lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p'
lang2_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang2_code + '.p'

if (os.path.exists(lang1_path_tokenized)
        and os.path.exists(lang2_path_tokenized)):
    tokenized_sentences_lang1 = pd.read_pickle(lang1_path_tokenized)
    tokenized_sentences_lang2 = pd.read_pickle(lang2_path_tokenized)
else:
    print("Please run: python train_ngrams_word.py " + lang1_code + " " +
          lang2_code + " 2")

# Flatten lists, so we have a long array of strings (words)
tokenized_sentences_lang1 = [
    item for sent in tokenized_sentences_lang1 for item in sent
][:100000]
Пример #12
0
# Lang 1
lang1 = sys.argv[1]
lang1_code = langs()[lang1]['code']
lang1_name = langs()[lang1]['name']

# Lang 2
lang2 = sys.argv[2]
lang2_code = langs()[lang2]['code']
lang2_name = langs()[lang2]['name']


# Get frequency dictionaries
lang1_path = WORD_LEVEL_DICTIONARIES_PATH + 'frequency_dict_' + lang1_code + '.csv'
lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'frequency_dict_' + lang2_code + '.csv'
if (os.path.exists(lang1_path) and os.path.exists(lang1_path)):
	print_status('Getting dictionaries...')
	frequency_lang1_df = pd.read_csv(lang1_path, encoding='utf-16')
	frequency_lang1_dict = frequency_lang1_df.set_index('word')['frequency'].to_dict()

	frequency_lang2_df = pd.read_csv(lang2_path, encoding='utf-16')
	frequency_lang2_dict = frequency_lang2_df.set_index('word')['frequency'].to_dict()
else:
	print("Please run: python train_probability.py " + lang1_code + " " + lang2_code)


# Create ngrams frequency dictionaries
ns = [
	2,
	3,
	4,
	5,
else:
    tokenized_sentences_lang1 = get_tokenized_sentences(lang1_code, lang1_name)
    with open(lang1_path, 'wb') as fp:
        pickle.dump(tokenized_sentences_lang1, fp)

lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'tokenized_sentences_' + lang2_code + '.p'
if (os.path.exists(lang2_path)):
    tokenized_sentences_lang2 = pd.read_pickle(lang2_path)
else:
    tokenized_sentences_lang2 = get_tokenized_sentences(lang2_code, lang2_name)
    with open(lang2_path, 'wb') as fp:
        pickle.dump(tokenized_sentences_lang2, fp)

# Train n gram model
ns = [
    2,
    3,
]
for n in ns:
    print_status('Training word ngrams model... n=' + str(n))
    model_lang1 = NGramModel(n)
    model_lang1.train(tokenized_sentences_lang1)
    write_dict(WORD_LEVEL_DICTIONARIES_PATH, model_lang1.freq_dist,
               str(n) + '_grams_word_dict_' + lang1_code)

    model_lang2 = NGramModel(n)
    model_lang2.train(tokenized_sentences_lang2)
    write_dict(WORD_LEVEL_DICTIONARIES_PATH, model_lang2.freq_dist,
               str(n) + '_grams_word_dict_' + lang2_code)

print_status('Done!')
Пример #14
0
from sklearn.svm import LinearSVC
import pandas as pd
import sys
import os

# Get language codes and evaluation dataset from keyboard
if len(sys.argv) == 1:
    print("Please give two letter language codes as arg, for example en es")
    print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'")
    exit(1)
lang1_code = sys.argv[1]
lang2_code = sys.argv[2]
evaluation_dataset = sys.argv[3]

# Get training dictionaries
print_status("Getting tokenized sentences...")
lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p'
lang2_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang2_code + '.p'

if (os.path.exists(lang1_path_tokenized)
        and os.path.exists(lang2_path_tokenized)):
    tokenized_sentences_lang1 = pd.read_pickle(lang1_path_tokenized)
    tokenized_sentences_lang2 = pd.read_pickle(lang2_path_tokenized)
else:
    print("Please run: python train_ngrams_word.py " + lang1_code + " " +
          lang2_code + " 2")

# Flatten lists, so we have a long array of strings (words)
tokenized_sentences_lang1 = [
    item for sent in tokenized_sentences_lang1 for item in sent
][:100000]