def get_frequency_dict(lang_code, lang_name): print_status("Creating frequency dictionaries...") frequency_dict = dict() # Load data for root, dirs, files in os.walk('datasets/monolingual-' + lang_code): if ('.DS_Store' in files): files.remove('.DS_Store') for f in files: print(f) filepath = os.path.join(root, f) file = open(filepath, 'rt', encoding='utf8') text = file.read() file.close() # Clean XML tags cleantext = BeautifulSoup(text, "lxml").text module = importlib.import_module("spacy.lang." + lang_code) nlp = getattr(module, lang_name)() if module is not None else spacy.language.Language() tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = list(tokenizer(cleantext)) for word in tokens: word = word.text.lower() if is_other(word): continue else: if word in frequency_dict.keys(): frequency_dict[word] += 1 else: frequency_dict[word] = 1 return frequency_dict
def get_tokenized_sentences(lang_code, lang_name): tokenizedFile = [] # Initialize tokenizer module = importlib.import_module("spacy.lang." + lang_code) nlp = getattr( module, lang_name)() if module is not None else spacy.language.Language() tokenizer = nlp.Defaults.create_tokenizer(nlp) # Load data print_status("Creating tokenized sentences from dataset...") for root, dirs, files in os.walk('datasets/monolingual-' + lang_code): if ('.DS_Store' in files): files.remove('.DS_Store') for f in files: print(f) filepath = os.path.join(root, f) file = open(filepath, 'rt', encoding='utf8') text = file.read() file.close() # Clean XML tags cleantext = BeautifulSoup(text, "lxml").text # Split in sentences sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", cleantext) # Split in tokens for s in sentences: word_tokens = [] tokens = list(tokenizer(s)) for t in tokens: t = t.text.lower() if (not is_other(t)): word_tokens.append(t) tokenizedFile.append(word_tokens) return tokenizedFile
word = word.lower() # Get lang1 prob if word in probability_lang1_dict: prob_lang1 = probability_lang1_dict[word] else: prob_lang1 = probability_lang1_dict['OOV'] # Get lang2 prob if word in probability_lang2_dict: prob_lang2 = probability_lang2_dict[word] else: prob_lang2 = probability_lang2_dict['OOV'] # Assign class based on regex or class with highest prob if (is_other(word)): lang = 'other' else: if (prob_lang1 >= prob_lang2): lang = 'lang1' else: lang = 'lang2' y.append(lang) predictions_dict[word] = lang else: y.append('') if (evaluation_dataset == 'test-original'): save_predictions( y, './results/predictions/' + lang1_code + '-' + lang2_code +
if (line.strip() is not ''): token = line.rstrip('\n') words.append(token.lower()) else: words.append('') file.close() # Choose language with highest probability for each word based on ngrams y = [] predictions_dict = dict() counter = 0 print_status("Classifying...") for word in words: if (word != ''): word = word.lower() if is_other(word): lang = 'other' else: prob_lang1 = model_lang1.get_word_log_prob(word) prob_lang2 = model_lang2.get_word_log_prob(word) if (prob_lang1 >= prob_lang2): lang = 'lang1' else: lang = 'lang2' y.append(lang) predictions_dict[word] = lang if counter % 10000 == 0: print(f"{counter} of {len(words)}") counter += 1
s.append(token.lower()) else: sentences.append(s) s = [] file.close() y = [] predictions_dict = dict() for tokens in sentences: if (len(tokens) > 0): # Separate 'lang' words from 'other' words lang_tokens = [] other_indexes = [] for i in range(len(tokens)): if (is_other(tokens[i])): other_indexes.append(i) else: lang_tokens.append(tokens[i]) # For sentences with 'lang1', 'lang2' and 'other' words if (len(lang_tokens) > 0): y_sentence = identifier.identify(lang_tokens) for index in other_indexes: y_sentence.insert(index, 'other') # For sentences that are made up only of 'other' words else: y_sentence = [] for index in other_indexes: y_sentence.append('other') for i in range(len(tokens)): predictions_dict[tokens[i]] = y_sentence[i]
# Original test set for line in file: # Remove empty lines, lines starting with # sent_enum, \n and split on tab if (line.strip() is not ''): token = line.rstrip('\n') words.append(token.lower()) else: words.append('') file.close() # Remove 'other' words print_status("Removing 'other' data...") words_not_other = [] for word in words: if(word != '' and not is_other(word)): words_not_other.append(word) # Convert a collection of words to a matrix of token counts print_status("Counting ngrams...") # vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), binary=True) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), binary=True) vectorized_train_data = vectorizer.fit_transform(X_train) vectorized_dev_data = vectorizer.transform(words_not_other) # Create and fit the LDA model print_status("Training LDA...") number_topics = 2 lda_model = LDA(n_components=number_topics, max_iter=100, random_state=123)
token = line.rstrip('\n') s.append(token.lower()) else: sentences.append(s) s = [] file.close() # Choose language with highest probability for each word based on ngrams y = [] predictions_dict = dict() counter = 0 print_status("Classifying...") for s in sentences: if (len(s) == 0): continue for word_index in range(len(s)): if is_other(s[word_index]): lang = 'other' else: prob_lang1 = model_lang1.get_word_log_prob(s, word_index) prob_lang2 = model_lang2.get_word_log_prob(s, word_index) if (prob_lang1 >= prob_lang2): lang = 'lang1' else: lang = 'lang2' y.append(lang) predictions_dict[s[word_index]] = lang if counter % 10000 == 0: print(f"{counter} of {len(sentences)}") counter += 1