def lemmatize_file(filename): print('lemmatizing ' + filename) v = Voikko("fi") lemmatized_filename = filename + '_lemmatized' lemmatized_file = open(lemmatized_filename, 'w') with open(filename, 'r') as f: for sentence in f: sent_toks = v.tokens(sentence) words_baseform = [] for word in sent_toks: if word.tokenType == 1: word_analyzed = v.analyze(word.tokenText) if len(word_analyzed) > 0: words_baseform.append(word_analyzed[0].get('BASEFORM')) else: words_baseform.append(word.tokenText) else: words_baseform.append(word.tokenText) sent_baseform = ''.join(words_baseform) lemmatized_file.write(sent_baseform) lemmatized_file.close() v.terminate() return lemmatized_filename
def read_data(file_path): ''' Read data into a list of words and store the words into a file if the relevant word file does not exist''' if os.path.exists(file_path + '_words'): print('reading from word file...') with open(file_path + '_words', 'r') as f: words = f.read().split('\n') return words print('reading from data file...') v = Voikko("fi") with open(file_path) as f: words = [ word.tokenText.lower() for word in v.tokens(f.read()) if word.tokenType == 1 or word.tokenType == 2 ] v.terminate() file = open(file_path + '_words', 'w') file.write('\n'.join(words)) file.close() return words
def sentence_to_index(index_file, file_path, dictionary): '''Read sentences from file and replace them with their corresponding word indices in the dictionary''' print("converting sentences to indices...") v = Voikko("fi") index_f = open(index_file, 'wb') with open(file_path) as f: index_sentences = [] for sentence in f: words = [ word.tokenText.lower() for word in v.tokens(sentence) if word.tokenType == 1 or word.tokenType == 2 ] index_words = [ dictionary[word] if word in dictionary else 0 for word in words ] index_sentences.append(index_words) v.terminate() # save sentence indices into a index_file pkl.dump(index_sentences, index_f, -1) index_f.close() return index_sentences