def tokenize_all_sentences_in_directory(self, directory) -> [str]: sentences = [] for file in FileUtil.get_files_in_directory(directory): if self._italian: sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding()), language="italian") else: sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding())) return sentences
def iterate_files(tokenizer, preprecessor, folder): for file in FileUtil.get_files_in_directory(folder, True): file_representation = tokenizer.tokenize(file) file_representation.preprocess(preprecessor) for word in file_representation.token_list: lemma = [token.lemma_ for token in spacy_lemmatizer(word)] if len(lemma) > 1: log.info( f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma" ) lemma = "".join(lemma) if word in word_to_lemma_map: if not word_to_lemma_map[word] == lemma: log.info( f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}" ) else: word_to_lemma_map[word] = lemma
def embedd_all_files_in_directory(self, directory): all_filenames = FileUtil.get_files_in_directory(directory) all_embeddings = [] for filename in all_filenames: try: file_representation = self._tokenize_and_preprocess(filename) except (FileNotFoundError, IsADirectoryError, PermissionError, UnicodeDecodeError,) as e: log.info(f"SKIPPED: Error on reading or tokenizing {filename}: {e}") continue except JavaSyntaxError as j: log.info(f"SKIPPED: JavaSyntaxError on tokenizing {filename} (Note: code files needs to be compilable): {j.at}") continue except (JavaParserError, LexerError) as j: log.info(f"SKIPPED: Error on tokenizing {filename} (Note: code files needs to be compilable): {j}") continue file_embedding = self._create_embeddings(file_representation) if file_embedding: all_embeddings.append(file_embedding) else: log.info(f"No embedding for {filename}") return all_embeddings