Пример #1
0
 def tokenize_all_sentences_in_directory(self, directory) -> [str]:
     sentences = []
     for file in FileUtil.get_files_in_directory(directory):
         if self._italian:
             sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding()), language="italian")
         else: 
             sentences += sent_tokenize(FileUtil.read_textfile_into_string(file, self._dataset.encoding()))
     return sentences
 def iterate_files(tokenizer, preprecessor, folder):
     for file in FileUtil.get_files_in_directory(folder, True):
         file_representation = tokenizer.tokenize(file)
         file_representation.preprocess(preprecessor)
         for word in file_representation.token_list:
             lemma = [token.lemma_ for token in spacy_lemmatizer(word)]
             if len(lemma) > 1:
                 log.info(
                     f"More than one lemma {lemma} for \"{word}\". Using \"{''.join(lemma)}\" as lemma"
                 )
             lemma = "".join(lemma)
             if word in word_to_lemma_map:
                 if not word_to_lemma_map[word] == lemma:
                     log.info(
                         f"Different Duplicate Lemma for {word}: {word_to_lemma_dataframe[word]} <-> {lemma}"
                     )
             else:
                 word_to_lemma_map[word] = lemma
Пример #3
0
 def embedd_all_files_in_directory(self, directory):
     all_filenames = FileUtil.get_files_in_directory(directory)
     all_embeddings = []
     for filename in all_filenames:
         try:
             file_representation = self._tokenize_and_preprocess(filename)
         except (FileNotFoundError, IsADirectoryError, PermissionError, UnicodeDecodeError,) as e:
             log.info(f"SKIPPED: Error on reading or tokenizing {filename}: {e}")
             continue
         except JavaSyntaxError as j:
             log.info(f"SKIPPED: JavaSyntaxError on tokenizing {filename} (Note: code files needs to be compilable): {j.at}")
             continue
         except (JavaParserError, LexerError) as j:
             log.info(f"SKIPPED: Error on tokenizing {filename} (Note: code files needs to be compilable): {j}")
             continue
         file_embedding = self._create_embeddings(file_representation)
         if file_embedding:
             all_embeddings.append(file_embedding)
         else:
             log.info(f"No embedding for {filename}")
     return all_embeddings