def test_build_corpus_features_en(self):
        path_src = os.path.join(RAW_CORPUS_ROOT, 'en')
        file_name = [f for f in os.listdir(path_src)][0]
        file_path = os.path.join(path_src, file_name)

        dd = DetailedDictionary.read_from_file(file_path)
        cf = CorpusFeatures('en', EnAlphabet, file_path)
        cf.build(dd)
        self.assertGreater(len(cf.ngrams_collector.suffixes), 5)
Exemplo n.º 2
0
    def read_corpus_by_lang(
            cls,
            folder: str = CORPUS_ROOT,
            ignore_cached: bool = False,
            read_cached_only: bool = False) -> List[CorpusFeatures]:
        """
        Almost the same as read_corpus_by_text, but combines all texts by language
        into single corpus
        """
        data = []  # type: List[CorpusFeatures]

        raw_path = os.path.join(folder, 'raw')
        features_path = os.path.join(folder, 'features')
        if not os.path.isdir(features_path):
            os.mkdir(features_path)

        dirs = [f for f in os.listdir(raw_path)]
        for dir_name in dirs:
            sub_path = os.path.join(raw_path, dir_name)
            if not os.path.isdir(sub_path):
                continue

            language = dir_name
            features_name = f'{language}.json'  # '.../raw/fr.json'
            feature_path = os.path.join(features_path, features_name)
            corpus = None

            if not ignore_cached and os.path.isfile(feature_path):
                try:
                    cf = CorpusFeatures.load_from_file(feature_path)
                    if cf.version != CorpusFeatures.ACTUAL_VERSION:
                        print(
                            f'File "{feature_path}" has version "{cf.version}"'
                        )
                    else:
                        corpus = cf
                except Exception as e:
                    print(f'Error loading "{feature_path}": {e}')

            if not corpus and not read_cached_only:
                # build corpus
                alph = alphabet_by_code[language]
                corpus = CorpusFeatures(language, alph, sub_path)
                dict = DetailedDictionary.read_from_folder(sub_path)
                corpus.build(dict)
                # cache corpus
                corpus.save_to_file(feature_path)

            if corpus:
                corpus.multifile = True
                corpus.cache_file_path = feature_path
                data.append(corpus)
        return data
    def test_colorize_corpuses(self):
        corpus_by_lang = CorpusFeatures.load_from_folder(CORPUS_ROOT)
        for lang_folder in os.listdir(RAW_CORPUS_ROOT):
            subfolder = os.path.join(RAW_CORPUS_ROOT, lang_folder)
            if not os.path.isdir(subfolder):
                continue
            corpuses = corpus_by_lang[lang_folder]

            files = [f for f in os.listdir(subfolder)]
            for file_name in files:
                full_path = os.path.join(subfolder, file_name)
                if not os.path.isfile(full_path) or not file_name.endswith('.txt'):
                    continue
                with codecs.open(full_path, 'r', encoding='utf-8') as fr:
                    text = fr.read()
                words = text.split(' ')
                if not words:
                    continue

                cf = [c for c in corpuses if c.corpus_path == full_path][0]
                new_file_name = os.path.splitext(full_path)[0]
                new_file_name = new_file_name + '.png'

                vectorized = cf.encode_words_by_morphs(words)
                self.colorize_morph_vectors(new_file_name, vectorized)
 def test_find_morphs(self):
     cf = CorpusFeatures('en', EnAlphabet, '')
     cf.dictionary = DetailedDictionary()
     cf.dictionary.words = [
         WordCard('deprived', 10),
         WordCard('prived', 6),
         WordCard('deprive', 5)
     ]
     cf.dictionary.words_total = len(cf.dictionary.words)
     cf.all_words = {d.word for d in cf.dictionary.words}
     cf.ngrams_collector = MarginNgramsCollector(cf.alphabet, cf.dictionary)
     cf.ngrams_collector.prefixes.append(MarginNgram('de', 1, 3, 1))
     cf.ngrams_collector.prefixes.append(MarginNgram('in', 1, 2, 1))
     cf.ngrams_collector.suffixes.append(MarginNgram('ion', -1, 3, 1))
     cf.ngrams_collector.suffixes.append(MarginNgram('d', -1, 4, 1))
     cf.find_dict_morphs()
     wrd = cf.dictionary.words[0]
     self.assertGreater(len(wrd.root), 0)
Exemplo n.º 5
0
    def featurize_folder(self, corpus_folder: str) -> Dict[str, List[Tuple[str, List[float]]]]:
        """
        Folder structure should correspond to the one used by
        CorpusFeatures.load_from_folder() method, i.e.:
         raw/en/file1.txt ...
         features/en/file1.txt ...
        """
        features_by_lang = {}  # type: Dict[str, List[Tuple[str, List[float]]]]
        corpus_by_lang = CorpusFeatures.load_from_folder(corpus_folder)
        for lang in corpus_by_lang:
            corpuses = corpus_by_lang[lang]
            if not corpuses:
                continue
            feature_list = []
            features_by_lang[lang] = feature_list

            for corpus in corpuses:
                feature_list.append(self.featurize_corpus(corpus))

        return features_by_lang
 def test_build_all_features(self):
     corpus_by_lang = CorpusFeatures.load_from_folder(CORPUS_ROOT)
     self.assertGreater(len(corpus_by_lang), 3)
 def test_load_corpus_features_en(self):
     path_src = os.path.join(FEATURES_CORPUS_ROOT, 'en')
     path_src = os.path.join(path_src, 'features.json')
     cf = CorpusFeatures.load_from_file(path_src)
     self.assertGreater(len(cf.ngrams_collector.suffixes), 5)
Exemplo n.º 8
0
 def get_cached_corpus_by_path(cls, path: str) -> CorpusFeatures:
     return CorpusFeatures.load_from_file(path)
Exemplo n.º 9
0
    def read_corpus_by_text(cls,
                            folder: str = CORPUS_ROOT,
                            ignore_cached: bool = False,
                            read_cached_only: bool = False,
                            file_name_only: str = '') -> List[CorpusFeatures]:
        """
        "folder" should have the following structure:
         - raw
           - <lang_1>
             - <file_1_1>.txt  # source text - words in lowercase, space-separated
             ..
           - <lang_N>
         - features
           - <file_1_1>.json  # JSON-encoded CorpusFeatures for file_1_1.txt "corpus"
           ..

        """
        data = []  # type: List[CorpusFeatures]

        raw_path = os.path.join(folder, 'raw')
        features_path = os.path.join(folder, 'features')
        if not os.path.isdir(features_path):
            os.mkdir(features_path)

        dirs = [f for f in os.listdir(raw_path)]
        for dir_name in dirs:
            sub_path = os.path.join(raw_path, dir_name)
            if not os.path.isdir(sub_path):
                continue

            language = dir_name  # now we somwhere like '.../raw/fr/'
            files = [f for f in os.listdir(sub_path)]
            for file_name in files:
                if file_name_only and file_name != file_name_only:
                    continue
                full_path = os.path.join(sub_path,
                                         file_name)  # '.../raw/fr/file01.txt'
                if not os.path.isfile(full_path) or not file_name.endswith(
                        '.txt'):
                    continue
                # try "cached" feature file
                features_name = os.path.splitext(file_name)[0] + '.json'
                feature_path = os.path.join(features_path, dir_name,
                                            features_name)
                corpus = None  # type: Optional[CorpusFeatures]
                if not ignore_cached and os.path.isfile(feature_path):
                    try:
                        cf = CorpusFeatures.load_from_file(feature_path)
                        if cf.version != CorpusFeatures.ACTUAL_VERSION:
                            print(
                                f'File "{feature_path}" has version "{cf.version}"'
                            )
                        else:
                            corpus = cf
                    except Exception as e:
                        print(f'Error loading "{feature_path}": {e}')
                if not corpus and not read_cached_only:
                    # build corpus
                    alph = alphabet_by_code[language]
                    corpus = CorpusFeatures(language, alph, full_path)
                    dict = DetailedDictionary.read_from_file(full_path)
                    corpus.build(dict)
                    # cache corpus
                    feature_subfolder = os.path.join(features_path, dir_name)
                    if not os.path.isdir(feature_subfolder):
                        os.mkdir(feature_subfolder)
                    corpus.save_to_file(feature_path)

                if corpus:
                    corpus.cache_file_path = feature_path
                    data.append(corpus)
        return data