Пример #1
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this
        """
        if corpus == 'tlg':
            orig_dir_rel = '~/cltk_data/originals/tlg'
            works_dir_rel = '~/cltk_data/greek/text/tlg/individual_works'
            file_prefix = 'TLG'
            latin = False
        elif corpus == 'phi5':
            orig_dir_rel = '~/cltk_data/originals/phi5'
            works_dir_rel = '~/cltk_data/latin/text/phi5/individual_works'
            file_prefix = 'LAT'
            latin = True  # this is for the optional TLGU argument to convert()

        orig_dir = os.path.expanduser(orig_dir_rel)
        works_dir = os.path.expanduser(works_dir_rel)
        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [x for x in files if x.endswith('.TXT') and x.startswith(file_prefix)]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin)
                logger.info('Writing files at %s to %s.', orig_file_path, works_dir)
            except Exception as err:
                logger.error('Failed to convert files: %s.', err)
Пример #2
0
    def _check_distributed_corpora_file(self):
        """Check '~/cltk_data/distributed_corpora.yaml' for any custom,
        distributed corpora that the user wants to load locally.

        TODO: write check or try if `cltk_data` dir is not present
        """
        if self.testing:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml')
        else:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml')

        try:
            with open(distributed_corpora_fp) as file_open:
                corpora_dict = yaml.safe_load(file_open)
        except FileNotFoundError:
            logger.info('Distributed_corpora.yaml file not found.')
            return []
        except yaml.parser.ParserError as parse_err:
            logger.debug('Yaml parsing error: %s' % parse_err)
            return []

        user_defined_corpora = []
        for corpus_name in corpora_dict:
            about = corpora_dict[corpus_name]

            if about['language'].lower() == self.language:
                user_defined_corpus = dict()
                # user_defined_corpus['git_remote'] = about['git_remote']
                user_defined_corpus['origin'] = about['origin']
                user_defined_corpus['type'] = about['type']
                user_defined_corpus['name'] = corpus_name
                user_defined_corpora.append(user_defined_corpus)

        return user_defined_corpora
Пример #3
0
    def _long_by_position(self, syllable, sentence):
        """Check if syllable is long by position. Long by position includes:
        1) Next syllable begins with two consonants, unless those consonants
        are a stop + liquid combination
        2) Next syllable begins with a double consonant
        3) Syllable ends with a consonant and the next syllable begins with a
        consonant
        :param syllable: Current syllable
        :param sentence: Current sentence
        :return: True if syllable is long by position
        :rtype : bool
        """

        try:
            next_syll = sentence[sentence.index(syllable) + 1]
            # Long by postion by case 1
            if (next_syll[0] in self.sing_cons and next_syll[1] in
                    self.sing_cons) and (next_syll[0] not in self.stops and
                                         next_syll[1] not in self.liquids):
                return True
            # Long by position by case 2
            elif syllable[-1] in self.vowels and next_syll[0] in \
                    self.doub_cons:
                return True
            # Long by position by case 3
            elif syllable[-1] in self.sing_cons and next_syll[0] in \
                    self.sing_cons:
                return True
            else:
                pass
        except IndexError:
            logger.info("IndexError while checking if syllable '%s' is long. Continuing.", syllable)
Пример #4
0
    def __init__(self, language, testing=False):
        """Setup corpus importing.

        `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting
        local. A better idea is probably to refuse to overwrite the .yaml.
        """
        self.language = language.lower()

        assert isinstance(testing, bool), '`testing` parameter must be boolean type'
        self.testing = testing

        self.user_defined_corpora = self._setup_language_variables()

        # if user_defined_corpora, then we need to add these to the corpus.py objects
        if self.user_defined_corpora:
            logger.info('User-defined corpus found for "{}" language'.format(self.language))
            try:
                logger.debug('Core corpora also found for "{}" language'.format(self.language))
                logger.debug('Combining the user-defined and the core corpora')
                self.official_corpora = LANGUAGE_CORPORA[self.language]
                self.all_corpora = self.official_corpora
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
            except KeyError:
                logger.debug('Nothing in the official repos '
                            'for "{}" language. Make the all_corpora solely '
                            'from the .yaml'.format(self.language))
                self.all_corpora = []
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
        else:
            logger.info('No user-defined corpora found for "{}" language'.format(self.language))
            # self.official_corpora = LANGUAGE_CORPORA[self.language]
            self.all_corpora = LANGUAGE_CORPORA[self.language]
Пример #5
0
    def divide_works(self, corpus):
        """Use the work-breaking option.
        TODO: Maybe incorporate this into ``convert_corpus()``
        TODO: Write test for this
        """
        if corpus == 'tlg':
            orig_dir_rel = '~/cltk_data/originals/tlg'
            works_dir_rel = '~/cltk_data/greek/text/tlg/individual_works'
            file_prefix = 'TLG'
            latin = False
        elif corpus == 'phi5':
            orig_dir_rel = '~/cltk_data/originals/phi5'
            works_dir_rel = '~/cltk_data/latin/text/phi5/individual_works'
            file_prefix = 'LAT'
            latin = True  # this is for the optional TLGU argument to convert()

        orig_dir = os.path.expanduser(orig_dir_rel)
        works_dir = os.path.expanduser(works_dir_rel)
        if not os.path.exists(works_dir):
            os.makedirs(works_dir)

        files = os.listdir(orig_dir)
        texts = [x for x in files if x.endswith('.TXT') and x.startswith(file_prefix)]

        for file in texts:
            orig_file_path = os.path.join(orig_dir, file)
            new_file_path = os.path.join(works_dir, file)

            try:
                self.convert(orig_file_path, new_file_path, divide_works=True, latin=latin)
                logger.info('Writing files at %s to %s.', orig_file_path, works_dir)
            except Exception as err:
                logger.error('Failed to convert files: %s.', err)
Пример #6
0
    def _check_distributed_corpora_file(self):
        """Check '~/cltk_data/distributed_corpora.yaml' for any custom,
        distributed corpora that the user wants to load locally.

        TODO: write check or try if `cltk_data` dir is not present
        """
        if self.testing:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/test_distributed_corpora.yaml')
        else:
            distributed_corpora_fp = os.path.expanduser('~/cltk_data/distributed_corpora.yaml')

        try:
            with open(distributed_corpora_fp) as file_open:
                corpora_dict = yaml.safe_load(file_open)
        except FileNotFoundError:
            logger.info('Distributed_corpora.yaml file not found.')
            return []
        except yaml.parser.ParserError as parse_err:
            logger.debug('Yaml parsing error: %s' % parse_err)
            return []

        user_defined_corpora = []
        for corpus_name in corpora_dict:
            about = corpora_dict[corpus_name]

            if about['language'].lower() == self.language:
                user_defined_corpus = dict()
                user_defined_corpus['git_remote'] = about['git_remote']
                user_defined_corpus['name'] = corpus_name
                user_defined_corpus['type'] = about['type']
                user_defined_corpora.append(user_defined_corpus)

        return user_defined_corpora
Пример #7
0
    def _long_by_position(self, syllable, sentence):
        """Check if syllable is long by position.

        Long by position includes:
        1) Next syllable begins with two consonants, unless those consonants
        are a stop + liquid combination
        2) Next syllable begins with a double consonant
        3) Syllable ends with a consonant and the next syllable begins with a
        consonant
        :param syllable: Current syllable
        :param sentence: Current sentence
        :return: True if syllable is long by position
        :rtype : bool
        """
        try:
            next_syll = sentence[sentence.index(syllable) + 1]
            # Long by position by case 1
            if (next_syll[0] in self.sing_cons and next_syll[1]
                    in self.sing_cons) and (next_syll[0] not in self.stops and
                                            next_syll[1] not in self.liquids):
                return True
            # Long by position by case 2
            elif syllable[-1] in self.vowels and next_syll[0] in self.doub_cons:
                return True
            # Long by position by case 3
            elif syllable[-1] in self.sing_cons and (next_syll[0]
                                                     in self.sing_cons):
                return True
            else:
                pass
        except IndexError:
            logger.info(
                "IndexError while checking if syllable '%s' is long. Continuing.",
                syllable)
Пример #8
0
    def __init__(self, language, testing=False):
        """Setup corpus importing.

        `testing` is a hack to check a tmp .yaml file to look at or local corpus. This keeps from overwriting
        local. A better idea is probably to refuse to overwrite the .yaml.
        """
        self.language = language.lower()

        assert isinstance(testing, bool), '`testing` parameter must be boolean type'
        self.testing = testing

        self.user_defined_corpora = self._setup_language_variables()

        # if user_defined_corpora, then we need to add these to the corpus.py objects
        if self.user_defined_corpora:
            logger.info('User-defined corpus found for "{}" language'.format(self.language))
            try:
                logger.debug('Core corpora also found for "{}" language'.format(self.language))
                logger.debug('Combining the user-defined and the core corpora')
                self.official_corpora = LANGUAGE_CORPORA[self.language]
                self.all_corpora = self.official_corpora
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
            except KeyError:
                logger.debug('Nothing in the official repos '
                            'for "{}" language. Make the all_corpora solely '
                            'from the .yaml'.format(self.language))
                self.all_corpora = []
                for corpus in self.user_defined_corpora:
                    self.all_corpora.append(corpus)
        else:
            logger.info('No user-defined corpora found for "{}" language'.format(self.language))
            # self.official_corpora = LANGUAGE_CORPORA[self.language]
            self.all_corpora = LANGUAGE_CORPORA[self.language]
Пример #9
0
    def _make_syllables(self, sentences_words):
        """Divide the word tokens into a list of syllables.

        Note that a syllable in this instance is defined as a vocalic group
        (i.e., vowel or a diphthong). This means that all syllables which are
        not the last syllable in the word will end with a vowel or diphthong.
        TODO: Determine whether a CLTK syllabifier could replace this
        :param sentence_words:
        :return: Syllabified words
        :rtype : list
        """
        text = self._tokenize(sentences_words)
        all_syllables = []
        for sentence in text:
            syll_per_sent = []
            for word in sentence:
                syll_start = 0  # Begins syllable iterator
                syll_per_word = []
                cur_letter_in = 0  # Begins general iterator
                while cur_letter_in < len(word):
                    letter = word[cur_letter_in]
                    if (cur_letter_in != len(word) - 1) and \
                       (word[cur_letter_in] + word[cur_letter_in + 1]) \
                       in self.diphthongs:
                        cur_letter_in += 1
                        # Syllable ends with a diphthong
                        syll_per_word.append(word[syll_start:cur_letter_in +
                                                  1])
                        syll_start = cur_letter_in + 1
                    elif (letter in self.vowels) or (letter
                                                     in self.long_vowels):
                        # Syllable ends with a vowel
                        syll_per_word.append(word[syll_start:cur_letter_in +
                                                  1])
                        syll_start = cur_letter_in + 1
                    cur_letter_in += 1
                try:
                    last_vowel = syll_per_word[-1][-1]  # Last vowel of a word
                    # Modifies general iterator to accomodate consonants after
                    # the last syllable in a word
                    cur_letter_in = len(word) - 1
                    # Contains all of the consonants after the last vowel in a word
                    leftovers = ''
                    while word[cur_letter_in] != last_vowel:
                        if word[cur_letter_in] != '.':
                            # Adds consonants to leftovers
                            leftovers = word[cur_letter_in] + leftovers
                        cur_letter_in -= 1
                    # Adds leftovers to last syllable in a word
                    syll_per_word[-1] += leftovers
                    syll_per_sent.append(syll_per_word)
                except IndexError:
                    logger.info(
                        "IndexError while making syllables of '%s'. Continuing.",
                        word)
            all_syllables.append(syll_per_sent)
        return all_syllables
Пример #10
0
 def test_contribs_find_write_contribs(self):
     """Test contrib writing function."""
     file_contribs = 'contributors.md'
     try:
         os.remove(file_contribs)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file_contribs)
     find_write_contribs()
     contribs_file = os.path.isfile(file_contribs)
     self.assertTrue(contribs_file)
Пример #11
0
 def test_contribs_write_contribs(self):
     contribs = Contributors()
     file = 'contributors.md'
     try:
         os.remove(file)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file)
     contribs.write_contribs()
     contribs_file = os.path.isfile(file)
     self.assertTrue(contribs_file)
Пример #12
0
 def test_contribs_find_write_contribs(self):
     """Test contrib writing function."""
     file = 'contributors.md'
     try:
         os.remove(file)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file)
     find_write_contribs()
     contribs_file = os.path.isfile(file)
     self.assertTrue(contribs_file)
Пример #13
0
 def test_contribs_write_contribs(self):
     contribs = Contributors()
     file = 'contributors.md'
     try:
         os.remove(file)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file)
     contribs.write_contribs()
     contribs_file = os.path.isfile(file)
     self.assertTrue(contribs_file)
Пример #14
0
    def _what_os(self):
        """Get operating system."""
        if platform == "linux" or platform == "linux2":
            _platform = 'linux'
        elif platform == "darwin":
            _platform = 'mac'
        elif platform == "win32":
            _platform = 'windows'
        logger.info("Detected '{}' operating system.".format(_platform))

        return _platform
Пример #15
0
 def load_replacement_patterns(self):
     """Check for availability of the specified dictionary."""
     filename = self.dictionary + '.py'
     models = self.language + '_models_cltk'
     rel_path = os.path.join('~/cltk_data', self.language, 'model', models,
                             'semantics', filename)
     path = os.path.expanduser(rel_path)
     logger.info('Loading lemmata or synonyms. This may take a minute.')
     loader = importlib.machinery.SourceFileLoader(filename, path)
     module = types.ModuleType(loader.name)
     loader.exec_module(module)
     return module.DICTIONARY
Пример #16
0
    def _make_syllables(self, sentences_words):
        """Divide the word tokens into a list of syllables.

        Note that a syllable in this instance is defined as a vocalic group
        (i.e., vowel or a diphthong). This means that all syllables which are
        not the last syllable in the word will end with a vowel or diphthong.
        TODO: Determine whether a CLTK syllabifier could replace this
        :param sentence_words:
        :return: Syllabified words
        :rtype : list
        """
        text = self._tokenize(sentences_words)
        all_syllables = []
        for sentence in text:
            syll_per_sent = []
            for word in sentence:
                syll_start = 0  # Begins syllable iterator
                syll_per_word = []
                cur_letter_in = 0  # Begins general iterator
                while cur_letter_in < len(word):
                    letter = word[cur_letter_in]
                    if (cur_letter_in != len(word) - 1) and \
                       (word[cur_letter_in] + word[cur_letter_in + 1]) \
                       in self.diphthongs:
                        cur_letter_in += 1
                        # Syllable ends with a diphthong
                        syll_per_word.append(word[syll_start:cur_letter_in + 1])
                        syll_start = cur_letter_in + 1
                    elif (letter in self.vowels) or (letter in self.long_vowels):
                        # Syllable ends with a vowel
                        syll_per_word.append(word[syll_start:cur_letter_in + 1])
                        syll_start = cur_letter_in + 1
                    cur_letter_in += 1
                try:
                    last_vowel = syll_per_word[-1][-1]  # Last vowel of a word
                    # Modifies general iterator to accomodate consonants after
                    # the last syllable in a word
                    cur_letter_in = len(word) - 1
                    # Contains all of the consonants after the last vowel in a word
                    leftovers = ''
                    while word[cur_letter_in] != last_vowel:
                        if word[cur_letter_in] != '.':
                            # Adds consonants to leftovers
                            leftovers = word[cur_letter_in] + leftovers
                        cur_letter_in -= 1
                    # Adds leftovers to last syllable in a word
                    syll_per_word[-1] += leftovers
                    syll_per_sent.append(syll_per_word)
                except IndexError:
                    logger.info("IndexError while making syllables of '%s'. Continuing.", word)
            all_syllables.append(syll_per_sent)
        return all_syllables
Пример #17
0
def write_contribs(def_dict_list):
    """Write to file, in current dir, 'contributors.md'."""
    file_str = ''
    note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n'
    file_str += note
    for contrib in def_dict_list:
        file_str += '## ' + contrib + '\n'
        for module in def_dict_list[contrib]:
            file_str += '* ' + module + '\n'
        file_str += '\n'
    file_name = 'contributors.md'
    with open(file_name, 'w') as file_open:
        file_open.write(file_str)
    logger.info('Wrote contribs file at "%s".', file_name)
Пример #18
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             if not self.testing:
                 print('Do you want to install TLGU? To continue, press Return. To exit, Control-C.')
                 input()
             else:
                 print('Automated or test build, skipping keyboard input confirmation for installation of TLGU.')
             try:
                 p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Пример #19
0
    def make(self):
        """Build program."""
        #! for linux install Clan
        fp = os.path.expanduser('~/cltk_data/multilingual/software/lapos')
        p_out = subprocess.call('cd {} && make'.format(fp),
                                shell=True,
                                stdout=subprocess.DEVNULL)

        if p_out == 0:
            print('Lapos built successfully.')
            logger.info('Lapos build successfully.')
        else:
            print('Lapos did not build successfully.')
            logger.error('Lapos did not build successfully.')
Пример #20
0
def write_contribs(def_dict_list: Dict[str, List[str]]) -> None:
    """Write to file, in current dir, 'contributors.md'."""
    file_str = ''  # type: str
    note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n'  # type: str  # pylint: disable=line-too-long
    file_str += note
    for contrib in def_dict_list:
        file_str += '## ' + contrib + '\n'
        for module in def_dict_list[contrib]:
            file_str += '* ' + module + '\n'
        file_str += '\n'
    file_name = 'contributors.md'  # type: str
    with open(file_name, 'w') as file_open:  # type: IO
        file_open.write(file_str)
    logger.info('Wrote contribs file at "%s".', file_name)
Пример #21
0
 def _check_install():
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = '~/cltk_data/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             try:
                 p_out = subprocess.call('cd {0} && make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 p_out = subprocess.call('cd {0} && sudo make install'.format(tlgu_path), shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Пример #22
0
 def write_contribs(self):
     """Write to file, in current dir, 'contributors.md'."""
     file_str = ''
     note = '# Contributors\nCLTK Core authors, ordered alphabetically by first name\n\n'
     file_str += note
     for contrib in self.credits:
         file_str += '## ' + contrib + '\n'
         for module in self.credits[contrib]:
             file_str += '* ' + module + '\n'
         file_str += '\n'
     file_name = 'contributors.md'
     with open(file_name, 'w') as file_open:
         file_open.write(file_str)
     logger.info('Wrote contribs file at "%s".', file_name)
Пример #23
0
 def test_write_contribs(self):
     """Test file writer for contribs module."""
     # rm old
     file = 'contributors.md'
     try:
         os.remove(file)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file)
     # mk new dict
     def_dict = defaultdict(list)
     def_dict['key'].append('val1')
     def_dict['key'].append('val2')
     write_contribs(def_dict)
     # write file
     contribs_file = os.path.isfile(file)
     self.assertTrue(contribs_file)
Пример #24
0
 def test_write_contribs(self):
     """Test file writer for contribs module."""
     # rm old
     file = 'contributors.md'
     try:
         os.remove(file)
     except FileNotFoundError:
         logger.info("No file to remove at '%s'. Continuing.", file)
     # mk new dict
     def_dict = defaultdict(list)
     def_dict['key'].append('val1')
     def_dict['key'].append('val2')
     write_contribs(def_dict)
     # write file
     contribs_file = os.path.isfile(file)
     self.assertTrue(contribs_file)
Пример #25
0
 def _git_user_defined_corpus(self,
                              corpus_name,
                              corpus_type,
                              uri: str,
                              branch='master'):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
     # self._download_corpus(corpus_type, corpus_name, path)
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split('/')[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip('.git')
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, 'README.md')
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri,
                             target_dir,
                             branch=branch,
                             depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(
                 uri, corpus_imp_err)
             logger.error(msg)
Пример #26
0
 def _setup_language_variables(self):
     """Check for availability of corpora for a language.
     TODO: Make the selection of available languages dynamic from dirs
     within ``corpora`` which contain a ``corpora.py`` file.
     """
     if self.language not in AVAILABLE_LANGUAGES:
         # If no official repos, check if user has custom
         user_defined_corpora = self._check_distributed_corpora_file()
         if user_defined_corpora:
             return user_defined_corpora
         else:
             msg = 'Corpora not available (either core or user-defined) for the "{}" language.'.format(self.language)
             logger.info(msg)
             raise CorpusImportError(msg)
     else:
         user_defined_corpora = self._check_distributed_corpora_file()
         return user_defined_corpora
Пример #27
0
    def _retrieve_morpheus_entry(self, word):
        """Return Morpheus entry for word

        Entry format: [(head word, tag, macronized form)]

        :param word: unmacronized, lowercased word
        :ptype word: string
        :return: Morpheus entry in tuples
        :rtype : list
        """
        entry = self.macron_data.get(word)
        if entry is None:
            logger.info('No Morpheus entry found for {}.'.format(word))
            return None
        elif len(entry) == 0:
            logger.info('No Morpheus entry found for {}.'.format(word))
        return entry
Пример #28
0
    def _retrieve_morpheus_entry(self, word):
        """Return Morpheus entry for word

        Entry format: [(head word, tag, macronized form)]

        :param word: unmacronized, lowercased word
        :ptype word: string
        :return: Morpheus entry in tuples
        :rtype : list
        """
        entry = self.macron_data.get(word)
        if entry is None:
            logger.info('No Morpheus entry found for {}.'.format(word))
            return None
        elif len(entry) == 0:
            logger.info('No Morpheus entry found for {}.'.format(word))
        return entry
Пример #29
0
 def _setup_language_variables(self):
     """Check for availability of corpora for a language.
     TODO: Make the selection of available languages dynamic from dirs
     within ``corpora`` which contain a ``corpora.py`` file.
     """
     if self.language not in AVAILABLE_LANGUAGES:
         # If no official repos, check if user has custom
         user_defined_corpora = self._check_distributed_corpora_file()
         if user_defined_corpora:
             return user_defined_corpora
         else:
             msg = 'Corpora not available (either core or user-defined) for the "{}" language.'.format(self.language)
             logger.info(msg)
             raise CorpusImportError(msg)
     else:
         user_defined_corpora = self._check_distributed_corpora_file()
         return user_defined_corpora
Пример #30
0
 def write_concordance_from_string(self, text, name):
     """A reworkinng of write_concordance_from_file(). Refactor these."""
     list_of_lists = self._build_concordance(text)
     user_data_rel = '~/cltk_data/user_data'
     user_data = os.path.expanduser(user_data_rel)
     if not os.path.isdir(user_data):
         os.makedirs(user_data)
     file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
     concordance_output = ''
     for word_list in list_of_lists:
         for line in word_list:
             concordance_output += line + '\n'
     try:
         with open(file_path, 'w') as open_file:
             open_file.write(concordance_output)
             logger.info("Wrote concordance to '%s'." % file_path)
     except IOError as io_error:
         logger.error("Failed to write concordance to '%s'." % file_path)
Пример #31
0
 def write_concordance_from_string(self, text, name):
     """A reworkinng of write_concordance_from_file(). Refactor these."""
     list_of_lists = self._build_concordance(text)
     user_data_rel = '~/cltk_data/user_data'
     user_data = os.path.expanduser(user_data_rel)
     if not os.path.isdir(user_data):
         os.makedirs(user_data)
     file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
     concordance_output = ''
     for word_list in list_of_lists:
         for line in word_list:
             concordance_output += line + '\n'
     try:
         with open(file_path, 'w') as open_file:
             open_file.write(concordance_output)
             logger.info("Wrote concordance to '%s'." % file_path)
     except IOError as io_error:
         logger.error("Failed to write concordance to '%s'." % file_path)
Пример #32
0
 def _check_corpus_availability(self, corpus_name):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = LANGUAGE_CORPORA[self.language]
     except NameError as name_error:
         logger.error('Corpus not available for language %s: %s',
                      (self.language, name_error))
     corpus_properties = None
     for corpus in corpora:
         if corpus['name'] == corpus_name:
             corpus_properties = corpus
     if not corpus_properties:
         logger.info("Corpus '%s' not available for the '%s' language.",
                     corpus_name, self.language)
     return corpus_properties
Пример #33
0
 def _check_corpus_availability(self, corpus_name):
     """Check whether a corpus is available for import.
     :type corpus_name: str
     :param corpus_name: Name of available corpus.
     :rtype : str
     """
     try:
         corpora = LANGUAGE_CORPORA[self.language]
     except NameError as name_error:
         logger.error('Corpus not available for language %s: %s', (self.language, name_error))
     corpus_properties = None
     for corpus in corpora:
         if corpus['name'] == corpus_name:
             corpus_properties = corpus
     if not corpus_properties:
         logger.info("Corpus '%s' not available for the '%s' language.",
                     corpus_name,
                     self.language)
     return corpus_properties
Пример #34
0
def write_concordance_from_string(text: str, name: str) -> None:
    """A reworkinng of write_concordance_from_file(). Refactor these."""
    list_of_lists = build_concordance(text)  # type: List[List[str]]
    user_data_rel = '~/cltk_data/user_data'  # type: str
    user_data = os.path.expanduser(user_data_rel)  # type: str
    if not os.path.isdir(user_data):
        os.makedirs(user_data)
    file_path = os.path.join(user_data,
                             'concordance_' + name + '.txt')  # type: str
    concordance_output = ''  # type: str
    for word_list in list_of_lists:
        for line in word_list:
            concordance_output += line + '\n'
    try:
        with open(file_path, 'w') as open_file:
            open_file.write(concordance_output)
            logger.info("Wrote concordance to '%s'.", file_path)
    except IOError as io_error:
        logger.error("Failed to write concordance to '%s'. Error: %s",
                     file_path, io_error)
Пример #35
0
 def _copy_dir_recursive(src_rel, dst_rel):
     """Copy contents of one directory to another. `dst_rel` dir cannot
     exist. Source: http://stackoverflow.com/a/1994840
     TODO: Move this to file_operations.py module.
     :type src_rel: str
     :param src_rel: Directory to be copied.
     :type dst_rel: str
     :param dst_rel: Directory to be created with contents of ``src_rel``.
     """
     src = os.path.expanduser(src_rel)
     dst = os.path.expanduser(dst_rel)
     try:
         shutil.copytree(src, dst)
         logger.info('Files copied from %s to %s', src, dst)
     except OSError as exc:
         if exc.errno == errno.ENOTDIR:
             shutil.copy(src, dst)
             logger.info('Files copied from %s to %s', src, dst)
         else:
             raise
Пример #36
0
 def _copy_dir_recursive(src_rel, dst_rel):
     """Copy contents of one directory to another. `dst_rel` dir cannot
     exist. Source: http://stackoverflow.com/a/1994840
     TODO: Move this to file_operations.py module.
     :type src_rel: str
     :param src_rel: Directory to be copied.
     :type dst_rel: str
     :param dst_rel: Directory to be created with contents of ``src_rel``.
     """
     src = os.path.expanduser(src_rel)
     dst = os.path.expanduser(dst_rel)
     try:
         shutil.copytree(src, dst)
         logger.info('Files copied from %s to %s', src, dst)
     except OSError as exc:
         if exc.errno == errno.ENOTDIR:
             shutil.copy(src, dst)
             logger.info('Files copied from %s to %s', src, dst)
         else:
             raise
Пример #37
0
    def _macronize_word(self, word):
        """Return macronized word.

        :param word: (word, tag)
        :ptype word: tuple
        :return: (word, tag, macronized_form)
        :rtype : tuple
        """
        head_word = word[0]
        tag = word[1]
        if tag is None:
            logger.info('Tagger {} could not tag {}.'.format(self.tagger, head_word))
            return head_word, tag, head_word
        elif tag == 'U--------':
            return (head_word, tag.lower(), head_word)
        else:
            entries = self._retrieve_morpheus_entry(head_word)
            if entries is None:
                return head_word, tag.lower(), head_word
            matched_entry = [entry for entry in entries if entry[0] == tag.lower()]
            if len(matched_entry) == 0:
                logger.info('No matching Morpheus entry found for {}.'.format(head_word))
                return head_word, tag.lower(), entries[0][2]
            elif len(matched_entry) == 1:
                return head_word, tag.lower(), matched_entry[0][2].lower()
            else:
                logger.info('Multiple matching entries found for {}.'.format(head_word))
                return head_word, tag.lower(), matched_entry[1][2].lower()
Пример #38
0
    def _macronize_word(self, word):
        """Return macronized word.

        :param word: (word, tag)
        :ptype word: tuple
        :return: (word, tag, macronized_form)
        :rtype : tuple
        """
        head_word = word[0]
        tag = word[1]
        if tag is None:
            logger.info('Tagger {} could not tag {}.'.format(
                self.tagger, head_word))
            return head_word, tag, head_word
        elif tag == 'U--------':
            return (head_word, tag.lower(), head_word)
        else:
            entries = self._retrieve_morpheus_entry(head_word)
            if entries is None:
                return head_word, tag.lower(), head_word
            matched_entry = [
                entry for entry in entries if entry[0] == tag.lower()
            ]
            if len(matched_entry) == 0:
                logger.info('No matching Morpheus entry found for {}.'.format(
                    head_word))
                return head_word, tag.lower(), entries[0][2]
            elif len(matched_entry) == 1:
                return head_word, tag.lower(), matched_entry[0][2].lower()
            else:
                logger.info('Multiple matching entries found for {}.'.format(
                    head_word))
                return head_word, tag.lower(), matched_entry[1][2].lower()
Пример #39
0
 def _check_install(self):
     """Check if tlgu installed, if not install it."""
     try:
         subprocess.check_output(['which', 'tlgu'])
     except Exception as exc:
         logger.info('TLGU not installed: %s', exc)
         logger.info('Installing TLGU.')
         if not subprocess.check_output(['which', 'gcc']):
             logger.error('GCC seems not to be installed.')
         else:
             tlgu_path_rel = get_cltk_data_dir(
             ) + '/greek/software/greek_software_tlgu'
             tlgu_path = os.path.expanduser(tlgu_path_rel)
             if not self.testing:
                 print('Do you want to install TLGU?')
                 print('To continue, press Return. To exit, Control-C.')
                 input()
             else:
                 print(
                     'Automated or test build, skipping keyboard input confirmation for installation of TLGU.'
                 )
             try:
                 command = 'cd {0} && make install'.format(tlgu_path)
                 print('Going to run command:', command)
                 p_out = subprocess.call(command, shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install without sudo failed.')
             except Exception as exc:
                 logger.error('TLGU install failed: %s', exc)
             else:  # for Linux needing root access to '/usr/local/bin'
                 if not self.testing:
                     print(
                         'Could not install without root access. Do you want to install TLGU with sudo?'
                     )
                     command = 'cd {0} && sudo make install'.format(
                         tlgu_path)
                     print('Going to run command:', command)
                     print('To continue, press Return. To exit, Control-C.')
                     input()
                     p_out = subprocess.call(command, shell=True)
                 else:
                     command = 'cd {0} && sudo make install'.format(
                         tlgu_path)
                     p_out = subprocess.call(command, shell=True)
                 if p_out == 0:
                     logger.info('TLGU installed.')
                 else:
                     logger.error('TLGU install with sudo failed.')
Пример #40
0
 def _git_user_defined_corpus(self, corpus_name, corpus_type, uri:str, branch='master'):
     """Clone or update a git repo defined by user.
     TODO: This code is very redundant with what's in import_corpus(),
     could be refactored.
     """
     # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
     # self._download_corpus(corpus_type, corpus_name, path)
     type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
     type_dir = os.path.expanduser(type_dir_rel)
     repo_name = uri.split('/')[-1]  # eg, 'latin_corpus_newton_example.git'
     repo_name = repo_name.rstrip('.git')
     target_dir = os.path.join(type_dir, repo_name)
     target_file = os.path.join(type_dir, repo_name, 'README.md')
     # check if corpus already present
     # if not, clone
     if not os.path.isfile(target_file):
         if not os.path.isdir(type_dir):
             os.makedirs(type_dir)
         try:
             msg = "Cloning '{}' from '{}'".format(corpus_name, uri)
             logger.info(msg)
             Repo.clone_from(uri, target_dir, branch=branch, depth=1,
                             progress=ProgressPrinter())
         except CorpusImportError as corpus_imp_err:
             msg = "Git clone of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
     # if corpus is present, pull latest
     else:
         try:
             repo = Repo(target_dir)
             assert not repo.bare  # or: assert repo.exists()
             git_origin = repo.remotes.origin
             msg = "Pulling latest '{}' from '{}'.".format(corpus_name, uri)
             logger.info(msg)
             git_origin.pull()
         except CorpusImportError as corpus_imp_err:
             msg = "Git pull of '{}' failed: '{}'".format(uri, corpus_imp_err)
             logger.error(msg)
Пример #41
0
def write_concordance_from_file(filepaths: Union[str, List[str]],
                                name: str) -> None:
    """This calls the modified ConcordanceIndex, taken and modified from
    the NLTK, and writes to disk a file named 'concordance_' + name at
    '~/cltk_data/user_data/'.

    TODO: Add language (here or in class), lowercase option, stemming/
    lemmatization, else?

    :type filepaths: str or list
    :param filepaths: Filepath of text(s) to be used in concordance.
    :rtype : str
    """
    assert isinstance(filepaths, (str, list))
    if isinstance(filepaths, str):
        filepath = filepaths  # type: str
        text = read_file(filepath)  # type: str
    elif isinstance(filepaths, list):
        text = ''
        for filepath in filepaths:
            text += read_file(filepath)
    list_of_lists = build_concordance(text)  # type: List[List[str]]
    user_data_rel = '~/cltk_data/user_data'  # type: str
    user_data = os.path.expanduser(user_data_rel)  # type: str
    if not os.path.isdir(user_data):
        os.makedirs(user_data)
    file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
    concordance_output = ''  # type: str
    for word_list in list_of_lists:
        for line in word_list:
            concordance_output += line + '\n'
    try:
        with open(file_path, 'w') as open_file:  # type: IO
            open_file.write(concordance_output)
            logger.info("Wrote concordance to '%s'.", file_path)
    except IOError as io_error:
        logger.error("Failed to write concordance to '%s'. Error: %s",
                     file_path, io_error)
Пример #42
0
    def write_concordance_from_file(self, filepaths, name):
        """This calls my modified ConcordanceIndex, taken and modified from
        the NLTK, and writes to disk a file named 'concordance_' + name at
        '~/cltk_data/user_data/'.

        TODO: Add language (here or in class), lowercase option, stemming/
        lemmatization, else?

        :type filepaths: str or list
        :param filepaths: Filepath of text(s) to be used in concordance.
        :rtype : str
        """
        assert isinstance(filepaths, (str, list))
        if isinstance(filepaths, str):
            filepath = filepaths
            text = self._read_file(filepath)
        elif isinstance(filepaths, list):
            text = ''
            for filepath in filepaths:
                text += self._read_file(filepath)
        list_of_lists = self._build_concordance(text)
        user_data_rel = '~/cltk_data/user_data'
        user_data = os.path.expanduser(user_data_rel)
        if not os.path.isdir(user_data):
            os.makedirs(user_data)
        file_path = os.path.join(user_data, 'concordance_' + name + '.txt')
        concordance_output = ''
        for word_list in list_of_lists:
            for line in word_list:
                concordance_output += line + '\n'
        try:
            with open(file_path, 'w') as open_file:
                open_file.write(concordance_output)
                logger.info("Wrote concordance to '%s'." % file_path)
        except IOError as io_error:
            logger.error("Failed to write concordance to '%s'." % file_path)
Пример #43
0
    def index_corpus(self):
        """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5,
        or PHI7.

        TLG takes almost 13 min; PHI5 1.5 min.
        To setup index parameters
        >>> cltk_index = CLTKIndex('latin', 'phi5')  # 1.5 min, 363 docs
        >>> cltk_index = CLTKIndex('latin', 'phi5', chunk='work')  # 2 min, 837 docs
        >>> cltk_index = CLTKIndex('greek', 'tlg')  # 13 min, 1823 docs
        >>> cltk_index = CLTKIndex('greek', 'tlg', chunk='work')  #15.5 min, 6625 docs

        # And to start indexing:
        >>> cltk_index.index_corpus()

        TODO: Prevent overwriting. Ask user to rm old dir before re-indexing.
        TODO: Add option for lemmatizing.
        TODO: Add for figure out lower() options.
        TODO: Process TLG through forthcoming normalize().
        TODO: Add name to each index.
        TODO: Turn off any language-specific mods (eg, stemming, case) that
        Whoosh might be doing by default.
        """

        # Setup index dir
        schema = Schema(path=ID(stored=True),
                        author=TEXT(stored=True),
                        content=TEXT)
        try:
            _index = create_in(self.index_path, schema)
        except FileNotFoundError:
            os.makedirs(self.index_path)
            _index = create_in(self.index_path, schema)
        writer = _index.writer()

        # Setup corpus to be indexed
        if self.lang == 'greek' and self.corpus == 'tlg':
            corpus_path = os.path.expanduser(
                '~/cltk_data/greek/text/tlg/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser(
                    '~/cltk_data/greek/text/tlg/individual_works/')
        elif self.lang == 'latin' and self.corpus == 'phi5':
            corpus_path = os.path.expanduser(
                '~/cltk_data/latin/text/phi5/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser(
                    '~/cltk_data/latin/text/phi5/individual_works/')
        assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path  # pylint: disable=line-too-long

        files = os.listdir(corpus_path)
        if self.lang == 'greek' and self.corpus == 'tlg':
            files = [f[:-4] for f in files if f.startswith('TLG')]
            corpus_index = TLG_AUTHOR_MAP
        elif self.lang == 'latin' and self.corpus == 'phi5':
            files = [f[:-4] for f in files if f.startswith('LAT')]
            corpus_index = PHI5_AUTHOR_MAP

        time_0 = time.time()
        logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus))  # pylint: disable=line-too-long
        logger.info('Index will be written to: "%s".' % self.index_path)
        if self.chunk == 'author':
            for count, file in enumerate(files, 1):

                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        file = file[3:]
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, 'TLG' + file + '.TXT')
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, file + '.TXT')
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()
                writer.add_document(path=path, author=author, content=content)

                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)

        if self.chunk == 'work':
            for count, file in enumerate(files, 1):
                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[3:-8]]
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[:-8]]
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()

                writer.add_document(path=path, author=author, content=content)
                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)
        logger.info('Commencing to commit changes.')
        writer.commit()

        time_1 = time.time()
        elapsed = time_1 - time_0
        logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed)))  # pylint: disable=line-too-long
Пример #44
0
 def import_corpus(self, corpus_name, local_path=None, branch='master'):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     :param branch: What Git branch to clone.
     """
     corpus_properties = self._get_corpus_properties(corpus_name)
     try:
         location = corpus_properties['location']
     except KeyError:
         # git_uri = corpus_properties['git_remote']
         git_name = corpus_properties['']
         git_uri = corpus_properties['origin']
         git_type = corpus_properties['type']
         # pass this off to a special downloader just for custom urls
         self._git_user_defined_corpus(git_name, git_type, git_uri)
         return
     corpus_type = corpus_properties['type']
     if location == 'remote':
         # git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         git_uri = corpus_properties['origin']
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                 logger.info(msg)
                 Repo.clone_from(git_uri, target_dir, branch=branch, depth=1,
                                 progress=ProgressPrinter())
             except CorpusImportError as corpus_imp_err:
                 msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 git_origin = repo.remotes.origin
                 msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri)
                 logger.info(msg)
                 git_origin.pull()
             except CorpusImportError as corpus_imp_err:
                 msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
     elif location == 'local':
         msg = "Importing from local path: '{}'".format(local_path)
         logger.info(msg)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 msg = "Wrote directory at '{}'.".format(originals_dir)
                 logger.info(msg)
             tlg_originals_dir = os.path.join(data_dir,
                                              'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                 logger.info(msg)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
Пример #45
0
Файл: word.py Проект: reuf/cltk
                                  line_tokenize)
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize.texttiling import TextTilingTokenizer
#from nltk.tokenize.toktok   import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

from cltk.utils.cltk_logger import logger

do_arabic = False
try:
    import pyarabic.araby as araby
    do_arabic = True
except ImportError:
    logger.info(
        'Arabic not supported. Install `pyarabic` library to tokenize Arabic.')
    pass

__author__ = [
    'Patrick J. Burns <*****@*****.**>',
    'Kyle P. Johnson <*****@*****.**>',
    'Natasha Voake <*****@*****.**>'
]
__license__ = 'MIT License. See LICENSE.'


class WordTokenizer:  # pylint: disable=too-few-public-methods
    """Tokenize according to rules specific to a given language."""
    def __init__(self, language):
        """Take language as argument to the class. Check availability and
        setup class variables."""
Пример #46
0
    def convert(self, input_path=None, output_path=None, markup=None,
                break_lines=False, divide_works=False, latin=False,
                extra_args=None):
        """
        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all
        numerical markup; 'full' gives most detailed, with reference numbers
        included before each text line.
        :param break_lines: No spaces; removes line ends and hyphens before an
         ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in
        the form output_file-xxx.txt; if an output file is not specified, this
         option has no effect.
        :param latin: Primarily Latin text (PHI). Some TLG texts, notably
        doccan1.txt and doccan2.txt are mostly roman texts lacking explicit
        language change codes. Setting this option will force a change to
        Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and
        without dashes, e.g.: ['p', 'b', 'B'].
        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), 'File {0} does not exist.'.format(input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == 'full':
            full_args = ['v', 'w', 'x', 'y', 'z']
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if break_lines:
            tlgu_options.append('N')
        if divide_works:
            tlgu_options.append('W')
        if latin:
            tlgu_options.append('r')
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = '-' + ' -'.join(tlgu_options)
        else:
            tlgu_flags = ''
        # make tlgu call
        tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags,
                                              input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error('Failed to convert %s to %s.',
                             input_path,
                             output_path)
        except Exception as exc:
            logger.error('Failed to convert %s to %s: %s',
                         input_path,
                         output_path,
                         exc)
            raise
Пример #47
0
 def import_corpus(self, corpus_name, local_path=None):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     """
     corpus_properties = self._check_corpus_availability(corpus_name)
     location = corpus_properties['location']
     corpus_type = corpus_properties['type']
     if location == 'remote':
         git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         #self._download_corpus(corpus_type, corpus_name, path)
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language,
                                     corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 logger.info("Cloning '%s' from '%s'" %
                             (corpus_name, git_uri))
                 Repo.clone_from(git_uri, target_dir, depth=1)
             except Exception as e:
                 logger.error("Git clone of '%s' failed: '%s'",
                              (git_uri, e))
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 o = repo.remotes.origin
                 logger.info("Pulling latest '%s' from '%s'." %
                             (corpus_name, git_uri))
                 o.pull()
             except Exception as e:
                 logger.error("Git pull of '%s' failed: '%s'" %
                              (git_uri, e))
     elif location == 'local':
         logger.info("Importing from local path: '%s'", local_path)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 logger.info("Wrote directory at '%s'.", originals_dir)
             tlg_originals_dir = os.path.join(data_dir, 'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 logger.info("Removed directory at '%s'.",
                             tlg_originals_dir)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
Пример #48
0
 def import_corpus(self, corpus_name, local_path=None, branch='master'):  # pylint: disable=R0912
     """Download a remote or load local corpus into dir ``~/cltk_data``.
     TODO: maybe add ``from git import RemoteProgress``
     TODO: refactor this, it's getting kinda long
     :type corpus_name: str
     :param corpus_name: The name of an available corpus.
     :param local_path: str
     :param local_path: A filepath, required when importing local corpora.
     :param branch: What Git branch to clone.
     """
     corpus_properties = self._get_corpus_properties(corpus_name)
     location = corpus_properties['location']
     corpus_type = corpus_properties['type']
     if location == 'remote':
         git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
         # self._download_corpus(corpus_type, corpus_name, path)
         type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
         type_dir = os.path.expanduser(type_dir_rel)
         target_dir = os.path.join(type_dir, corpus_name)
         target_file = os.path.join(type_dir, corpus_name, 'README.md')
         # check if corpus already present
         # if not, clone
         if not os.path.isfile(target_file):
             if not os.path.isdir(type_dir):
                 os.makedirs(type_dir)
             try:
                 msg = "Cloning '{}' from '{}'".format(corpus_name, git_uri)
                 logger.info(msg)
                 Repo.clone_from(git_uri, target_dir, branch=branch, depth=1,
                                 progress=ProgressPrinter())
             except CorpusImportError as corpus_imp_err:
                 msg = "Git clone of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
         # if corpus is present, pull latest
         else:
             try:
                 repo = Repo(target_dir)
                 assert not repo.bare  # or: assert repo.exists()
                 git_origin = repo.remotes.origin
                 msg = "Pulling latest '{}' from '{}'.".format(corpus_name, git_uri)
                 logger.info(msg)
                 git_origin.pull()
             except CorpusImportError as corpus_imp_err:
                 msg = "Git pull of '{}' failed: '{}'".format(git_uri, corpus_imp_err)
                 logger.error(msg)
     elif location == 'local':
         msg = "Importing from local path: '{}'".format(local_path)
         logger.info(msg)
         if corpus_name in ('phi5', 'phi7', 'tlg'):
             if corpus_name == 'phi5':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI5':
                     logger.info("Directory must be named 'PHI5'.")
             if corpus_name == 'phi7':
                 # normalize local_path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'PHI7':
                     logger.info("Directory must be named 'PHI7'.")
             if corpus_name == 'tlg':
                 # normalize path for checking dir
                 if local_path.endswith('/'):
                     local_path = local_path[:-1]
                 # check for right corpus dir
                 if os.path.split(local_path)[1] != 'TLG_E':
                     logger.info("Directory must be named 'TLG_E'.")
             # move the dir-checking commands into a function
             data_dir = os.path.expanduser(CLTK_DATA_DIR)
             originals_dir = os.path.join(data_dir, 'originals')
             # check for `originals` dir; if not present mkdir
             if not os.path.isdir(originals_dir):
                 os.makedirs(originals_dir)
                 msg = "Wrote directory at '{}'.".format(originals_dir)
                 logger.info(msg)
             tlg_originals_dir = os.path.join(data_dir,
                                              'originals',
                                              corpus_name)
             # check for `originals/<corpus_name>`; if pres, delete
             if os.path.isdir(tlg_originals_dir):
                 shutil.rmtree(tlg_originals_dir)
                 msg = "Removed directory at '{}'.".format(tlg_originals_dir)
                 logger.info(msg)
             # copy_dir requires that target
             if not os.path.isdir(tlg_originals_dir):
                 self._copy_dir_recursive(local_path, tlg_originals_dir)
Пример #49
0
from cltk.tokenize.word import WordTokenizer
from cltk.stop.arabic.stops import STOPS_LIST as ARABIC_STOPS
from cltk.utils.cltk_logger import logger

try:
    import pyarabic.araby as araby
except ImportError:
    logger.info(
        'Arabic not supported. Install `pyarabic` library to strip diacritics.'
    )
    pass


def stopwords_filter(string):

    text = string
    # strip tashkeel because the stop words list contains voweled words
    text = araby.strip_tashkeel(text)
    word_tokenizer = WordTokenizer("arabic")
    tokens = word_tokenizer.tokenize(text)

    # filter stop words
    no_stops = [w for w in tokens if w not in ARABIC_STOPS]

    return no_stops
Пример #50
0
    def convert(self,
                input_path=None,
                output_path=None,
                markup=None,
                rm_newlines=False,
                divide_works=False,
                latin=False,
                extra_args=None):
        """
        :param input_path: TLG filepath to convert.
        :param output_path: filepath of new converted text.
        :param markup: Specificity of inline markup. Default None removes all
        numerical markup; 'full' gives most detailed, with reference numbers
        included before each text line.
        :param rm_newlines: No spaces; removes line ends and hyphens before an
         ID code; hyphens and spaces before page and column ends are retained.
        :param divide_works: Each work (book) is output as a separate file in
        the form output_file-xxx.txt; if an output file is not specified, this
         option has no effect.
        :param latin: Primarily Latin text (PHI). Some TLG texts, notably
        doccan1.txt and doccan2.txt are mostly roman texts lacking explicit
        language change codes. Setting this option will force a change to
        Latin text after each citation block is encountered.
        :param extra_args: Any other tlgu args to be passed, in list form and
        without dashes, e.g.: ['p', 'b', 'B'].
        """
        # setup file paths
        input_path = os.path.expanduser(input_path)
        output_path = os.path.expanduser(output_path)

        # check input path exists
        assert os.path.isfile(input_path), 'File {0} does not exist.'.format(
            input_path)

        # setup tlgu flags
        tlgu_options = []
        if markup == 'full':
            full_args = ['v', 'w', 'x', 'y', 'z']
            [tlgu_options.append(x) for x in full_args]  # pylint: disable=W0106
        if rm_newlines:
            tlgu_options.append('N')
        if divide_works:
            tlgu_options.append('W')
        if latin:
            tlgu_options.append('r')
        # setup extra args
        if extra_args is None:
            extra_args = []
        else:
            try:
                extra_args = list(extra_args)
            except Exception as exc:
                logger.error("Argument 'extra_args' must be a list: %s.", exc)
                raise
        tlgu_options = tlgu_options + extra_args
        # assemble all tlgu flags
        tlgu_options = list(set(tlgu_options))
        if tlgu_options:
            tlgu_flags = '-' + ' -'.join(tlgu_options)
        else:
            tlgu_flags = ''
        # make tlgu call
        tlgu_call = 'tlgu {0} {1} {2}'.format(tlgu_flags, input_path,
                                              output_path)
        logger.info(tlgu_call)
        try:
            p_out = subprocess.call(tlgu_call, shell=True)
            if p_out == 1:
                logger.error('Failed to convert %s to %s.', input_path,
                             output_path)
        except Exception as exc:
            logger.error('Failed to convert %s to %s: %s', input_path,
                         output_path, exc)
            raise
Пример #51
0
    def index_corpus(self):
        """Make a Whoosh index out of a pre-processed corpus, ie TLG, PHI5,
        or PHI7.

        TLG takes almost 13 min; PHI5 1.5 min.
        To setup index parameters
        >>> # cltk_index = CLTKIndex('latin', 'phi5')  # 1.5 min, 363 docs
        >>> # cltk_index = CLTKIndex('latin', 'phi5', chunk='work')  # 2 min, 837 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg')  # 13 min, 1823 docs
        >>> # cltk_index = CLTKIndex('greek', 'tlg', chunk='work')  #15.5 min, 6625 docs

        # And to start indexing:
        >>> # cltk_index.index_corpus()

        TODO: Prevent overwriting. Ask user to rm old dir before re-indexing.
        TODO: Add option for lemmatizing.
        TODO: Add for figure out lower() options.
        TODO: Process TLG through forthcoming normalize().
        TODO: Add name to each index.
        TODO: Turn off any language-specific mods (eg, stemming, case) that
        Whoosh might be doing by default.
        """

        # Setup index dir
        schema = Schema(path=ID(stored=True),
                        author=TEXT(stored=True),
                        content=TEXT)
        try:
            _index = create_in(self.index_path, schema)
        except FileNotFoundError:
            os.makedirs(self.index_path)
            _index = create_in(self.index_path, schema)
        writer = _index.writer()

        # Setup corpus to be indexed
        if self.lang == 'greek' and self.corpus == 'tlg':
            corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser('~/cltk_data/greek/text/tlg/individual_works/')
        elif self.lang == 'latin' and self.corpus == 'phi5':
            corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/plaintext/')
            if self.chunk == 'work':
                corpus_path = os.path.expanduser('~/cltk_data/latin/text/phi5/individual_works/')
        assert os.path.isdir(corpus_path), 'Corpus does not exist in the following location: "%s". Use CLTK Corpus Importer and TLGU to create transformed corpus.' % corpus_path  # pylint: disable=line-too-long

        files = os.listdir(corpus_path)
        if self.lang == 'greek' and self.corpus == 'tlg':
            files = [f[:-4] for f in files if f.startswith('TLG')]
            corpus_index = TLG_AUTHOR_MAP
        elif self.lang == 'latin' and self.corpus == 'phi5':
            files = [f[:-4] for f in files if f.startswith('LAT')]
            corpus_index = PHI5_AUTHOR_MAP

        time_0 = time.time()
        logger.info("Commencing indexing of %s documents of '%s' corpus." % (len(files), self.corpus))  # pylint: disable=line-too-long
        logger.info('Index will be written to: "%s".' % self.index_path)
        if self.chunk == 'author':
            for count, file in enumerate(files, 1):

                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        file = file[3:]
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, 'TLG' + file + '.TXT')
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        author = corpus_index[file]
                        path = os.path.join(corpus_path, file + '.TXT')
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()
                writer.add_document(path=path,
                                    author=author,
                                    content=content)

                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)

        if self.chunk == 'work':
            for count, file in enumerate(files, 1):
                try:
                    if self.lang == 'greek' and self.corpus == 'tlg':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[3:-8]]
                    if self.lang == 'latin' and self.corpus == 'phi5':
                        path = os.path.join(corpus_path, file + '.TXT')
                        author = corpus_index[file[:-8]]
                except KeyError as key_error:
                    if file.startswith('LAT9999'):
                        continue
                    logger.error(key_error)
                    raise

                with open(path) as file_open:
                    content = file_open.read()

                writer.add_document(path=path,
                                    author=author,
                                    content=content)
                if count % 100 == 0:
                    logger.info('Indexed doc %s.' % count)
        logger.info('Commencing to commit changes.')
        writer.commit()

        time_1 = time.time()
        elapsed = time_1 - time_0
        logger.info('Finished indexing all documents in %s seconds (averaging %s docs per sec.)' % (elapsed, (len(files) / elapsed)))  # pylint: disable=line-too-long
Пример #52
0
    def make_syllables(self, sentences_words):
        """Divide the word tokens into a list of syllables. Note that a syllable
        in this instance is defined as a vocalic group (i.e., a vowel or a
        diphthong). This means that all syllables which are not the last
        syllable in the word will end with a vowel or diphthong.

        TODO: Determine whether Luke Hollis's module at
        `cltk.stem.latin.syllabifier could replace this method.`

        :param sentences_words: A list of sentences with tokenized words.
        :return: Syllabified words
        :rtype : list
        """
        all_syllables = []
        for sentence in sentences_words:
            syll_per_sent = []
            for word in sentence:
                syll_start = 0  # Begins syllable iterator
                syll_per_word = []
                cur_letter_in = 0  # Begins general iterator
                while cur_letter_in < len(word):
                    letter = word[cur_letter_in]
                    if not cur_letter_in == len(word) - 1:
                        if word[cur_letter_in] + word[cur_letter_in + 1] in self.diphthongs:
                            cur_letter_in += 1
                            # Syllable ends with a diphthong
                            syll_per_word.append(
                                word[syll_start:cur_letter_in + 1])
                            syll_start = cur_letter_in + 1
                        elif (letter in self.vowels) or \
                             (letter in self.long_vowels):
                            # Syllable ends with a vowel
                            syll_per_word.append(
                                word[syll_start:cur_letter_in + 1])
                            syll_start = cur_letter_in + 1
                    elif (letter in self.vowels) or \
                         (letter in self.long_vowels):
                        # Syllable ends with a vowel
                        syll_per_word.append(
                            word[syll_start:cur_letter_in + 1])
                        syll_start = cur_letter_in + 1
                    cur_letter_in += 1
                try:
                    last_vowel = syll_per_word[-1][-1]  # Last vowel of a word
                    # Modifies general iterator for consonants after the last
                    # syllable in a word.
                    cur_letter_in = len(
                        word) - 1
                    # Contains all of the consonants after the last vowel in a
                    # word
                    leftovers = u''
                    while word[cur_letter_in] != last_vowel:
                        if word[cur_letter_in] != u'.':
                            # Adds consonants to leftovers
                            leftovers = word[cur_letter_in] + leftovers
                        cur_letter_in -= 1
                    # Adds leftovers to last syllable in a word
                    syll_per_word[-1] += leftovers
                    syll_per_sent.append(syll_per_word)
                except IndexError:
                    logger.info("IndexError while making syllables of '%s'. Continuing.", word)
            all_syllables.append(syll_per_sent)
        return all_syllables