示例#1
0
 def separar_silabas(palavra, separador):
     """
     Função que separa silabas da palavra indicada na chamada da função. O
     usuário ainda pode escolher que tipo de separador ele deseja para poder
     ficar mais amigável ao seu código.
     """
     #TODO: Implementar função nativamente para processamento de sílabas
     from pyphen import Pyphen
     _palavra_sep = palavra.lower()
     dic = Pyphen(lang="pt_BR")
     _palavra_sep = dic.inserted(_palavra_sep)
     if separador == "-":
         return _palavra_sep
     _palavra_sep = str(_palavra_sep).replace("-", separador)
     return _palavra_sep
示例#2
0
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        text = text.lower()
        text = "".join(x for x in text if x not in exclude)

        if text is None:
            return 0
        elif len(text) == 0:
            return 0
        else:
            dic = Pyphen(lang=lang)
            count = 0
            for word in text.split(' '):
                word_hyphenated = dic.inserted(word)
                count += max(1, word_hyphenated.count("-") + 1)
            return count
示例#3
0
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
示例#4
0
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
示例#5
0
def n_syllables(doc: Doc):
    """
    Return number of syllables per token
    """

    dic = Pyphen(lang=doc.lang_)

    def count_syl(token: Token):
        word_hyphenated = dic.inserted(token.lower_)
        return max(1, word_hyphenated.count("-") + 1)

    return [count_syl(token) for token in doc._._filtered_tokens]
示例#6
0
    def syllable_counts(self, text):
        """
        Calculates number of syllables per token
        Punctuation is removed before tokenization
        """
        text = Text.to_text(text)
        if not text.text:
            return 0
        dic = Pyphen(lang=self.__lang)

        def count_syl(token):
            word_hyphenated = dic.inserted(token.lower())
            return max(1, word_hyphenated.count("-") + 1)

        return [count_syl(token) for token in text.tokens_without_punctuation]
示例#7
0
class textstatistics:
    __lang = "en_US"
    text_encoding = "utf-8"
    __easy_word_sets = {}
    __punctuation_regex = re.compile(f'[{re.escape(string.punctuation)}]')

    def __init__(self):
        self.set_lang(self.__lang)

    def _cache_clear(self):
        caching_methods = [
            method for method in dir(self) if callable(getattr(self, method))
            and hasattr(getattr(self, method), "cache_info")
        ]

        for method in caching_methods:
            getattr(self, method).cache_clear()

    def set_lang(self, lang):
        self.__lang = lang
        self.pyphen = Pyphen(lang=self.__lang)
        self._cache_clear()

    @lru_cache(maxsize=128)
    def char_count(self, text, ignore_spaces=True):
        """
        Function to return total character counts in a text,
        pass the following parameter `ignore_spaces = False`
        to ignore whitespaces
        """
        if ignore_spaces:
            text = text.replace(" ", "")
        return len(text)

    @lru_cache(maxsize=128)
    def letter_count(self, text, ignore_spaces=True):
        """
        Function to return total letter amount in a text,
        pass the following parameter `ignore_spaces = False`
        to ignore whitespaces
        """
        if ignore_spaces:
            text = text.replace(" ", "")
        return len(self.remove_punctuation(text))

    @classmethod
    def remove_punctuation(cls, text):
        return cls.__punctuation_regex.sub('', text)

    @lru_cache(maxsize=128)
    def lexicon_count(self, text, removepunct=True):
        """
        Function to return total lexicon (words in lay terms) counts in a text
        """
        if removepunct:
            text = self.remove_punctuation(text)
        count = len(text.split())
        return count

    @lru_cache(maxsize=128)
    def syllable_count(self, text, lang=None):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if lang:
            warnings.warn(
                "The 'lang' argument has been moved to "
                "'textstat.set_lang(<lang>)'. This argument will be removed "
                "in the future.", DeprecationWarning)
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        count = 0
        for word in text.split(' '):
            count += len(self.pyphen.positions(word)) + 1
        return count

    @lru_cache(maxsize=128)
    def sentence_count(self, text):
        """
        Sentence count of a text
        """
        ignore_count = 0
        sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
        for sentence in sentences:
            if self.lexicon_count(sentence) <= 2:
                ignore_count += 1
        return max(1, len(sentences) - ignore_count)

    @lru_cache(maxsize=128)
    def avg_sentence_length(self, text):
        try:
            asl = float(self.lexicon_count(text) / self.sentence_count(text))
            return legacy_round(asl, 1)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def avg_syllables_per_word(self, text, interval=None):
        syllable = self.syllable_count(text)
        words = self.lexicon_count(text)
        try:
            if interval:
                syllables_per_word = float(syllable) * interval / float(words)
            else:
                syllables_per_word = float(syllable) / float(words)
            return legacy_round(syllables_per_word, 1)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def avg_character_per_word(self, text):
        try:
            letters_per_word = float(
                self.char_count(text) / self.lexicon_count(text))
            return legacy_round(letters_per_word, 2)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def avg_letter_per_word(self, text):
        try:
            letters_per_word = float(
                self.letter_count(text) / self.lexicon_count(text))
            return legacy_round(letters_per_word, 2)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def avg_sentence_per_word(self, text):
        try:
            sentence_per_word = float(
                self.sentence_count(text) / self.lexicon_count(text))
            return legacy_round(sentence_per_word, 2)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def flesch_reading_ease(self, text):
        sentence_length = self.avg_sentence_length(text)
        s_interval = 100 if self.__get_lang_root() in ['es', 'it'] else None
        syllables_per_word = self.avg_syllables_per_word(text, s_interval)
        flesch = (
            self.__get_lang_cfg("fre_base") - float(
                self.__get_lang_cfg("fre_sentence_length") * sentence_length) -
            float(
                self.__get_lang_cfg("fre_syll_per_word") * syllables_per_word))
        return legacy_round(flesch, 2)

    @lru_cache(maxsize=128)
    def flesch_kincaid_grade(self, text):
        sentence_lenth = self.avg_sentence_length(text)
        syllables_per_word = self.avg_syllables_per_word(text)
        flesch = (float(0.39 * sentence_lenth) +
                  float(11.8 * syllables_per_word) - 15.59)
        return legacy_round(flesch, 1)

    @lru_cache(maxsize=128)
    def polysyllabcount(self, text):
        count = 0
        for word in text.split():
            wrds = self.syllable_count(word)
            if wrds >= 3:
                count += 1
        return count

    @lru_cache(maxsize=128)
    def smog_index(self, text):
        sentences = self.sentence_count(text)

        if sentences >= 3:
            try:
                poly_syllab = self.polysyllabcount(text)
                smog = ((1.043 * (30 * (poly_syllab / sentences))**.5) +
                        3.1291)
                return legacy_round(smog, 1)
            except ZeroDivisionError:
                return 0.0
        else:
            return 0.0

    @lru_cache(maxsize=128)
    def coleman_liau_index(self, text):
        letters = legacy_round(self.avg_letter_per_word(text) * 100, 2)
        sentences = legacy_round(self.avg_sentence_per_word(text) * 100, 2)
        coleman = float((0.058 * letters) - (0.296 * sentences) - 15.8)
        return legacy_round(coleman, 2)

    @lru_cache(maxsize=128)
    def automated_readability_index(self, text):
        chrs = self.char_count(text)
        words = self.lexicon_count(text)
        sentences = self.sentence_count(text)
        try:
            a = float(chrs) / float(words)
            b = float(words) / float(sentences)
            readability = ((4.71 * legacy_round(a, 2)) +
                           (0.5 * legacy_round(b, 2)) - 21.43)
            return legacy_round(readability, 1)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def linsear_write_formula(self, text):
        easy_word = 0
        difficult_word = 0
        text_list = text.split()[:100]

        for word in text_list:
            if self.syllable_count(word) < 3:
                easy_word += 1
            else:
                difficult_word += 1

        text = ' '.join(text_list)

        number = float(
            (easy_word * 1 + difficult_word * 3) / self.sentence_count(text))

        if number <= 20:
            number -= 2

        return number / 2

    @lru_cache(maxsize=128)
    def difficult_words(self, text, syllable_threshold=2):
        return len(self.difficult_words_list(text, syllable_threshold))

    @lru_cache(maxsize=128)
    def difficult_words_list(self, text, syllable_threshold=2):
        words = set(re.findall(r"[\w\='‘’]+", text.lower()))
        diff_words = [
            word for word in words
            if self.is_difficult_word(word, syllable_threshold)
        ]
        return diff_words

    @lru_cache(maxsize=128)
    def is_difficult_word(self, word, syllable_threshold=2):
        easy_word_set = self.__get_lang_easy_words()
        if word in easy_word_set:
            return False
        if self.syllable_count(word) < syllable_threshold:
            return False
        return True

    @lru_cache(maxsize=128)
    def is_easy_word(self, word, syllable_threshold=2):
        return not self.is_difficult_word(word, syllable_threshold)

    @lru_cache(maxsize=128)
    def dale_chall_readability_score(self, text):
        word_count = self.lexicon_count(text)
        count = word_count - self.difficult_words(text, syllable_threshold=0)

        try:
            per = float(count) / float(word_count) * 100
        except ZeroDivisionError:
            return 0.0

        difficult_words = 100 - per

        score = ((0.1579 * difficult_words) +
                 (0.0496 * self.avg_sentence_length(text)))

        if difficult_words > 5:
            score += 3.6365
        return legacy_round(score, 2)

    @lru_cache(maxsize=128)
    def gunning_fog(self, text):
        try:
            syllable_threshold = self.__get_lang_cfg("syllable_threshold")
            per_diff_words = (self.difficult_words(
                text, syllable_threshold=syllable_threshold) /
                              self.lexicon_count(text) * 100)

            grade = 0.4 * (self.avg_sentence_length(text) + per_diff_words)
            return legacy_round(grade, 2)
        except ZeroDivisionError:
            return 0.0

    @lru_cache(maxsize=128)
    def lix(self, text):
        words = text.split()

        words_len = len(words)
        long_words = len([wrd for wrd in words if len(wrd) > 6])

        per_long_words = (float(long_words) * 100) / words_len
        asl = self.avg_sentence_length(text)
        lix = asl + per_long_words

        return legacy_round(lix, 2)

    @lru_cache(maxsize=128)
    def rix(self, text):
        """
        A Rix ratio is simply the number of long words divided by
        the number of assessed sentences.
        rix = LW/S
        """
        words = text.split()
        long_words_count = len([wrd for wrd in words if len(wrd) > 6])
        sentences_count = self.sentence_count(text)

        try:
            rix = long_words_count / sentences_count
        except ZeroDivisionError:
            rix = 0.00

        return legacy_round(rix, 2)

    @lru_cache(maxsize=128)
    def spache_readability(self, text, float_output=True):
        """
        Function to calculate SPACHE readability formula for young readers.
        I/P - a text
        O/P - an int Spache Readability Index/Grade Level
        """
        total_no_of_words = self.lexicon_count(text)
        count_of_sentences = self.sentence_count(text)
        asl = total_no_of_words / count_of_sentences
        pdw = (self.difficult_words(text) / total_no_of_words) * 100
        spache = (0.141 * asl) + (0.086 * pdw) + 0.839
        if not float_output:
            return int(spache)
        else:
            return spache

    @lru_cache(maxsize=128)
    def dale_chall_readability_score_v2(self, text):
        """
        Function to calculate New Dale Chall Readability formula.
        I/P - a text
        O/P - an int Dale Chall Readability Index/Grade Level
        """
        total_no_of_words = self.lexicon_count(text)
        count_of_sentences = self.sentence_count(text)
        asl = total_no_of_words / count_of_sentences
        pdw = (self.difficult_words(text) / total_no_of_words) * 100
        raw_score = 0.1579 * (pdw) + 0.0496 * asl
        adjusted_score = raw_score
        if raw_score > 0.05:
            adjusted_score = raw_score + 3.6365
        return legacy_round(adjusted_score, 2)

    @lru_cache(maxsize=128)
    def text_standard(self, text, float_output=None):

        grade = []

        # Appending Flesch Kincaid Grade
        lower = legacy_round(self.flesch_kincaid_grade(text))
        upper = math.ceil(self.flesch_kincaid_grade(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Flesch Reading Easy
        score = self.flesch_reading_ease(text)
        if score < 100 and score >= 90:
            grade.append(5)
        elif score < 90 and score >= 80:
            grade.append(6)
        elif score < 80 and score >= 70:
            grade.append(7)
        elif score < 70 and score >= 60:
            grade.append(8)
            grade.append(9)
        elif score < 60 and score >= 50:
            grade.append(10)
        elif score < 50 and score >= 40:
            grade.append(11)
        elif score < 40 and score >= 30:
            grade.append(12)
        else:
            grade.append(13)

        # Appending SMOG Index
        lower = legacy_round(self.smog_index(text))
        upper = math.ceil(self.smog_index(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Coleman_Liau_Index
        lower = legacy_round(self.coleman_liau_index(text))
        upper = math.ceil(self.coleman_liau_index(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Automated_Readability_Index
        lower = legacy_round(self.automated_readability_index(text))
        upper = math.ceil(self.automated_readability_index(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Dale_Chall_Readability_Score
        lower = legacy_round(self.dale_chall_readability_score(text))
        upper = math.ceil(self.dale_chall_readability_score(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Linsear_Write_Formula
        lower = legacy_round(self.linsear_write_formula(text))
        upper = math.ceil(self.linsear_write_formula(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Appending Gunning Fog Index
        lower = legacy_round(self.gunning_fog(text))
        upper = math.ceil(self.gunning_fog(text))
        grade.append(int(lower))
        grade.append(int(upper))

        # Finding the Readability Consensus based upon all the above tests
        d = Counter(grade)
        final_grade = d.most_common(1)
        score = final_grade[0][0]

        if float_output:
            return float(score)
        else:
            lower_score = int(score) - 1
            upper_score = lower_score + 1
            return "{}{} and {}{} grade".format(lower_score,
                                                get_grade_suffix(lower_score),
                                                upper_score,
                                                get_grade_suffix(upper_score))

    @lru_cache(maxsize=128)
    def reading_time(self, text, ms_per_char=14.69):
        """
        Function to calculate reading time (Demberg & Keller, 2008)
        I/P - a text
        O/P - reading time in second
        """
        words = text.split()
        nchars = map(len, words)
        rt_per_word = map(lambda nchar: nchar * ms_per_char, nchars)
        reading_time = sum(list(rt_per_word))

        return legacy_round(reading_time / 1000, 2)

    # Spanish readability tests
    @lru_cache(maxsize=128)
    def fernandez_huerta(self, text):
        '''
        Fernandez Huerta readability score
        https://legible.es/blog/lecturabilidad-fernandez-huerta/
        '''
        sentence_length = self.avg_sentence_length(text)
        syllables_per_word = self.avg_syllables_per_word(text)

        f_huerta = (206.85 - float(60 * syllables_per_word) -
                    float(1.02 * sentence_length))
        return legacy_round(f_huerta, 1)

    @lru_cache(maxsize=128)
    def szigriszt_pazos(self, text):
        '''
        Szigriszt Pazos readability score (1992)
        https://legible.es/blog/perspicuidad-szigriszt-pazos/
        '''
        syllables = self.syllable_count(text)
        total_words = self.lexicon_count(text)
        total_sentences = self.sentence_count(text)

        s_p = (self.__get_lang_cfg("fre_base") - 62.3 *
               (syllables / total_words) - (total_words / total_sentences))

        return legacy_round(s_p, 2)

    @lru_cache(maxsize=128)
    def gutierrez_polini(self, text):
        '''
        Guttierrez de Polini index
        https://legible.es/blog/comprensibilidad-gutierrez-de-polini/
        '''
        total_words = self.lexicon_count(text)
        total_letters = self.letter_count(text)
        total_sentences = self.sentence_count(text)

        gut_pol = (95.2 - 9.7 * (total_letters / total_words) - 0.35 *
                   (total_words / total_sentences))

        return legacy_round(gut_pol, 2)

    @lru_cache(maxsize=128)
    def crawford(self, text):
        '''
        Crawford index
        https://legible.es/blog/formula-de-crawford/
        '''
        total_sentences = self.sentence_count(text)
        total_words = self.lexicon_count(text)
        total_syllables = self.syllable_count(text)

        # Calculating __ per 100 words
        sentences_per_words = 100 * (total_sentences / total_words)
        syllables_per_words = 100 * (total_syllables / total_words)

        craw_years = (-0.205 * sentences_per_words +
                      0.049 * syllables_per_words - 3.407)

        return legacy_round(craw_years, 1)

    def __get_lang_cfg(self, key):
        """ Read as get lang config """
        default = langs.get("en")
        config = langs.get(self.__get_lang_root(), default)
        return config.get(key, default.get(key))

    def __get_lang_root(self):
        return self.__lang.split("_")[0]

    def __get_lang_easy_words(self):
        lang = self.__get_lang_root()
        if lang not in self.__easy_word_sets:
            try:
                easy_word_set = {
                    ln.decode("utf-8").strip()
                    for ln in pkg_resources.resource_stream(
                        "textstat",
                        f"resources/{lang}/easy_words.txt",
                    )
                }
            except FileNotFoundError:
                warnings.warn(
                    "There is no easy words vocabulary for "
                    f"{self.__lang}, using english.",
                    Warning,
                )
                easy_word_set = {
                    ln.decode("utf-8").strip()
                    for ln in pkg_resources.resource_stream(
                        "textstat", "resources/en/easy_words.txt")
                }
            self.__easy_word_sets[lang] = easy_word_set
        return self.__easy_word_sets[lang]
示例#8
0
def _shyphenate_text(dic: pyphen.Pyphen, text: str) -> str:
    if len(text) < 5:
        return text
    else:
        return " ".join(
            dic.inserted(word, hyphen=SOFT_HYPHEN) for word in text.split(" "))
示例#9
0
def read_data(args, config):
  '''read data sets, construct all needed structures and update the config'''
  if args.ssm == '1': config.ssm = 1
  
  hyphenator = Pyphen(lang=args.lang)

  def my_characters(word):
    return ['⎡'] + list(word) + ['⎦']

  if args.is_train == '1':
    if not os.path.exists(args.save_dir):
      os.makedirs(args.save_dir)
    with open(os.path.join(
        args.save_dir, args.prefix + '-data.pkl'), 'wb') as data_file:
      word_data = open(os.path.join(args.data_dir, 'train.txt'), 'r').read() \
                  .replace('\n', args.eos).split()
      words = list(set(word_data))
      
      characters = set()
      word_lens_in_char = []

      for word in words:
        chars = my_characters(word)
        word_lens_in_char.append(len(chars))
        for char in chars:
          characters.add(char)

      chars_list = list(characters)
      pickle.dump(
          (word_data, words, word_lens_in_char, chars_list), data_file)

  else:
    with open(os.path.join(
        args.save_dir, args.prefix + '-data.pkl'), 'rb') as data_file:
      word_data, words, word_lens_in_char, chars_list = \
          pickle.load(data_file)

  word_data_size, word_vocab_size = len(word_data), len(words)
  print('data has %d words, %d unique' % (word_data_size, word_vocab_size))
  config.word_vocab_size = word_vocab_size
  config.num_sampled = int(word_vocab_size * 0.2)

  word_to_ix = { word:i for i,word in enumerate(words) }
  ix_to_word = { i:word for i,word in enumerate(words) }

  def get_word_raw_data(input_file):
    data = open(input_file, 'r').read().replace('\n', args.eos).split()
    return [word_to_ix[w] for w in data]

  train_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'train.txt'))
  valid_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'valid.txt'))
  test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt'))

  char_vocab_size = len(chars_list)
  max_word_len = int(np.percentile(word_lens_in_char, 100))
  config.max_word_len = max_word_len
  print('data has %d unique chars' % char_vocab_size)
  print('max word length in chars is set to', max_word_len)

  # a fake character for zero-padding
  zero_pad_char = ' '
  chars_list.insert(0, zero_pad_char)
  char_vocab_size += 1
  config.char_vocab_size = char_vocab_size

  char_to_ix = { char:i for i,char in enumerate(chars_list) }
  ix_to_char = { i:char for i,char in enumerate(chars_list) }

  word_ix_to_char_ixs = {}
  for word in words:
    word_ix = word_to_ix[word]
    word_in_chars = my_characters(word)
    word_in_chars += [zero_pad_char] * (max_word_len - len(word_in_chars))
    word_ix_to_char_ixs[word_ix] = \
        [char_to_ix[char] for char in word_in_chars]

  return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_char_ixs
示例#10
0
from utils.web.slack_api import parse_config
from utils.web.slack_api.big_emoji import resize_image, resize_gif
from utils.web.slack_api.text_to_emoji import text_to_emoji
from utils.web.servers.core import register_cmd, SlackInfo, slack_api, app, init_slack_api, gen_help_str, \
    send_to_channel, request_in_loop, no_dm, run_in_executor

__author__ = 'acushner'

from utils.web.servers.incident import IncidentInfo, init_incident_store

_admins = parse_config().admin_id_name_map
DEFAULT_SIZE_MULT = 6.
MAX_SIZE_MULT = 15.

_pyphen = Pyphen(lang='en')


@register_cmd
@no_dm
async def embiggen(si: SlackInfo):
    """emoji [size_multiple]
    [_size_multiple_]: multiple to scale up/down emoji size by
    only works on custom emoji due to download issues from slack"""
    emoji, *rest = si.argstr.split()
    mult = min(MAX_SIZE_MULT, float(first(rest, DEFAULT_SIZE_MULT)))
    if mult <= 0:
        return text(f'invalid mult: {mult}')

    all_emoji = await slack_api.get_emoji()
    try:
示例#11
0
文件: process.py 项目: ADFD/adfd
def hyphenate(text, hyphen='&shy;'):
    py = Pyphen(lang='de_de')
    words = text.split(' ')
    return ' '.join([py.inserted(word, hyphen=hyphen) for word in words])
示例#12
0
class Syllable:
    def __init__(self, syl):
        self.id = ID.next()
        self.text = syl
        self.phonemes = []
        self.stressed = False

    def __str__(self):
        arr = []
        for ph in self.phonemes:
            arr.append(ph.text)
        return u'{} ({})'.format(self.text, arr)


hyp = Pyphen(lang='pl_PL')

ph_map = {
    'I': 'y',
    'en': u'ę',
    'on': u'ą',
    'v': 'w',
    'S': 'sz',
    'Z': u'ż',
    'si': u'ś',
    'zi': u'ź',
    'x': 'h',
    'ts': 'c',
    'tS': 'cz',
    'dZ': u'dż',
    'ni': u'ń',
示例#13
0
class GermaLemma(object):
    """
    Lemmatizer for German language text main class.
    """
    pyphen_dic = Pyphen(lang='de')

    def __init__(self, **kwargs):
        """
        Initialize GermaLemma lemmatizer. By default, it will load the lemmatizer data from 'data/lemmata.pickle'. You
        can also pass a manual lemmata dictionary via `lemmata` or load a corpus in CONLL09 format via `tiger_corpus`
        or load pickled lemmatizer data from `pickle`.
        Force usage of pattern.de module by setting `use_pattern_module` to True (or False for not using). By default,
        it will try to use pattern.de if it is installed.
        """
        if 'lemmata' in kwargs:
            self.lemmata = kwargs['lemmata']
            if 'lemmata_lower' in kwargs:
                self.lemmata_lower = kwargs['lemmata_lower']
            else:
                self.lemmata_lower = {
                    pos:
                    {token.lower(): lemma
                     for token, lemma in pos_lemmata}
                    for pos, pos_lemmata in self.lemmata.items()
                }
        elif 'tiger_corpus' in kwargs:
            self.lemmata, self.lemmata_lower = self.load_corpus_lemmata(
                kwargs['tiger_corpus'])
        elif 'pickle' in kwargs:
            self.load_from_pickle(kwargs['pickle'])
        else:
            try:
                self.load_from_pickle(DEFAULT_LEMMATA_PICKLE)
            except FileNotFoundError:
                self.load_from_pickle(
                    os.path.join(sys.prefix, DEFAULT_LEMMATA_PICKLE))

        self.pattern_module = None
        use_pattern_module = kwargs.get('use_pattern_module', None)
        if use_pattern_module in (True, None):
            try:
                self.pattern_module = import_module('pattern.de')
            except ImportError:
                if use_pattern_module is True:
                    raise ImportError('pattern.de module could not be loaded')

    def find_lemma(self, w, pos_tag):
        """
        Find a lemma for word `w` that has a Part-of-Speech tag `pos_tag`. `pos_tag` should be a valid STTS tagset tag
        (see http://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/TagSets/stts-table.html) or a simplified form
        with:
        - 'N' for nouns
        - 'V' for verbs
        - 'ADJ' for adjectives
        - 'ADV' for adverbs
        All other tags will raise a ValueError("Unsupported POS tag")!
        Return the lemma or, if no lemma was found, return `w`.
        """
        if not w:  # do not process empty strings
            return w

        if pos_tag == 'NE':  # if word is a name, it already is the lemma
            return w

        if pos_tag.startswith('N') or pos_tag.startswith('V'):
            pos = pos_tag[0]
        elif pos_tag.startswith('ADJ') or pos_tag.startswith('ADV'):
            pos = pos_tag[:3]
        else:
            raise ValueError("Unsupported POS tag")

        # look if we can directly find `w` in the lemmata dictionary
        res = self.dict_search(w, pos)

        if not res and self.pattern_module:  # try to use pattern.de module
            res_pattern = self._lemma_via_patternlib(w, pos)
            if res_pattern != w:
                res = res_pattern

        if not res:
            # try to split nouns that are made of composita
            if pos == 'N':
                res = self._composita_lemma(w) or w
            else:
                res = w

            # try to lemmatize adjectives using prevalent German language adjective suffixes
            if pos == 'ADJ':
                res = self._adj_lemma(res)

        # nouns always start with a capital letter
        if pos == 'N':
            if len(res) > 1 and res[0].islower():
                res = res[0].upper() + res[1:]
        else:  # all other forms are lower-case
            res = res.lower()

        return res

    def dict_search(self, w, pos, use_lower=False):
        """
        Lemmata dictionary lookup for word `w` with POS tag `pos`.
        Return lemma if found, else None.
        """
        pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[
            pos]

        return pos_lemmata.get(w, None)

    def _adj_lemma(self, w):
        """
        Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized
        adjective.
        """
        for full, reduced in ADJ_SUFFIXES_DICT.items():
            if w.endswith(full):
                return w[:-len(full)] + reduced

        return w

    def _composita_lemma(self, w):
        """
        Try to split a word `w` that is possibly made of composita.
        Return the lemma if found, else return None.
        """

        # find most important split position first when a hyphen is used in the word
        try:
            split_positions = [w.rfind('-') + 1]
        except ValueError:
            split_positions = []

        # add possible split possitions by using Pyphen's hyphenation positions
        split_positions.extend([
            p for p in self.pyphen_dic.positions(w) if p not in split_positions
        ])

        # now split `w` by hyphenation step by step
        for hy_pos in split_positions:
            # split in left and right parts (start and end of the strings)
            left, right = w[:hy_pos], w[hy_pos:]

            # look if the right part can be found in the lemmata dictionary
            # if we have a noun, a lower case match will also be accepted
            if left and right and not right.endswith('innen'):
                res = self.dict_search(right,
                                       'N',
                                       use_lower=right[0].islower())
                if res:
                    # concatenate the left side with the found partial lemma
                    if left[-1] == '-':
                        res = left + res.capitalize()
                    else:
                        res = left + res.lower()

                    if w.isupper():
                        return res.upper()
                    else:
                        return res

        return None

    def _lemma_via_patternlib(self, w, pos):
        """
        Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions.
        Return the lemma or `w` if lemmatization was not possible with pattern.de
        """
        if not self.pattern_module:
            raise RuntimeError('pattern.de module not loaded')

        if pos == 'NP':  # singularize noun
            return self.pattern_module.singularize(w)
        elif pos.startswith('V'):  # get infinitive of verb
            return self.pattern_module.conjugate(w)
        elif pos.startswith('ADJ') or pos.startswith(
                'ADV'):  # get baseform of adjective or adverb
            return self.pattern_module.predicative(w)

        return w

    @classmethod
    def load_corpus_lemmata(cls, corpus_file):
        lemmata = defaultdict(dict)
        lemmata_lower = defaultdict(dict)

        with codecs.open(corpus_file, encoding="utf-8") as f:
            for line in f:
                parts = line.split()
                if len(parts) == 15:
                    token, lemma = parts[1:3]
                    pos = parts[4]
                    cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token,
                                             lemma, pos)

        return lemmata, lemmata_lower

    @staticmethod
    def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos):
        for pos_prefix in VALID_POS_PREFIXES:
            if pos.startswith(pos_prefix):
                if token not in lemmata[pos_prefix]:
                    lemmata[pos_prefix][token] = lemma
                if lemma not in lemmata[pos_prefix]:  # for quicker lookup
                    lemmata[pos_prefix][lemma] = lemma

                if pos_prefix == 'N':
                    token_lower = token.lower()
                    if token_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][token_lower] = lemma
                    lemma_lower = lemma.lower()
                    if lemma_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][lemma_lower] = lemma

                return

    def save_to_pickle(self, pickle_file):
        with open(pickle_file, 'wb') as f:
            pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2)

    def load_from_pickle(self, pickle_file):
        with open(pickle_file, 'rb') as f:
            self.lemmata, self.lemmata_lower = pickle.load(f)
示例#14
0
def read_data(args, config):
    '''read data sets, construct all needed structures and update the config'''
    if args.ssm == '1': config.ssm = 1

    hyphenator = Pyphen(lang=args.dict)

    def my_syllables(word):
        return hyphenator.inserted(word).split('-')

    if args.is_train == '1':
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
        with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'),
                  'wb') as data_file:
            word_data = open(
                os.path.join(args.data_dir, 'train.txt'), 'r').read() \
                .replace('\n', args.eos).split()
            words = list(set(word_data))

            syllables = set()
            word_lens_in_syl = []

            for word in words:
                syls = my_syllables(word)
                word_lens_in_syl.append(len(syls))
                for syl in syls:
                    syllables.add(syl)

            syls_list = list(syllables)
            pickle.dump((word_data, words, word_lens_in_syl, syls_list),
                        data_file)
    else:
        with open(os.path.join(args.save_dir, args.prefix + '-data.pkl'),
                  'rb') as data_file:
            word_data, words, word_lens_in_syl, syls_list = pickle.load(
                data_file)

    word_data_size, word_vocab_size = len(word_data), len(words)
    print('data has %d words, %d unique' % (word_data_size, word_vocab_size))
    config.word_vocab_size = word_vocab_size
    config.num_sampled = int(word_vocab_size * 0.2)

    word_to_ix = {word: i for i, word in enumerate(words)}
    ix_to_word = {i: word for i, word in enumerate(words)}

    def get_word_raw_data(input_file):
        data = open(input_file, 'r').read().replace('\n', args.eos).split()
        return [word_to_ix[w] for w in data]

    train_raw_data = get_word_raw_data(os.path.join(args.data_dir,
                                                    'train.txt'))
    valid_raw_data = get_word_raw_data(os.path.join(args.data_dir,
                                                    'valid.txt'))
    test_raw_data = get_word_raw_data(os.path.join(args.data_dir, 'test.txt'))

    syl_vocab_size = len(syls_list)
    max_word_len = int(np.percentile(word_lens_in_syl, 100))
    config.max_word_len = max_word_len
    print('data has %d unique syllables' % syl_vocab_size)
    print('max word length in syllables is set to', max_word_len)

    # a fake syllable for zero-padding
    zero_pad_syl = ' '
    syls_list.insert(0, zero_pad_syl)
    syl_vocab_size += 1
    config.syl_vocab_size = syl_vocab_size

    syl_to_ix = {syl: i for i, syl in enumerate(syls_list)}
    ix_to_syl = {i: syl for i, syl in enumerate(syls_list)}

    word_ix_to_syl_ixs = {}
    for word in words:
        word_ix = word_to_ix[word]
        word_in_syls = my_syllables(word)
        word_in_syls += [zero_pad_syl] * (max_word_len - len(word_in_syls))
        word_ix_to_syl_ixs[word_ix] = [syl_to_ix[syl] for syl in word_in_syls]

    return train_raw_data, valid_raw_data, test_raw_data, word_ix_to_syl_ixs
示例#15
0
 def set_lang(self, lang):
     self.__lang = lang
     self.pyphen = Pyphen(lang=self.__lang)
     self._cache_clear()
示例#16
0
class CustomGermaLemma(object):
    """
    Lemmatizer for German language text main class.
    """
    pyphen_dic = Pyphen(lang='de')

    def __init__(self, **kwargs):
        if ('tiger_corpus' in kwargs):
            self.lemmata, self.lemmata_lower = self.load_corpus_lemmata(
                kwargs['tiger_corpus'])
        elif ('pickle' in kwargs):
            self.load_from_pickle(kwargs['pickle'])
        self.pattern_module = import_module('pattern.de')
        self.iwnlpLemmatizer = CustomIWNLPLemmatizer(
            join(FILE_PATH, "lib", "IWNLP.Lemmatizer_20170501.json"))

    def find_lemma(self, w, pos, props=None):
        # do not process empty strings
        if (not (w)):
            raise ValueError("Empty String!")
        # valid pos = N,V,ADJ,ADV
        elif (not (pos in ["NOUN", "VERB", "ADJ", "ADV", "AUX"])):
            return word
        iwnlpLemmas = self.iwnlpLemmatizer.lemmatize(w, pos)
        if (iwnlpLemmas): return iwnlpLemmas, None

        if (pos.startswith('N') or pos.startswith('V')):
            pos = pos[0]
        elif (pos.startswith('ADJ') or pos.startswith('ADV')):
            pos = pos[:3]
        elif (pos == "AUX"):
            pos = "V"
        # look if we can directly find `w` in the lemmata dictionary
        res = self.dict_search(w, pos)
        composita = None
        if (not (res)):
            # try to split nouns that are made of composita
            if (pos == 'N'):
                compositaRes = self._composita_lemma(w)
                res = compositaRes[0]
                if (len(compositaRes) > 1):
                    composita = compositaRes[1:]

            # try to lemmatize adjectives using prevalent German language adjective suffixe
            elif pos == 'ADJ':
                res = self._adj_lemma(w)

        # try to use pattern.de module
        if (not (res) and props and self.pattern_module):
            res_pattern = self._lemma_via_patternlib(w, pos, props)
            if res_pattern != w:
                res = res_pattern

        if (res):
            # nouns always start with a capital letter
            if (pos == 'N'):
                if len(res) > 1 and res[0].islower():
                    res = res[0].upper() + res[1:]
            else:
                res = res.lower()
            return [res], composita

        return res, composita

    def dict_search(self, w, pos, use_lower=False):
        """
        Lemmata dictionary lookup for word `w` with POS tag `pos`.
        Return lemma if found, else None.
        """
        pos_lemmata = self.lemmata_lower[pos] if use_lower else self.lemmata[
            pos]

        return pos_lemmata.get(w, None)

    def _adj_lemma(self, w):
        """
        Try to lemmatize adjectives using prevalent German language adjective suffixes. Return possibly lemmatized
        adjective.
        """
        for full, reduced in ADJ_SUFFIXES_DICT.items():
            if w.endswith(full):
                return w[:-len(full)] + reduced

        return None

    def _composita_lemma(self, w):
        """
        Try to split a word `w` that is possibly made of composita.
        Return the lemma if found, else return None.
        """
        # find most important split position first, only right part needs to exist
        try:
            split_positions = [w.rfind('-') + 1]
        except ValueError:
            split_positions = []
        split_positions.extend([
            p for p in self.pyphen_dic.positions(w) if p not in split_positions
        ])

        for hy_pos in split_positions:
            left, right = w[:hy_pos], w[hy_pos:]
            if (left and right and not (right.endswith('innen'))):
                resRight = self.dict_search(right,
                                            'N',
                                            use_lower=right[0].islower())
                if (not (resRight)):
                    resRight = self.iwnlpLemmatizer.lemmatize(right, "NOUN")
                    if (resRight): resRight = resRight[0]
                if resRight:
                    resLeft = self.dict_search(left,
                                               'N',
                                               use_lower=left[0].islower())
                    if (not (resLeft)):
                        resLeft = self.iwnlpLemmatizer.lemmatize(left, "NOUN")
                        if (resLeft): resLeft = resLeft[0]
                    if (not (resLeft)):
                        resLeft = self.dict_search(left[:-1],
                                                   'N',
                                                   use_lower=left[0].islower())
                    if (not (resLeft)):
                        resLeft = self.iwnlpLemmatizer.lemmatize(
                            left[:-1], "NOUN")
                        if (resLeft): resLeft = resLeft[0]
                    # concatenate the left side with the found partial lemma
                    if left[-1] == '-':
                        res = left + resRight.capitalize()
                    else:
                        res = left + resRight.lower()

                    resList = []
                    if w.isupper():
                        resList.append(res.upper())
                    else:
                        resList.append(res.capitalize())
                    resList.append(resRight.capitalize())
                    if (resLeft): resList.append(resLeft.capitalize())
                    return resList

        # try other split positions, both parts need to exist
        split_positions = [
            i for i in range(3,
                             len(w) - 2) if not (i in split_positions)
        ]

        for hy_pos in split_positions:
            left, right = w[:hy_pos], w[hy_pos:]
            if (left and right and not (right.endswith('innen'))):
                resRight = self.dict_search(right,
                                            'N',
                                            use_lower=right[0].islower())
                if (not (resRight)):
                    resRight = self.iwnlpLemmatizer.lemmatize(right, "NOUN")
                    if (resRight): resRight = resRight[0]
                resLeft = self.dict_search(left,
                                           'N',
                                           use_lower=left[0].islower())
                if (not (resLeft)):
                    resLeft = self.iwnlpLemmatizer.lemmatize(left, "NOUN")
                    if (resLeft): resLeft = resLeft[0]
                if (not (resLeft)):
                    resLeft = self.dict_search(left[:-1],
                                               'N',
                                               use_lower=left[0].islower())
                if (not (resLeft)):
                    resLeft = self.iwnlpLemmatizer.lemmatize(left[:-1], "NOUN")
                    if (resLeft): resLeft = resLeft[0]
                if (resRight and resLeft):
                    res = left + resRight.lower()
                    resList = []
                    if w.isupper():
                        resList.append(res.upper())
                    else:
                        resList.append(res.capitalize())
                    resList.append(resRight.capitalize())
                    resList.append(resLeft.capitalize())
                    return resList

        return [None]

    def _lemma_via_patternlib(self, w, pos, props={}):
        """
        Try to find a lemma for word `w` that has a Part-of-Speech tag `pos_tag` by using pattern.de module's functions.
        Return the lemma or `w` if lemmatization was not possible with pattern.de
        """
        if (not (self.pattern_module)):
            raise RuntimeError('pattern.de module not loaded')
        if (pos.startswith('N') and "number" in props
                and props["number"] != "Sg"):  # pos == 'NP': singularize noun
            return self.pattern_module.singularize(w)
        elif (pos.startswith('V') and "form" in props
              and props["form"] != "INF"):  # get infinitive of verb
            return self.pattern_module.conjugate(w)
        elif (pos.startswith('ADJ')
              or pos.startswith('ADV')):  # get baseform of adjective or adverb
            return self.pattern_module.predicative(w)

        return w

    @staticmethod
    def add_to_lemmata_dicts(lemmata, lemmata_lower, token, lemma, pos):
        for pos_prefix in VALID_POS_PREFIXES:
            if pos.startswith(pos_prefix):
                if token not in lemmata[pos_prefix]:
                    lemmata[pos_prefix][token] = lemma
                if lemma not in lemmata[pos_prefix]:  # for quicker lookup
                    lemmata[pos_prefix][lemma] = lemma

                if pos_prefix == 'N':
                    token_lower = token.lower()
                    if token_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][token_lower] = lemma
                    lemma_lower = lemma.lower()
                    if lemma_lower not in lemmata_lower[pos_prefix]:
                        lemmata_lower[pos_prefix][lemma_lower] = lemma

                return

    @classmethod
    def load_corpus_lemmata(cls, corpus_file):
        lemmata = defaultdict(dict)
        lemmata_lower = defaultdict(dict)

        with codecs.open(corpus_file, encoding="utf-8") as f:
            for line in f:
                parts = line.split()
                if len(parts) == 15:
                    token, lemma = parts[1:3]
                    pos = parts[4]
                    cls.add_to_lemmata_dicts(lemmata, lemmata_lower, token,
                                             lemma, pos)

        return lemmata, lemmata_lower

    def save_to_pickle(self, pickle_file):
        with open(pickle_file, 'wb') as f:
            pickle.dump((self.lemmata, self.lemmata_lower), f, protocol=2)

    def load_from_pickle(self, pickle_file):
        with open(pickle_file, 'rb') as f:
            self.lemmata, self.lemmata_lower = pickle.load(f)
示例#17
0
class ContentCleaner:
    def __init__(self, dataset, content_column):
        self.dataset = dataset.reset_index()
        self.content_column = content_column
        self.dic = Pyphen(lang='en_US')

        self.process_data()

    def __str__(self):
        return """
            This class takes a raw dataset of data and builds a
            clean NLP dataset with features of out of it
        """

    def lower_case(self):
        self.dataset[self.content_column] = self.dataset[
            self.content_column].str.lower()

    def remove_html_tags(self):
        cleanr = re.compile('<.*?>.*<.*>')
        self.dataset[self.content_column] = [
            re.sub(cleanr, '', r) for r in self.dataset[self.content_column]
        ]

    def stem_words(self):
        """
        https://stackoverflow.com/questions/38763007/how-to-use-spacy-lemmatizer-to-get-a-word-into-basic-form
        """
        print("Stemming Words")
        for i, row in tqdm(self.dataset.iterrows()):
            stemmed_string = ""
            content_row = nlp(row["content"])
            for word in content_row:
                stemmed_string += " " + word.lemma_
            self.dataset.loc[i, "content"] = stemmed_string

    def remove_stop_words(self):
        print("Removing Stop Words")
        for i, row in tqdm(self.dataset.iterrows()):
            sentence_sans_stop_words = ""
            content_row = nlp(row["content"])

            for word in content_row:
                if word.is_stop is False:
                    sentence_sans_stop_words += " " + word.text
            self.dataset.loc[i, "content"] = sentence_sans_stop_words
            self.dataset.loc[i, "num_words"] = len(content_row)

    def count_adjectives(self):
        """
        see:
        https://spacy.io/api/annotation
        https://spacy.io/usage/linguistic-features
        """
        print("Counting Adjectives")
        for i, row in tqdm(self.dataset.iterrows()):
            adjective_count = 0
            content_row = nlp(row["content"])

            for word in content_row:
                if word.pos_ == "ADJ":
                    adjective_count += 1
            self.dataset.loc[i, "adjectives"] = adjective_count

    def biggest_word(self):
        """
        Taken from https://github.com/shivam5992/textstat
        """
        self.dic = Pyphen(lang='en_US')
        print("Finding Biggest Words")
        for i, row in tqdm(self.dataset.iterrows()):
            biggest_word = 0
            content_row = nlp(row["content"])

            for word in content_row:
                word_hyphenated = self.dic.inserted(word.text)
                word_size = max(1, word_hyphenated.count("-") + 1)
                if word_size > biggest_word:
                    biggest_word = word_size

            self.dataset.loc[i, "biggest_word_syllables"] = biggest_word

    def readability_score(self):
        """
        Taken from - https://github.com/shivam5992/textstat
        
        Based on The Flesch Reading Ease formula
        """
        def avg_sentence_length(text):
            sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
            ignore_count = 0
            sentence_lengths = []
            for sentence in sentences:
                if len(sentence.split(" ")) <= 2:
                    ignore_count += 1
                else:
                    sentence_lengths.append(len(sentence.split(" ")))
            sentence_count = max(1, len(sentences) - ignore_count)
            sentence_length_mean = sum(sentence_lengths)
            return sentence_length_mean / sentence_count

        def avg_syllables_per_word(text):
            words = nlp(row["content"])
            syllables = []
            self.dic = Pyphen(lang='en_US')

            for word in words:
                word_hyphenated = self.dic.inserted(word.text)
                syllables.append(max(1, word_hyphenated.count("-") + 1))
            return sum(syllables) / len(words)

        def legacy_round(number, points=0):
            p = 10**points
            return float(
                math.floor((number * p) + math.copysign(0.5, number))) / p

        # code from https://github.com/shivam5992/textstat
        print("Assessing Readability Score")
        for i, row in tqdm(self.dataset.iterrows()):
            sentence_length = avg_sentence_length(row["content"])
            syllables_per_word = avg_syllables_per_word(row["content"])
            flesch = (206.835 - float(1.015 * sentence_length) -
                      float(84.6 * syllables_per_word))
            Flesch_reading_score = legacy_round(flesch, 2)
            self.dataset.loc[i, "flesch_reading_score"] = Flesch_reading_score

    def count_alliteration(self):
        print("Counting Alliteration")
        for i, row in tqdm(self.dataset.iterrows()):
            repeat_letter = None
            consecutive = False
            alliteration_count = 0

            if len(row["content"]) > 0:

                words = row["content"].split(" ")
                for word in words:
                    if len(word) > 0:
                        # Start of new alliteration
                        if str(word
                               )[0] == repeat_letter and consecutive is False:
                            alliteration_count += 1
                            repeat_letter = str(word)[0]
                            consecutive = True
                        # In the middle of a consecutive streak of alliteration
                        elif str(word)[0] == repeat_letter and consecutive:
                            repeat_letter = str(word)[0]

                        # End of an alliteration
                        elif str(word)[0] != repeat_letter:
                            repeat_letter = str(word)[0]
                            consecutive = False
                self.dataset.loc[i, "alliteration"] = alliteration_count

            else:
                self.dataset.loc[i, "alliteration"] = 0

    def process_data(self):
        self.count_alliteration()
        self.count_adjectives()
        self.biggest_word()
        self.readability_score()
        self.remove_html_tags()
        self.lower_case()
        self.remove_stop_words()
        self.stem_words()
示例#18
0
import logging
logging.basicConfig(filename="log.txt",
    level=logging.INFO,
    format="%(asctime)s %(message)s")

db, db_c = db_init()
app = Flask(__name__)
app.config["JSONIFY_PRETTYPRINT_REGULAR"] = True
app.secret_key = load_config("secret_key")
login_manager = LoginManager(app)

game_nwords = int(load_config("nwords"))
wordlist_file = "res/wordlists/de.txt"
words = [l.split() for l in open(wordlist_file).readlines()]
article_choices = ["der", "die", "das"]
hyphen_dic = Pyphen(lang="de_DE")
story_users = load_story_users()
story_filenames = find_story_filenames()


@login_manager.user_loader
def load_user(username):
    db_c.execute("SELECT username,displayname,hash from users WHERE username=?", [username])
    data = db_c.fetchone()
    if data is None:
        return None
    return User(data[0], data[1], data[2])


@login_manager.unauthorized_handler
def unauthorized_callback():
import nltk
from nltk.corpus import words
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
# from spellchecker import SpellChecker
# nltk.download('words')
# nltk.download('cmudict')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
import math
import re
import string
import syllables
from pyphen import Pyphen
d = cmudict.dict()
dic = Pyphen(lang="en")

# tool = language_tool_python.LanguageTool('en-US')


def removePunctuation(text):
    result = ""

    for char in text:
        if char in (".", ",", "!", "?", "؟", "،", "\","
                    "/", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+",
                    ":", ";", "<", ">", "=", "[", "]", "^", "_", "`", "{", "}",
                    "|", "~"):
            continue
        if char in ("-", "\n", "\r", "\t"):
            char = " "
示例#20
0
    def __init__(self, dataset, content_column):
        self.dataset = dataset.reset_index()
        self.content_column = content_column
        self.dic = Pyphen(lang='en_US')

        self.process_data()
import textstat
from sklearn.preprocessing import label_binarize
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import pkg_resources
import ast
import spacy
#from collections import Counter
from pyphen import Pyphen
import pickle
#import xgboost

# lead the language model from spay. this must be downloaded
nlp = spacy.load('en_core_web_md')
pyphen_dic = Pyphen(lang='en')

# set word lists to be used

## This corpus comes from the Cambridge English Corpus of spoken English and includes
## all the NGSL and SUP words needed to get 90% coverage.
NGSL_wordlist = set([
    ln.decode('utf-8').strip() for ln in pkg_resources.resource_stream(
        'financial_readability', 'word_lists/NGSL_wordlist.txt')
])

## The Business Service List 1.0, also known as the BSL (Browne, C. & Culligan, B., 2016) is a list of approximately 1700 words
## that occur with very high frequency within the domain of general business English. Based on a 64.5 million word corpus of business
## texts, newspapers, journals and websites, the BSL 1.0 version gives approximately 97% coverage of general business English materials
## when learned in combination with the 2800 words of core general English in the New General Service List or NGSL (Browne, C., Culligan, B., and Phillips, J. 2013)
BSL_wordlist = set([
示例#22
0
 def __init__(self, language):
     self.pyphen = Pyphen(lang=language)
def count_syllables(word):
    pyphen_dic = Pyphen(lang='en')
    syllabled_word = pyphen_dic.inserted(word)
    return syllabled_word.count('-') + 1
                    type=str,
                    help='E-Mail subject related to survey mails')
parser.add_argument('-notxt',
                    action='store_true',
                    help='Disable saving of results to txt file')
parser.add_argument('-nobar',
                    action='store_true',
                    help='Disable plotting of bar plots')
parser.add_argument('-nopie',
                    action='store_true',
                    help='Disable plotting of pie plots')
args = parser.parse_args()

### CONFIG - SET VARIABLES AND DEFAULTS HERE ###
#pyphen dictionary
german_dict = Pyphen(lang='de_DE')
#e-mail information
login = args.login
password = args.password
pop_server = (args.pop_server if args.pop_server else 'pop3.web.de')
filter_subject = (args.subject if args.subject else 'Evaluation')
#file information
write_txt = not args.notxt
write_bars = not args.nobar
write_pies = not args.nopie
txt_file_name = 'results.txt'
bar_file_name = 'result_bars.pdf'
pie_file_name = 'result_pies.pdf'
#allowed text lengths until new line for plot labels
pie_wrap_len = 19
bar_wrap_len = 10
示例#25
0
def DataPreprocessing(data, train=1):

    global docCount

    #EXTRACTING DENSE FEATURES
    sentiment = np.array([])
    word_count = np.array([])
    char_count = np.array([])
    sent_count = np.array([])
    syl_count = np.array([])
    mention_count = np.array([])
    url_count = np.array([])
    special_count = np.array([])
    cat_count = np.array([])
    dic = Pyphen(lang='en')
    for text in data["tweet"]:
        blob = TextBlob(text)

        #OPTIONAL SPELLING CORRECTION
        #data.loc[docCount,"tweet"]=str(blob.correct())
        #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"]))

        url_count = np.append(url_count, blob.words.count("URL"))
        mention_count = np.append(mention_count, blob.words.count("USER"))
        cat_count = np.append(cat_count, sum(c == '#' for c in text))
        special_count = np.append(
            special_count,
            sum(not c.isalnum() and c != ' ' and c != '@' and c != '#'
                for c in text))
        syl_count = np.append(
            syl_count,
            len(TextBlob(dic.inserted(text).replace('-', ' ')).words))
        char_count = np.append(char_count, len(text))
        word_count = np.append(word_count, len(blob.words))
        sent_count = np.append(sent_count, len(blob.sentences))
        sentiment = np.append(sentiment, blob.sentiment.polarity)
        docCount += 1

    #INITIALIZING STEMMER AND STOP WORD CORPUS
    stop_words = set(stopwords.words('english'))
    porter_stemmer = PorterStemmer()

    #POS TAGGING
    POS = CMUTweetTagger.runtagger_parse(data["tweet"])
    POSDictionary = {
        "N": "nn",
        "O": "pro",
        "S": "np",
        "^": "nnps",
        "Z": "nnpz",
        "L": "vl",
        "M": "nv",
        "V": "md",
        "A": "adj",
        "R": "adv",
        "!": "int",
        "D": "det",
        "P": "ppt",
        "&": "cc",
        "T": "rp",
        "X": "ex",
        "Y": "exv",
        "#": "cat",
        "@": "tar",
        "~": "dsc",
        ",": "punc",
        "$": "num",
        "U": "url",
        "E": "emo",
        "G": "abr"
    }

    #PREPROCESSING (REMOVE STOP WORDS AND STEMMING)
    docCount = 0
    for doc in POS:
        filtered_sentence = []
        for word in doc:
            if word[0] not in stop_words:
                filtered_sentence.append(porter_stemmer.stem(
                    word[0]))  #+'_'+POSDictionary[word[1]])
        data.loc[docCount, "tweet"] = filtered_sentence
        data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"])
        docCount += 1

    #REPLACING LABEL (subtask) WITH INTEGER
    if (train == 1):
        data['label'] = data['subtask'].factorize()[0]
    data['sentiment'] = sentiment + 1
    data['sent_count'] = sent_count
    data['word_count'] = word_count
    data['syl_count'] = syl_count
    data['url_count'] = url_count
    data['mention_count'] = mention_count
    data['cat_count'] = cat_count
    data['special_count'] = special_count

    #SEPERATING FEATURES AND LABELS
    X = data[[
        'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count',
        'url_count', 'mention_count', 'special_count', 'cat_count'
    ]]
    if train == 1:
        y = data['label']
    else:
        y = None
    return X, y
示例#26
0
 def __init__(self, lang='en_US'):
     
     self.dic = dic = Pyphen(lang=lang)
示例#27
0
import re
from pandocfilters import Para, Str, toJSONFilter, walk
from pyphen import Pyphen

dic = Pyphen(lang='en_US', left=3, right=3)

word_detection_pattern = re.compile(r'\w{7,}', re.UNICODE)

def inpara(key, value, format, meta):
    if key == 'Para':
        return Para(walk(value, hyphenate, format, meta))

def hyphenate(key, value, format, meta):
    if key == 'Str':
        return Str(word_detection_pattern.sub(
            lambda match: dic.inserted(match.group(0), hyphen='­'),
            value))

if __name__ == "__main__":
    toJSONFilter(inpara)
示例#28
0
def count_syllables(word):
    return max(1, len(Pyphen(lang='en_US').hd.positions(word)) + 1)