예제 #1
0
def _shyphenated(dic: pyphen.Pyphen, word: str) -> str:
    word = dic.inserted(word, hyphen=SOFT_HYPHEN)
    if word[2:7] == SOFT_HYPHEN:
        word = word[:2] + word[7:]
    if word[-7:-2] == SOFT_HYPHEN:
        word = word[:-7] + word[-2:]
    return word
예제 #2
0
    def syllable_count(self, text, lang=None):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if lang:
            warnings.warn(
                "The 'lang' argument has been moved to "
                "'textstat.set_lang(<lang>)'. This argument will be removed "
                "in the future.",
                DeprecationWarning
            )
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=self.__lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
예제 #3
0
    def _count_syllables(self, word):
        """Counting the syllables in a word."""
        dic = Pyphen(lang=self.pyphen_language)
        word = dic.inserted(word)
        s_count = word.count("-") + 1

        return s_count
예제 #4
0
def count_syllables(word):
    # necessary for the syllable count
    dic = Pyphen(lang='en_EN')
    word_hyphenated = dic.inserted(word)
    # triple hyphens resulting from hyphens inside the normal word need to be reduced to single hyphens
    word_hyphenated = word_hyphenated.replace("---", "-")
    syllables = max(1, word_hyphenated.count("-") + 1)
    return syllables
예제 #5
0
def syllable_count(text):
    text = text.lower()
    text = "".join(x for x in text if x not in exclude)
    dic = Pyphen(lang='ru_RU')
    count = 0
    for word in text.split(' '):
        word_hyphenated = dic.inserted(word)
        count += max(1, word_hyphenated.count("-") + 1)
    return count
예제 #6
0
 def split_syllables(self, lang='fr'):
     """
     Sépare les syllabes du texte, si la langue du texte n’est pas le français il faut la
     specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys())
     :param lang: langue du texte
     :return: renvoi `self` afin de pouvoir chainer les opérations
     """
     dic = Pyphen(lang=lang)
     self.text = ' '.join(dic.inserted(w, '/') for w in self.text.split()).replace('-/', '-')
     return self
 def count_syllables(self):
     """
     Compte les syllabes d’un texte, en utilisant le dictionnaire Pyphen
     pour composer les mots en syllabes sensuite faire le comptage des syllabes selon
     l'expression reguliere indiques dessous
     :return: nombre syllabes obtenu
     """
     dic = Pyphen(lang='en')
     text = ' '.join(dic.inserted(w, ' ') for w in self.text.split())
     return len(re.findall(r'(\w+|\,|\;|\b\.|\:|\?)', text))
 def split_syllables(self, lang='en'):
     """
     Sépare les syllabes du texte, si la langue du texte n’est pas le englais il faut la
     specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys())
     :param lang: langue du texte
     :return: renvoi `text` sous la forme demandee
     """
     dic = Pyphen(lang=lang)
     text = ' '.join(dic.inserted(w, ' ') for w in self.text.split())
     text = re.sub(r'(\w+|\,|\;|\:|\.\W+\b)', r'\1-' , text)
     text = re.sub(r'(\s)', r'' , text)
     return text
예제 #9
0
 def separar_silabas(palavra, separador):
     """
     Função que separa silabas da palavra indicada na chamada da função. O
     usuário ainda pode escolher que tipo de separador ele deseja para poder
     ficar mais amigável ao seu código.
     """
     #TODO: Implementar função nativamente para processamento de sílabas
     from pyphen import Pyphen
     _palavra_sep = palavra.lower()
     dic = Pyphen(lang="pt_BR")
     _palavra_sep = dic.inserted(_palavra_sep)
     if separador == "-":
         return _palavra_sep
     _palavra_sep = str(_palavra_sep).replace("-", separador)
     return _palavra_sep
예제 #10
0
 def separar_silabas(palavra, separador):
     """
     Função que separa silabas da palavra indicada na chamada da função. O
     usuário ainda pode escolher que tipo de separador ele deseja para poder
     ficar mais amigável ao seu código.
     """
     #TODO: Implementar função nativamente para processamento de sílabas
     from pyphen import Pyphen
     _palavra_sep = palavra.lower()
     dic = Pyphen(lang="pt_BR")
     _palavra_sep = dic.inserted(_palavra_sep)
     if separador == "-":
         return _palavra_sep
     _palavra_sep = str(_palavra_sep).replace("-", separador)
     return _palavra_sep
예제 #11
0
def count_syl_line(line):
    """Creates a dictionary that relates each word in a line to the number of syllables in the line"""
    from pyphen import Pyphen
    from re import split
    dic = Pyphen(lang='en-US')
    words = split(" ", line)
    syllables = {}
    j = 0
    if line == '':
        return -1
    else:
        for i in words:
            syllables[i] = len(split("-", dic.inserted(words[j])))
            j += 1
        line_syl_count = int(sum(syllables.values()))
        return line_syl_count
예제 #12
0
def Syllable_Count(s_word):
	exclude = list(string.punctuation)
	s_word = s_word.lower()
	s_word = "".join(x for x in s_word if x not in exclude)

	if s_word is None:
		return 0
	elif len(s_word) == 0:
		return 0
	else:
		dic = Pyphen(lang='en_US')
		count = 0
		for word in s_word.split(' '):
			word_hyphenated = dic.inserted(word)
			count += max(1, word_hyphenated.count("-") + 1)
		return count
예제 #13
0
def Syllable_Count(s_word, lang, measures):
    exclude = list(string.punctuation)
    s_word = s_word.lower()
    s_word = "".join(x for x in s_word if x not in exclude)

    if s_word is None:
        measures["no_of_syllables"] = 0
    elif len(s_word) == 0:
        measures["no_of_syllables"] = 0
    else:
        dic = Pyphen(lang=lang)
        count = 0
        for word in s_word.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        measures["no_of_syllables"] = count
예제 #14
0
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        text = text.lower()
        text = "".join(x for x in text if x not in exclude)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
예제 #15
0
def syllable_count(text, lang='en_US'):
    """
    Function to calculate syllable words in a text.
    I/P - a text
    O/P - number of syllable words
    """
    text = text.lower()
    text = delete_mask_return_sen(text)
    text = remove_punctuation(text).strip()
    if not text:
        return 0
    dic = Pyphen(lang=lang)
    count = 0
    for word in text.split(' '):
        if word:
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
    return count
예제 #16
0
파일: textstat.py 프로젝트: matifq/textstat
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
예제 #17
0
    def syllable_count(self, text, lang='en_US'):
        """
        Function to calculate syllable words in a text.
        I/P - a text
        O/P - number of syllable words
        """
        if isinstance(text, bytes):
            text = text.decode(self.text_encoding)

        text = text.lower()
        text = self.remove_punctuation(text)

        if not text:
            return 0

        dic = Pyphen(lang=lang)
        count = 0
        for word in text.split(' '):
            word_hyphenated = dic.inserted(word)
            count += max(1, word_hyphenated.count("-") + 1)
        return count
예제 #18
0
def DataPreprocessing(data, train=1):

    global docCount

    #EXTRACTING DENSE FEATURES
    sentiment = np.array([])
    word_count = np.array([])
    char_count = np.array([])
    sent_count = np.array([])
    syl_count = np.array([])
    mention_count = np.array([])
    url_count = np.array([])
    special_count = np.array([])
    cat_count = np.array([])
    dic = Pyphen(lang='en')
    for text in data["tweet"]:
        blob = TextBlob(text)

        #OPTIONAL SPELLING CORRECTION
        #data.loc[docCount,"tweet"]=str(blob.correct())
        #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"]))

        url_count = np.append(url_count, blob.words.count("URL"))
        mention_count = np.append(mention_count, blob.words.count("USER"))
        cat_count = np.append(cat_count, sum(c == '#' for c in text))
        special_count = np.append(
            special_count,
            sum(not c.isalnum() and c != ' ' and c != '@' and c != '#'
                for c in text))
        syl_count = np.append(
            syl_count,
            len(TextBlob(dic.inserted(text).replace('-', ' ')).words))
        char_count = np.append(char_count, len(text))
        word_count = np.append(word_count, len(blob.words))
        sent_count = np.append(sent_count, len(blob.sentences))
        sentiment = np.append(sentiment, blob.sentiment.polarity)
        docCount += 1

    #INITIALIZING STEMMER AND STOP WORD CORPUS
    stop_words = set(stopwords.words('english'))
    porter_stemmer = PorterStemmer()

    #POS TAGGING
    POS = CMUTweetTagger.runtagger_parse(data["tweet"])
    POSDictionary = {
        "N": "nn",
        "O": "pro",
        "S": "np",
        "^": "nnps",
        "Z": "nnpz",
        "L": "vl",
        "M": "nv",
        "V": "md",
        "A": "adj",
        "R": "adv",
        "!": "int",
        "D": "det",
        "P": "ppt",
        "&": "cc",
        "T": "rp",
        "X": "ex",
        "Y": "exv",
        "#": "cat",
        "@": "tar",
        "~": "dsc",
        ",": "punc",
        "$": "num",
        "U": "url",
        "E": "emo",
        "G": "abr"
    }

    #PREPROCESSING (REMOVE STOP WORDS AND STEMMING)
    docCount = 0
    for doc in POS:
        filtered_sentence = []
        for word in doc:
            if word[0] not in stop_words:
                filtered_sentence.append(porter_stemmer.stem(
                    word[0]))  #+'_'+POSDictionary[word[1]])
        data.loc[docCount, "tweet"] = filtered_sentence
        data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"])
        docCount += 1

    #REPLACING LABEL (subtask) WITH INTEGER
    if (train == 1):
        data['label'] = data['subtask'].factorize()[0]
    data['sentiment'] = sentiment + 1
    data['sent_count'] = sent_count
    data['word_count'] = word_count
    data['syl_count'] = syl_count
    data['url_count'] = url_count
    data['mention_count'] = mention_count
    data['cat_count'] = cat_count
    data['special_count'] = special_count

    #SEPERATING FEATURES AND LABELS
    X = data[[
        'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count',
        'url_count', 'mention_count', 'special_count', 'cat_count'
    ]]
    if train == 1:
        y = data['label']
    else:
        y = None
    return X, y
def count_syllables(word):
    pyphen_dic = Pyphen(lang='en')
    syllabled_word = pyphen_dic.inserted(word)
    return syllabled_word.count('-') + 1
예제 #20
0
def _shyphenate_text(dic: pyphen.Pyphen, text: str) -> str:
    if len(text) < 5:
        return text
    else:
        return " ".join(
            dic.inserted(word, hyphen=SOFT_HYPHEN) for word in text.split(" "))
예제 #21
0
class ContentCleaner:
    def __init__(self, dataset, content_column):
        self.dataset = dataset.reset_index()
        self.content_column = content_column
        self.dic = Pyphen(lang='en_US')

        self.process_data()

    def __str__(self):
        return """
            This class takes a raw dataset of data and builds a
            clean NLP dataset with features of out of it
        """

    def lower_case(self):
        self.dataset[self.content_column] = self.dataset[
            self.content_column].str.lower()

    def remove_html_tags(self):
        cleanr = re.compile('<.*?>.*<.*>')
        self.dataset[self.content_column] = [
            re.sub(cleanr, '', r) for r in self.dataset[self.content_column]
        ]

    def stem_words(self):
        """
        https://stackoverflow.com/questions/38763007/how-to-use-spacy-lemmatizer-to-get-a-word-into-basic-form
        """
        print("Stemming Words")
        for i, row in tqdm(self.dataset.iterrows()):
            stemmed_string = ""
            content_row = nlp(row["content"])
            for word in content_row:
                stemmed_string += " " + word.lemma_
            self.dataset.loc[i, "content"] = stemmed_string

    def remove_stop_words(self):
        print("Removing Stop Words")
        for i, row in tqdm(self.dataset.iterrows()):
            sentence_sans_stop_words = ""
            content_row = nlp(row["content"])

            for word in content_row:
                if word.is_stop is False:
                    sentence_sans_stop_words += " " + word.text
            self.dataset.loc[i, "content"] = sentence_sans_stop_words
            self.dataset.loc[i, "num_words"] = len(content_row)

    def count_adjectives(self):
        """
        see:
        https://spacy.io/api/annotation
        https://spacy.io/usage/linguistic-features
        """
        print("Counting Adjectives")
        for i, row in tqdm(self.dataset.iterrows()):
            adjective_count = 0
            content_row = nlp(row["content"])

            for word in content_row:
                if word.pos_ == "ADJ":
                    adjective_count += 1
            self.dataset.loc[i, "adjectives"] = adjective_count

    def biggest_word(self):
        """
        Taken from https://github.com/shivam5992/textstat
        """
        self.dic = Pyphen(lang='en_US')
        print("Finding Biggest Words")
        for i, row in tqdm(self.dataset.iterrows()):
            biggest_word = 0
            content_row = nlp(row["content"])

            for word in content_row:
                word_hyphenated = self.dic.inserted(word.text)
                word_size = max(1, word_hyphenated.count("-") + 1)
                if word_size > biggest_word:
                    biggest_word = word_size

            self.dataset.loc[i, "biggest_word_syllables"] = biggest_word

    def readability_score(self):
        """
        Taken from - https://github.com/shivam5992/textstat
        
        Based on The Flesch Reading Ease formula
        """
        def avg_sentence_length(text):
            sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text)
            ignore_count = 0
            sentence_lengths = []
            for sentence in sentences:
                if len(sentence.split(" ")) <= 2:
                    ignore_count += 1
                else:
                    sentence_lengths.append(len(sentence.split(" ")))
            sentence_count = max(1, len(sentences) - ignore_count)
            sentence_length_mean = sum(sentence_lengths)
            return sentence_length_mean / sentence_count

        def avg_syllables_per_word(text):
            words = nlp(row["content"])
            syllables = []
            self.dic = Pyphen(lang='en_US')

            for word in words:
                word_hyphenated = self.dic.inserted(word.text)
                syllables.append(max(1, word_hyphenated.count("-") + 1))
            return sum(syllables) / len(words)

        def legacy_round(number, points=0):
            p = 10**points
            return float(
                math.floor((number * p) + math.copysign(0.5, number))) / p

        # code from https://github.com/shivam5992/textstat
        print("Assessing Readability Score")
        for i, row in tqdm(self.dataset.iterrows()):
            sentence_length = avg_sentence_length(row["content"])
            syllables_per_word = avg_syllables_per_word(row["content"])
            flesch = (206.835 - float(1.015 * sentence_length) -
                      float(84.6 * syllables_per_word))
            Flesch_reading_score = legacy_round(flesch, 2)
            self.dataset.loc[i, "flesch_reading_score"] = Flesch_reading_score

    def count_alliteration(self):
        print("Counting Alliteration")
        for i, row in tqdm(self.dataset.iterrows()):
            repeat_letter = None
            consecutive = False
            alliteration_count = 0

            if len(row["content"]) > 0:

                words = row["content"].split(" ")
                for word in words:
                    if len(word) > 0:
                        # Start of new alliteration
                        if str(word
                               )[0] == repeat_letter and consecutive is False:
                            alliteration_count += 1
                            repeat_letter = str(word)[0]
                            consecutive = True
                        # In the middle of a consecutive streak of alliteration
                        elif str(word)[0] == repeat_letter and consecutive:
                            repeat_letter = str(word)[0]

                        # End of an alliteration
                        elif str(word)[0] != repeat_letter:
                            repeat_letter = str(word)[0]
                            consecutive = False
                self.dataset.loc[i, "alliteration"] = alliteration_count

            else:
                self.dataset.loc[i, "alliteration"] = 0

    def process_data(self):
        self.count_alliteration()
        self.count_adjectives()
        self.biggest_word()
        self.readability_score()
        self.remove_html_tags()
        self.lower_case()
        self.remove_stop_words()
        self.stem_words()
예제 #22
0
파일: process.py 프로젝트: ADFD/adfd
def hyphenate(text, hyphen='&shy;'):
    py = Pyphen(lang='de_de')
    words = text.split(' ')
    return ' '.join([py.inserted(word, hyphen=hyphen) for word in words])