def _shyphenated(dic: pyphen.Pyphen, word: str) -> str: word = dic.inserted(word, hyphen=SOFT_HYPHEN) if word[2:7] == SOFT_HYPHEN: word = word[:2] + word[7:] if word[-7:-2] == SOFT_HYPHEN: word = word[:-7] + word[-2:] return word
def syllable_count(self, text, lang=None): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if lang: warnings.warn( "The 'lang' argument has been moved to " "'textstat.set_lang(<lang>)'. This argument will be removed " "in the future.", DeprecationWarning ) if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 dic = Pyphen(lang=self.__lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def _count_syllables(self, word): """Counting the syllables in a word.""" dic = Pyphen(lang=self.pyphen_language) word = dic.inserted(word) s_count = word.count("-") + 1 return s_count
def count_syllables(word): # necessary for the syllable count dic = Pyphen(lang='en_EN') word_hyphenated = dic.inserted(word) # triple hyphens resulting from hyphens inside the normal word need to be reduced to single hyphens word_hyphenated = word_hyphenated.replace("---", "-") syllables = max(1, word_hyphenated.count("-") + 1) return syllables
def syllable_count(text): text = text.lower() text = "".join(x for x in text if x not in exclude) dic = Pyphen(lang='ru_RU') count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def split_syllables(self, lang='fr'): """ Sépare les syllabes du texte, si la langue du texte n’est pas le français il faut la specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys()) :param lang: langue du texte :return: renvoi `self` afin de pouvoir chainer les opérations """ dic = Pyphen(lang=lang) self.text = ' '.join(dic.inserted(w, '/') for w in self.text.split()).replace('-/', '-') return self
def count_syllables(self): """ Compte les syllabes d’un texte, en utilisant le dictionnaire Pyphen pour composer les mots en syllabes sensuite faire le comptage des syllabes selon l'expression reguliere indiques dessous :return: nombre syllabes obtenu """ dic = Pyphen(lang='en') text = ' '.join(dic.inserted(w, ' ') for w in self.text.split()) return len(re.findall(r'(\w+|\,|\;|\b\.|\:|\?)', text))
def split_syllables(self, lang='en'): """ Sépare les syllabes du texte, si la langue du texte n’est pas le englais il faut la specifier, et elle doit être disponible dans le module pyphen (pyphen.LANGUAGES.keys()) :param lang: langue du texte :return: renvoi `text` sous la forme demandee """ dic = Pyphen(lang=lang) text = ' '.join(dic.inserted(w, ' ') for w in self.text.split()) text = re.sub(r'(\w+|\,|\;|\:|\.\W+\b)', r'\1-' , text) text = re.sub(r'(\s)', r'' , text) return text
def separar_silabas(palavra, separador): """ Função que separa silabas da palavra indicada na chamada da função. O usuário ainda pode escolher que tipo de separador ele deseja para poder ficar mais amigável ao seu código. """ #TODO: Implementar função nativamente para processamento de sílabas from pyphen import Pyphen _palavra_sep = palavra.lower() dic = Pyphen(lang="pt_BR") _palavra_sep = dic.inserted(_palavra_sep) if separador == "-": return _palavra_sep _palavra_sep = str(_palavra_sep).replace("-", separador) return _palavra_sep
def count_syl_line(line): """Creates a dictionary that relates each word in a line to the number of syllables in the line""" from pyphen import Pyphen from re import split dic = Pyphen(lang='en-US') words = split(" ", line) syllables = {} j = 0 if line == '': return -1 else: for i in words: syllables[i] = len(split("-", dic.inserted(words[j]))) j += 1 line_syl_count = int(sum(syllables.values())) return line_syl_count
def Syllable_Count(s_word): exclude = list(string.punctuation) s_word = s_word.lower() s_word = "".join(x for x in s_word if x not in exclude) if s_word is None: return 0 elif len(s_word) == 0: return 0 else: dic = Pyphen(lang='en_US') count = 0 for word in s_word.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def Syllable_Count(s_word, lang, measures): exclude = list(string.punctuation) s_word = s_word.lower() s_word = "".join(x for x in s_word if x not in exclude) if s_word is None: measures["no_of_syllables"] = 0 elif len(s_word) == 0: measures["no_of_syllables"] = 0 else: dic = Pyphen(lang=lang) count = 0 for word in s_word.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) measures["no_of_syllables"] = count
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ text = text.lower() text = "".join(x for x in text if x not in exclude) if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def syllable_count(text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ text = text.lower() text = delete_mask_return_sen(text) text = remove_punctuation(text).strip() if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): if word: word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def syllable_count(self, text, lang='en_US'): """ Function to calculate syllable words in a text. I/P - a text O/P - number of syllable words """ if isinstance(text, bytes): text = text.decode(self.text_encoding) text = text.lower() text = self.remove_punctuation(text) if not text: return 0 dic = Pyphen(lang=lang) count = 0 for word in text.split(' '): word_hyphenated = dic.inserted(word) count += max(1, word_hyphenated.count("-") + 1) return count
def DataPreprocessing(data, train=1): global docCount #EXTRACTING DENSE FEATURES sentiment = np.array([]) word_count = np.array([]) char_count = np.array([]) sent_count = np.array([]) syl_count = np.array([]) mention_count = np.array([]) url_count = np.array([]) special_count = np.array([]) cat_count = np.array([]) dic = Pyphen(lang='en') for text in data["tweet"]: blob = TextBlob(text) #OPTIONAL SPELLING CORRECTION #data.loc[docCount,"tweet"]=str(blob.correct()) #print(data.loc[docCount,"tweet"],type(data.loc[docCount,"tweet"])) url_count = np.append(url_count, blob.words.count("URL")) mention_count = np.append(mention_count, blob.words.count("USER")) cat_count = np.append(cat_count, sum(c == '#' for c in text)) special_count = np.append( special_count, sum(not c.isalnum() and c != ' ' and c != '@' and c != '#' for c in text)) syl_count = np.append( syl_count, len(TextBlob(dic.inserted(text).replace('-', ' ')).words)) char_count = np.append(char_count, len(text)) word_count = np.append(word_count, len(blob.words)) sent_count = np.append(sent_count, len(blob.sentences)) sentiment = np.append(sentiment, blob.sentiment.polarity) docCount += 1 #INITIALIZING STEMMER AND STOP WORD CORPUS stop_words = set(stopwords.words('english')) porter_stemmer = PorterStemmer() #POS TAGGING POS = CMUTweetTagger.runtagger_parse(data["tweet"]) POSDictionary = { "N": "nn", "O": "pro", "S": "np", "^": "nnps", "Z": "nnpz", "L": "vl", "M": "nv", "V": "md", "A": "adj", "R": "adv", "!": "int", "D": "det", "P": "ppt", "&": "cc", "T": "rp", "X": "ex", "Y": "exv", "#": "cat", "@": "tar", "~": "dsc", ",": "punc", "$": "num", "U": "url", "E": "emo", "G": "abr" } #PREPROCESSING (REMOVE STOP WORDS AND STEMMING) docCount = 0 for doc in POS: filtered_sentence = [] for word in doc: if word[0] not in stop_words: filtered_sentence.append(porter_stemmer.stem( word[0])) #+'_'+POSDictionary[word[1]]) data.loc[docCount, "tweet"] = filtered_sentence data.loc[docCount, "tweet"] = " ".join(data.loc[docCount, "tweet"]) docCount += 1 #REPLACING LABEL (subtask) WITH INTEGER if (train == 1): data['label'] = data['subtask'].factorize()[0] data['sentiment'] = sentiment + 1 data['sent_count'] = sent_count data['word_count'] = word_count data['syl_count'] = syl_count data['url_count'] = url_count data['mention_count'] = mention_count data['cat_count'] = cat_count data['special_count'] = special_count #SEPERATING FEATURES AND LABELS X = data[[ 'tweet', 'sentiment', 'sent_count', 'word_count', 'syl_count', 'url_count', 'mention_count', 'special_count', 'cat_count' ]] if train == 1: y = data['label'] else: y = None return X, y
def count_syllables(word): pyphen_dic = Pyphen(lang='en') syllabled_word = pyphen_dic.inserted(word) return syllabled_word.count('-') + 1
def _shyphenate_text(dic: pyphen.Pyphen, text: str) -> str: if len(text) < 5: return text else: return " ".join( dic.inserted(word, hyphen=SOFT_HYPHEN) for word in text.split(" "))
class ContentCleaner: def __init__(self, dataset, content_column): self.dataset = dataset.reset_index() self.content_column = content_column self.dic = Pyphen(lang='en_US') self.process_data() def __str__(self): return """ This class takes a raw dataset of data and builds a clean NLP dataset with features of out of it """ def lower_case(self): self.dataset[self.content_column] = self.dataset[ self.content_column].str.lower() def remove_html_tags(self): cleanr = re.compile('<.*?>.*<.*>') self.dataset[self.content_column] = [ re.sub(cleanr, '', r) for r in self.dataset[self.content_column] ] def stem_words(self): """ https://stackoverflow.com/questions/38763007/how-to-use-spacy-lemmatizer-to-get-a-word-into-basic-form """ print("Stemming Words") for i, row in tqdm(self.dataset.iterrows()): stemmed_string = "" content_row = nlp(row["content"]) for word in content_row: stemmed_string += " " + word.lemma_ self.dataset.loc[i, "content"] = stemmed_string def remove_stop_words(self): print("Removing Stop Words") for i, row in tqdm(self.dataset.iterrows()): sentence_sans_stop_words = "" content_row = nlp(row["content"]) for word in content_row: if word.is_stop is False: sentence_sans_stop_words += " " + word.text self.dataset.loc[i, "content"] = sentence_sans_stop_words self.dataset.loc[i, "num_words"] = len(content_row) def count_adjectives(self): """ see: https://spacy.io/api/annotation https://spacy.io/usage/linguistic-features """ print("Counting Adjectives") for i, row in tqdm(self.dataset.iterrows()): adjective_count = 0 content_row = nlp(row["content"]) for word in content_row: if word.pos_ == "ADJ": adjective_count += 1 self.dataset.loc[i, "adjectives"] = adjective_count def biggest_word(self): """ Taken from https://github.com/shivam5992/textstat """ self.dic = Pyphen(lang='en_US') print("Finding Biggest Words") for i, row in tqdm(self.dataset.iterrows()): biggest_word = 0 content_row = nlp(row["content"]) for word in content_row: word_hyphenated = self.dic.inserted(word.text) word_size = max(1, word_hyphenated.count("-") + 1) if word_size > biggest_word: biggest_word = word_size self.dataset.loc[i, "biggest_word_syllables"] = biggest_word def readability_score(self): """ Taken from - https://github.com/shivam5992/textstat Based on The Flesch Reading Ease formula """ def avg_sentence_length(text): sentences = re.split(r' *[\.\?!][\'"\)\]]*[ |\n](?=[A-Z])', text) ignore_count = 0 sentence_lengths = [] for sentence in sentences: if len(sentence.split(" ")) <= 2: ignore_count += 1 else: sentence_lengths.append(len(sentence.split(" "))) sentence_count = max(1, len(sentences) - ignore_count) sentence_length_mean = sum(sentence_lengths) return sentence_length_mean / sentence_count def avg_syllables_per_word(text): words = nlp(row["content"]) syllables = [] self.dic = Pyphen(lang='en_US') for word in words: word_hyphenated = self.dic.inserted(word.text) syllables.append(max(1, word_hyphenated.count("-") + 1)) return sum(syllables) / len(words) def legacy_round(number, points=0): p = 10**points return float( math.floor((number * p) + math.copysign(0.5, number))) / p # code from https://github.com/shivam5992/textstat print("Assessing Readability Score") for i, row in tqdm(self.dataset.iterrows()): sentence_length = avg_sentence_length(row["content"]) syllables_per_word = avg_syllables_per_word(row["content"]) flesch = (206.835 - float(1.015 * sentence_length) - float(84.6 * syllables_per_word)) Flesch_reading_score = legacy_round(flesch, 2) self.dataset.loc[i, "flesch_reading_score"] = Flesch_reading_score def count_alliteration(self): print("Counting Alliteration") for i, row in tqdm(self.dataset.iterrows()): repeat_letter = None consecutive = False alliteration_count = 0 if len(row["content"]) > 0: words = row["content"].split(" ") for word in words: if len(word) > 0: # Start of new alliteration if str(word )[0] == repeat_letter and consecutive is False: alliteration_count += 1 repeat_letter = str(word)[0] consecutive = True # In the middle of a consecutive streak of alliteration elif str(word)[0] == repeat_letter and consecutive: repeat_letter = str(word)[0] # End of an alliteration elif str(word)[0] != repeat_letter: repeat_letter = str(word)[0] consecutive = False self.dataset.loc[i, "alliteration"] = alliteration_count else: self.dataset.loc[i, "alliteration"] = 0 def process_data(self): self.count_alliteration() self.count_adjectives() self.biggest_word() self.readability_score() self.remove_html_tags() self.lower_case() self.remove_stop_words() self.stem_words()
def hyphenate(text, hyphen='­'): py = Pyphen(lang='de_de') words = text.split(' ') return ' '.join([py.inserted(word, hyphen=hyphen) for word in words])