def analyze_text(self, text): words = get_words(text) char_count = int(get_char_count(words)) word_count = int(len(words)) sentences = get_sentences(text) len_sentences = len(sentences) sentence_count = int(len_sentences) # sentence_count = int(len(get_sentences(text))) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text.decode('utf-8')) avg_words_p_sentence = word_count / sentence_count encoding_dict = detect_encoding(self.filename) self.analyzedVars = { 'filename': self.filename, # 'text_truncated': text[:200].replace("\n", " "), 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence), 'encoding': encoding_dict['encoding'], 'encoding_confidence': encoding_dict['confidence'] }
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) word_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count / sentence_count self.analyzedVars = { 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) } outData = { 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) } return outData
def syllables(self, line): """ Calculation of a syllable score for a line: num of syllables in the line / max_syllables param :param line: target line, str :return: syllable score, float """ count = count_syllables(line) return count / self.params['max_syllables']
def preprocess_lexicon(df, language, phon_column="PhonDISC", word_column="Word", vowels="IE{VQU@i#$u312456789cq0~", n=5, smoothing=.01, match_on="phones"): """Preprocess Celex dataframe.""" df['num_phones'] = df[phon_column].apply(lambda x: len(x)) df['num_sylls_est'] = df[phon_column].apply( lambda x: utils.count_syllables(x, language=language, vowels=vowels)) # Remove words estimates to have <1 syllables. df = df[df['num_sylls_est'] > 0] original_counts = obtain_length_distribution(df, match_on=match_on) ### Preprocess ## English df_processed = utils.preprocess_for_analysis( df, word_column=word_column, phon_column=phon_column).reset_index() unique_counts = obtain_length_distribution(df_processed, match_on=match_on) print(len(df_processed)) # Build n-gram model. print("Creating phonotactic model...") unique_wordforms = list(df_processed[phon_column]) model = create_model(unique_wordforms, n=n, smoothing=smoothing) # Obtain surprisal estimates df_processed['log_prob'] = df_processed[phon_column].apply( lambda x: model.evaluate(x)[2]) df_processed['surprisal'] = df_processed['log_prob'].apply(lambda x: -x) df['log_prob'] = df[phon_column].apply(lambda x: model.evaluate(x)[2]) df['surprisal'] = df['log_prob'].apply(lambda x: -x) # Save dataframes to file """ print("Saving dataframes to file...") print("data/processed/{lang1}/{lang2}_all_reals_{n}phone.csv".format(lang1=language, lang2=language, n=n)) df.to_csv("data/processed/{lang1}/{lang2}_all_reals_{n}phone.csv".format(lang1=language, lang2=language, n=n)) print("data/processed/{lang1}/{lang2}_lemmas_processed_{n}phone.csv".format(lang1=language, lang2=language, n=n)) df_processed.to_csv("data/processed/{lang1}/{lang2}_lemmas_processed_{n}phone.csv".format(lang1=language, lang2=language, n=n)) """ return { 'model': model, 'original_counts': original_counts, 'unique_counts': unique_counts, 'original_lexicon': unique_wordforms }
def transform(self, X, **transform_params): ease_scores = [] for article in X: number_of_sentences = len(article) number_of_words = 0 number_of_syllables = 0 for sentence in article: words = sentence.split() number_of_words += len(words) number_of_syllables += sum( map(lambda x: utils.count_syllables(x), words)) ease_scores.append( utils.flesch_kincaid_ease_score(number_of_sentences, number_of_words, number_of_syllables)) return normalize(np.array(ease_scores).reshape(len(ease_scores), 1))
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) word_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count/sentence_count self.analyzedVars = { 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) }
def analyze_text(self, text): words = get_words(text) char_count = get_char_count(words) words_count = len(words) sentence_count = len(get_sentences(text)) syllable_count = count_syllables(words) print("syllable_count:", syllable_count) complex_words_count = count_complex_words(text) avg_words_per_sentence = int(words_count / sentence_count) print("avg_words_per_sentence", avg_words_per_sentence) self.ana_vars = { 'words': words, 'char_count': float(char_count), 'words_count': float(words_count), 'sentence_count': float(sentence_count), 'syllable_count': float(syllable_count), 'complex_words_count': float(complex_words_count), 'avg_words_per_sentence': float(avg_words_per_sentence) }
def average_syllables(target_list): """ Counts average number of syllables in lyrics for a specified artists and/or genres. :param target_list: list of identifiers (artist and/or genres), str :return: dictionary {identifier: average number of syllables}, dict """ syllables = {} for ident in target_list: original_bars = split_file(f'data/{ident}.txt') count = 0 excluded = 0 for line in original_bars: syls = count_syllables(line) if syls > 3: count += syls else: excluded += 1 syllables[ident] = count / (len(original_bars) - excluded) return syllables
def analyze_text(self, text): words = get_words(text) char_count = int(get_char_count(words)) word_count = int(len(words)) sentence_count = int(len(get_sentences(text))) syllable_count = count_syllables(words) complexwords_count = count_complex_words(text) avg_words_p_sentence = word_count / sentence_count self.analyzedVars = { 'filename': self.filename, # 'text_truncated': text[:200].replace("\n", " "), 'words': words, 'char_cnt': float(char_count), 'word_cnt': float(word_count), 'sentence_cnt': float(sentence_count), 'syllable_cnt': float(syllable_count), 'complex_word_cnt': float(complexwords_count), 'avg_words_p_sentence': float(avg_words_p_sentence) }
def flesch_kincaid_score(article): xml_url = '&titles='.join([xml_api_url, title]) try: xml = requests.get(xml_url).content bs = BeautifulSoup(xml) try: text = str(bs.find('extract').contents[0].encode('utf-8')) # convert NavigableString to string after encoding non_text = ['== See also ==\n', '== References ==\n', ' === Further references ===\n', '== External links ==\n', '== Notes ==\n'] for ele in non_text: text = text.split(ele, 1)[0] text = re.sub('==.*==', '', text) words = get_words(text) syllableCount = count_syllables(text) sentences = get_sentences(text) fk = 206.835 - 1.015*len(words)/len(sentences) - 84.6*(syllableCount)/len(words) return float(format(fk,'.2f')) except: print 'Error while computing fk score of ' + article print format_exc() except: print 'Error while fetching xml content of ' + article print format_exc()
def syllable_match(line1, line2): if count_syllables(line1) == count_syllables(line2): return 1 else: return 0