def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentences = get_sentences(text)
        len_sentences = len(sentences)
        sentence_count = int(len_sentences)
        # sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text.decode('utf-8'))
        avg_words_p_sentence = word_count / sentence_count
        encoding_dict = detect_encoding(self.filename)

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence),
            'encoding': encoding_dict['encoding'],
            'encoding_confidence': encoding_dict['confidence']
        }
示例#2
0
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }

        outData = {
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
        return outData
示例#3
0
 def syllables(self, line):
     """
     Calculation of a syllable score for a line: num of syllables in the line / max_syllables param
     :param line: target line, str
     :return: syllable score, float
     """
     count = count_syllables(line)
     return count / self.params['max_syllables']
示例#4
0
def preprocess_lexicon(df,
                       language,
                       phon_column="PhonDISC",
                       word_column="Word",
                       vowels="IE{VQU@i#$u312456789cq0~",
                       n=5,
                       smoothing=.01,
                       match_on="phones"):
    """Preprocess Celex dataframe."""
    df['num_phones'] = df[phon_column].apply(lambda x: len(x))
    df['num_sylls_est'] = df[phon_column].apply(
        lambda x: utils.count_syllables(x, language=language, vowels=vowels))

    # Remove words estimates to have <1 syllables.
    df = df[df['num_sylls_est'] > 0]

    original_counts = obtain_length_distribution(df, match_on=match_on)

    ### Preprocess
    ## English
    df_processed = utils.preprocess_for_analysis(
        df, word_column=word_column, phon_column=phon_column).reset_index()
    unique_counts = obtain_length_distribution(df_processed, match_on=match_on)

    print(len(df_processed))

    # Build n-gram model.
    print("Creating phonotactic model...")
    unique_wordforms = list(df_processed[phon_column])
    model = create_model(unique_wordforms, n=n, smoothing=smoothing)

    # Obtain surprisal estimates
    df_processed['log_prob'] = df_processed[phon_column].apply(
        lambda x: model.evaluate(x)[2])
    df_processed['surprisal'] = df_processed['log_prob'].apply(lambda x: -x)
    df['log_prob'] = df[phon_column].apply(lambda x: model.evaluate(x)[2])
    df['surprisal'] = df['log_prob'].apply(lambda x: -x)

    # Save dataframes to file
    """
    print("Saving dataframes to file...")
    print("data/processed/{lang1}/{lang2}_all_reals_{n}phone.csv".format(lang1=language, lang2=language, n=n))
    df.to_csv("data/processed/{lang1}/{lang2}_all_reals_{n}phone.csv".format(lang1=language, lang2=language, n=n))
    print("data/processed/{lang1}/{lang2}_lemmas_processed_{n}phone.csv".format(lang1=language, lang2=language, n=n))
    df_processed.to_csv("data/processed/{lang1}/{lang2}_lemmas_processed_{n}phone.csv".format(lang1=language, lang2=language, n=n))
    """
    return {
        'model': model,
        'original_counts': original_counts,
        'unique_counts': unique_counts,
        'original_lexicon': unique_wordforms
    }
示例#5
0
    def transform(self, X, **transform_params):
        ease_scores = []
        for article in X:
            number_of_sentences = len(article)
            number_of_words = 0
            number_of_syllables = 0
            for sentence in article:
                words = sentence.split()
                number_of_words += len(words)
                number_of_syllables += sum(
                    map(lambda x: utils.count_syllables(x), words))

            ease_scores.append(
                utils.flesch_kincaid_ease_score(number_of_sentences,
                                                number_of_words,
                                                number_of_syllables))
        return normalize(np.array(ease_scores).reshape(len(ease_scores), 1))
示例#6
0
    def analyze_text(self, text):
        words = get_words(text)
        char_count = get_char_count(words)
        word_count = len(words)
        sentence_count = len(get_sentences(text))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count/sentence_count

        self.analyzedVars = {
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
示例#7
0
 def analyze_text(self, text):
     words = get_words(text)
     char_count = get_char_count(words)
     words_count = len(words)
     sentence_count = len(get_sentences(text))
     syllable_count = count_syllables(words)
     print("syllable_count:", syllable_count)
     complex_words_count = count_complex_words(text)
     avg_words_per_sentence = int(words_count / sentence_count)
     print("avg_words_per_sentence", avg_words_per_sentence)
     self.ana_vars = {
         'words': words,
         'char_count': float(char_count),
         'words_count': float(words_count),
         'sentence_count': float(sentence_count),
         'syllable_count': float(syllable_count),
         'complex_words_count': float(complex_words_count),
         'avg_words_per_sentence': float(avg_words_per_sentence)
     }
def average_syllables(target_list):
    """
    Counts average number of syllables in lyrics for a specified artists and/or genres.

    :param target_list: list of identifiers (artist and/or genres), str
    :return: dictionary {identifier: average number of syllables}, dict
    """
    syllables = {}
    for ident in target_list:
        original_bars = split_file(f'data/{ident}.txt')
        count = 0
        excluded = 0
        for line in original_bars:
            syls = count_syllables(line)
            if syls > 3:
                count += syls
            else:
                excluded += 1
        syllables[ident] = count / (len(original_bars) - excluded)
    return syllables
    def analyze_text(self, text):
        words = get_words(text)
        char_count = int(get_char_count(words))
        word_count = int(len(words))
        sentence_count = int(len(get_sentences(text)))
        syllable_count = count_syllables(words)
        complexwords_count = count_complex_words(text)
        avg_words_p_sentence = word_count / sentence_count

        self.analyzedVars = {
            'filename': self.filename,
            # 'text_truncated': text[:200].replace("\n", " "),
            'words': words,
            'char_cnt': float(char_count),
            'word_cnt': float(word_count),
            'sentence_cnt': float(sentence_count),
            'syllable_cnt': float(syllable_count),
            'complex_word_cnt': float(complexwords_count),
            'avg_words_p_sentence': float(avg_words_p_sentence)
        }
示例#10
0
def flesch_kincaid_score(article):
	xml_url = '&titles='.join([xml_api_url, title])
	try:
		xml = requests.get(xml_url).content
		bs = BeautifulSoup(xml)

		try:
			text = str(bs.find('extract').contents[0].encode('utf-8'))	# convert NavigableString to string after encoding
			non_text = ['== See also ==\n', '== References ==\n', ' === Further references ===\n', '== External links ==\n', '== Notes ==\n']
			for ele in non_text:
				text = text.split(ele, 1)[0]
			text = re.sub('==.*==', '', text)
			words = get_words(text)
			syllableCount = count_syllables(text)
			sentences = get_sentences(text)
			fk = 206.835 - 1.015*len(words)/len(sentences) - 84.6*(syllableCount)/len(words)
			return float(format(fk,'.2f'))
		except:
			print 'Error while computing fk score of ' + article
			print format_exc()

	except:
		print 'Error while fetching xml content of ' + article
		print format_exc()
示例#11
0
def syllable_match(line1, line2):
    if count_syllables(line1) == count_syllables(line2):
        return 1
    else:
        return 0