Exemplo n.º 1
0
    def readability_analysis(self, text):
        words = text.split()
        wrd_dic = {}
        for wrd in words:
            wrd = "".join(a for a in wrd if a not in punctuation)
            wrd_dic[wrd] = textstat.syllable_count(wrd)
        wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5]

        flesch_reading_ease = textstat.flesch_reading_ease(text)

        if flesch_reading_ease > 100:
            flesch_reading_ease = 100
        elif flesch_reading_ease < 0:
            flesch_reading_ease = 0

        syllable_count = textstat.syllable_count(text)
        avg_syllables_per_word = textstat.avg_syllables_per_word(text)
        avg_letter_per_word = textstat.avg_letter_per_word(text)

        readability = {
            "flesch_reading_ease": flesch_reading_ease,
            "avg_syllables_per_word": avg_syllables_per_word,
            "syllable_count": syllable_count,
            "avg_letter_per_word": avg_letter_per_word,
        }

        grade, score = self.readability_grade(readability)
        readability['grade'] = grade
        readability['score'] = score
        readability['difficult_words'] = wrd_dic
        return readability
Exemplo n.º 2
0
def synonym_replacement(input_string, copy_string):
    # loop through each token (word, comma, period) in the input file
    for word in word_tokenize(input_string):
        # find a set of synonyms for each word
        synonym_set = wordnet.synsets(word)
        # if there is a synonym for that word
        if synonym_set:
            #get the synonym with the lowest syllable count
            synonym = find_lowest_syl_count(synonym_set)

            # get just the first synonym
            #synonym = synonym_set[0].lemmas()[0].name()

            # if the synonym has less syllabes than the word
            if textstat.syllable_count(synonym) < textstat.syllable_count(
                    word):
                words_with_synonyms.append(word)
                #view synonym changes
                #print(word, "-->", synonym, "\n")
                the_synonyms.append(synonym)

    # replace all the words with their corresponding synonyms
    i = 0
    while i < len(words_with_synonyms):
        copy_string = [
            w.replace(words_with_synonyms[i], the_synonyms[i])
            for w in copy_string
        ]
        i += 1

    # join the list into one string
    FINAL_STRING = " ".join(copy_string)
    return FINAL_STRING
Exemplo n.º 3
0
def generatehaiku(url):
    authheader = "Basic " + base64.b64encode(os.environ['IMAGGA_API_KEY'] + ":" + os.environ['IMAGGA_API_SECRET'])
    headers = {'accept': "application/json", 'authorization':authheader}
    imaggaurl = "http://api.imagga.com/v1/tagging?url={}".format(url)
    r = requests.get(imaggaurl, headers=headers)
    imgtags = r.json()['results'][0]['tags']
    tags = []
    for tag in imgtags:
        tags.append([tag['tag'], int(textstat.syllable_count(tag['tag']))])
    tagsbysyllable = {}
    for tag in tags:
        key = tag[1]
        value = tag[0]
        if key not in tagsbysyllable:
            tagsbysyllable[key] = list()
        tagsbysyllable[key].append(value)
    random.seed(url)
    haikuline1=nsyllables(5,tagsbysyllable) 
    haikuline2=nsyllables(7, haikuline1[1])
    haikuline3=nsyllables(5, haikuline2[1])
    return render_template('haiku.html',
                           url=url, 
                           haikuline1=haikuline1[0],
                           haikuline2=haikuline2[0],
                           haikuline3=haikuline3[0])
Exemplo n.º 4
0
def text_analytics(text):
    if textstat.sentence_count(text) != 0:
        lexicon = textstat.lexicon_count(text) #word count
        sent = textstat.sentence_count(text) #sentence count
        syll = textstat.syllable_count(text) #syllable count
        flesch = textstat.flesch_reading_ease(text) #flesch score
        smog = textstat.smog_index(text) #SMOG index
        fog = textstat.gunning_fog(text) #FOG index
        dale = textstat.dale_chall_readability_score(text) #grade level
        ari = textstat.automated_readability_index(text) #grade level
        cl = textstat.coleman_liau_index(text) #grade level

        flesch1 = lexicon*flesch
        flesch2 = sent*flesch
        flesch3 = syll*flesch
        smog1 = lexicon*smog
        smog2 = sent*smog
        smog3 = syll*smog
        fog1 = lexicon*fog
        fog2 = sent*fog
        fog3 = syll*fog
        dale1 = lexicon*dale
        dale2 = sent*dale
        dale3=syll*dale
        ari1 = lexicon*ari
        ari2 = sent*ari
        ari3 = syll*ari
        cl1 = lexicon*cl
        cl2 = sent*cl
        cl3 = syll*cl
        x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1,                 smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3]
    return(x)
Exemplo n.º 5
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
Exemplo n.º 6
0
def calculate_statistics(lyrics):
    """
    Calculates statistics based on the text_raw of the lyrics.
    :return: Annotated lyrics containing information about the songs
    """
    logging.info("Calculating Statistics")
    from textstat.textstat import textstat
    for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)):
        try:
            song["num_syllables"] = textstat.syllable_count(song["text_raw"])
            song["num_words"] = textstat.lexicon_count(song["text_raw"])
            song["num_sentences"] = textstat.sentence_count(song["text_raw"])
            song["flesch_score"] = textstat.flesch_reading_ease(
                song["text_raw"])
            song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade(
                song["text_raw"])
            song["fog_score"] = textstat.gunning_fog(song["text_raw"])
            song[
                "num_difficult_words"] = textstat.dale_chall_readability_score(
                    song["text_raw"])
        except Exception as e:
            logging.error(
                "Something bad happened in the current song ! Skipping it... \n{}"
                .format(song))
            logging.exception(e)
    return lyrics
def _get_base_textstats(no_code_text):
    """
    Find basic text statistics
    :param no_code_text: Text we are analyzing
    :return: list: List of results
    """
    results = []
    group_by = 'Basic Text Statistics'
    num_chars = len(no_code_text)
    num_lower = sum(1 for c in no_code_text if c.islower())
    num_upper = sum(1 for c in no_code_text if c.isupper())
    num_letters = sum(1 for c in no_code_text if c.isalpha())
    num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by))

    results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by))
    results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by))
    results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by))
    results.append(TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(TextFeature('Number of upper case characters', num_upper, group_by))
    return results
Exemplo n.º 8
0
def _schedule_words(text, rate):
    """Determine the time at which to speak each word for slow dictation.

    :returns list: A list of floats, starting from 0 and monotonically
    increasing, corresponding to the time at which to say each word.
    """
    num_syllables = textstat.syllable_count(text)
    total_dictation_time = num_syllables * NUM_WORDS_PER_SYLLABLE / rate
    total_dictation_seconds = total_dictation_time * 60.0

    words = text.split()
    time_per_word = total_dictation_seconds / len(text.split())

    current_time = 0.0
    for word in words:
        # There's no technical reason to round, it's just nice to not have to
        # worry about making a pretty string representation of the list of
        # timings.
        yield round(current_time, 2)

        word_length = time_per_word
        punctuation_pause = PUNCTUATION_PAUSES.get(word[-1])
        if punctuation_pause:
            word_length += time_per_word * punctuation_pause
        current_time += word_length
Exemplo n.º 9
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
Exemplo n.º 10
0
def haiku(text):
    words = text.split()

    for word in words:
        try:
            CMU_DICT[re.sub(r'[^\w\s]', '', word.lower())]
        except Exception as ex:
            #print ex
            return

    syllables = [
        int(math.ceil(textstat.syllable_count(word))) for word in words
    ]
    if sum(syllables) != 17: return

    syl_line = [0, 0, 0]
    haiku_lines = ['', '', '']

    for word, syllable_count in zip(words, syllables):
        if syl_line[0] < 5:
            syl_line[0] += syllable_count
            haiku_lines[0] += word + ' '
        elif syl_line[0] is 5 and syl_line[1] < 7:
            syl_line[1] += syllable_count
            haiku_lines[1] += word + ' '
        elif syl_line[0] is 5 and syl_line[1] is 7 and syl_line[2] < 5:
            syl_line[2] += syllable_count
            haiku_lines[2] += word + ' '

        # ain't a haiku,
        if syl_line[0] > 5 or syl_line[2] > 5 or syl_line[1] > 7: return

    # If haiku return haiku as string
    if syl_line == [5, 7, 5]:
        return ('%s\n%s\n%s' % tuple(haiku_lines))[:-1]
Exemplo n.º 11
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
Exemplo n.º 12
0
def flesch_kincaid_score(text):
	sylCount = textstat.syllable_count(text)
	wordCount = len(text.split())
	sentenceCount = textstat.sentence_count(text)

	print "Syl count - %s, word count - %s, sentenceCount - %s " % (sylCount,wordCount,sentenceCount)

	return (0.39*(wordCount/sentenceCount)+11.8*(sylCount/wordCount) - 15.59)
Exemplo n.º 13
0
def composition(text, file):
    char_count = textstat.char_count(text)
    syll_count = textstat.syllable_count(text)
    lex_count = textstat.lexicon_count(text)
    sent_count = textstat.sentence_count(text)
    file.write(
        '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d'
        % (char_count, syll_count, lex_count, sent_count))
Exemplo n.º 14
0
def average_syllables_per_word(text):
    """
    :type text: Text
    :param text: The text to be analysed
    :rtype float
    :returns Average syllables per word
    """
    return textstat.syllable_count(text.text) / len(text.tokens_alphabetic)
Exemplo n.º 15
0
Arquivo: haiku.py Projeto: rski/steely
def split_by_syllables(syls, words):
    s_count = 0
    split = 0
    while s_count < syls and split < len(words):
        s_count += round(textstat.syllable_count(words[split]))
        split += 1
    if s_count != syls:
        raise ValueError("Words do not evenly split")
    return words[:split], words[split:]
Exemplo n.º 16
0
def nsyl(word):
    ''' Return the number of syllables in word.'''
    try:
        res = [
            len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]
        ][0]
    except:
        res = np.round(textstat.syllable_count(word))
    return res
Exemplo n.º 17
0
def other_features_(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features.

    This is modified to only include those features in the final
    model."""

    sentiment = sentiment_analyzer.polarity_scores(tweet)

    words = preprocess(tweet)  #Get text only

    syllables = textstat.syllable_count(words)  #count syllables in words
    num_chars = sum(len(w) for w in words)  #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables + 0.001)) / float(num_words + 0.001), 4)
    num_unique_terms = len(set(words.split()))

    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(
        float(0.39 * float(num_words) / 1.0) + float(11.8 * avg_syl) - 15.59,
        1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(
        206.835 - 1.015 * (float(num_words) / 1.0) - (84.6 * float(avg_syl)),
        2)

    twitter_objs = count_twitter_objs(tweet)  #Count #, @, and http://

    lyric = False
    #index=0
    for t in tweets:
        new_t = preprocess(t)
        #print(new_t)
        for l in lyrics:
            l = preprocess(l)
            if new_t == l:
                #print(new_t)
                #print(tweets.iloc[index])
                #df.loc[index, "class"]=2
                lyric = True
                #df.set_value(index,'class',2)
                #df.to_csv("labeled_data.csv", index=False)
                #print("done", df["class"].iloc[index])
        #index=index+1

    features = [
        FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words,
        num_unique_terms, sentiment['compound'], lyric
    ]
    #features = pandas.DataFrame(features)
    return features
Exemplo n.º 18
0
def get_stats(sentence):
	syllables = textstat.syllable_count(sentence)
	words = textstat.lexicon_count(sentence, True)
	sentence_count = textstat.sentence_count(sentence)

	if sentence_count > 0:
		text_standard = textstat.text_standard(sentence)
	else:
		text_standard = EMPTY_TEXT_STANDARD

	text_standard = fix_grammar_errors(text_standard)

	return combine(syllables, words, sentence_count, text_standard)
Exemplo n.º 19
0
def get_avg_syl_count(row, is_title):
    """
    Function to get the average number of syllables per word. Here, row refers to the article being considered.

    Args:

        row: The row of data to be considered. In this case a row of a `pandas` `DataFrame`.

        is_title: A boolean value indicating whether or not this average syllable count is for the title.

    Returns:
        An average syllable count for the provided row.

    Raises:
        Additional errors may be thrown by dependencies.
   """

    if is_title:
        syl = textstat.syllable_count(row.Title)
        return syl / row.titleWordCount
    syl = textstat.syllable_count(row.Body)
    return syl / row.wordCount
Exemplo n.º 20
0
def flesch_kincaid(tokenized_sentences):
    total_syllables = 0
    total_words = 0
    total_sentences = len(tokenized_sentences)
    for tokenized_sent in tokenized_sentences:
        for token in tokenized_sent:
            total_words += 1
            total_syllables += textstat.syllable_count(token)

    score = 206.835
    score -= 1.015 * (total_words / total_sentences)
    score -= 84.6 * (total_syllables / total_words)
    return score / 100
Exemplo n.º 21
0
def find_lowest_syl_count(syn_list):
    #12 is largest number of syllables in one english word
    lowest_count = 12
    lowest_count_word = ""

    for synset in syn_list:
        word = synset.name().split('.')[0]
        count = textstat.syllable_count(word)

        if count < lowest_count:
            lowest_count = count
            lowest_count_word = word

    return lowest_count_word
Exemplo n.º 22
0
def haiku_friend():
    haiku = []
    while True:
        line1 = raw_input("Enter the first line (5 syllables!) ")
        if math.ceil(textstat.syllable_count(line1)) == 5:
            haiku.append(line1)
            break
        else:
            print "Your line should contain exactly 5 syllables!"

    while True:
        line2 = raw_input("Enter the second line (7 syllables!) ")
        if math.ceil(textstat.syllable_count(line2)) == 7:
            haiku.append(line2)
            break
        else:
            print "Your line should contain exactly 7 syllables!"

    while True:
        line3 = raw_input("Enter the third line (5 syllables again!) ")
        if math.ceil(textstat.syllable_count(line3)) == 5:
            haiku.append(line3)
            break
        else:
            print "Your line should contain exactly 5 syllables!"

    os.system('clear')
    print "Here is your haiku:\n"
    print '\n' ''.join(haiku)

    do_over = raw_input(
        "\nPress any key to make another haiku, or press 'n' to leave :'( ")
    if do_over == 'n':
        quit()
    else:
        haiku_friend()
Exemplo n.º 23
0
def make_haiku(model, artist_name):
    haiku_scheme = [5, 7, 5]
    char_limit = 140 - len(artist_name) - 3
    haiku_poem = [artist_name + ' - ']
    for h in haiku_scheme:
        while True:
            sentence = model.make_short_sentence(char_limit,
                                                 max_overlap_total=3)
            if sentence:
                syllables = ceil(textstat.syllable_count(sentence))
                if syllables == h:
                    haiku_poem.append(sentence)
                    break
    haiku = '\n'.join(haiku_poem)
    return haiku
def get_syll_stats(segments, feats_dict):
    """
    Computes statistics related to number of syllables present in each word in transcript.
    :param segments: list of segments, where each segment is a list of words
    :param feats_dict: dictionary to store computed feature values
    """
    syll_count_list = []
    for segment in segments:
        for word in segment:
            syll_count_list.append(textstat.syllable_count(word))
    feats_dict['syll_mean'] = np.mean(syll_count_list) if syll_count_list else float('nan')
    feats_dict['syll_median'] = np.median(syll_count_list) if syll_count_list else float('nan')
    feats_dict['syll_stdev'] = np.std(syll_count_list) if syll_count_list else float('nan')
    feats_dict['syll_min'] = min(syll_count_list) if syll_count_list else float('nan')
    feats_dict['syll_max'] = max(syll_count_list) if syll_count_list else float('nan')
Exemplo n.º 25
0
def averageSyllable(text):

    count = 0
    no_of_words = 0

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    no_of_words = len(words)

    for word in words:
        count += textstat.syllable_count(word)

    if no_of_words != 0:
        return float(count) / no_of_words
    else:
        return 0.5
Exemplo n.º 26
0
def gettingFeatures(text):
    text = text.lower()

    #words / syllables / sentences count
    wordCount = len(text.split())
    syllables = textstat.syllable_count(text)
    sentences = textstat.sentence_count(text)
    try:
        #ReadabilityScore
        readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * (
            syllables / wordCount)
        #ReadabilityGrade
        ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * (
            syllables / wordCount) - 15.59
    except:
        readabilityScore = 0
        ReadabilityGrade = 0
    print(readabilityScore, ReadabilityGrade)
    #Direction Count
    #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"};
    DiractionCount = 0
    DiractionCount = text.count("here") + text.count("there") + text.count(
        "over there") + text.count("beyond") + text.count(
            "nearly") + text.count("opposite") + text.count(
                "under") + text.count("to the left") + text.count(
                    "to the right") + text.count("in the distance")
    #Exemplify count
    #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"};
    Exemplify = 0
    Exemplify = text.count("chiefly") + text.count("especially") + text.count(
        "for instance") + text.count("in particular") + text.count(
            "markedly") + text.count("namely") + text.count(
                "particularly") + text.count("incluiding") + text.count(
                    "specifically") + text.count("such as")

    try:
        #words per sentence (average)
        WPS = 0
        parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()]
        WPS = sum(parts) / len(parts)  #number of words per sentence
    except:
        WPS = 0
    #print(wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify)
    return numpy.array([
        wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS,
        Exemplify
    ])
Exemplo n.º 27
0
def percentageOfOneSyllable(text):

    count = 0
    no_of_words = 0

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    no_of_words = len(words)

    for word in words:
        if (textstat.syllable_count(word) == 1):
            count += 1

    if no_of_words != 0:
        return float(count * 100) / no_of_words
    else:
        return 0.5
Exemplo n.º 28
0
def hodorify_syllables(word):

    new_word = ''
    index = 0

    #count syllables in each word
    syl_count = int(textstat.syllable_count(word))

    #get initial and final punctuation
    word_start = 0
    for word_start in range(0, len(word)):
        if word[word_start].isalpha():
            break

    word_end = len(word)
    for word_end in range(len(word), 1, -1):
        if word[word_end - 1].isalpha():
            break

    #split word from punctutation
    pre_text = word[:word_start]
    post_text = word[word_end:]
    word = word[word_start:word_end]

    #for each syllable of the word, print a ho-dor syllable
    for syl_index in range(0, syl_count):

        hodex = syl_index % (len(hodorian_syllables) - 1)
        dordex = 0

        for dordex in range(0, len(hodorian_syllables[0])):
            if word[index + dordex].isupper():
                new_word += hodorian_syllables[hodex][dordex].upper()

            else:
                new_word += hodorian_syllables[hodex][dordex].lower()

        index += len(hodorian_syllables[0])

    #only print the final r at the end of the word
    if hodex == 1:
        if word[index].isupper():
            new_word += hodorian_syllables[2].upper()
        else:
            new_word += hodorian_syllables[2]
    return (pre_text + new_word + post_text)
Exemplo n.º 29
0
    def __init__(self, text):

        self.sent_count = 0  # no of sentences
        self.word_count = 0  # no of words
        self.char_count = 0  # no of chars, no space
        self.syll_count = 0  # no of syllables
        self.comp_count = 0  # no of words which have three or more than three syllables

        # 0. loop text, add space after period if there is not
        text_list = text.split()
        text_list_len = len(text_list)
        for i in range(text_list_len):
            if "." in text_list[i]:
                text_list[i] = text_list[i].replace(".", ". ")
        text = " ".join(text_list)

        # 1. parse text into separate sentences based on punc
        from nltk.tokenize import sent_tokenize

        # from 'Models' tab and select 'punkt'
        sentences = sent_tokenize(text.decode("utf8"))
        # 2. cal total sentences
        self.sent_count = len(sentences)
        # 3. remove pun
        import string

        table = string.maketrans("", "")
        punc_removed_text = text.translate(table, string.punctuation)
        # 6. cal total digits
        self.char_count = len(punc_removed_text) - punc_removed_text.count(" ")
        # 4. split text into a list of separate word
        text_list = punc_removed_text.split()
        # 5. cal total words
        self.word_count = len(text_list)
        # 7. cal total syllables
        # 8. cal complex words

        from textstat.textstat import textstat

        for i in text_list:
            each_syll = textstat.syllable_count(i.decode("utf8"))
            if each_syll == 0:
                each_syll = 1
            self.syll_count += each_syll
            if each_syll >= 3:
                self.comp_count += 1
Exemplo n.º 30
0
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
Exemplo n.º 31
0
def _get_base_textstats(no_code_text):
    """
    Find basic text statistics
    :param no_code_text: Text we are analyzing
    :return: list: List of results
    """
    results = []
    group_by = 'Basic Text Statistics'
    num_chars = len(no_code_text)
    num_lower = sum(1 for c in no_code_text if c.islower())
    num_upper = sum(1 for c in no_code_text if c.isupper())
    num_letters = sum(1 for c in no_code_text if c.isalpha())
    num_numbers = sum(1 for c in no_code_text if c.isdigit())
    num_alphanum = sum(1 for c in no_code_text if c.isalnum())
    num_otherchars = num_chars - num_alphanum
    results.append(TextFeature('Number of characters', num_chars, group_by))
    results.append(TextFeature('Number of letters', num_letters, group_by))
    results.append(TextFeature('Number of numbers', num_numbers, group_by))
    results.append(
        TextFeature('Number of other characters', num_otherchars, group_by))
    character_counts = Counter(no_code_text.lower())
    for c in sorted(character_counts.items()):
        try:
            results.append(
                TextFeature(
                    'Character count for "{}"'.format(
                        c[0].encode('unicode_escape')), c[1], group_by))
        except AttributeError:
            results.append(
                TextFeature('Character count for "{}"'.format(c[0]), c[1],
                            group_by))

    results.append(
        TextFeature('Number of syllables',
                    textstat.syllable_count(no_code_text), group_by))
    results.append(
        TextFeature('Lexicon Count (without punctuation)',
                    textstat.lexicon_count(no_code_text, True), group_by))
    results.append(
        TextFeature('Lexicon Count (with punctuation)',
                    textstat.lexicon_count(no_code_text, False), group_by))
    results.append(
        TextFeature('Number of lower case characters', num_lower, group_by))
    results.append(
        TextFeature('Number of upper case characters', num_upper, group_by))
    return results
Exemplo n.º 32
0
def hodorify_syllables(word):

  new_word = ''
  index = 0

  #count syllables in each word
  syl_count = int(textstat.syllable_count(word))

  #get initial and final punctuation
  word_start = 0
  for word_start in range(0,len(word)):
    if word[word_start].isalpha():
      break

  word_end = len(word)
  for word_end in range(len(word),1,-1):
    if word[word_end-1].isalpha():
      break  

  #split word from punctutation
  pre_text  = word[:word_start]
  post_text = word[word_end:]
  word      = word[word_start:word_end]

  #for each syllable of the word, print a ho-dor syllable
  for syl_index in range(0,syl_count):

    hodex   = syl_index % (len(hodorian_syllables)-1)
    dordex  = 0

    for dordex in range(0,len(hodorian_syllables[0])):
      if word[index+dordex].isupper():
        new_word += hodorian_syllables[hodex][dordex].upper()

      else:
        new_word += hodorian_syllables[hodex][dordex].lower()
      
    index += len(hodorian_syllables[0])

  #only print the final r at the end of the word
  if hodex == 1:
    if word[index].isupper():
      new_word += hodorian_syllables[2].upper()
    else:
      new_word += hodorian_syllables[2]
  return(pre_text+new_word+post_text)
Exemplo n.º 33
0
def _words_in_chars(chars, method):
    text = "".join(c for c, _ in chars)
    if method == "ncra":
        # The NCRA defines a "word" to be 1.4 syllables, which is the average
        # number of syllables per English word.
        syllables_per_word = 1.4
        # For some reason, textstat returns syllable counts such as a
        # one-syllable word like "the" being 0.9 syllables.
        syllables_in_text = textstat.syllable_count(text) / 0.9
        return syllables_in_text * (1 / syllables_per_word)
    elif method == "traditional":
        # Formal definition; see https://en.wikipedia.org/wiki/Words_per_minute
        return len(text) / 5
    elif method == "spaces":
        return len([i for i in text.split() if i])
    else:
        assert False, "bad wpm method: " + method
Exemplo n.º 34
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Exemplo n.º 35
0
def nsyllables(numsyls, tags):
    """ Finds an n-syllable phrase in a dict
        with keys for number of syllables, with values of
        lists of applicable words,  eg:

         {'5': ['polysyllabic', 'proletariat'],
          '4': ['polyganol', 'pollywantsa'],
          '3': ['Pauly Shore', 'polishing'],
          '2': ['poly', 'goner']
          '1': ['Paul', 'pole', 'Pawn'] }
    """
    if numsyls <= 0:
        return [randomonesyl(), tags]
    if numsyls in tags and len(tags[numsyls]) >= 1:
            return [tags[numsyls].pop(0), tags]
    part = nsyllables(numsyls - 1, tags)[0]
    remainder = nsyllables(numsyls - int(textstat.syllable_count(part)) - 1, tags)[0]
    return [str(part) + " " + str(remainder), tags]
Exemplo n.º 36
0
def analyse_json(json_text):
    # consider moving this to be a feature of Transcript in the other module

    df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name',
                                         'syllable_count','lexicon_count',
                                         'sentence_count',
                                         'syllables_per_word',
                                         'gunning_fog', 'smog_index',
                                         'text_standard'],
                      index=[])

    trscrpt = json.loads(json_text)
    if 'witnesses' in trscrpt:
        witnesses = trscrpt['witnesses']


        for s in trscrpt['all_sections']:
            if 'speaker' in s and 'person' in s['speaker'] and \
                    s['speaker']['person']['speaker_type']=='witness':
                witness =  witnesses[s['speaker']['person']['name']]
                witness.setdefault('all_text', []).append(s['spoken_text'])

        for i, p in enumerate(witnesses):
            if 'all_text' in witnesses[p]:
                witness_text = '\n\n'.join(witnesses[p]['all_text'])
                if len(witness_text) > 0:
                    stats_data = {'html_file_location': trscrpt['html_file_location'],
                                  'witness_name': p,
                                  'syllable_count': textstat.syllable_count(witness_text),
                                  'lexicon_count': textstat.lexicon_count(witness_text),
                                  'sentence_count': textstat.sentence_count(witness_text),
                                  'syllables_per_word': textstat.avg_syllables_per_word(witness_text),
                                  'gunning_fog': textstat.gunning_fog(witness_text),
                                  'smog_index': textstat.smog_index(witness_text),
                                  'text_standard': textstat.text_standard(witness_text)}
                    df_witnesses.loc['witness_%i' % i] = stats_data
                else:
                    df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                    df_witnesses.loc['witness_%i' % i, 'witness_name'] = p
            else:
                df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                df_witnesses.loc['witness_%i' % i, 'witness_name'] = p

    return df_witnesses
Exemplo n.º 37
0
def analyseText():
    values = request.get_json()
    required = [ 'inputText' ]
    if not all(k in values for k in required):
        return 'Missing values', 400

    text = values['inputText']
    result = {
        'syllable_count': textstat.syllable_count(text),
        'lexicon_count': textstat.lexicon_count(text),
        'sentence_count': textstat.sentence_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'gunning_fog': textstat.gunning_fog(text),
        'smog_index': textstat.smog_index(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(text)
    };

    return jsonify(result), 200
Exemplo n.º 38
0
def feature_readability(essay):
    syllable_count = textstat.syllable_count(essay)
    #音节数统计
    flesch_reading_ease = textstat.flesch_reading_ease(essay)
    #文档的易读性0-100之间的分数
    smog_index = textstat.smog_index(essay)
    #烟雾指数,反映文档的易读程度,更精确,更容易计算
    flesch_kincaid_index = textstat.flesch_kincaid_grade(essay)
    #等级分数,年级等级
    coleman_liau_index = textstat.coleman_liau_index(essay)
    #返回文本的年级级别
    automated_readability_index = textstat.automated_readability_index(essay)
    #自动可读性指数,接近理解文本需要的年级
    dale_chall_readability_score = textstat.dale_chall_readability_score(essay)
    #返回年级级别,使用最常见的英文单词
    difficult_words = textstat.difficult_words(essay)

    linsear_write_formula = textstat.linsear_write_formula(essay)
    #返回文本的年级级别
    gunning_fog = textstat.gunning_fog(essay)
    #迷雾指数, 反映文本的阅读难度
    return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
Exemplo n.º 39
0
def text_stats(corpus):
    tk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    toks = [tk(entry) for entry in corpus]

    # get function words
    with open('function_words.txt') as file:
        funcs = file.read().split(',')
        funcs = [f.strip() for f in funcs]

    amb = ambiguity(toks)       # calculate ambiguity

    matrix = [["Chars/Word", "Lexical Diversity", "Lexical Density", "Function Words", "Syllables", "ARI"]]
    for tokens, sentence in zip(toks, corpus):
        unique = set(tokens)
        avchar = 0
        lexdiv = 0
        lexden = 0
        nfunc = 0
        numsyl = 0
        ari = 0

        if len(sentence) > 1:
            lexdiv = len(unique) / len(tokens)                                  # Lexical Diversity
            lexden = len([x for x in tokens if x not in funcs]) / len(tokens)   # Lexical Density
            numsyl = textstat.syllable_count(sentence) / len(tokens) / 10       # Number of syllables in text
            # may be a bit dodgy without punctuation
            ari = abs(textstat.automated_readability_index(sentence)) / 14      # Automated Readability index
        for t in tokens:
            avchar += len(t) / len(tokens) / len(sentence)                      # Average num chars
            if t in funcs:
                nfunc += 1 / len(tokens)                                        # Number of function words

        matrix.append([avchar, lexdiv, lexden, nfunc, numsyl, ari])

    matrix = [m + [a] for m, a in zip(matrix, amb)]

    return np.array(matrix)
Exemplo n.º 40
0
#all={1: 'CC', 2: 'CD', 3: 'DT', 4: 'EX', 5: 'FW', 6: 'IN', 7: 'JJ', 8: 'JJR', 9: 'JJS', 10: 'LS', 11: 'MD', 12: 'NN', 13: 'NNS', 14: 'NNP', 15: 'NNPS', 16: 'PDT', 17: 'POS', 18: 'PRP', 19: 'PRP$', 20: 'RB', 21: 'RBR', 22: 'RBS', 23: 'RP', 24: 'SYM', 25: 'TO', 26: 'UH', 27: 'VB', 28: 'VBD', 29: 'VBG', 30: 'VBN', 31: 'VBP', 32: 'VBZ', 33: 'WDT', 34: 'WP', 35: 'WP$', 36: 'WRB', 'NN': 12, 'FW': 5, 'PRP': 18, 'RB': 20, 'NNS': 13, 'NNP': 14, 'PRP$': 19, 'WRB': 36, 'CC': 1, 'PDT': 16, 'VBN': 30, 'WP$': 35, 'JJS': 9, 'JJR': 8, 'SYM': 24, 'VBP': 31, 'WDT': 33, 'JJ': 7, 'VBG': 29, 'WP': 34, 'VBZ': 32, 'DT': 3, 'POS': 17, 'TO': 25, 'LS': 10, 'VB': 27, 'RBS': 22, 'RBR': 21, 'EX': 4, 'IN': 6, 'RP': 23, 'CD': 2, 'VBD': 28, 'MD': 11, 'NNPS': 15, 'UH': 26,  '.':37 , 37:'.' , ':':38, 38:':','-NONE-':39,39:'-NONE-' , ',':40, 40:','}
#ui=[]
print("grammer for the essay's")
for index in range(len(df)):
     p=df.essay[index]
     p1=nltk.word_tokenize(p.lower())
     p2=nltk.pos_tag(p1)
     counts=Counter(tag for p1,tag in p2)
     print(counts)
     total = sum(counts.values())
     print(dict((word, float(count)/total) for word,count in counts.items()))
     print("")
print("readability/complexity")     
for index in range(len(df)):
    r=df.essay[index]
    print(textstat.syllable_count(r))    
    print(textstat.readability_consensus(r))
    print("")
    #print(textstat.flesch_reading_ease(r))
    #print(textstat.flesch_kincaid_grade(r))
    
    
"""for index in range(len(df)):
    r=df.essay[index]     
    for words in r.split():
        words1 = [w1 for w1 in words if not w1 in stopwords.words("english")]
        print(words1)"""
        
#Example
print("normalizing values")
ranger = interp1d([1,512],[1,10])
Exemplo n.º 41
0
def get_mnemonic_syllables(mn):
    return sum([textstat.syllable_count(a) for a in mn.split()])
Exemplo n.º 42
0
WAV_FILE = path.join(path.dirname(path.realpath(__file__)), "abc.wav")

# use "test.wav" as the audio source
r = sr.Recognizer()
with sr.WavFile(WAV_FILE) as source:
    audio = r.record(source) # read the entire WAV file

# recognize speech using Google Speech Recognition
try:
    # for testing purposes, we're just using the default API key
    # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
    # instead of `r.recognize_google(audio)`
    print("Google Speech Recognition thinks you said " + r.recognize_google(audio))
    words = r.recognize_google(audio).split()
    for word in words:
        print ("%%", word, textstat.syllable_count(word))
    print textstat.syllable_count(r.recognize_google(audio))
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Google Speech Recognition service; {0}".format(e))
'''
# recognize speech using Wit.ai
WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings
try:
    print("Wit.ai thinks you said " + r.recognize_wit(audio, key=WIT_AI_KEY))
except sr.UnknownValueError:
    print("Wit.ai could not understand audio")
except sr.RequestError as e:
    print("Could not request results from Wit.ai service; {0}".format(e))
Exemplo n.º 43
0
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
Exemplo n.º 44
0
         FAM_scores.append(MIN_FAM)
     else:
         FAM_scores.append(float(map[words[i]][1]))
         
     if (float(map[words[i]][2]) == 0):
         IMG_scores.append(MIN_IMG)
     else:
         IMG_scores.append(float(map[words[i]][2]))
         
     if (float(map[words[i]][3]) == 0):
         CONC_scores.append(MIN_CONC)
     else:
         CONC_scores.append(float(map[words[i]][3]))
         
     if (float(map[words[i]][4]) == 0):
         NSYL_scores.append(textstat.syllable_count(words[i]))
     else:
         NSYL_scores.append(float(map[words[i]][4]))
         
     if (float(map[words[i]][5]) == 0):
         FREQ_scores.append(MIN_FREQ)
     else:
         FREQ_scores.append(float(map[words[i]][5]))
         
     if (float(map[words[i]][6]) == 0):
         AOA_scores.append(MAX_AOA)
     else:
         AOA_scores.append(float(map[words[i]][6]))
         
 else :
     FAM_scores.append(MIN_FAM)
Exemplo n.º 45
0
# semicolon_count
try:
  semicolon_count = count_semicolon(AB) 
except:
  warning_message = 1

# comma_count
try:
  comma_count = count_comma(AB) 
except:
  warning_message = 1

# num_syllables
try: 
  num_syllables = textstat.syllable_count(AB)
except: 
  warning_message = 1

# word_count
try:
  word_count = textstat.lexicon_count(AB) 
except: 
  warning_message = 1

# avg_word_len
try: 
  avg_word_len = avg_word_length(AB) 
except:
  warning_message = 1
  
                    # Build Dataset
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment": sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index": ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
                print e

    outfile = "years/" + str(year) + '.txt'
    dir = os.path.dirname(outfile)
Exemplo n.º 47
0
from syllabify import syllabify
import count_syl as cs
from textstat.textstat import textstat

a = syllabify('hello')
# this doesn't really work, it's for ARPANET, not English words

b = cs.count_syllables('accident')
# this script seems to work pretty well, but gives lower and upper bound

c = textstat.syllable_count('fragmentation')
# ^ works well

def get_mnemonic_syllables(mn):
    return sum([textstat.syllable_count(a) for a in mn.split()])
Exemplo n.º 48
0
from nltk.corpus import wordnet as wn
from textstat.textstat import textstat
from pattern.en import conjugate, PRESENT, PARTICIPLE 

"""
Run with `python collect_squattings.py > unfiltered_squattings.txt`
"""

# Collect present participle (ending in 'ing') of single-syllable verbs
verbs = list(wn.all_synsets('v'))
squattings = []

for item in verbs:
    for verb in item.lemmas():
        syllables = round(textstat.syllable_count(verb.name()))
        if syllables == 1.0:
            squat = verb.name().replace("_", " ")
            squatting = conjugate(squat, PRESENT+PARTICIPLE)
            print squatting.encode('utf8')
            


Exemplo n.º 49
0
from nltk.corpus import wordnet as wn
from textstat.textstat import textstat

"""
Run with `python collect_naked.py > unfiltered_naked.txt`
"""

adjectives = list(wn.all_synsets('a')) + list(wn.all_synsets('s'))
nakeds = []

# Collect all two-syllable adjectives
for item in adjectives:
    for adj in item.lemmas():
        syllables = round(textstat.syllable_count(adj.name()))
        if syllables == 2.0:
            naked = adj.name().replace("_", " ")
            nakeds.append(naked)

# Uniques only
nakeds = set(nakeds)


for naked in nakeds:
    print naked.encode('utf8')
Exemplo n.º 50
0
from textstat.textstat import textstat

"""
Run with `python collect_hearts.py > unfiltered_hearts.txt`
"""

# Find a specific synset of "body" 
body = wn._synset_from_pos_and_offset('n',5216365) 

# Keep searching for part_meronyms until there are no more
def collect_parts_recursive(body, collection):
    parts = body.part_meronyms()

    for part in parts:
        collection.append(part)
        collect_parts_recursive(part, collection)

    return collection

all_parts = collect_parts_recursive(body, [])

# Collect heart replacements
hearts = []
for part in all_parts:
    heart = part.name().split('.')[0]
    heart = heart.replace("_", " ")
    syllables = round(textstat.syllable_count(heart))
    if syllables == 1.0:
        print heart