def run_syllabify(args): for line in args.infile: new_line = ' '.join([ ' '.join(syllabifier.orthographic_syllabify(w, args.lang)) for w in line.strip().split(' ') ]) args.outfile.write(new_line + '\n')
def other_features(self,tweet): """ expects text, returns a feature vector, for english and hindi """ if self.lang == 'en': sentiment = self.sentiment_analyzer.polarity_scores(tweet) words = self.preprocess(tweet) #Get text only # pdb.set_trace() syllables = textstat.syllable_count(words) num_chars = sum(len(w) for w in words) num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = self.count_twitter_objs(tweet) retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'], twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] #features = pandas.DataFrame(features) return features if self.lang == 'hi': sentiment = self.sentiment_analyzer.predict(tweet) words = self.preprocess(tweet) syllables = len([syllabifier.orthographic_syllabify(w,self.lang) for w in hi_tokenizer(input=words , language_code=self.lang)]) # pdb.set_trace() num_chars = sum(len(w) for w in words) num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = self.count_twitter_objs(tweet) retweet = 0 if "rt" in words: retweet = 1 features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment[2][0].tolist(), sentiment[2][2].tolist(), sentiment[2][1].tolist(), sentiment[2][1].tolist()-sentiment[2][0].tolist()+sentiment[2][1].tolist(), twitter_objs[2], twitter_objs[1], twitter_objs[0], retweet] #features = pandas.DataFrame(features) return features
def getSyllables(word, lang): return syllabifier.orthographic_syllabify(word, lang)