def getData(self, params): '''Return dataframe ''' top = int(params['top']) data = conn.createDataframe() data = data['lang'].value_counts() lang = [item for item in data.keys()] count = [item for item in data] language = pd.read_csv('language.csv', sep=',', encoding='utf-8') languageAbbr = {language.iloc[i][0]:language.iloc[i][1] for i in np.arange(len(language))} lang = [languageAbbr[item] if item in list(language['Abbreviation']) else item for item in lang] df = pd.DataFrame({'Language': lang, 'Count':count}) df = df[['Language', 'Count']] return df[:top]
Repeating letter e.g hungrryyy for hungry Punctuation ''' stopWords = get_stop_words('en') stopWords.append('at_user') stopWords.append('url') return stopWords def featureVector(tweet): featureVectorList = [] regex = r'^[a-zA-Z][a-zA-Z0-9]*$' for char in tweet: if tweet is not None: char = tweet.split() char = replacefn(char).strip('\'"?,.') #chech if the word starts with an alphabet alphebet = re.search(regex, char) if char not in stopWord() or alphebet is not None: featureVectorList.append(char.lower()) return featureVectorList df = conn.createDataframe() k = df['text'].map(lambda x:processing(x)) for num in np.arange(len(k)): x = k.iloc[num] print featureVector(x) #print df['tweet'].map(lambda x: featureVector(x))