def extract_stylo_features(tweets, save_file_name): """ Extracting stylometric features from the corpus tweets: a list of tweets from the data file save_file_name: name of file to which extracted feature vectors are save Return: The name of the file that contains the feature vectors """ pos_word_list = get_word_list(name="pos") neg_word_list = get_word_list(name="neg") bad_word_list = get_word_list(name="bad") modal_list = get_word_list(name="modal") print("Extracting stylometric features") nlp = spacy.load('en_core_web_sm') stylo_features = [] for tweet in tqdm(tweets): features = {} blob_text = TextBlob(tweet) token_list = blob_text.words #getting the number of tokens num_tkns = len(token_list) features['num_tkns'] = num_tkns #getting the number of sentences num_snts = len(blob_text.sentences) features['num_snts'] = num_snts #geting average sentence length avg_sent_len = round(num_tkns / num_snts) features['ave_snt_len'] = avg_sent_len #geting average token length sum_tkn_len = sum(len(token) for token in token_list) avg_token_len = round(sum_tkn_len / num_tkns) features['ave_tnk_len'] = avg_token_len #counting pos tags pos_tags = blob_text.tags nn_count = 0 adj_count = 0 v_count = 0 adv_count = 0 prn_count = 0 verb_tags = ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ") noun_tags = ('NNP', 'NN', 'NNS', "NNPS") adj_tags = ('JJ', 'JJR', "JJS") adv_tags = ("RB", "RBR", "RBS", "RP") prn_tags = ("PRP", "PRP$") for token, tag in pos_tags: if tag in verb_tags: v_count += 1 elif tag in noun_tags: nn_count += 1 elif tag in adj_tags: adj_count += 1 elif tag in prn_tags: prn_count += 1 elif tag in adv_tags: adv_count += 1 features['num_nn'] = nn_count features['num_adj'] = adj_count features['num_vrb'] = v_count features['num_adv'] = adv_count features['num_prn'] = prn_count #getting tweet polarity score polarity_score = round(blob_text.sentiment.polarity, 1) features['polarity'] = polarity_score #getting tweet subjectivity score subjectivity_score = round(blob_text.sentiment.subjectivity, 1) features['subjectivity'] = subjectivity_score #getting the number of special tokens dit_count = 0 spc_count = 0 cap_count = 0 pos_word_count = 0 neg_word_count = 0 bad_word_count = 0 modal_verb_count = 0 for token in token_list: blob_word = Word(token) #getting the number of ditgit tokens if blob_word.isdigit() == True: dit_count += 1 #getting the number of tokens which are special characters if blob_word.isdigit() == False and blob_word.isalpha() == False: spc_count += 1 #getting the number of tokens containing capital characters if blob_word.islower() == False: cap_count += 1 #getting the number of tokens that have positive sentiment if blob_word in pos_word_list: pos_word_count += 1 #getting the number of tokens that have negative sentiment if blob_word in neg_word_list: neg_word_count += 1 #getting the number of tokens which are profane words if blob_word in bad_word_list: bad_word_count += 1 #getting the number of tokens which are modal verbs if blob_word in modal_list: modal_verb_count += 1 features['num_dit'] = dit_count features['num_spc'] = spc_count features['num_cap'] = cap_count features['num_pos_w'] = pos_word_count features['num_neg_w'] = neg_word_count #getting the number of emojis num_emojis = len( Tweet_processing.get_emojis(tweet)) #from tweet_processing.py features['num_emo'] = num_emojis #getting the number of hashtags num_hashtags = len( Tweet_processing.get_hashtags(tweet)) #from tweet_processing.py features['num_htg'] = num_hashtags #getting the number of hashtags num_users = len( Tweet_processing.get_users(tweet)) #from tweet_processing.py features['num_users'] = num_users #getting number of named entities doc = nlp(tweet) ents = list(doc.ents) features['num_ents'] = len(ents) stylo_features.append(features) dict_vtrz = DictVectorizer(sparse=False) #transform extracted features into vectors stylometric = dict_vtrz.fit_transform(stylo_features) print(stylometric.shape) #save feature vectors save_file = save_data(stylometric, save_file_name) print(save_file) return save_file