Python Word.isdigit 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: textblob

클래스/타입: Word

메소드/함수: isdigit

hotexamples.com에서의 예제들: 1

Python Word.isdigit - 1개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 textblob.Word.isdigit에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Word(30)

correct(18)

get_synsets(8)

capitalize(2)

define(2)

detect_language(1)

extend(1)

isalpha(1)

isdigit(1)

path_similarity(1)

예제 #1

파일 보기

파일: feature_engineering.py 프로젝트: nguyenngoctongan/authorship-profiling

def extract_stylo_features(tweets, save_file_name):
    """
    Extracting stylometric features from the corpus
    tweets: a list of tweets from the data file
    save_file_name: name of file to which extracted feature vectors are save
        
    Return: The name of the file that contains the feature vectors
    """
    pos_word_list = get_word_list(name="pos")
    neg_word_list = get_word_list(name="neg")
    bad_word_list = get_word_list(name="bad")
    modal_list = get_word_list(name="modal")

    print("Extracting stylometric features")
    nlp = spacy.load('en_core_web_sm')
    stylo_features = []
    for tweet in tqdm(tweets):
        features = {}
        blob_text = TextBlob(tweet)
        token_list = blob_text.words

        #getting the number of tokens
        num_tkns = len(token_list)
        features['num_tkns'] = num_tkns

        #getting the number of sentences
        num_snts = len(blob_text.sentences)
        features['num_snts'] = num_snts

        #geting average sentence length
        avg_sent_len = round(num_tkns / num_snts)
        features['ave_snt_len'] = avg_sent_len

        #geting average token length
        sum_tkn_len = sum(len(token) for token in token_list)
        avg_token_len = round(sum_tkn_len / num_tkns)
        features['ave_tnk_len'] = avg_token_len

        #counting pos tags
        pos_tags = blob_text.tags
        nn_count = 0
        adj_count = 0
        v_count = 0
        adv_count = 0
        prn_count = 0
        verb_tags = ("VB", "VBD", "VBG", "VBN", "VBP", "VBZ")
        noun_tags = ('NNP', 'NN', 'NNS', "NNPS")
        adj_tags = ('JJ', 'JJR', "JJS")
        adv_tags = ("RB", "RBR", "RBS", "RP")
        prn_tags = ("PRP", "PRP$")
        for token, tag in pos_tags:
            if tag in verb_tags:
                v_count += 1
            elif tag in noun_tags:
                nn_count += 1
            elif tag in adj_tags:
                adj_count += 1
            elif tag in prn_tags:
                prn_count += 1
            elif tag in adv_tags:
                adv_count += 1

        features['num_nn'] = nn_count
        features['num_adj'] = adj_count
        features['num_vrb'] = v_count
        features['num_adv'] = adv_count
        features['num_prn'] = prn_count

        #getting tweet polarity score
        polarity_score = round(blob_text.sentiment.polarity, 1)
        features['polarity'] = polarity_score

        #getting tweet subjectivity score
        subjectivity_score = round(blob_text.sentiment.subjectivity, 1)
        features['subjectivity'] = subjectivity_score

        #getting the number of special tokens
        dit_count = 0
        spc_count = 0
        cap_count = 0
        pos_word_count = 0
        neg_word_count = 0
        bad_word_count = 0
        modal_verb_count = 0
        for token in token_list:
            blob_word = Word(token)
            #getting the number of ditgit tokens
            if blob_word.isdigit() == True:
                dit_count += 1
        #getting the number of tokens which are special characters
            if blob_word.isdigit() == False and blob_word.isalpha() == False:
                spc_count += 1
        #getting the number of tokens containing capital characters
            if blob_word.islower() == False:
                cap_count += 1
        #getting the number of tokens that have positive sentiment
            if blob_word in pos_word_list:
                pos_word_count += 1
        #getting the number of tokens that have negative sentiment
            if blob_word in neg_word_list:
                neg_word_count += 1
        #getting the number of tokens which are profane words
            if blob_word in bad_word_list:
                bad_word_count += 1
        #getting the number of tokens which are modal verbs
            if blob_word in modal_list:
                modal_verb_count += 1

        features['num_dit'] = dit_count
        features['num_spc'] = spc_count
        features['num_cap'] = cap_count
        features['num_pos_w'] = pos_word_count
        features['num_neg_w'] = neg_word_count

        #getting the number of emojis
        num_emojis = len(
            Tweet_processing.get_emojis(tweet))  #from tweet_processing.py
        features['num_emo'] = num_emojis

        #getting the number of hashtags
        num_hashtags = len(
            Tweet_processing.get_hashtags(tweet))  #from tweet_processing.py
        features['num_htg'] = num_hashtags

        #getting the number of hashtags
        num_users = len(
            Tweet_processing.get_users(tweet))  #from tweet_processing.py
        features['num_users'] = num_users

        #getting number of named entities
        doc = nlp(tweet)
        ents = list(doc.ents)
        features['num_ents'] = len(ents)

        stylo_features.append(features)

    dict_vtrz = DictVectorizer(sparse=False)
    #transform extracted features into vectors
    stylometric = dict_vtrz.fit_transform(stylo_features)

    print(stylometric.shape)
    #save feature vectors
    save_file = save_data(stylometric, save_file_name)
    print(save_file)

    return save_file