Python LexicalRichness.mattr примеры использования

Язык программирования: Python

Пространство имен/Пакет: lexicalrichness

Класс/Тип: LexicalRichness

Метод/Функция: mattr

Примеров на hotexamples.com: 3

Python LexicalRichness.mattr - 3 примера найдено. Это лучшие примеры Python кода для lexicalrichness.LexicalRichness.mattr, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

LexicalRichness(19)

mtld(4)

mattr(3)

hdd(1)

msttr(1)

Пример #1

Показать файл

Файл: corpus_stats_viz.py Проект: GillesJ/sentivent_webannoparser

def compute_lexical_richness(events,
                             by=["event_type"],
                             extent=["discontiguous_triggers"],
                             preproc=None):
    """
    Compute lexical richness measures of unit attributes.
    event_type extracts the mention tokens

    :return:
    :param events: list of Event objects
    :param by: Group metric by attribute name. Used for grouping by event_type or subtype
    :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont.,
    :param preproc: list of preprocessing functions that take a string of text as input.
    :return:
    """
    from lexicalrichness import LexicalRichness

    print(
        f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}"
    )
    # collect text by attribute
    all_text = {}
    for attrib_name, g in groupby(events,
                                  key=lambda x:
                                  (getattr(x, attrib_n) for attrib_n in by)):
        attrib_name = ".".join(str(attrib_name))

        for event in g:
            text = event.get_extent_text(extent=extent)
            if preproc:
                for preproc_func in preproc:
                    text = preproc_func(text)
            all_text.setdefault(attrib_name, []).append(text)

    # compute lexical diversity by attribute
    d = []
    for attrib_name, text in all_text.items():
        # This was a bad idea because mention TTR is nearly always 1.
        # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944)
        # # instead of to_fix segments: annotation mentions
        # mention_ttr = [LexicalRichness(t).ttr for t in text]
        # mmttr = sum(mention_ttr) / len(text)
        # print(mention_ttr)
        # print(mmttr)

        # Lexical entropy
        p, lns = Counter(text), float(len(text))
        entropy = -sum(count / lns * math.log(count / lns, 2)
                       for count in p.values())

        # metrics on all mentions together
        text = " ".join(text)
        lr = LexicalRichness(text)

        d.append({
            "annotation_type": attrib_name,
            # "Mean mention TTR": mmttr, # this was a bad idea of mine
            "cttr": lr.cttr,
            "entropy": entropy,
            "dugast": lr.Dugast,
            "type_count": lr.terms,
            "token_count": lr.words,
            "herdan": lr.Herdan,
            "somers": lr.Summer,
            "maas": lr.Maas,  #  low sensivitty
            "ttr": lr.ttr,
            "rttr": lr.rttr,
            "mtld": lr.mtld(threshold=0.72),  # length correct,  mid sensivitty
            "msttr":
            lr.msttr(segment_window=25),  # length correct, mid sensivity
            "mattr":
            lr.mattr(window_size=25),  # length correct, mid sensivitty
            "hdd": lr.hdd(draws=42),  # length correct, high sensitivity
        })

    df_lr = pd.DataFrame(d)
    # invert Maas for plotting
    df_lr["maas_inv"] = df_lr["maas"] * -1.0

    rec_metrics = ["maas", "hdd",
                   "mtld"]  # recommended metrics in McCarthy 2010
    # rank
    df_lr = util.rank_dataframe_column(
        df_lr, ascending=False)  # add rank column for easy comparison
    df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int)
                          )  # Maas is inverted, lower score is more richness
    df_lr = df_lr.drop(labels=["annotation_type_rank"],
                       axis=1)  # no need for index column ranking

    # nicer output
    df_lr = df_lr.sort_index(axis=1)  # sort columns alphabetically
    rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c]
    df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int)
                         )  # sum every metric rank and rank inversely
    df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics
                                          ]].sum(axis=1).rank().astype(int)
                                   )  # combine recommended metrics
    df_lr = df_lr.set_index("annotation_type")
    df_lr = df_lr.sort_values(
        by="rank_maas_hdd_mtld"
    )  # sort values by conbination of recommended metrics in McCarthy 2010
    return df_lr

Пример #2

Показать файл

Файл: LexicalRichnessCalculator.py Проект: LiatNativPersonal/Cognates

 def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72):
     lex = LexicalRichness(text)
     self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average
     self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity
     return self.measures

Пример #3

Показать файл

Файл: nlp_util.py Проект: vjbytes102/open_dbm

def process_speech(transcribe_df, r_config):
    """
        Preparing speech features
        Args:
            transcribe_df: Transcribed dataframe
            r_config: raw config file object
        Returns:
            Dataframe for speech features
    """

    err_transcribe = transcribe_df[r_config.err_reason].iloc[0]
    transcribe = transcribe_df[r_config.nlp_transcribe].iloc[0]
    total_time = transcribe_df[r_config.nlp_totalTime].iloc[0]
    master_url = transcribe_df['dbm_master_url'].iloc[0]

    #clean transcribe
    transcribe = transcribe.replace(",", "")
    transcribe = " ".join(re.findall(r"[\w']+|[.!?]", transcribe))

    if err_transcribe != 'Pass':
        df_speech = empty_speech(r_config, master_url, error_txt)

        return df_speech

    speech_dict = {}
    nltk_download()

    sentences = nltk.tokenize.sent_tokenize(transcribe)
    words_all = nltk.tokenize.word_tokenize(transcribe)
    num_sentences = len(sentences)

    speech_dict[r_config.nlp_numSentences] = num_sentences

    #nlp_singPron
    i_s = transcribe.count('I')
    me_s = transcribe.count('me')
    my_s = transcribe.count('my')
    sing_count = i_s + me_s + my_s

    speech_dict[r_config.nlp_singPronPerAns] = sing_count if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_singPronPerSen] = divide_var(
        speech_dict[r_config.nlp_singPronPerAns], num_sentences)

    tagged = nltk.pos_tag(transcribe.split())
    tagged_df = pd.DataFrame(tagged, columns=['word', 'pos_tag'])

    #Past tense per answer
    all_POSs = tagged_df['pos_tag'].tolist()
    speech_dict[r_config.nlp_pastTensePerAns] = all_POSs.count(
        'VBD') if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pastTensePerSen] = divide_var(
        speech_dict[r_config.nlp_pastTensePerAns], num_sentences)

    #Pronoun per answer
    pronounsPerAns = all_POSs.count('PRP') + all_POSs.count('PRP$')
    speech_dict[r_config.nlp_pronounsPerAns] = pronounsPerAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_pronounsPerSen] = divide_var(
        speech_dict[r_config.nlp_pronounsPerAns], num_sentences)

    #Verb per answer
    verbPerAns = all_POSs.count('VB') + all_POSs.count('VBD') + all_POSs.count('VBG') \
                      + all_POSs.count('VBN') + all_POSs.count('VBP') + all_POSs.count('VBZ')
    speech_dict[r_config.
                nlp_verbsPerAns] = verbPerAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_verbsPerSen] = divide_var(
        speech_dict[r_config.nlp_verbsPerAns], num_sentences)

    #Adjective per answer
    adjectivesAns = all_POSs.count('JJ') + all_POSs.count(
        'JJR') + all_POSs.count('JJS')
    speech_dict[r_config.nlp_adjectivesPerAns] = adjectivesAns if len(
        words_all) > 0 else np.nan
    speech_dict[r_config.nlp_adjectivesPerSen] = divide_var(
        speech_dict[r_config.nlp_adjectivesPerAns], num_sentences)

    #Noun per answer
    nounsAns = all_POSs.count('NN') + all_POSs.count('NNP') + all_POSs.count(
        'NNS')
    speech_dict[
        r_config.nlp_nounsPerAns] = nounsAns if len(words_all) > 0 else np.nan
    speech_dict[r_config.nlp_nounsPerSen] = divide_var(
        speech_dict[r_config.nlp_nounsPerAns], num_sentences)

    #Sentiment analysis
    vader = SentimentIntensityAnalyzer()
    sentence_valences = []

    for s in sentences:
        sentiment_dict = vader.polarity_scores(s)
        sentence_valences.append(sentiment_dict['compound'])

    speech_dict[r_config.nlp_sentiment_mean] = np.mean(
        sentence_valences) if len(sentence_valences) > 0 else np.nan
    non_punc = list(value for value in words_all
                    if value not in ['.', '!', '?'])

    non_punc_as_str = " ".join(str(non_punc))
    lex = LexicalRichness(non_punc_as_str)
    speech_dict[r_config.nlp_mattr] = lex.mattr(
        window_size=lex.words) if lex.words > 0 else np.nan

    #Number of words per minute
    speech_dict[r_config.nlp_wordsPerMin] = divide_var(len(non_punc),
                                                       total_time) * 60
    speech_dict[r_config.nlp_totalTime] = total_time
    speech_dict['dbm_master_url'] = master_url

    df_speech = pd.DataFrame([speech_dict])
    return df_speech