Exemplo n.º 1
0
def generate_master_summary(pd_text):
    #this is where you get the function
    pd_text = date_processing(pd_text)

    pd_text = add_block_conv(pd_text)
    pd_text['index_holder'] = range(0, len(pd_text))
    first_conversation = pd_text.groupby(['block_conv']).first()
    all_potential_initiation = first_conversation[
        'index_holder'].values.tolist()
    nr_outgoing_again_Index,nr_incoming_again_Index, guy_initiation_index,girl_initiation_index=\
    summary_initiation_count(pd_text, all_potential_initiation)
    list_of_entire_text = generate_daily_text_group(pd_text)
    all_dates = pd_text['Message_Day'].drop_duplicates()

    #checking this sentiment changes
    #maybe we dont need this anymore

    text_info = bert_sentiment(list_of_entire_text)
    text_info_sentiment = [x[1] for x in text_info]

    #text_info_sentiment=len(all_dates)*[1]
    text_sentiment_pair = list(zip(all_dates, text_info_sentiment))

    #special keywords
    custom_stopwords = identify_custom_stopwords(list_of_entire_text)
    pd_master=pd.DataFrame(columns=['Date', 'Start Time', 'End Time', 'Text sent by Me', 'Text Sent by partner', 'raw ratio', 'adjusted text ratio','word count by me', 'word count by partner', 'word ratio', 'Total Text', 'Initiator', 'Ender',
                                    'initiate with new topic', 'Emoji_partner','Emoji_me', 'Attachment_partner','Attachment_me', 'Topic', 'Response',\
                                    'adjust_word_ratio', 'total_minutes_partner', 'total_minutes_me', 'hours_in_touch_partner', 'hours_in_tough_me', 'gnat_abs', \
                                    'gnat_perctg', 'response_time_partner', 'reponse_time_me', 'valid word count by me', 'valid word count by partner', 'valid word ratio',])

    for index_date in range(0, len(all_dates)):
        print("analysing day ", index_date)
        date = all_dates.iloc[index_date]
        all_date_time = pd_text.loc[pd_text['Message_Day'] == date,
                                    'Message Date']
        initial_time_index = all_dates.index[index_date]

        all_date_time.reset_index(drop=True, inplace=True)
        initial_time = all_date_time[0]
        end_time = all_date_time[len(all_date_time) - 1]
        pd_day_text = pd_text.loc[pd_text.Message_Day == date, ]
        pd_day_text.reset_index(drop=True, inplace=True)

        #this will be used for block study
        pd_master=generating_analytical(pd_day_text, pd_master,date,initial_time, initial_time_index,end_time,\
                                        nr_incoming_again_Index,guy_initiation_index, nr_outgoing_again_Index, girl_initiation_index,\
                                        text_sentiment_pair, custom_stopwords)

    return pd_master, nr_outgoing_again_Index, nr_incoming_again_Index, guy_initiation_index, girl_initiation_index
Exemplo n.º 2
0
def summary_analytical(pd_text, pd_master, file_name, nr_outgoing_again_Index,
                       nr_incoming_again_Index, guy_initiation_index,
                       girl_initiation_index):
    all_figure = []
    file_name = file_name + ".pdf"
    pdf = matplotlib.backends.backend_pdf.PdfPages(file_name)

    # sent message ratip
    incoming_msg_count, outgoing_msg_count, fig = count_number_of_incoming_outcoming(
        date_processing(pd_text), pd_master, to_graph=True)
    all_figure.append(fig)

    #total words
    pd_master.loc[pd_master['valid word count by me'] == 0,
                  'valid word count by me'] = 1
    pd_master.loc[pd_master['word count by me'] == 0, 'word count by me'] = 1

    pd_master['valid_word_rolling_ratio'] = pd_master[
        'valid word count by partner'] / pd_master['valid word count by me']
    pd_master['valid_word_rolling_ratio'] = pd_master[
        'valid_word_rolling_ratio'].rolling(7).median()
    pd_master['word_rolling_ratio'] = pd_master[
        'word count by partner'] / pd_master['word count by me']
    pd_master['word_rolling_ratio'] = pd_master['word_rolling_ratio'].rolling(
        7).median()

    fig2, axs = plt.subplots(2, sharex=True, squeeze=True)
    axs[0].plot(pd_master['Date'],
                pd_master['word count by me'].rolling(7).mean(),
                '-r',
                label='me')
    axs[0].plot(pd_master['Date'],
                pd_master['word count by partner'].rolling(7).mean(),
                '-b',
                label='partner')
    leg = axs[0].legend()
    axs[0].set_title('Avg words by partner vs Avg words by me')
    axs[1].plot(pd_master['Date'],
                pd_master['word_rolling_ratio'],
                '-r',
                label='normal ratio')
    axs[1].plot(pd_master['Date'],
                pd_master['valid_word_rolling_ratio'],
                '-b',
                label='valid words only ratio')
    leg = axs[1].legend()
    axs[1].set_title('Rolling Word Ratio - by Partner / by Me ')
    fig2.autofmt_xdate()
    all_figure.append(fig2)

    #2b hours
    pd_master.loc[pd_master['total_minutes_me'] == 0, 'total_minutes_me'] = 1
    pd_master['minutes_ratio'] = pd_master[
        'total_minutes_partner'] / pd_master['total_minutes_me']
    pd_master['minutes_ratio'] = pd_master['minutes_ratio'].rolling(7).median()
    pd_master.loc[pd_master['minutes_ratio'] == 0, 'minutes_ratio'] = 1

    #2c hours in talk
    pd_master['hours_in_touch_ratio'] = pd_master['hours_in_touch_partner'] / (
        pd_master['hours_in_tough_me'])
    pd_master['hours_in_touch_ratio'] = pd_master[
        'hours_in_touch_ratio'].rolling(7).mean()
    fig2c, axs = plt.subplots(2, sharex=True, squeeze=True)
    axs[0].plot(pd_master['Date'],
                pd_master['hours_in_tough_me'].rolling(7).mean(),
                '-r',
                label='me')
    axs[0].plot(pd_master['Date'],
                pd_master['hours_in_touch_partner'].rolling(7).mean(),
                '-b',
                label='partner')
    leg = axs[0].legend()
    axs[0].set_title(
        'Hour points partner sends message vs hour points me send message')
    axs[1].plot(pd_master['Date'], pd_master['hours_in_touch_ratio'])
    axs[1].set_title('Rolling Hour Points Ratio - by Partner / by Me ')
    fig2c.autofmt_xdate()
    all_figure.append(fig2c)

    #new initiator chart
    first_conversation = pd_text.groupby(['block_conv']).first()
    initial_type = []
    for x in first_conversation['index_holder']:
        initial_type.append(
            label_conversation_initiation(x, nr_incoming_again_Index,
                                          guy_initiation_index,
                                          girl_initiation_index,
                                          nr_outgoing_again_Index))
    first_conversation['initial_type'] = initial_type
    initition_summary = first_conversation.groupby(
        ['Message_Day', 'initial_type'])['Type'].count().unstack()
    initition_summary = initition_summary.fillna(0)

    initition_summary_columns = list(initition_summary.columns)
    if 'INCOM' not in initition_summary_columns:
        initition_summary['INCOM'] = [0] * len(initition_summary)
    if 'NR_INCOM' not in initition_summary_columns:
        initition_summary['NR_INCOM'] = [0] * len(initition_summary)
    if 'OUTGO' not in initition_summary_columns:
        initition_summary['OUTGO'] = [0] * len(initition_summary)
    if 'NR_OUTGO' not in initition_summary_columns:
        initition_summary['NR_OUTGO'] = [0] * len(initition_summary)

    fig3, axs = plt.subplots(2, 2)
    #axs.set_title('Breakdown of initiation type (rolling 7d)')
    axs[0, 0].plot(initition_summary.index,
                   initition_summary.INCOM.rolling(7).sum())
    axs[0, 0].set_title('New Initiation from Partner')
    axs[0, 0].xaxis.set_visible(False)
    axs[0, 1].plot(initition_summary.index,
                   initition_summary.NR_INCOM.rolling(7).sum(), 'tab:orange')
    axs[0, 1].set_title('Second Initiation from Partner')
    axs[0, 1].xaxis.set_visible(False)
    axs[1, 0].plot(initition_summary.index,
                   initition_summary.OUTGO.rolling(7).sum(), 'tab:green')
    axs[1, 0].set_title('New Initiation from Me')
    axs[1, 1].plot(initition_summary.index,
                   initition_summary.NR_OUTGO.rolling(7).sum(), 'tab:red')
    axs[1, 1].set_title('Second Initiation from Me')
    fig3.suptitle('Breakdown of initiation type (rolling 7d)', fontsize=18)
    fig3.autofmt_xdate()
    all_figure.append(fig3)

    #gnat tendency
    pd_master['rolling_gnat_perctg'] = pd_master['gnat_perctg'].rolling(
        7).mean()
    fig3a = plt.figure()
    plt.plot(pd_master['Date'], pd_master['rolling_gnat_perctg'])
    plt.title(
        'Text Gnat Tendency -- Percentage of conversations initiated repeatedly'
    )
    fig3a.autofmt_xdate()
    all_figure.append(fig3a)

    #initiator
    him_initiation = nr_incoming_again_Index + guy_initiation_index
    her_initiation = girl_initiation_index + nr_outgoing_again_Index
    first_conversation = pd_text.groupby(['block_conv']).first()
    first_conversation = categorization_him_me(first_conversation,
                                               him_initiation, her_initiation)
    first_conversation_summary = first_conversation.groupby(['Message_Day'
                                                             ]).mean()
    fig8 = plt.figure()
    plt.plot(first_conversation_summary.index,
             first_conversation_summary['initiation_score'].rolling(7).mean())
    plt.title('Partner Initiation of Conversation (Score)')
    fig8.autofmt_xdate()
    all_figure.append(fig8)

    #count emoji
    pd_master['total_attachment_him'] = pd_master['Emoji_partner'] + pd_master[
        'Attachment_partner']
    pd_master['total_attachment_her'] = pd_master['Emoji_me'] + pd_master[
        'Attachment_me']
    pd_master['total_attachment_him_average'] = pd_master[
        'total_attachment_him'].rolling(7).sum()
    pd_master['total_attachment_her_average'] = pd_master[
        'total_attachment_her'].rolling(7).sum()

    pd_master.loc[pd_master.total_attachment_her == 0,
                  'total_attachment_her'] = 1
    pd_master[
        'mms_ratio'] = pd_master['total_attachment_him_average'] / pd_master[
            'total_attachment_her_average']
    pd_master['mms_ratio'] = pd_master['mms_ratio'].rolling(7).median()

    fig4c, axs = plt.subplots(2, sharex=True, squeeze=True)
    axs[0].plot(pd_master['Date'],
                pd_master['total_attachment_her_average'],
                '-r',
                label='me')
    axs[0].plot(pd_master['Date'],
                pd_master['total_attachment_him_average'],
                '-b',
                label='partner')
    leg = axs[0].legend()
    axs[0].set_title('No. of Multimedia Sent by Partner vs by Me')
    axs[1].plot(pd_master['Date'], pd_master['mms_ratio'])
    axs[1].set_title('Multimedia Send Ratio: by Partner / by Me')
    fig4c.autofmt_xdate()
    all_figure.append(fig4c)

    #count sentiment: maybe in the future
    #pd_master['raw_sentiment']=pd_master['Response'].apply(clean_sentiment)
    pd_master['rolling_response_sentiment'] = pd_master['Response'].apply(
        float).rolling(7).mean()
    fig5 = plt.figure()
    plt.plot(pd_master['Date'], pd_master['rolling_response_sentiment'])
    plt.title('Conversation Sentiment Index - higher is better (beta version)')
    fig5.autofmt_xdate()
    all_figure.append(fig5)

    #holy grail
    pd_raw = add_block_conv(pd_text)
    fig6 = holy_grail_analysis(pd_raw,
                               method='normal',
                               conversation_cutoff=5,
                               rolling_avg=10)
    all_figure.append(fig6)

    pd_master['raw ratio'].replace(np.inf, 1, inplace=True)
    mms_ratio_mean = pd_master['mms_ratio'].mean()
    pd_master['mms_ratio'].replace(np.nan, mms_ratio_mean, inplace=True)

    pd_master['final_score']=pd_master['valid_word_rolling_ratio']+\
    pd_master['raw ratio']+pd_master['minutes_ratio']+pd_master['hours_in_touch_ratio']+\
    pd_master['mms_ratio']-pd_master['rolling_gnat_perctg']

    fig5d = plt.figure()
    plt.plot(pd_master['Date'],
             pd_master['final_score'],
             '-b',
             label='current')
    plt.plot(pd_master['Date'],
             pd_master['final_score'].rolling(7).mean(),
             '-r',
             label='7d avg')
    plt.title('Ultimate Text Indicator')
    leg = plt.legend()
    fig5d.autofmt_xdate()
    all_figure = [fig5d] + all_figure

    #finally all of them
    for fig in all_figure:  ## will open an empty extra figure :(
        pdf.savefig(fig)
    pdf.close()
Exemplo n.º 3
0
def summary_analytical(pd_text, pd_master, file_name, nr_outgoing_again_Index,
                       nr_incoming_again_Index, guy_initiation_index,
                       girl_initiation_index):
    all_figure = []
    file_name = file_name + ".pdf"
    pdf = matplotlib.backends.backend_pdf.PdfPages(file_name)

    # sent message ratip
    incoming_msg_count, outgoing_msg_count, fig = count_number_of_incoming_outcoming(
        date_processing(pd_text), to_graph=True)
    all_figure.append(fig)
    #total words
    pd_master['total_word_by_me'] = pd_master['word count by me'].rolling(
        7).mean()
    pd_master['total_word_by_him'] = pd_master[
        'word count by partner'].rolling(7).mean()
    pd_master['word_rolling_ratio'] = pd_master[
        'total_word_by_him'] / pd_master['total_word_by_me']
    fig2, axs = plt.subplots(2, sharex=True, squeeze=True)
    axs[0].plot(pd_master['Date'],
                pd_master['total_word_by_me'],
                '-r',
                label='me')
    axs[0].plot(pd_master['Date'],
                pd_master['total_word_by_him'],
                '-b',
                label='partner')
    leg = axs[0].legend()
    axs[0].set_title('Avg words by partner vs Avg words by me')
    axs[1].plot(pd_master['Date'], pd_master['word_rolling_ratio'])
    axs[1].set_title('Rolling Word Ratio - by Partner / by Me ')
    fig2.autofmt_xdate()
    all_figure.append(fig2)

    #initiator
    him_initiation = nr_incoming_again_Index + guy_initiation_index
    her_initiation = girl_initiation_index + nr_outgoing_again_Index
    first_conversation = pd_text.groupby(['block_conv']).first()
    first_conversation = categorization_him_me(first_conversation,
                                               him_initiation, her_initiation)
    first_conversation_summary = first_conversation.groupby(['Message_Day'
                                                             ]).mean()
    fig3 = plt.figure()
    plt.plot(first_conversation_summary.index,
             first_conversation_summary['initiation_score'].rolling(7).mean())
    plt.title('Partner Initiation of Conversation (Score)')
    fig3.autofmt_xdate()
    all_figure.append(fig3)

    #count emoji
    pd_master['total_attachment_him'] = pd_master['Emoji_partner'] + pd_master[
        'Attachment_partner']
    pd_master['total_attachment_her'] = pd_master['Emoji_me'] + pd_master[
        'Attachment_me']

    pd_master['total_attachment_him_average'] = pd_master[
        'total_attachment_him'].rolling(7).sum()
    pd_master['total_attachment_her_average'] = pd_master[
        'total_attachment_her'].rolling(7).sum()

    fig4 = plt.figure()
    plt.plot(pd_master['Date'],
             pd_master['total_attachment_him_average'],
             '-b',
             label='partner')
    plt.plot(pd_master['Date'],
             pd_master['total_attachment_her_average'],
             '-r',
             label='me')
    fig4.autofmt_xdate()
    plt.title('No. of Multimedia Sent')
    leg = plt.legend()
    all_figure.append(fig4)

    #count sentiment
    #pd_master['raw_sentiment']=pd_master['Response'].apply(clean_sentiment)
    fig5 = plt.figure()
    plt.plot(pd_master['Date'],
             pd_master['Response'].apply(float).rolling(7).mean())
    plt.title('Conversation Sentiment Index - higher is better (beta version)')
    fig5.autofmt_xdate()
    all_figure.append(fig5)

    #holy grail
    pd_raw = add_block_conv(pd_text)
    fig6 = holy_grail_analysis(pd_raw,
                               method='normal',
                               conversation_cutoff=5,
                               rolling_avg=10)
    all_figure.append(fig6)

    #find topics for the holy grail:
    '''
    file_name=file_name.replace(".pdf", "_topic_analysis.csv")
    final_score=scoring_holy_grail_normal(pd_raw)
    range_len=int(len(final_score)*0.3)
    topic_analysis_list=[]
    for i in range(0,range_len):
        print (i)
        top_block_number=final_score.index[i]
        score=final_score.iloc[i]
        all_content=pd_raw.loc[pd_raw.block_conv==top_block_number, 'Text'].values.tolist()
        all_content=[x for x in all_content if x==x]
        all_content=[x for x in all_content if not 'www.' in x]
        date_of_conversation=pd_text.loc[pd_text.block_conv==top_block_number, 'Message Date'].iloc[0]
        keywords=summary_topic(all_content)
        summary=paragraph_summary(all_content)
        topic_analysis_list.append((date_of_conversation, score,keywords,summary ))
    pd.DataFrame(topic_analysis_list).to_csv(file_name)
    '''
    #finally all of them
    for fig in all_figure:  ## will open an empty extra figure :(
        pdf.savefig(fig)
    pdf.close()