def main(): parser = argparse.ArgumentParser('Script to generate distribution ' 'of edge weights/degrees for all ' 'participants') parser.add_argument('-m', '-M', type=str, required=True, help='location of the message file') parser.add_argument('-mt', '-MT', type=str, default='all', help='types of messages to plot, currently supports ' 'one of the following: sms, fb, twitter, or all') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') parser.add_argument('-s', '-S', type=str, required=True, help='folder to store data in, leading / required') parser.add_argument('-p', '-P', action='store_true', help='flag to generate plots') args = parser.parse_args() survey_file = args.r message_file = args.m m_type = args.mt folder_to_store = args.s generate_plots = args.p wi = weeklyinfo() week_info = wi.getweeklyfo(survey_file) ff = filterfields(message_file) filtered_data = [] if m_type == 'all': for message_type in ['sms', 'fb_message']: filtered_data.extend(ff.filterbyequality(pr.m_type, message_type)) else: filtered_data = ff.filterbyequality(pr.m_type, m_type) _, links_tuple, _, pid_dict = hlp.creategraph(filtered_data, filterType=args.mt) gh = ghelper() plt = plots() weekly_deg_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt, is_degree=True, week_info=week_info) hlp.dumpvariable(weekly_deg_dist, 'weekly_deg_dist.dict', folder_to_store) weekly_ew_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt, is_degree=False, week_info=week_info) hlp.dumpvariable(weekly_ew_dist, 'weekly_ew_dist.dict', folder_to_store) if generate_plots: plt.plotweeklyprogression(weekly_deg_dist, folder_to_store + 'deg_', 'No. of friends', 'Week No.', 'Friends') plt.plotweeklyprogression(weekly_ew_dist, folder_to_store + 'ew_', 'No. of messages exhanged', 'Week No.', 'Messages') print 'done...'
def main(): parser = argparse.ArgumentParser('Script to perform sentiment analysis using VADER') parser.add_argument('-m', '-M', type=str, required=True, help='Location of the message file') parser.add_argument('-mt', '-MT', type=str, required=True, nargs='+', help='types of messages to filter') parser.add_argument('-f', '-F', type=str, required=True, help='filename where data is stored, no extension needed') parser.add_argument('-s', '-S', type=str, required=True, help='location of folder to store the file, ends with a /') parser.add_argument('-p', '-P', action='store_true', help='flag to store polarities separately') parser.add_argument('-w', '-W', type=str, required=False, help='conduct weekly analysis, path to the survey data for ' 'creating week information') parser.add_argument('-l', '-L', type=str, nargs='+', required=True, help='the filters to use, make one or more choices: seenB, wasB, didB') parser.add_argument('-lf', '-LF', type=str, nargs='+', required=True, help='location of filtered data, from runSurveyStats.py, in same order as -l/L flag') args = parser.parse_args() message_file = args.m message_types = args.mt filename_to_store = args.f location_to_store = args.s separate_polarity_score = args.p survey_file = args.w filters_chosen = args.l filter_files = args.lf catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) if separate_polarity_score and survey_file is not None: print 'Cannot have separate polarity scores and weekly analysis together, ' \ 'please remove the -p/-P flag' return if survey_file is not None: wi = weeklyinfo() week_dates = wi.getweeklyfo(survey_file) gh = ghelper() ff = filterfields(message_file) data = [] for message_type in message_types: data.extend(ff.filterbyequality(pr.m_type, message_type)) pid_dict = hlp.getuniqueparticipants(data, 'all' if len(message_types) > 1 else message_types[0]) sentiment_analyzer = vadersenti(data[1:]) returned_data = sentiment_analyzer.compilesentiment(pr.m_content, separate_sentiment_list=separate_polarity_score) if separate_polarity_score: hlp.dumpvariable(returned_data, filename_to_store + '.data', location_to_store) else: header = pr.message_header + ['pos', 'neg', 'neu', 'compound'] final_data = [header] + returned_data hlp.writecsv(final_data, location_to_store + filename_to_store + '.csv') weekly_data = gh.filterweeklydata(pid_dict, returned_data, week_dates, 'all' if len(message_types) > 1 else message_types[0]) hlp.dumpvariable(weekly_data, 'weekly_data.dict', location_to_store) summarized_sentiment = {} for pid in weekly_data: summarized_sentiment[pid] = {} participant_data = weekly_data[pid] for week_no in participant_data: summarized_sentiment[pid][week_no] = sentiment_analyzer.summarizesentiment(participant_data[week_no], separate_in_out=True, message_type=message_type) hlp.dumpvariable(summarized_sentiment, 'weekly_summarized_sentiment.dict', location_to_store) plt = plots() overlay_data = gh.createbullyingoverlay(catch_all_data, week_dates, ff) plt.plotweeklyprogression(summarized_sentiment, location_to_store, 'Sentiment Progress', 'Week', 'Sentiment Value', sentiment_legend=['Positive', 'Negative', 'Neutral'], overlay_data=overlay_data) print 'done'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True, help='Message list file') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') parser.add_argument('-p', '-P', type=str, required=True, help='PID dict inverted') parser.add_argument('-b', '-B', type=str, required=True, help='bullying dictionary') parser.add_argument('-o', '-O', type=str, required=True, help='Output folder') parser.add_argument('-l', '-L', type=str, nargs='+', help='Filters chosen') parser.add_argument('-f', '-f', type=str, nargs='+', help='Filter files') args = parser.parse_args() output_folder = args.o message_data = hlp.recovervariable(args.m) pid_dict = hlp.recovervariable(args.p) filters_chosen = args.l filter_files = args.f catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) wi = weeklyinfo() weekly_info = wi.getweeklyfo(args.r) ff = filterfields() gh = ghelper() bullying_overlay = gh.createbullyingoverlay(catch_all_data, weekly_info, ff) bullying_overlay = flip_bullying_overlay(bullying_overlay, weekly_info.keys()) pid_list = pid_dict.keys() pid_list.sort() for pid in pid_list: training_set_final = [] testing_set_final = [] pid_list_training = deepcopy(pid_list) pid_list_training.remove(pid) ff.setdata(message_data) testing_raw_data = ff.filterbyequality(pr.m_source, pid_dict[pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[pid]) ff.removebyequality(pr.m_source, pid_dict[pid]) ff.removebyequality(pr.m_target, pid_dict[pid]) training_raw_data = ff.getdata() fe = raw_features(data=None) _, _ = fe.get_scoring_factors(training_raw_data) training_weekly_data = {} for training_pid in pid_list_training: training_weekly_data[training_pid] = {} data_to_use = ff.filterbyequality(pr.m_source, pid_dict[training_pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[training_pid]) if 0 == len(data_to_use): print 'no data found, probably filtered into the testing set, Training PID: '+\ training_pid+', Testing PID: '+pid continue pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, \ outgoing_ss = get_pid_level_features(data_to_use, weekly_info, ff, bullying_overlay, pid_dict, training_pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[training_pid]) training_set_final.append( [training_pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) # testing pid pid_weekly_w_bullying, global_in_degree, global_out_degree, \ global_in_ew, global_out_ew, incoming_ss, outgoing_ss = get_pid_level_features(testing_raw_data, weekly_info, ff, bullying_overlay, pid_dict, pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[pid]) testing_set_final.append( [pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) header = ['pid', 'wkno', 'frWInSenPos', 'frWInSenNeu', 'frWInSenNeg', 'frWOutSenPos', 'frWOutSenNeu', 'frWOutSenNeg', 'frInDegO', 'frOutDegO', 'frInEdgeO', 'frOutEdgeO', 'inSenSc', 'outSenSc', 'label'] training_set_final = [header] + training_set_final testing_set_final = [header] + testing_set_final hlp.writecsv(training_set_final, output_folder+pid+'_tr.csv') hlp.writecsv(testing_set_final, output_folder+pid+'_ts.csv')
def main(): parser = argparse.ArgumentParser('Script to generate a CDF comparing the degrees of our participants') parser.add_argument('-l', '-L', type=str, nargs='+', required=True, help='the filters to use, make one or more choices: seenB, wasB, didB') parser.add_argument('-f', '-F', type=str, nargs='+', required=True, help='location of filtered data, from runSurveyStats.py, in the same order as -l/L flag') parser.add_argument('-m', '-M', type=str, required=True, help='location of the message file') parser.add_argument('-mt', '-MT', type=str, default='sms', help='type of message we are filtering, default: sms') parser.add_argument('-n', '-N', action='store_true', help='flag indicates that processing should include participants which did not witness ' 'anything mentioned in the values passed for flags -l/L') parser.add_argument('-a', '-A', action='store_true', help='flag indicates that processing should include a plot of all participants') parser.add_argument('-s', '-S', type=str, required=True, help='folder to store in, leading /') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') args = parser.parse_args() filters_chosen = args.l for filter_v in filters_chosen: if filter_v not in ['seenB', 'didB', 'wasB']: raise Exception('filter value was not from the ones specified') filter_files = args.f assert len(filter_files) == len(filters_chosen), e.len_filter_file_ne_len_filters_chosen include_other_participants = args.n include_all_participants = args.a location_to_store = args.s if not os.path.exists(location_to_store): os.mkdir(location_to_store) message_file = args.m message_type = args.mt survey_file = args.r wi = weeklyinfo() week_info = wi.getweeklyfo(survey_file) gh = ghelper() plt = plots() # get the filtered messages ff = filterfields(message_file) filtered_data = [] if message_type == 'all': for message_type in ['sms', 'fb', 'twitter']: filtered_data.extend(ff.filterbyequality(pr.m_type, message_type)) else: filtered_data = ff.filterbyequality(pr.m_type, message_type) # generate the links and the graph for the filtered data links, links_tuple, graph_obj, pid_dict = hlp.creategraph(filtered_data, filterType=message_type) # get the pids from the chosen filters bullying_pid_dict = hlp.getfilterdata(filters_chosen, filter_files) cumulative_bully_pid = hlp.getfilterdata(filters_chosen, filter_files, cumulative_list=True) # get all the information from the filters catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) # generate the distributions for in degree and plot them in_distributions = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants, include_other_participants, pid_dict, message_type, cumulative_bully_pid, in_dist=True) in_distributions_ew = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants, include_other_participants, pid_dict, message_type, cumulative_bully_pid, in_dist=True, is_degree=False) plt.generatetablehist(in_distributions, location_to_store + 'in_degree_table.csv', generate_totals=True) plt.generatetablehist(in_distributions_ew, location_to_store + 'in_edge_weight.csv', generate_totals=True) # generate the distributions for out degree and plot them out_distributions = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants, include_other_participants, pid_dict, message_type, cumulative_bully_pid, in_dist=False) out_distributions_ew = gh.generatedistributions(graph_obj, bullying_pid_dict, include_all_participants, include_other_participants, pid_dict, message_type, cumulative_bully_pid, in_dist=False) plt.generatetablehist(out_distributions, location_to_store + 'out_degree_table.csv', generate_totals=True) plt.generatetablehist(out_distributions_ew, location_to_store + 'out_edge_weight.csv', generate_totals=True) # line plot of degrees weekly_dist_degrees, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=message_type, is_degree=True, week_info=week_info) overlay_info = gh.createbullyingoverlay(catch_all_data, week_info, ff) plt.plotweeklyprogression(weekly_dist_degrees, location_to_store +'deg_', 'No of friends', 'Week No', 'Friends', overlay_data=overlay_info) # line plot of weights weekly_dist_ew, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=message_type, is_degree=False, week_info=week_info) overlay_info = gh.createbullyingoverlay(catch_all_data, week_info, ff) plt.plotweeklyprogression(weekly_dist_ew, location_to_store +'ew_', 'No. of messages exchanged', 'Week No', 'Messages', overlay_data=overlay_info) print 'TADAAA!'