def main(): parser = argparse.ArgumentParser('Filter out people who have 0 communication weeks greater than the threshold') parser.add_argument('-f', '-F', type=str, required=True) parser.add_argument('-ti', '-TI', type=int, required=True, help='Incoming threshold') parser.add_argument('-to', '-TO', type=int, required=True, help='Outgoing threshold') parser.add_argument('-s', '-S', type=str, required=True, help='storage folder with /') parser.add_argument('-sf', '-SF', type=str, required=True, help='file name for storage') args = parser.parse_args() flipped_dict = hlp.recovervariable(args.f) incoming_th = args.ti outgoing_th = args.to location_to_store = args.s filename = args.sf to_remove = [] for pid in flipped_dict: if flipped_dict[pid][0] >= incoming_th and flipped_dict[pid][1] >= outgoing_th: to_remove.append(pid) print 'REMOVED: ', pid, flipped_dict[pid] else: print 'NOT REMOVED: ', pid, flipped_dict[pid] print 'Removed ', len(to_remove), ' out of a total of ', len(flipped_dict.keys()), 'participants' hlp.dumpvariable(to_remove, filename, location_to_store)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') parser.add_argument('-n', '-N', required=False, nargs=2, type=int, default=[0, 2], help='the neutral threshold, first value is min, second is max') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p neutral_limit = args.n message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] afinn = afinnsenti(data=message_data, neutral_threshold=neutral_limit) data = afinn.compilesentiment(separate_sentiment_list=separate_sentiment, field_no=nd.m_content) if(separate_sentiment): hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('score') message_header.append('label') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def individual_reciprocity_analysis(labelled_data, pid_dict, location_to_store): reciprocity_info = {} ff = filterfields() ff.setdata(labelled_data) polarity_data = {} for pid in pid_dict: print 'Working with PID: ', pid, '(', pid_dict[pid], ')' messages_by_participant = ff.filterbyequality(pr.m_source, pid) messages_to_participant = ff.filterbyequality(pr.m_target, pid) polarity_data[pid] = __get_polarity_composition(messages_by_participant+messages_to_participant, pid) reciprocity_info[pid] = {} n = len(messages_by_participant) idx = 0 for message in messages_by_participant: print 'idx=' + str(idx) + '/' + str(n) idx += 1 closest_message = find_closest_message(message, messages_to_participant, ff) target_type = 'P' if message[pr.m_target_type] == 'participant' else 'NP' target = message[pr.m_target] if target_type not in reciprocity_info[pid]: reciprocity_info[pid][target_type] = {} if target not in reciprocity_info[pid][target_type]: reciprocity_info[pid][target_type][target] = __basic_reciprocity_dict() sent_message_type = message[-1] reply_message_type = 'X' if closest_message is None else closest_message[-1] reciprocity_info[pid][target_type][target][sent_message_type][reply_message_type] += 1 print 'saving checkpoint...' hlp.dumpvariable([reciprocity_info, pid, pid_dict], 'checkpoint.chp', location_to_store) print 'saved!' return reciprocity_info, polarity_data
def find_reciprocity(labelled_data, location_to_store): ff = filterfields() ff.setdata(labelled_data) messages_sent_by_participants = ff.filterbyequality(pr.m_source_type, 'participant') reciprocity_dict = {'P': {'P': 0, 'U': 0, 'N': 0, 'X': 0}, 'N': {'P': 0, 'U': 0, 'N': 0, 'X': 0}, 'U': {'P': 0, 'U': 0, 'N': 0, 'X': 0}} n = len(messages_sent_by_participants) idx = 1 message_pairs = [] for message in messages_sent_by_participants: print 'at message ', idx, ' of ', n idx += 1 reply_message = find_closest_message(message, ff) sent_message_type = message[-1] if reply_message is None: reply_message_type = 'X' else: reply_message_type = reply_message[-1] reciprocity_dict[sent_message_type][reply_message_type] += 1 message_pairs.append((message, reply_message)) if 0 == idx%500: print 'saving...' hlp.dumpvariable([idx, reciprocity_dict, message_pairs, messages_sent_by_participants], 'checkpoint.chp', location_to_store) print 'done... out of the loop' to_use = {'P': '+', 'N': '-', 'U': 'u', 'X': 'null'} for sent_type in reciprocity_dict: recvd_types = reciprocity_dict[sent_type] for recvd_type in recvd_types: print 'N('+to_use[recvd_type]+'|'+to_use[sent_type]+')=', recvd_types[recvd_type] return reciprocity_dict, message_pairs
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Message file') parser.add_argument('-p', '-P', action='store_true') parser.add_argument('-s', '-S', required=True, help='filename to store polarity in, no extension needed') parser.add_argument('-f', '-F', required=True, help='folder to store the files in, ending with /') args = parser.parse_args() messagefile = args.m location_to_store = args.f file_to_store = args.s separate_sentiment = args.p message_data = hlp.readcsv(messagefile) message_header = message_data[0] message_data = message_data[1:] vader = vadersenti(data=message_data) data = vader.compilesentiment(separate_sentiment_list=separate_sentiment) if separate_sentiment: hlp.dumpvariable(data, file_to_store+'.list', location_to_store) else: message_header.append('pos') message_header.append('neg') message_header.append('neu') message_header.append('compound') final_data = [message_header] + data hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main(): parser = argparse.ArgumentParser('Script to generate statistics about message types') # add arguments parser.add_argument('-d', '-D', type=str, required=True, help='location of file to work with') parser.add_argument('-s', '-S', type=str, required=True, help='folder to store the results, ending with /') parser.add_argument('-f', '-F', type=str, required=True, help='filename to store data in') parser.add_argument('-w', '-W', type=int, default=0, help='what threshold to classify missing, default 0, Integer value needed') # get arguments args = parser.parse_args() filename = args.d threshold_missing = args.w location_to_store = args.s filepath = args.f data = hlp.recovervariable(filename) missing_week_dict, per_week_msgs = hlp.missingweeks(data, threshold_value=threshold_missing) flipped_dict = flipdict(missing_week_dict) printsummary(missing_week_dict, 'No. of participants with less than '+ str(threshold_missing)+' data points in ', len(data.keys()), per_week_msgs) hlp.dumpvariable(missing_week_dict, filepath, location_to_store) hlp.dumpvariable(flipped_dict, 'flipped_'+filepath, location_to_store)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--messageFile', type=str, required=True) parser.add_argument('-mt', '--messageTypes', type=str, nargs='+') parser.add_argument('-o', '--outputFolder', type=str, required=True) parser.add_argument('-of', '--outputFile', type=str, required=True) parser.add_argument('-pd', '--participantDictionary', type=str) parser.add_argument('-i', '--ignoreParticipants', type=str) parser.add_argument('-mc', '--messageTypeConvert', type=str, nargs='*') args = parser.parse_args() message_file = args.messageFile message_types = args.messageTypes output_folder = args.outputFolder output_file = args.outputFile pid_dict = args.participantDictionary ignore_pids = args.ignoreParticipants message_type_conversions = args.messageTypeConvert ff = filterfields(message_file) ff.setdata(ff.getdata()[1:]) to_set_data = [] # extract the relevant data for message_type in message_types: to_set_data.extend(ff.filterbyequality(pr.m_type, message_type)) ff.setdata(to_set_data) if ignore_pids is not None: ignore_pids = hlp.recovervariable(ignore_pids) for pid in ignore_pids: ff.removebyequality(pr.m_source, pid) ff.removebyequality(pr.m_target, pid) # set the pid to normal id dictionary if pid_dict is None: pid_dict = hlp.getuniqueparticipants(ff.getdata(), mtype='all', separate_pid_npid=True) # replace the message type names with the ones provided if message_type_conversions is not None: for idx in range(0, len(message_type_conversions), 2): message_to_convert = message_type_conversions[idx] to_convert_to = message_type_conversions[idx+1] ff.replacebyequality(pr.m_type, message_to_convert, to_convert_to) message_types = ff.getuniqueelements(pr.m_type) coded_participant_list = pid_dict[pr.participant['all']].values() storage_dict = initiatestorage(coded_participant_list, message_types) storage_dict = getperparticipantinout(ff.getdata(), storage_dict, pid_dict) plotperparticipantbar(storage_dict, 'Participant ID', '# of Messages', message_types, 'Per Participant Messages', output_folder+output_file) hlp.dumpvariable(pid_dict, 'pid_dict.dict', output_folder) hlp.dumpvariable(ff.getdata(), 'messageData.list', output_folder)
def main(): messages = hlp.recovervariable(sys.argv[1]) pid_dict = hlp.recovervariable(sys.argv[2]) week_dict = hlp.recovervariable(sys.argv[3]) m_type = sys.argv[4] participants = pid_dict[pr.participant[m_type]] non_participants = pid_dict[pr.nparticipant[m_type]] graph_objs = weekgraphs(week_dict, participants, non_participants) hlp.dumpvariable(graph_objs, 'week_graph_objs')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-w', '-W', type=str, required=True) parser.add_argument('-o', '-O', type=str, required=True) args = parser.parse_args() word_file = args.w output_folder = args.o f = open(word_file, 'r') d = f.readlines() f.close() word_list = [x.strip() for x in d] cf = canonical_form(word_list) words_in_dict, words_not_in_dict = cf.words_in_dict() print 'Total word list: ', len(word_list), ' Words present in dict: ', len(words_in_dict), \ ' Not in dict: ', len(words_not_in_dict) to_write_in_dict = '' for word in words_in_dict: to_write_in_dict += word +'\n' cf.set_word_list(words_not_in_dict) correct_form, missed_words = cf.get_canonical_form() print 'Could not find canonical forms for ', len(missed_words), ' out of a total of ', len(words_not_in_dict) to_write_canonical = '' to_substitute = {} for right_form, other_values in correct_form.iteritems(): to_write_canonical += right_form for word in other_values: to_write_canonical += ' '+word to_substitute[word] = right_form to_write_canonical += '\n' to_write_missed = '' for word in missed_words: to_write_missed += word + '\n' with open(output_folder + 'found_in_dict.txt', 'w') as f: f.write(to_write_in_dict) with open(output_folder + 'cannonical_form.txt', 'w') as f: f.write(to_write_canonical) with open(output_folder + 'not_found_anywhere.txt', 'w') as f: f.write(to_write_missed) hlp.dumpvariable(to_substitute, 'substitution_dict.dict', output_folder) print 'Done writing...'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled data from validate_balance_theory.py') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') parser.add_argument('-w', '-W', required=False, help='survey file for weekly data processing') args = parser.parse_args() data_file = args.d location_to_store = args.f weekly_surveys = args.w all_data = hlp.recovervariable(data_file) labelled_data = all_data[2] pid_dict = all_data[3] if weekly_surveys is None: reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'], location_to_store) analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv') analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv') hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info], 'reciprocity_info_overall.dict', location_to_store) else: # working with bimonthly data months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] wi = weeklyinfo() weekly_info = wi.getweeklyfo(weekly_surveys) ff = filterfields() weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) idx = 1 for bi_month in months2: print 'For weeks: ', bi_month bi_month_data = [] for weekno in bi_month: bi_month_data.extend(weekly_data[weekno]) reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'], location_to_store) analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv') analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv') hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info], 'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store) idx += 1 print 'tadaa!'
def main(): parser = argparse.ArgumentParser('Script to generate distribution ' 'of edge weights/degrees for all ' 'participants') parser.add_argument('-m', '-M', type=str, required=True, help='location of the message file') parser.add_argument('-mt', '-MT', type=str, default='all', help='types of messages to plot, currently supports ' 'one of the following: sms, fb, twitter, or all') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') parser.add_argument('-s', '-S', type=str, required=True, help='folder to store data in, leading / required') parser.add_argument('-p', '-P', action='store_true', help='flag to generate plots') args = parser.parse_args() survey_file = args.r message_file = args.m m_type = args.mt folder_to_store = args.s generate_plots = args.p wi = weeklyinfo() week_info = wi.getweeklyfo(survey_file) ff = filterfields(message_file) filtered_data = [] if m_type == 'all': for message_type in ['sms', 'fb_message']: filtered_data.extend(ff.filterbyequality(pr.m_type, message_type)) else: filtered_data = ff.filterbyequality(pr.m_type, m_type) _, links_tuple, _, pid_dict = hlp.creategraph(filtered_data, filterType=args.mt) gh = ghelper() plt = plots() weekly_deg_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt, is_degree=True, week_info=week_info) hlp.dumpvariable(weekly_deg_dist, 'weekly_deg_dist.dict', folder_to_store) weekly_ew_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt, is_degree=False, week_info=week_info) hlp.dumpvariable(weekly_ew_dist, 'weekly_ew_dist.dict', folder_to_store) if generate_plots: plt.plotweeklyprogression(weekly_deg_dist, folder_to_store + 'deg_', 'No. of friends', 'Week No.', 'Friends') plt.plotweeklyprogression(weekly_ew_dist, folder_to_store + 'ew_', 'No. of messages exhanged', 'Week No.', 'Messages') print 'done...'
def main(sql_path, variable_path): s_obj = surveys() data = s_obj.importsqlascsv(sql_path, 'survey') hlp.dumpvariable(data, 'survey_list.list', variable_path) hlp.writecsv(data, variable_path+'survey_list.csv') ndata = s_obj.interpretanswers(data) hlp.dumpvariable(ndata, 'survey_list_interpret.list', variable_path) hlp.writecsv(ndata, variable_path+'survey_list_interpret.csv') ndata_wR = s_obj.interpretanswers(data, True) hlp.dumpvariable(ndata_wR, 'survey_list_with_response_interpret.list', variable_path) hlp.writecsv(ndata_wR, variable_path+'survey_list_with_response_interpret.csv') data_dict = s_obj.datatodict(ndata) hlp.dumpvariable(data_dict, 'survey_dict_interpret.dict', variable_path) data_wR_dict = s_obj.datatodict(ndata_wR) hlp.dumpvariable(data_wR_dict, 'survey_dict_with_response_interpret.dict', variable_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled data from validate_balance_theory.py') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') parser.add_argument('-w', '-W', required=False, help='survey file for weekly data processing') args = parser.parse_args() data_file = args.d location_to_store = args.f weekly_surveys = args.w all_data = hlp.recovervariable(data_file) labelled_data = all_data[2] pid_dict = all_data[3] if weekly_surveys is None: reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store) hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store) else: months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] wi = weeklyinfo() weekly_info = wi.getweeklyfo(weekly_surveys) ff = filterfields() weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) idx = 1 for bi_month in months2: print 'For weeks: ', bi_month bi_month_data = [] for weekno in bi_month: bi_month_data.extend(weekly_data[weekno]) reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store) hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
def main(): parser = argparse.ArgumentParser('Script to process the survey data') parser.add_argument('-i', '-I', type=str, required=True, help='Path to the input dictionary') parser.add_argument('-q', '-Q', type=str, required=True, nargs=1, help='Q Types - seenB: seen bullying, didB: did bullying, other: others used my account, ' 'wasB: was bullied') parser.add_argument('-a', '-A', type=str, required=False, nargs='*', help='optional, what answers to filter for') parser.add_argument('-s', '-S', type=str, required=True, help='path to save the variables at, with leading /') parser.add_argument('-f', '-F', type=str) parser.add_argument('-f1q', '-F1Q', type=str, nargs = 1, help='first level filter question') parser.add_argument('-f1a', '-F1A', type=str, nargs = '*', help='first level filter answers', required=False) args = parser.parse_args() ip_filepath = args.i qno = args.q[0] answers = args.a op_filepath = args.s op_filename = args.f filterQ = args.f1q filterA = args.f1a print 'Processing...' res = filtersurvey(ip_filepath, qno, answers) to_save = {} print 'done' if not (None == filterQ): filterQ = filterQ[0] print 'second level filtering argument exists, filtering...' for ans in res.keys(): temp = filtersurvey(res[ans], filterQ, filterA, is_data=True) for ans1 in temp.keys(): to_save[(ans, ans1)] = temp[ans1] print 'done' else: to_save = res hlp.dumpvariable(to_save, op_filename, op_filepath)
def main(): parse = argparse.ArgumentParser('Script to generate statistics on bullying data') parse.add_argument('-i', '-I', type=str, required=True, help='Path to the input dictionary containing bullying information') parse.add_argument('-m', '-M', type=str, required=True, help='Path to the messages file, should be a csv') parse.add_argument('-s', '-S', type=str, required=True, help='Directory where results are stored, with a leading /') parse.add_argument('-f', '-F', type=str, required=True, help='File name') parse.add_argument('-p', '-P', type=str, required=True, help='Participant type') args = parse.parse_args() bullying_data = hlp.recovervariable(args.i) message_path = args.m save_dir = args.s save_f = args.f p_type = args.p res = {} for key in bullying_data.keys(): res[key] = getstats(message_path, bullying_data[key], p_type) hlp.dumpvariable(res, save_f, save_dir)
def get_message_counts(old_dataset, new_dataset, sorted_week_list, weekly_info, hash_to_pid_dict, ff_obj, location_to_store, do_debug): in_out_message_dict = {} # do_debug = True for pid_hash in hash_to_pid_dict: print '\n\n' old_pid_out_week_counts, old_out, old_out_week = __get_weekly_counts(old_dataset, pr.m_source, pid_hash, weekly_info, ff_obj, sorted_week_list, pid_hash, True) old_pid_in_weeks_counts, old_in, old_in_week = __get_weekly_counts(old_dataset, pr.m_target, pid_hash, weekly_info, ff_obj, sorted_week_list, pid_hash, True) new_pid_out_weeks_counts, new_out, new_out_week = __get_weekly_counts(new_dataset, pr.m_source, pid_hash, weekly_info, ff_obj, sorted_week_list, pid_hash) new_pid_in_weeks_counts, new_in, new_in_week = __get_weekly_counts(new_dataset, pr.m_target, pid_hash, weekly_info, ff_obj, sorted_week_list, pid_hash) in_out_message_dict[hash_to_pid_dict[pid_hash]] = [[old_pid_in_weeks_counts, old_pid_out_week_counts], [new_pid_in_weeks_counts, new_pid_out_weeks_counts]] print 'Sums: o_o, n_o, o_i, n_i: ', sum(old_pid_out_week_counts), sum(new_pid_out_weeks_counts), \ sum(old_pid_in_weeks_counts), sum(new_pid_in_weeks_counts) print 'Checking the numbers for ' + hash_to_pid_dict[pid_hash] + '(' + str(pid_hash) + ')' for week in sorted_week_list: if len(old_out_week[week]) > len(new_out_week[week]): print '***For week ' + str(week) + ' found old_out_week > new_out_week: ', len(old_out_week[week]), \ len(new_out_week[week]) if do_debug: __old_new_compare(old_out_week[week], new_out_week[week]) if len(old_in_week[week]) > len(new_in_week[week]): print '***For week ' + str(week) + ' found old_in_week > new_in_week: ', len(old_in_week[week]), \ len(new_in_week[week]) if do_debug: __old_new_compare(old_in_week[week], new_in_week[week]) hlp.dumpvariable([old_out, old_out_week, old_in, old_in_week, new_out, new_out_week, new_in, new_in_week], hash_to_pid_dict[pid_hash] + '.data', location_to_store) return in_out_message_dict
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '-F', type=str, required=True, help='weekly dict') parser.add_argument('-p', '-P', type=str, required=True, help='pid') parser.add_argument('-w', '-W', type=int, nargs='+', help='list of weeks') parser.add_argument('-o', '-O', type=str, help='folder to store the output') parser.add_argument('-s', '-S', action='store_true', help='separate out the incoming and outgoing messages') parser.add_argument('-io', type=str) args = parser.parse_args() week_dict_file = args.f pid = args.p weeks = args.w location_to_store = args.o separate_in_out = args.s show_in_out = args.io week_data_dict = hlp.recovervariable(week_dict_file) participant_data = {pid: {}} for week_no in weeks: reduced_data = getspecificdata(week_data_dict, pid, week_no, separate_in_out) if reduced_data is None: print 'No data found, or some error occurred...' continue else: participant_data[pid] = reduced_data print '\n\n\n\n\nData summary for PID:', pid, ' week_no: ', week_no printmessages(reduced_data, separate_in_out, show_in_out) if location_to_store is not None: hlp.dumpvariable(participant_data, pid+'.data', location_to_store)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled csv') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') args = parser.parse_args() data_file = args.d location_to_store = args.f all_afinn_data = hlp.readcsv(data_file) labelled_data = hlp.processafinnsentiment(all_afinn_data) csv_header = ['pid', 'm_type', 'in_pos', 'in_neg', 'in_neu', 'out_pos', 'out_neg', 'out_neu', 'in_deg_part', 'in_deg_nonpart', 'out_deg_part', 'out_deg_nonpart'] pol_dist, complete_in_out = distribution_polarity(labelled_data) print '***For Complete Dataset***' print 'Incoming(P, N, U): ', complete_in_out['in'] print 'Outgoing(P, N, U): ', complete_in_out['out'] hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store) to_store_csv = [csv_header] for pid in pol_dist: pid_data = pol_dist[pid] for m_type in pid_data: m_data = pid_data[m_type] csv_line = __summarize_data(m_data) final_csv_line = [pid, m_type] final_csv_line.extend(csv_line) to_store_csv.append(final_csv_line) hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
def main(): ff = filterfields(sys.argv[1]) print 'filtering...' filtered_data = ff.filterbyequality(pr.m_type, sys.argv[6]) hlp.dumpvariable(filtered_data, 'filtered_'+sys.argv[6], sys.argv[5]) print 'done' if '-' is not sys.argv[2]: writecsv(sys.argv[2], filtered_data) if '-' is not sys.argv[3]: links, link_tuple, graph_obj, pid_dict = hlp.creategraph(filtered_data) hlp.dumpvariable(links, 'static_links', sys.argv[5]) hlp.dumpvariable(link_tuple, 'static_links_tuple', sys.argv[5]) hlp.dumpvariable(graph_obj, 'static_graph_obj', sys.argv[5]) hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5]) graph_obj.writegraph(sys.argv[3]) if '-' is not sys.argv[4]: to_write_edge, to_write_nodes, week_dict, pid_dict, week_content = hlp.creategraph(filtered_data, False) writetofile(sys.argv[4]+'_el.csv', to_write_edge) writetofile(sys.argv[4]+'_nl.csv', to_write_nodes) hlp.dumpvariable(week_dict, 'dynamic_week_dict', sys.argv[5]) hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5]) hlp.dumpvariable(week_content, 'week_content', sys.argv[5])
random.seed(seed) # idx = 0 for datum in smsdata: #i = random.randint(0, l) ts_before.append(datum[pr.m_content]) # tweet_type = td.sentiment_dict[datum[td.sentiment]] # tweet_content = datum[td.sentiment_text] # ts_before.append((tweet_content, tweet_type)) # idx += 1 data = [] s_obj = sentiment() tr_set = s_obj.createtrainingset(tr_before) ts_set = s_obj.createtestingset(ts_before, testing_has_labels=False) print 'classifier training' s_obj.trainclassifier(tr_set) predictions = [] feature_set = [] print 'making predictions' # print 'Accuracy: ', s_obj.getaccuracy(ts_set) idx = 1 for datum in ts_set: res = s_obj.individualprediction(datum) print idx, ts_before[idx-1], '***pos: ', res.prob('pos'), ' *** neg: ', res.prob('neg') smsdata[idx-1].append(res) idx += 1 hlp.dumpvariable(smsdata, 'results') print 'woot!'
def main(): parser = argparse.ArgumentParser('Script to perform sentiment analysis using VADER') parser.add_argument('-m', '-M', type=str, required=True, help='Location of the message file') parser.add_argument('-mt', '-MT', type=str, required=True, nargs='+', help='types of messages to filter') parser.add_argument('-f', '-F', type=str, required=True, help='filename where data is stored, no extension needed') parser.add_argument('-s', '-S', type=str, required=True, help='location of folder to store the file, ends with a /') parser.add_argument('-p', '-P', action='store_true', help='flag to store polarities separately') parser.add_argument('-w', '-W', type=str, required=False, help='conduct weekly analysis, path to the survey data for ' 'creating week information') parser.add_argument('-l', '-L', type=str, nargs='+', required=True, help='the filters to use, make one or more choices: seenB, wasB, didB') parser.add_argument('-lf', '-LF', type=str, nargs='+', required=True, help='location of filtered data, from runSurveyStats.py, in same order as -l/L flag') args = parser.parse_args() message_file = args.m message_types = args.mt filename_to_store = args.f location_to_store = args.s separate_polarity_score = args.p survey_file = args.w filters_chosen = args.l filter_files = args.lf catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) if separate_polarity_score and survey_file is not None: print 'Cannot have separate polarity scores and weekly analysis together, ' \ 'please remove the -p/-P flag' return if survey_file is not None: wi = weeklyinfo() week_dates = wi.getweeklyfo(survey_file) gh = ghelper() ff = filterfields(message_file) data = [] for message_type in message_types: data.extend(ff.filterbyequality(pr.m_type, message_type)) pid_dict = hlp.getuniqueparticipants(data, 'all' if len(message_types) > 1 else message_types[0]) sentiment_analyzer = vadersenti(data[1:]) returned_data = sentiment_analyzer.compilesentiment(pr.m_content, separate_sentiment_list=separate_polarity_score) if separate_polarity_score: hlp.dumpvariable(returned_data, filename_to_store + '.data', location_to_store) else: header = pr.message_header + ['pos', 'neg', 'neu', 'compound'] final_data = [header] + returned_data hlp.writecsv(final_data, location_to_store + filename_to_store + '.csv') weekly_data = gh.filterweeklydata(pid_dict, returned_data, week_dates, 'all' if len(message_types) > 1 else message_types[0]) hlp.dumpvariable(weekly_data, 'weekly_data.dict', location_to_store) summarized_sentiment = {} for pid in weekly_data: summarized_sentiment[pid] = {} participant_data = weekly_data[pid] for week_no in participant_data: summarized_sentiment[pid][week_no] = sentiment_analyzer.summarizesentiment(participant_data[week_no], separate_in_out=True, message_type=message_type) hlp.dumpvariable(summarized_sentiment, 'weekly_summarized_sentiment.dict', location_to_store) plt = plots() overlay_data = gh.createbullyingoverlay(catch_all_data, week_dates, ff) plt.plotweeklyprogression(summarized_sentiment, location_to_store, 'Sentiment Progress', 'Week', 'Sentiment Value', sentiment_legend=['Positive', 'Negative', 'Neutral'], overlay_data=overlay_data) print 'done'
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', required=True, help='Sentiment Message file') parser.add_argument('-t', '-T', action='store_true', help='Sentiment type flag, if used then vader, else afinn') parser.add_argument('-f', '-F', required=True, help='Folder to store checkpoints, and final result') parser.add_argument('-w', '-W', required=False, help='Per week/month analysis') args = parser.parse_args() message_file = args.m sentiment_type = args.t location_to_store = args.f survey_file = args.w # get message data, only sms and fb_message ff = filterfields(message_file) ff.setdata(ff.getdata()[1:]) sms_data = ff.filterbyequality(pr.m_type, 'sms') pid_dict_sms = hlp.getuniqueparticipants2(sms_data) fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message') pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data) message_data = sms_data + fb_message_data # put the labels on labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \ hlp.processafinnsentiment(message_data, label_only=False) if survey_file is not None: wi = weeklyinfo() weekly_info = wi.getweeklyfo(survey_file) weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) #__temp_testing_for_discrepancy(labelled_data, weekly_data) # get the pid_dict for easier handling pid_dict = hlp.getuniqueparticipants2(labelled_data) if survey_file is not None: over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants']) __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf') __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf') __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf') print '***SMS***' print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values()) print '***FB***' print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values()) print '***OVERALL***' print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values()) summary_src_trg = summarize_message_by_src_trg(labelled_data) print '***Message Distribution***' for m_type_1 in summary_src_trg: print m_type_1, summary_src_trg[m_type_1] if survey_file is not None: week_list = weekly_data.keys() week_list.sort() # this is not good, as there aren't enough triads months = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24, 25]] # this has at least 8 triads, always, use this months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] month_idx = 1 for month in months2: labelled_data = [] for week in month: labelled_data.extend(weekly_data[week]) general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***' print 'len(LD): ', len(labelled_data) for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store) month_idx += 1 else: print 'len(LD): ', len(labelled_data) words_list, short_list = word_count(labelled_data) toWrite_wl_csv = create_word_count_csv(words_list) hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',') for mtype in words_list: counted_words = Counter(words_list[mtype]) counted_short = Counter(short_list[mtype]) print '***For '+mtype+' ***' print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20)) print 'Top 20 short: ', counted_short.most_common(20) print '\n\n' general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict) frac_triad = general_graph[3] summary_triad = general_graph[2] frac_triad_rand = random_graph[3] summary_triad_rand = random_graph[2] for summary in frac_triad: print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \ frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')' hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store) # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d') pos, neg, neu = get_polarity_directionality(labelled_data) print '***Polarity Distribution***' print 'Positive: \n', pos print 'Negative: \n', neg print 'Neutral: \n', neu in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants']) print '***Incoming Messages***' print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m) print '***Outgoing Messages***' print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m) print '***In Degree***' print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d) print '***Out Degree***' print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d) print '***COUNTS***' plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.', location_to_store+'in_out_messages.pdf') # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.', # location_to_store+'out_messages.pdf') plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.', location_to_store+'in_out_degree.pdf', True) # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.', # location_to_store+'out_degree.pdf', True) print 'TADAA!!'