def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True, help='Message file') parser.add_argument('-p', '-P', type=str, required=True, help='pid_dict') parser.add_argument('-o', '-O', type=str, required=True, help='output folder') parser.add_argument('-of', '-OF', type=str, required=True, help='output file') args = parser.parse_args() message_data = hlp.recovervariable(args.m) pid_dict = hlp.recovervariable(args.p) output_folder = args.o output_file = args.of edge_dict = getedgecount(message_data, pid_dict) edge_list = converttolist(edge_dict) hlp.writecsv([['source', 'target', 'pos', 'neu', 'neg']]+edge_list, output_folder+output_file) hlp.writecsv([['PID', 'Coded ID']]+convertpiddicttolist(pid_dict), output_folder+'pid_dict_list.csv')
def main(): messages = hlp.recovervariable(sys.argv[1]) pid_dict = hlp.recovervariable(sys.argv[2]) week_dict = hlp.recovervariable(sys.argv[3]) m_type = sys.argv[4] participants = pid_dict[pr.participant[m_type]] non_participants = pid_dict[pr.nparticipant[m_type]] graph_objs = weekgraphs(week_dict, participants, non_participants) hlp.dumpvariable(graph_objs, 'week_graph_objs')
def main(): parser = argparse.ArgumentParser('Script to generate statistics about message types') # add arguments parser.add_argument('-d', '-D', type=str, required=True, help='location of file to work with') parser.add_argument('-s', '-S', type=str, required=True, help='folder to store the results, ending with /') parser.add_argument('-f', '-F', type=str, required=True, help='filename to store data in') parser.add_argument('-w', '-W', type=int, default=0, help='what threshold to classify missing, default 0, Integer value needed') # get arguments args = parser.parse_args() filename = args.d threshold_missing = args.w location_to_store = args.s filepath = args.f data = hlp.recovervariable(filename) missing_week_dict, per_week_msgs = hlp.missingweeks(data, threshold_value=threshold_missing) flipped_dict = flipdict(missing_week_dict) printsummary(missing_week_dict, 'No. of participants with less than '+ str(threshold_missing)+' data points in ', len(data.keys()), per_week_msgs) hlp.dumpvariable(missing_week_dict, filepath, location_to_store) hlp.dumpvariable(flipped_dict, 'flipped_'+filepath, location_to_store)
def main(): parser = argparse.ArgumentParser('Filter out people who have 0 communication weeks greater than the threshold') parser.add_argument('-f', '-F', type=str, required=True) parser.add_argument('-ti', '-TI', type=int, required=True, help='Incoming threshold') parser.add_argument('-to', '-TO', type=int, required=True, help='Outgoing threshold') parser.add_argument('-s', '-S', type=str, required=True, help='storage folder with /') parser.add_argument('-sf', '-SF', type=str, required=True, help='file name for storage') args = parser.parse_args() flipped_dict = hlp.recovervariable(args.f) incoming_th = args.ti outgoing_th = args.to location_to_store = args.s filename = args.sf to_remove = [] for pid in flipped_dict: if flipped_dict[pid][0] >= incoming_th and flipped_dict[pid][1] >= outgoing_th: to_remove.append(pid) print 'REMOVED: ', pid, flipped_dict[pid] else: print 'NOT REMOVED: ', pid, flipped_dict[pid] print 'Removed ', len(to_remove), ' out of a total of ', len(flipped_dict.keys()), 'participants' hlp.dumpvariable(to_remove, filename, location_to_store)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True) parser.add_argument('-l', '-L', type=str, required=True) parser.add_argument('-o', '-O', type=str, required=False) args = parser.parse_args() message_data = hlp.recovervariable(args.m) lexicon_file = args.l output_file = args.o f = open(lexicon_file, 'r') lexicon_data = f.readlines() f.close() pct_words_covered, words_not_present, common_words = get_effective_coverage(lexicon_data, message_data) print 'pct words covered by vader: ', pct_words_covered print 'words not present: ', words_not_present if output_file is not None: output_text = '' for word in words_not_present.__iter__(): output_text += word + '\n' with open(output_file, 'w') as f: f.write(output_text)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--messageFile', type=str, required=True) parser.add_argument('-mt', '--messageTypes', type=str, nargs='+') parser.add_argument('-o', '--outputFolder', type=str, required=True) parser.add_argument('-of', '--outputFile', type=str, required=True) parser.add_argument('-pd', '--participantDictionary', type=str) parser.add_argument('-i', '--ignoreParticipants', type=str) parser.add_argument('-mc', '--messageTypeConvert', type=str, nargs='*') args = parser.parse_args() message_file = args.messageFile message_types = args.messageTypes output_folder = args.outputFolder output_file = args.outputFile pid_dict = args.participantDictionary ignore_pids = args.ignoreParticipants message_type_conversions = args.messageTypeConvert ff = filterfields(message_file) ff.setdata(ff.getdata()[1:]) to_set_data = [] # extract the relevant data for message_type in message_types: to_set_data.extend(ff.filterbyequality(pr.m_type, message_type)) ff.setdata(to_set_data) if ignore_pids is not None: ignore_pids = hlp.recovervariable(ignore_pids) for pid in ignore_pids: ff.removebyequality(pr.m_source, pid) ff.removebyequality(pr.m_target, pid) # set the pid to normal id dictionary if pid_dict is None: pid_dict = hlp.getuniqueparticipants(ff.getdata(), mtype='all', separate_pid_npid=True) # replace the message type names with the ones provided if message_type_conversions is not None: for idx in range(0, len(message_type_conversions), 2): message_to_convert = message_type_conversions[idx] to_convert_to = message_type_conversions[idx+1] ff.replacebyequality(pr.m_type, message_to_convert, to_convert_to) message_types = ff.getuniqueelements(pr.m_type) coded_participant_list = pid_dict[pr.participant['all']].values() storage_dict = initiatestorage(coded_participant_list, message_types) storage_dict = getperparticipantinout(ff.getdata(), storage_dict, pid_dict) plotperparticipantbar(storage_dict, 'Participant ID', '# of Messages', message_types, 'Per Participant Messages', output_folder+output_file) hlp.dumpvariable(pid_dict, 'pid_dict.dict', output_folder) hlp.dumpvariable(ff.getdata(), 'messageData.list', output_folder)
def filtersurvey(dict_path, qno, answers, is_data = False): data = dict_path if is_data else hlp.recovervariable(dict_path) survey_obj = surveystats(data) if None == answers: res = survey_obj.processdict(sInfo.surveyQType[qno]) else: res = {} for ans in answers: res[ans] = survey_obj.processdict(sInfo.surveyQType[qno], ans) return res
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled data from validate_balance_theory.py') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') parser.add_argument('-w', '-W', required=False, help='survey file for weekly data processing') args = parser.parse_args() data_file = args.d location_to_store = args.f weekly_surveys = args.w all_data = hlp.recovervariable(data_file) labelled_data = all_data[2] pid_dict = all_data[3] if weekly_surveys is None: reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'], location_to_store) analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv') analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv') hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info], 'reciprocity_info_overall.dict', location_to_store) else: # working with bimonthly data months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] wi = weeklyinfo() weekly_info = wi.getweeklyfo(weekly_surveys) ff = filterfields() weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) idx = 1 for bi_month in months2: print 'For weeks: ', bi_month bi_month_data = [] for weekno in bi_month: bi_month_data.extend(weekly_data[weekno]) reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'], location_to_store) analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv') analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv') hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info], 'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store) idx += 1 print 'tadaa!'
def main(): parse = argparse.ArgumentParser('Script to create plots of graph statistics') parse.add_argument('-i', '-I', type=str, required=True, help='path to graph statistics data') parse.add_argument('-o', '-O', type=str, required=True, help='directory to store the generated graphs without leading /') parse.add_argument('-f', '-F', type=str, default='mean', help='function to use, currently supports all present in the statistics package') args = parse.parse_args() ip_file = args.i op_dir = args.o func = eval('statistics.' + args.f) data = hlp.recovervariable(ip_file) for ans in data.keys(): print 'DK:', ans dpath = op_dir + '_' + args.f + '/' if not os.path.exists(dpath): os.mkdir(dpath) plotindividual(data[ans], func, dpath)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '-D', required=True, help='labelled data from validate_balance_theory.py') parser.add_argument('-f', '-F', required=True, help='folder to save the data in') parser.add_argument('-w', '-W', required=False, help='survey file for weekly data processing') args = parser.parse_args() data_file = args.d location_to_store = args.f weekly_surveys = args.w all_data = hlp.recovervariable(data_file) labelled_data = all_data[2] pid_dict = all_data[3] if weekly_surveys is None: reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store) hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store) else: months2 = [[1, 2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25]] wi = weeklyinfo() weekly_info = wi.getweeklyfo(weekly_surveys) ff = filterfields() weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff) idx = 1 for bi_month in months2: print 'For weeks: ', bi_month bi_month_data = [] for weekno in bi_month: bi_month_data.extend(weekly_data[weekno]) reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store) hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
def main(): parse = argparse.ArgumentParser('Script to generate statistics on bullying data') parse.add_argument('-i', '-I', type=str, required=True, help='Path to the input dictionary containing bullying information') parse.add_argument('-m', '-M', type=str, required=True, help='Path to the messages file, should be a csv') parse.add_argument('-s', '-S', type=str, required=True, help='Directory where results are stored, with a leading /') parse.add_argument('-f', '-F', type=str, required=True, help='File name') parse.add_argument('-p', '-P', type=str, required=True, help='Participant type') args = parse.parse_args() bullying_data = hlp.recovervariable(args.i) message_path = args.m save_dir = args.s save_f = args.f p_type = args.p res = {} for key in bullying_data.keys(): res[key] = getstats(message_path, bullying_data[key], p_type) hlp.dumpvariable(res, save_f, save_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '-F', type=str, required=True, help='weekly dict') parser.add_argument('-p', '-P', type=str, required=True, help='pid') parser.add_argument('-w', '-W', type=int, nargs='+', help='list of weeks') parser.add_argument('-o', '-O', type=str, help='folder to store the output') parser.add_argument('-s', '-S', action='store_true', help='separate out the incoming and outgoing messages') parser.add_argument('-io', type=str) args = parser.parse_args() week_dict_file = args.f pid = args.p weeks = args.w location_to_store = args.o separate_in_out = args.s show_in_out = args.io week_data_dict = hlp.recovervariable(week_dict_file) participant_data = {pid: {}} for week_no in weeks: reduced_data = getspecificdata(week_data_dict, pid, week_no, separate_in_out) if reduced_data is None: print 'No data found, or some error occurred...' continue else: participant_data[pid] = reduced_data print '\n\n\n\n\nData summary for PID:', pid, ' week_no: ', week_no printmessages(reduced_data, separate_in_out, show_in_out) if location_to_store is not None: hlp.dumpvariable(participant_data, pid+'.data', location_to_store)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-m', '-M', type=str, required=True, help='Message list file') parser.add_argument('-r', '-R', type=str, required=True, help='survey file') parser.add_argument('-p', '-P', type=str, required=True, help='PID dict inverted') parser.add_argument('-b', '-B', type=str, required=True, help='bullying dictionary') parser.add_argument('-o', '-O', type=str, required=True, help='Output folder') parser.add_argument('-l', '-L', type=str, nargs='+', help='Filters chosen') parser.add_argument('-f', '-f', type=str, nargs='+', help='Filter files') args = parser.parse_args() output_folder = args.o message_data = hlp.recovervariable(args.m) pid_dict = hlp.recovervariable(args.p) filters_chosen = args.l filter_files = args.f catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True) wi = weeklyinfo() weekly_info = wi.getweeklyfo(args.r) ff = filterfields() gh = ghelper() bullying_overlay = gh.createbullyingoverlay(catch_all_data, weekly_info, ff) bullying_overlay = flip_bullying_overlay(bullying_overlay, weekly_info.keys()) pid_list = pid_dict.keys() pid_list.sort() for pid in pid_list: training_set_final = [] testing_set_final = [] pid_list_training = deepcopy(pid_list) pid_list_training.remove(pid) ff.setdata(message_data) testing_raw_data = ff.filterbyequality(pr.m_source, pid_dict[pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[pid]) ff.removebyequality(pr.m_source, pid_dict[pid]) ff.removebyequality(pr.m_target, pid_dict[pid]) training_raw_data = ff.getdata() fe = raw_features(data=None) _, _ = fe.get_scoring_factors(training_raw_data) training_weekly_data = {} for training_pid in pid_list_training: training_weekly_data[training_pid] = {} data_to_use = ff.filterbyequality(pr.m_source, pid_dict[training_pid]) + \ ff.filterbyequality(pr.m_target, pid_dict[training_pid]) if 0 == len(data_to_use): print 'no data found, probably filtered into the testing set, Training PID: '+\ training_pid+', Testing PID: '+pid continue pid_weekly_w_bullying, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, \ outgoing_ss = get_pid_level_features(data_to_use, weekly_info, ff, bullying_overlay, pid_dict, training_pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[training_pid]) training_set_final.append( [training_pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) # testing pid pid_weekly_w_bullying, global_in_degree, global_out_degree, \ global_in_ew, global_out_ew, incoming_ss, outgoing_ss = get_pid_level_features(testing_raw_data, weekly_info, ff, bullying_overlay, pid_dict, pid, fe) for week_no in pid_weekly_w_bullying: fr_in_degree, fr_out_degree, fr_in_ew, \ fr_out_ew, fr_in_senti, fr_out_senti, \ current_in_ss, current_out_ss = get_week_features(pid_weekly_w_bullying, week_no, fe, global_in_degree, global_out_degree, global_in_ew, global_out_ew, incoming_ss, outgoing_ss, pid_dict[pid]) testing_set_final.append( [pid, week_no, fr_in_senti[0], fr_in_senti[1], fr_in_senti[2], fr_out_senti[0], fr_out_senti[1], fr_out_senti[2], fr_in_degree, fr_out_degree, fr_in_ew, fr_out_ew, current_in_ss, current_out_ss, pid_weekly_w_bullying[week_no]['label']]) header = ['pid', 'wkno', 'frWInSenPos', 'frWInSenNeu', 'frWInSenNeg', 'frWOutSenPos', 'frWOutSenNeu', 'frWOutSenNeg', 'frInDegO', 'frOutDegO', 'frInEdgeO', 'frOutEdgeO', 'inSenSc', 'outSenSc', 'label'] training_set_final = [header] + training_set_final testing_set_final = [header] + testing_set_final hlp.writecsv(training_set_final, output_folder+pid+'_tr.csv') hlp.writecsv(testing_set_final, output_folder+pid+'_ts.csv')
def __init__(self, folder_to_look='./'): self.word_corpus = hlp.recovervariable(folder_to_look + 'all_words.list') self.total_len = len(self.word_corpus) + 0.0 self.word_freq = self.word_frequency(self.word_corpus) self.kb_neighbor = self.__keyboard_neighborhood() self.len_dict = self.__create_word_len_dict()