Пример #1
0
def main():
    parser = argparse.ArgumentParser('Filter out people who have 0 communication weeks greater than the threshold')

    parser.add_argument('-f', '-F', type=str, required=True)
    parser.add_argument('-ti', '-TI', type=int, required=True,
                        help='Incoming threshold')
    parser.add_argument('-to', '-TO', type=int, required=True,
                        help='Outgoing threshold')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='storage folder with /')
    parser.add_argument('-sf', '-SF', type=str, required=True,
                        help='file name for storage')

    args = parser.parse_args()

    flipped_dict = hlp.recovervariable(args.f)
    incoming_th = args.ti
    outgoing_th = args.to
    location_to_store = args.s
    filename = args.sf

    to_remove = []
    for pid in flipped_dict:
        if flipped_dict[pid][0] >= incoming_th and flipped_dict[pid][1] >= outgoing_th:
            to_remove.append(pid)
            print 'REMOVED: ', pid, flipped_dict[pid]
        else:
            print 'NOT REMOVED: ', pid, flipped_dict[pid]

    print 'Removed ', len(to_remove), ' out of a total of ', len(flipped_dict.keys()),  'participants'

    hlp.dumpvariable(to_remove, filename, location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Message file')
    parser.add_argument('-p', '-P', action='store_true')
    parser.add_argument('-s', '-S', required=True,
                        help='filename to store polarity in, no extension needed')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to store the files in, ending with /')
    parser.add_argument('-n', '-N', required=False, nargs=2, type=int, default=[0, 2],
                        help='the neutral threshold, first value is min, second is max')

    args = parser.parse_args()

    messagefile = args.m
    location_to_store = args.f
    file_to_store = args.s
    separate_sentiment = args.p
    neutral_limit = args.n


    message_data = hlp.readcsv(messagefile)
    message_header = message_data[0]
    message_data = message_data[1:]

    afinn = afinnsenti(data=message_data, neutral_threshold=neutral_limit)
    data = afinn.compilesentiment(separate_sentiment_list=separate_sentiment, field_no=nd.m_content)
    if(separate_sentiment):
        hlp.dumpvariable(data, file_to_store+'.list', location_to_store)
    else:
        message_header.append('score')
        message_header.append('label')
        final_data =  [message_header] + data
        hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def individual_reciprocity_analysis(labelled_data, pid_dict, location_to_store):
    reciprocity_info = {}
    ff = filterfields()
    ff.setdata(labelled_data)
    polarity_data = {}
    for pid in pid_dict:
        print 'Working with PID: ', pid, '(', pid_dict[pid], ')'
        messages_by_participant = ff.filterbyequality(pr.m_source, pid)
        messages_to_participant = ff.filterbyequality(pr.m_target, pid)
        polarity_data[pid] = __get_polarity_composition(messages_by_participant+messages_to_participant, pid)
        reciprocity_info[pid] = {}
        n = len(messages_by_participant)
        idx = 0
        for message in messages_by_participant:
            print 'idx=' + str(idx) + '/' + str(n)
            idx += 1
            closest_message = find_closest_message(message, messages_to_participant, ff)
            target_type = 'P' if message[pr.m_target_type] == 'participant' else 'NP'
            target = message[pr.m_target]
            if target_type not in reciprocity_info[pid]:
                reciprocity_info[pid][target_type] = {}
            if target not in reciprocity_info[pid][target_type]:
                reciprocity_info[pid][target_type][target] = __basic_reciprocity_dict()
            sent_message_type = message[-1]
            reply_message_type = 'X' if closest_message is None else closest_message[-1]
            reciprocity_info[pid][target_type][target][sent_message_type][reply_message_type] += 1
        print 'saving checkpoint...'
        hlp.dumpvariable([reciprocity_info, pid, pid_dict], 'checkpoint.chp', location_to_store)
        print 'saved!'
    return reciprocity_info, polarity_data
def find_reciprocity(labelled_data, location_to_store):
    ff = filterfields()
    ff.setdata(labelled_data)
    messages_sent_by_participants = ff.filterbyequality(pr.m_source_type, 'participant')
    reciprocity_dict = {'P': {'P': 0, 'U': 0, 'N': 0, 'X': 0},
                        'N': {'P': 0, 'U': 0, 'N': 0, 'X': 0},
                        'U': {'P': 0, 'U': 0, 'N': 0, 'X': 0}}
    n = len(messages_sent_by_participants)
    idx = 1
    message_pairs = []
    for message in messages_sent_by_participants:
        print 'at message ', idx, ' of ', n
        idx += 1
        reply_message = find_closest_message(message, ff)
        sent_message_type = message[-1]
        if reply_message is None:
            reply_message_type = 'X'
        else:
            reply_message_type = reply_message[-1]
        reciprocity_dict[sent_message_type][reply_message_type] += 1
        message_pairs.append((message, reply_message))
        if 0 == idx%500:
            print 'saving...'
            hlp.dumpvariable([idx, reciprocity_dict, message_pairs, messages_sent_by_participants],
                             'checkpoint.chp', location_to_store)
    print 'done... out of the loop'
    to_use = {'P': '+', 'N': '-', 'U': 'u', 'X': 'null'}
    for sent_type in reciprocity_dict:
        recvd_types = reciprocity_dict[sent_type]
        for recvd_type in recvd_types:
            print 'N('+to_use[recvd_type]+'|'+to_use[sent_type]+')=', recvd_types[recvd_type]

    return reciprocity_dict, message_pairs
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Message file')
    parser.add_argument('-p', '-P', action='store_true')
    parser.add_argument('-s', '-S', required=True,
                        help='filename to store polarity in, no extension needed')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to store the files in, ending with /')

    args = parser.parse_args()
    messagefile = args.m
    location_to_store = args.f
    file_to_store = args.s
    separate_sentiment = args.p

    message_data = hlp.readcsv(messagefile)
    message_header = message_data[0]
    message_data = message_data[1:]

    vader = vadersenti(data=message_data)
    data = vader.compilesentiment(separate_sentiment_list=separate_sentiment)
    if separate_sentiment:
        hlp.dumpvariable(data, file_to_store+'.list', location_to_store)
    else:
        message_header.append('pos')
        message_header.append('neg')
        message_header.append('neu')
        message_header.append('compound')
        final_data = [message_header] + data
        hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main():
    parser = argparse.ArgumentParser('Script to generate statistics about message types')

    # add arguments
    parser.add_argument('-d', '-D', type=str, required=True,
                        help='location of file to work with')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='folder to store the results, ending with /')
    parser.add_argument('-f', '-F', type=str, required=True,
                        help='filename to store data in')
    parser.add_argument('-w', '-W', type=int, default=0,
                        help='what threshold to classify missing, default 0, Integer value needed')

    # get arguments
    args = parser.parse_args()
    filename = args.d
    threshold_missing = args.w
    location_to_store = args.s
    filepath = args.f

    data = hlp.recovervariable(filename)

    missing_week_dict, per_week_msgs = hlp.missingweeks(data, threshold_value=threshold_missing)
    flipped_dict = flipdict(missing_week_dict)
    printsummary(missing_week_dict, 'No. of participants with less than '+
                 str(threshold_missing)+' data points in ', len(data.keys()), per_week_msgs)
    hlp.dumpvariable(missing_week_dict, filepath, location_to_store)
    hlp.dumpvariable(flipped_dict, 'flipped_'+filepath, location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '--messageFile', type=str, required=True)
    parser.add_argument('-mt', '--messageTypes', type=str, nargs='+')
    parser.add_argument('-o', '--outputFolder', type=str, required=True)
    parser.add_argument('-of', '--outputFile', type=str, required=True)
    parser.add_argument('-pd', '--participantDictionary', type=str)
    parser.add_argument('-i', '--ignoreParticipants', type=str)
    parser.add_argument('-mc', '--messageTypeConvert', type=str, nargs='*')

    args = parser.parse_args()

    message_file = args.messageFile
    message_types = args.messageTypes
    output_folder = args.outputFolder
    output_file = args.outputFile
    pid_dict = args.participantDictionary
    ignore_pids = args.ignoreParticipants
    message_type_conversions = args.messageTypeConvert

    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])

    to_set_data = []

    # extract the relevant data
    for message_type in message_types:
        to_set_data.extend(ff.filterbyequality(pr.m_type, message_type))

    ff.setdata(to_set_data)

    if ignore_pids is not None:
        ignore_pids = hlp.recovervariable(ignore_pids)
        for pid in ignore_pids:
            ff.removebyequality(pr.m_source, pid)
            ff.removebyequality(pr.m_target, pid)


    # set the pid to normal id dictionary
    if pid_dict is None:
        pid_dict = hlp.getuniqueparticipants(ff.getdata(), mtype='all', separate_pid_npid=True)

    # replace the message type names with the ones provided
    if message_type_conversions is not None:
        for idx in range(0, len(message_type_conversions), 2):
            message_to_convert = message_type_conversions[idx]
            to_convert_to = message_type_conversions[idx+1]
            ff.replacebyequality(pr.m_type, message_to_convert, to_convert_to)

    message_types = ff.getuniqueelements(pr.m_type)
    coded_participant_list = pid_dict[pr.participant['all']].values()
    storage_dict = initiatestorage(coded_participant_list, message_types)
    storage_dict = getperparticipantinout(ff.getdata(), storage_dict, pid_dict)
    plotperparticipantbar(storage_dict, 'Participant ID', '# of Messages', message_types, 'Per Participant Messages',
                          output_folder+output_file)
    hlp.dumpvariable(pid_dict, 'pid_dict.dict', output_folder)
    hlp.dumpvariable(ff.getdata(), 'messageData.list', output_folder)
def main():
    messages = hlp.recovervariable(sys.argv[1])
    pid_dict = hlp.recovervariable(sys.argv[2])
    week_dict = hlp.recovervariable(sys.argv[3])
    m_type = sys.argv[4]
    participants = pid_dict[pr.participant[m_type]]
    non_participants = pid_dict[pr.nparticipant[m_type]]
    graph_objs = weekgraphs(week_dict, participants, non_participants)
    hlp.dumpvariable(graph_objs, 'week_graph_objs')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-w', '-W', type=str, required=True)
    parser.add_argument('-o', '-O', type=str, required=True)

    args = parser.parse_args()

    word_file = args.w
    output_folder = args.o

    f = open(word_file, 'r')
    d = f.readlines()
    f.close()

    word_list = [x.strip() for x in d]

    cf = canonical_form(word_list)
    words_in_dict, words_not_in_dict = cf.words_in_dict()
    print 'Total word list: ', len(word_list), ' Words present in dict: ', len(words_in_dict), \
        ' Not in dict: ', len(words_not_in_dict)
    to_write_in_dict = ''
    for word in words_in_dict:
        to_write_in_dict += word +'\n'
    cf.set_word_list(words_not_in_dict)
    correct_form, missed_words = cf.get_canonical_form()
    print 'Could not find canonical forms for ', len(missed_words), ' out of a total of ', len(words_not_in_dict)
    to_write_canonical = ''
    to_substitute = {}
    for right_form, other_values in correct_form.iteritems():
        to_write_canonical += right_form
        for word in other_values:
            to_write_canonical += ' '+word
            to_substitute[word] = right_form
        to_write_canonical += '\n'
    to_write_missed = ''
    for word in missed_words:
        to_write_missed += word + '\n'

    with open(output_folder + 'found_in_dict.txt', 'w') as f:
        f.write(to_write_in_dict)

    with open(output_folder + 'cannonical_form.txt', 'w') as f:
        f.write(to_write_canonical)

    with open(output_folder + 'not_found_anywhere.txt', 'w') as f:
        f.write(to_write_missed)

    hlp.dumpvariable(to_substitute, 'substitution_dict.dict', output_folder)

    print 'Done writing...'
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')

    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]
    if weekly_surveys is None:
        reciprocity_info, polarity_info = individual_reciprocity_analysis(labelled_data, pid_dict['participants'],
                                                                          location_to_store)
        analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_overall.csv')
        analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_overall.csv')
        hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                         'reciprocity_info_overall.dict', location_to_store)
    else:
        # working with bimonthly data
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_info, polarity_info = individual_reciprocity_analysis(bi_month_data, pid_dict['participants'],
                                                                              location_to_store)
            analyze_info(reciprocity_info, pid_dict, location_to_store, 'pr_bimonthly_'+str(idx)+'.csv')
            analyze_polarity(polarity_info, pid_dict, location_to_store, 'polarity_bimonthly_'+str(idx)+'.csv')
            hlp.dumpvariable([reciprocity_info, labelled_data, pid_dict, polarity_info],
                             'reciprocity_info_bimonthly_'+str(idx)+'.data', location_to_store)
            idx += 1

    print 'tadaa!'
def main():
    parser = argparse.ArgumentParser('Script to generate distribution '
                                     'of edge weights/degrees for all '
                                     'participants')
    parser.add_argument('-m', '-M', type=str, required=True,
                        help='location of the message file')
    parser.add_argument('-mt', '-MT', type=str, default='all',
                        help='types of messages to plot, currently supports '
                             'one of the following: sms, fb, twitter, or all')
    parser.add_argument('-r', '-R', type=str, required=True,
                        help='survey file')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='folder to store data in, leading / required')
    parser.add_argument('-p', '-P', action='store_true',
                        help='flag to generate plots')

    args = parser.parse_args()
    survey_file = args.r
    message_file = args.m
    m_type = args.mt
    folder_to_store = args.s
    generate_plots = args.p

    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)

    ff = filterfields(message_file)
    filtered_data = []
    if m_type == 'all':
        for message_type in ['sms', 'fb_message']:
            filtered_data.extend(ff.filterbyequality(pr.m_type, message_type))
    else:
        filtered_data = ff.filterbyequality(pr.m_type, m_type)
    _, links_tuple, _, pid_dict = hlp.creategraph(filtered_data, filterType=args.mt)
    gh = ghelper()
    plt = plots()
    weekly_deg_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt,
                                                   is_degree=True, week_info=week_info)
    hlp.dumpvariable(weekly_deg_dist, 'weekly_deg_dist.dict', folder_to_store)
    weekly_ew_dist, _ = gh.getweeklydistributions(pid_dict, filtered_data, message_type=args.mt,
                                                  is_degree=False, week_info=week_info)
    hlp.dumpvariable(weekly_ew_dist, 'weekly_ew_dist.dict', folder_to_store)
    if generate_plots:
        plt.plotweeklyprogression(weekly_deg_dist, folder_to_store + 'deg_', 'No. of friends',
                                  'Week No.', 'Friends')
        plt.plotweeklyprogression(weekly_ew_dist, folder_to_store + 'ew_', 'No. of messages exhanged',
                                  'Week No.', 'Messages')

    print 'done...'
Пример #12
0
def main(sql_path, variable_path):
    s_obj = surveys()

    data = s_obj.importsqlascsv(sql_path, 'survey')
    hlp.dumpvariable(data, 'survey_list.list', variable_path)
    hlp.writecsv(data, variable_path+'survey_list.csv')

    ndata = s_obj.interpretanswers(data)
    hlp.dumpvariable(ndata, 'survey_list_interpret.list', variable_path)
    hlp.writecsv(ndata, variable_path+'survey_list_interpret.csv')

    ndata_wR = s_obj.interpretanswers(data, True)
    hlp.dumpvariable(ndata_wR, 'survey_list_with_response_interpret.list', variable_path)
    hlp.writecsv(ndata_wR, variable_path+'survey_list_with_response_interpret.csv')

    data_dict = s_obj.datatodict(ndata)
    hlp.dumpvariable(data_dict, 'survey_dict_interpret.dict', variable_path)

    data_wR_dict = s_obj.datatodict(ndata_wR)
    hlp.dumpvariable(data_wR_dict, 'survey_dict_with_response_interpret.dict', variable_path)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled data from validate_balance_theory.py')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    parser.add_argument('-w', '-W', required=False,
                        help='survey file for weekly data processing')


    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f
    weekly_surveys = args.w

    all_data = hlp.recovervariable(data_file)
    labelled_data = all_data[2]
    pid_dict = all_data[3]

    if weekly_surveys is None:
        reciprocity_dict, message_pairs = find_reciprocity(labelled_data, location_to_store)
        hlp.dumpvariable([reciprocity_dict, message_pairs], 'reciprocity_counts_msgPairs_overall', location_to_store)
    else:
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                   [9, 10, 11, 12, 13, 14, 15, 16],
                   [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(weekly_surveys)
        ff = filterfields()
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)
        idx = 1
        for bi_month in months2:
            print 'For weeks: ', bi_month
            bi_month_data = []
            for weekno in bi_month:
                bi_month_data.extend(weekly_data[weekno])
            reciprocity_dict, message_pairs = find_reciprocity(bi_month_data, location_to_store)
            hlp.dumpvariable([reciprocity_dict, message_pairs],
                             'reciprocity_counts_msgPairs_bimonthly_'+str(idx)+'.data', location_to_store)
def main():
    parser = argparse.ArgumentParser('Script to process the survey data')
    parser.add_argument('-i', '-I', type=str, required=True,
                        help='Path to the input dictionary')
    parser.add_argument('-q', '-Q', type=str, required=True, nargs=1,
                        help='Q Types - seenB: seen bullying, didB: did bullying, other: others used my account, '
                             'wasB: was bullied')
    parser.add_argument('-a', '-A', type=str, required=False, nargs='*',
                        help='optional, what answers to filter for')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='path to save the variables at, with leading /')
    parser.add_argument('-f', '-F', type=str)
    parser.add_argument('-f1q', '-F1Q', type=str, nargs = 1,
                        help='first level filter question')
    parser.add_argument('-f1a', '-F1A', type=str, nargs = '*',
                        help='first level filter answers', required=False)
    args = parser.parse_args()

    ip_filepath = args.i
    qno = args.q[0]
    answers = args.a
    op_filepath = args.s
    op_filename = args.f
    filterQ = args.f1q
    filterA = args.f1a
    print 'Processing...'
    res = filtersurvey(ip_filepath, qno, answers)
    to_save = {}
    print 'done'
    if not (None == filterQ):
        filterQ = filterQ[0]
        print 'second level filtering argument exists, filtering...'
        for ans in res.keys():
            temp = filtersurvey(res[ans], filterQ, filterA, is_data=True)
            for ans1 in temp.keys():
                to_save[(ans, ans1)] = temp[ans1]
        print 'done'
    else:
        to_save = res
    hlp.dumpvariable(to_save, op_filename, op_filepath)
def main():
    parse = argparse.ArgumentParser('Script to generate statistics on bullying data')
    parse.add_argument('-i', '-I', type=str, required=True,
                       help='Path to the input dictionary containing bullying information')
    parse.add_argument('-m', '-M', type=str, required=True,
                       help='Path to the messages file, should be a csv')
    parse.add_argument('-s', '-S', type=str, required=True,
                       help='Directory where results are stored, with a leading /')
    parse.add_argument('-f', '-F', type=str, required=True,
                       help='File name')
    parse.add_argument('-p', '-P', type=str, required=True,
                       help='Participant type')
    args = parse.parse_args()
    bullying_data = hlp.recovervariable(args.i)
    message_path = args.m
    save_dir = args.s
    save_f = args.f
    p_type = args.p
    res = {}
    for key in bullying_data.keys():
        res[key] = getstats(message_path, bullying_data[key], p_type)
    hlp.dumpvariable(res, save_f, save_dir)
def get_message_counts(old_dataset, new_dataset, sorted_week_list, weekly_info, hash_to_pid_dict, ff_obj,
                       location_to_store, do_debug):
    in_out_message_dict = {}
    # do_debug = True
    for pid_hash in hash_to_pid_dict:
        print '\n\n'
        old_pid_out_week_counts, old_out, old_out_week = __get_weekly_counts(old_dataset, pr.m_source, pid_hash,
                                                                             weekly_info, ff_obj, sorted_week_list,
                                                                             pid_hash, True)
        old_pid_in_weeks_counts, old_in, old_in_week = __get_weekly_counts(old_dataset, pr.m_target, pid_hash,
                                                                           weekly_info, ff_obj, sorted_week_list,
                                                                           pid_hash, True)
        new_pid_out_weeks_counts, new_out, new_out_week = __get_weekly_counts(new_dataset, pr.m_source, pid_hash,
                                                                              weekly_info, ff_obj, sorted_week_list,
                                                                              pid_hash)
        new_pid_in_weeks_counts, new_in, new_in_week = __get_weekly_counts(new_dataset, pr.m_target, pid_hash,
                                                                           weekly_info, ff_obj, sorted_week_list,
                                                                           pid_hash)
        in_out_message_dict[hash_to_pid_dict[pid_hash]] = [[old_pid_in_weeks_counts, old_pid_out_week_counts],
                                                           [new_pid_in_weeks_counts, new_pid_out_weeks_counts]]
        print 'Sums: o_o, n_o, o_i, n_i: ', sum(old_pid_out_week_counts), sum(new_pid_out_weeks_counts), \
            sum(old_pid_in_weeks_counts), sum(new_pid_in_weeks_counts)
        print 'Checking the numbers for ' + hash_to_pid_dict[pid_hash] + '(' + str(pid_hash) + ')'
        for week in sorted_week_list:
            if len(old_out_week[week]) > len(new_out_week[week]):
                print '***For week ' + str(week) + ' found old_out_week > new_out_week: ', len(old_out_week[week]), \
                    len(new_out_week[week])
            if do_debug:
                __old_new_compare(old_out_week[week], new_out_week[week])
            if len(old_in_week[week]) > len(new_in_week[week]):
                print '***For week ' + str(week) + ' found old_in_week > new_in_week: ', len(old_in_week[week]), \
                    len(new_in_week[week])
            if do_debug:
                __old_new_compare(old_in_week[week], new_in_week[week])

        hlp.dumpvariable([old_out, old_out_week, old_in, old_in_week, new_out, new_out_week, new_in, new_in_week],
                         hash_to_pid_dict[pid_hash] + '.data', location_to_store)
    return in_out_message_dict
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f', '-F', type=str, required=True,
                        help='weekly dict')
    parser.add_argument('-p', '-P', type=str, required=True,
                        help='pid')
    parser.add_argument('-w', '-W', type=int, nargs='+',
                        help='list of weeks')
    parser.add_argument('-o', '-O', type=str,
                        help='folder to store the output')
    parser.add_argument('-s', '-S', action='store_true',
                        help='separate out the incoming and outgoing messages')
    parser.add_argument('-io', type=str)

    args = parser.parse_args()
    week_dict_file = args.f
    pid = args.p
    weeks = args.w
    location_to_store = args.o
    separate_in_out = args.s
    show_in_out = args.io

    week_data_dict = hlp.recovervariable(week_dict_file)
    participant_data = {pid: {}}
    for week_no in weeks:
        reduced_data = getspecificdata(week_data_dict, pid, week_no, separate_in_out)
        if reduced_data is None:
            print 'No data found, or some error occurred...'
            continue
        else:
            participant_data[pid] = reduced_data
            print '\n\n\n\n\nData summary for PID:', pid, ' week_no: ', week_no
            printmessages(reduced_data, separate_in_out, show_in_out)
    if location_to_store is not None:
        hlp.dumpvariable(participant_data, pid+'.data', location_to_store)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled csv')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f

    all_afinn_data = hlp.readcsv(data_file)
    labelled_data = hlp.processafinnsentiment(all_afinn_data)
    csv_header = ['pid', 'm_type',
                  'in_pos', 'in_neg', 'in_neu',
                  'out_pos', 'out_neg', 'out_neu',
                  'in_deg_part', 'in_deg_nonpart',
                  'out_deg_part', 'out_deg_nonpart']

    pol_dist, complete_in_out = distribution_polarity(labelled_data)
    print '***For Complete Dataset***'
    print 'Incoming(P, N, U): ', complete_in_out['in']
    print 'Outgoing(P, N, U): ', complete_in_out['out']
    hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store)

    to_store_csv = [csv_header]

    for pid in pol_dist:
        pid_data = pol_dist[pid]
        for m_type in pid_data:
            m_data = pid_data[m_type]
            csv_line = __summarize_data(m_data)
            final_csv_line = [pid, m_type]
            final_csv_line.extend(csv_line)
            to_store_csv.append(final_csv_line)
    hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
Пример #19
0
def main():
    ff = filterfields(sys.argv[1])
    print 'filtering...'
    filtered_data = ff.filterbyequality(pr.m_type, sys.argv[6])
    hlp.dumpvariable(filtered_data, 'filtered_'+sys.argv[6], sys.argv[5])
    print 'done'
    if '-' is not sys.argv[2]:
        writecsv(sys.argv[2], filtered_data)
    if '-' is not sys.argv[3]:
        links, link_tuple, graph_obj, pid_dict = hlp.creategraph(filtered_data)
        hlp.dumpvariable(links, 'static_links', sys.argv[5])
        hlp.dumpvariable(link_tuple, 'static_links_tuple', sys.argv[5])
        hlp.dumpvariable(graph_obj, 'static_graph_obj', sys.argv[5])
        hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5])
        graph_obj.writegraph(sys.argv[3])
    if '-' is not sys.argv[4]:
        to_write_edge, to_write_nodes, week_dict, pid_dict, week_content = hlp.creategraph(filtered_data, False)
        writetofile(sys.argv[4]+'_el.csv', to_write_edge)
        writetofile(sys.argv[4]+'_nl.csv', to_write_nodes)
        hlp.dumpvariable(week_dict, 'dynamic_week_dict', sys.argv[5])
        hlp.dumpvariable(pid_dict, 'pid_dict', sys.argv[5])
        hlp.dumpvariable(week_content, 'week_content', sys.argv[5])
random.seed(seed)
# idx = 0
for datum in smsdata:
    #i = random.randint(0, l)
    ts_before.append(datum[pr.m_content])
    # tweet_type = td.sentiment_dict[datum[td.sentiment]]
    # tweet_content = datum[td.sentiment_text]
    # ts_before.append((tweet_content, tweet_type))
    # idx += 1

data = []

s_obj = sentiment()
tr_set = s_obj.createtrainingset(tr_before)
ts_set = s_obj.createtestingset(ts_before, testing_has_labels=False)
print 'classifier training'
s_obj.trainclassifier(tr_set)
predictions = []
feature_set = []
print 'making predictions'
# print 'Accuracy: ', s_obj.getaccuracy(ts_set)
idx = 1
for datum in ts_set:
    res = s_obj.individualprediction(datum)
    print idx, ts_before[idx-1], '***pos: ', res.prob('pos'), ' *** neg: ', res.prob('neg')
    smsdata[idx-1].append(res)
    idx += 1

hlp.dumpvariable(smsdata, 'results')
print 'woot!'
def main():
    parser = argparse.ArgumentParser('Script to perform sentiment analysis using VADER')

    parser.add_argument('-m', '-M', type=str, required=True,
                        help='Location of the message file')
    parser.add_argument('-mt', '-MT', type=str, required=True, nargs='+',
                        help='types of messages to filter')
    parser.add_argument('-f', '-F', type=str, required=True,
                        help='filename where data is stored, no extension needed')
    parser.add_argument('-s', '-S', type=str, required=True,
                        help='location of folder to store the file, ends with a /')
    parser.add_argument('-p', '-P', action='store_true',
                        help='flag to store polarities separately')
    parser.add_argument('-w', '-W', type=str, required=False,
                        help='conduct weekly analysis, path to the survey data for '
                             'creating week information')
    parser.add_argument('-l', '-L', type=str, nargs='+', required=True,
                        help='the filters to use, make one or more choices: seenB, wasB, didB')
    parser.add_argument('-lf', '-LF', type=str, nargs='+', required=True,
                        help='location of filtered data, from runSurveyStats.py, in same order as -l/L flag')

    args = parser.parse_args()
    message_file = args.m
    message_types = args.mt
    filename_to_store = args.f
    location_to_store = args.s
    separate_polarity_score = args.p
    survey_file = args.w
    filters_chosen = args.l
    filter_files = args.lf

    catch_all_data = hlp.getfilterdata(filters_chosen, filter_files, catch_all=True)

    if separate_polarity_score and survey_file is not None:
        print 'Cannot have separate polarity scores and weekly analysis together, ' \
              'please remove the -p/-P flag'
        return

    if survey_file is not None:
        wi = weeklyinfo()
        week_dates = wi.getweeklyfo(survey_file)
        gh = ghelper()
    ff = filterfields(message_file)
    data = []
    for message_type in message_types:
        data.extend(ff.filterbyequality(pr.m_type, message_type))
    pid_dict = hlp.getuniqueparticipants(data, 'all' if len(message_types) > 1 else message_types[0])
    sentiment_analyzer = vadersenti(data[1:])
    returned_data = sentiment_analyzer.compilesentiment(pr.m_content, separate_sentiment_list=separate_polarity_score)
    if separate_polarity_score:
        hlp.dumpvariable(returned_data, filename_to_store + '.data', location_to_store)
    else:
        header = pr.message_header + ['pos', 'neg', 'neu', 'compound']
        final_data = [header] + returned_data
        hlp.writecsv(final_data, location_to_store + filename_to_store + '.csv')
        weekly_data = gh.filterweeklydata(pid_dict, returned_data, week_dates,
                                          'all' if len(message_types) > 1 else message_types[0])
        hlp.dumpvariable(weekly_data, 'weekly_data.dict', location_to_store)
        summarized_sentiment = {}
        for pid in weekly_data:
            summarized_sentiment[pid] = {}
            participant_data = weekly_data[pid]
            for week_no in participant_data:
                summarized_sentiment[pid][week_no] = sentiment_analyzer.summarizesentiment(participant_data[week_no],
                                                                                           separate_in_out=True,
                                                                                           message_type=message_type)
        hlp.dumpvariable(summarized_sentiment, 'weekly_summarized_sentiment.dict', location_to_store)
        plt = plots()
        overlay_data = gh.createbullyingoverlay(catch_all_data, week_dates, ff)
        plt.plotweeklyprogression(summarized_sentiment, location_to_store, 'Sentiment Progress', 'Week',
                                  'Sentiment Value', sentiment_legend=['Positive', 'Negative', 'Neutral'],
                                  overlay_data=overlay_data)

    print 'done'
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Sentiment Message file')
    parser.add_argument('-t', '-T', action='store_true',
                        help='Sentiment type flag, if used then vader, else afinn')
    parser.add_argument('-f', '-F', required=True,
                        help='Folder to store checkpoints, and final result')
    parser.add_argument('-w', '-W', required=False,
                        help='Per week/month analysis')

    args = parser.parse_args()
    message_file = args.m
    sentiment_type = args.t
    location_to_store = args.f
    survey_file = args.w

    # get message data, only sms and fb_message
    ff = filterfields(message_file)
    ff.setdata(ff.getdata()[1:])
    sms_data = ff.filterbyequality(pr.m_type, 'sms')
    pid_dict_sms = hlp.getuniqueparticipants2(sms_data)
    fb_message_data = ff.filterbyequality(pr.m_type, 'fb_message')
    pid_dict_fb = hlp.getuniqueparticipants2(fb_message_data)
    message_data = sms_data + fb_message_data

    # put the labels on
    labelled_data = hlp.processvadersentiment(message_data, label_only=False) if sentiment_type else \
        hlp.processafinnsentiment(message_data, label_only=False)

    if survey_file is not None:
        wi = weeklyinfo()
        weekly_info = wi.getweeklyfo(survey_file)
        weekly_data = hlp.divideintoweekly(labelled_data, weekly_info, ff)

    #__temp_testing_for_discrepancy(labelled_data, weekly_data)

    # get the pid_dict for easier handling
    pid_dict = hlp.getuniqueparticipants2(labelled_data)
    if survey_file is not None:
        over_sent, in_sent, out_sent, xtick, ytick = per_participant_sentiment(weekly_data, pid_dict['participants'])
        __plot_imshow(over_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_over.pdf')
        __plot_imshow(in_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_in.pdf')
        __plot_imshow(out_sent, 'Participant', 'Week #', xtick, ytick, location_to_store+'sent_imshow_out.pdf')

    print '***SMS***'
    print 'P: ', len(pid_dict_sms['participants'].values()), ' NP: ', len(pid_dict_sms['nonparticipants'].values())

    print '***FB***'
    print 'P: ', len(pid_dict_fb['participants'].values()), 'NP: ', len(pid_dict_fb['nonparticipants'].values())

    print '***OVERALL***'
    print 'P: ', len(pid_dict['participants'].values()), 'NP: ', len(pid_dict['nonparticipants'].values())

    summary_src_trg = summarize_message_by_src_trg(labelled_data)
    print '***Message Distribution***'
    for m_type_1 in summary_src_trg:
        print m_type_1, summary_src_trg[m_type_1]

    if survey_file is not None:
        week_list = weekly_data.keys()
        week_list.sort()
        # this is not good, as there aren't enough triads
        months = [[1, 2, 3, 4],
                  [5, 6, 7, 8],
                  [9, 10, 11, 12],
                  [13, 14, 15, 16],
                  [17, 18, 19, 20],
                  [21, 22, 23, 24, 25]]
        # this has at least 8 triads, always, use this
        months2 = [[1, 2, 3, 4, 5, 6, 7, 8],
                  [9, 10, 11, 12, 13, 14, 15, 16],
                  [17, 18, 19, 20, 21, 22, 23, 24, 25]]
        month_idx = 1
        for month in months2:
            labelled_data = []
            for week in month:
                labelled_data.extend(weekly_data[week])
            general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
            frac_triad = general_graph[3]
            summary_triad = general_graph[2]
            frac_triad_rand = random_graph[3]
            summary_triad_rand = random_graph[2]
            print '** Months ', 2*month_idx-1, 2*month_idx, ': ', month,' ***'
            print 'len(LD): ', len(labelled_data)
            for summary in frac_triad:
                print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
            words_list, short_list = word_count(labelled_data)
            toWrite_wl_csv = create_word_count_csv(words_list)
            hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list_'+str(2*month_idx-1)+'-'+str(2*month_idx)+'.csv',
                         delimiter_sym=',')
            for mtype in words_list:
                counted_words = Counter(words_list[mtype])
                counted_short = Counter(short_list[mtype])
                print '***For '+mtype+' ***'
                print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
                print 'Top 20 short: ', counted_short.most_common(20)
                print '\n\n'
            hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'month_'+str(month_idx)+'.list', location_to_store)
            month_idx += 1
    else:
        print 'len(LD): ', len(labelled_data)
        words_list, short_list = word_count(labelled_data)
        toWrite_wl_csv = create_word_count_csv(words_list)
        hlp.writecsv(toWrite_wl_csv, location_to_store+'word_list.csv', delimiter_sym=',')
        for mtype in words_list:
            counted_words = Counter(words_list[mtype])
            counted_short = Counter(short_list[mtype])
            print '***For '+mtype+' ***'
            print 'Top 20 words: ', __get_top_word_sentiment(counted_words.most_common(20))
            print 'Top 20 short: ', counted_short.most_common(20)
            print '\n\n'
        general_graph, random_graph = conduct_triad_analysis(labelled_data, pid_dict)
        frac_triad = general_graph[3]
        summary_triad = general_graph[2]
        frac_triad_rand = random_graph[3]
        summary_triad_rand = random_graph[2]
        for summary in frac_triad:
            print summary, 'Study: ', frac_triad[summary], '(', len(summary_triad[summary]), ')', ' Random: ', \
                frac_triad_rand[summary], '(', len(summary_triad_rand[summary]), ')'
        hlp.dumpvariable([general_graph, random_graph, labelled_data, pid_dict], 'Overall.list', location_to_store)
        # plot_degree_dist(general_graph[4], 'Degree(d)', '# of Participants with Degree d')
        pos, neg, neu = get_polarity_directionality(labelled_data)
        print '***Polarity Distribution***'
        print 'Positive: \n', pos
        print 'Negative: \n', neg
        print 'Neutral: \n', neu

        in_m, out_m, in_d, out_d = get_count_degrees_messages_directed(labelled_data, pid_dict['participants'])
        print '***Incoming Messages***'
        print 'Total: ', sum(in_m), 'Mean: ', np.mean(in_m), 'Std. dev.: ', np.std(in_m)
        print '***Outgoing Messages***'
        print 'Total: ', sum(out_m), 'Mean: ', np.mean(out_m), 'Std. dev.: ', np.std(out_m)
        print '***In Degree***'
        print 'Total: ', sum(in_d), 'Mean: ', np.mean(in_d), 'Std. dev.: ', np.std(in_d)
        print '***Out Degree***'
        print 'Total: ', sum(out_d), 'Mean: ', np.mean(out_d), 'Std. dev.: ', np.std(out_d)
        print '***COUNTS***'
        plot_messages_degree([in_m, out_m], '# of Messages', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_messages.pdf')
        # plot_messages_degree(out_m, '# of Outgoing Messages', 'Cumulative Participant Prob.',
        #               location_to_store+'out_messages.pdf')
        plot_messages_degree([in_d, out_d], 'Degree', 'Cumulative Participant Prob.',
                      location_to_store+'in_out_degree.pdf', True)
        # plot_messages_degree(out_d, 'Out Degree', 'Cumulative Participant Prob.',
        #               location_to_store+'out_degree.pdf', True)
    print 'TADAA!!'