def main():
    parser = __define_process_parser()
    old_dataset_file, new_dataset_mapped, missing_data, \
    survey_file, location_to_store = __define_process_parser(True, parser)

    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_mapped, delimiter_sym=',', remove_first=True)
    old_data_missing = hlp.readcsv(missing_data, delimiter_sym=',', remove_first=True)
    old_missing = __dictify(0, old_data_missing)
    wi = weeklyinfo()
    week_info = wi.getweeklyfo(survey_file)
    week_list = week_info.keys()
    bullying_positives = __find_positive_survey(survey_file, week_info)
    if bullying_positives is None:
        print 'Exiting...'
        exit()

    ff = filterfields()
    old_data_weekly = hlp.divideintoweekly(old_dataset, week_info, ff, date_field=pr.m_time_sent)
    new_data_weekly = hlp.divideintoweekly(new_dataset, week_info, ff, date_field=nd.m_timecreated)
    bullying_res = [['pid_hash', 'survey_id', 'time_of_survey', 'n_old', 'n_new', 'raw', 'semi', 'ordered', 'other']]
    for datum in bullying_positives:
        bullying_week = datum[-1]
        prev_week = bullying_week - 1 if bullying_week > min(week_list) else min(week_list)
        next_week = bullying_week + 1 if bullying_week < max(week_list) else max(week_list)
        old_data_pos = old_data_weekly[prev_week] + old_data_weekly[bullying_week] + old_data_weekly[next_week]
        new_data_pos = new_data_weekly[prev_week] + new_data_weekly[bullying_week] + new_data_weekly[next_week]
        pid_hash = datum[s_i.s_participant]
        n_old, n_new, nfr_dict = compare_old_new(old_data_pos, new_data_pos, old_missing, pid_hash, ff)
        temp = [pid_hash, datum[s_i.s_id], datum[s_i.s_time], n_old, n_new, nfr_dict['raw'], nfr_dict['semi'],
                nfr_dict['ordered'], nfr_dict['other']]
        bullying_res.append(temp)
    hlp.writecsv(bullying_res, location_to_store+'bullying_res.csv', delimiter_sym=',')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-d", "-D", required=True, help="the dataset")
    parser.add_argument("-m", "-M", required=True, help="mapping of the hashes")
    parser.add_argument("-f", "-F", required=True, help="folder to store the output in")
    parser.add_argument("-o", "-O", action="store_true", help="flag to indicate that we have the old dataset")

    args = parser.parse_args()

    dataset_file = args.d
    mapping_hash_file = args.m
    location_to_store = args.f
    # TODO: integrate the old dataset processing
    is_old = args.o

    new_dataset = hlp.readcsv(dataset_file, delimiter_sym=",", remove_first=True)
    mapping_hash = hlp.readcsv(mapping_hash_file, delimiter_sym=",", remove_first=True)

    pid_dict = {datum[1]: datum[0] for datum in mapping_hash}

    in_m, out_m, in_d, out_d = get_degree_message_count(new_dataset, pid_dict)
    vbt.plot_messages_degree(
        [in_m.values(), out_m.values()],
        "# of Messages",
        "Cumulative Participant Prob.",
        location_to_store + "in_out_messages.pdf",
    )
    vbt.plot_messages_degree(
        [in_d.values(), out_d.values()],
        "Degree",
        "Cumulative Participant Prob.",
        location_to_store + "in_out_degree.pdf",
        True,
    )
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-o', '-O', required=True, help='Old dataset csv')
    parser.add_argument('-n', '-N', required=True, help='New dataset csv')
    parser.add_argument('-s', '-S', required=True, help='Survey file')
    parser.add_argument('-p', '-P', required=True, help='folder to store figures in, should end with /')
    parser.add_argument('-m', '-M', required=True, help='Master hash mapping csv')
    parser.add_argument('-mt', '-MT', required=True, nargs='+', help='Types of messages to look for')
    parser.add_argument('-d', '-D', action='store_true', help='Flag to debug')

    args = parser.parse_args()

    old_dataset_file = args.o
    new_dataset_file = args.n
    survey_file = args.s
    location_to_store = args.p
    master_hash_csv = args.m
    message_types = args.mt
    do_debug = args.d

    print 'Reading data...'
    master_csv = hlp.readcsv(master_hash_csv, delimiter_sym=',', remove_first=True)
    master_dict = {datum[1]: datum[0] for datum in master_csv}

    ff = filterfields()

    filtered_old = []
    filtered_new = []

    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',', remove_first=True)

    print 'Filtering message types'
    for message_type in message_types:
        filtered_old.extend(ff.filterbyequality(pr.m_type, message_type, data=old_dataset))
        filtered_new.extend(ff.filterbyequality(pr.m_type, message_type, data=new_dataset))

    wi = weeklyinfo()
    weekly_info = wi.getweeklyfo(survey_file)
    week_list = weekly_info.keys()
    week_list.sort()

    print 'Creating in out dictionary'
    in_out_message_dict = get_message_counts(filtered_old, filtered_new, week_list, weekly_info, master_dict, ff,
                                             location_to_store, do_debug)

    print 'Plotting...'
    for pid in in_out_message_dict:
        print pid
        plot_distribution(in_out_message_dict[pid][0][0], in_out_message_dict[pid][0][1],
                          in_out_message_dict[pid][1][0], in_out_message_dict[pid][1][1], week_list, pid,
                          location_to_store)
    print 'TADAA!!'
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Message file')
    parser.add_argument('-p', '-P', action='store_true')
    parser.add_argument('-s', '-S', required=True,
                        help='filename to store polarity in, no extension needed')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to store the files in, ending with /')
    parser.add_argument('-n', '-N', required=False, nargs=2, type=int, default=[0, 2],
                        help='the neutral threshold, first value is min, second is max')

    args = parser.parse_args()

    messagefile = args.m
    location_to_store = args.f
    file_to_store = args.s
    separate_sentiment = args.p
    neutral_limit = args.n


    message_data = hlp.readcsv(messagefile)
    message_header = message_data[0]
    message_data = message_data[1:]

    afinn = afinnsenti(data=message_data, neutral_threshold=neutral_limit)
    data = afinn.compilesentiment(separate_sentiment_list=separate_sentiment, field_no=nd.m_content)
    if(separate_sentiment):
        hlp.dumpvariable(data, file_to_store+'.list', location_to_store)
    else:
        message_header.append('score')
        message_header.append('label')
        final_data =  [message_header] + data
        hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-m', '-M', required=True,
                        help='Message file')
    parser.add_argument('-p', '-P', action='store_true')
    parser.add_argument('-s', '-S', required=True,
                        help='filename to store polarity in, no extension needed')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to store the files in, ending with /')

    args = parser.parse_args()
    messagefile = args.m
    location_to_store = args.f
    file_to_store = args.s
    separate_sentiment = args.p

    message_data = hlp.readcsv(messagefile)
    message_header = message_data[0]
    message_data = message_data[1:]

    vader = vadersenti(data=message_data)
    data = vader.compilesentiment(separate_sentiment_list=separate_sentiment)
    if separate_sentiment:
        hlp.dumpvariable(data, file_to_store+'.list', location_to_store)
    else:
        message_header.append('pos')
        message_header.append('neg')
        message_header.append('neu')
        message_header.append('compound')
        final_data = [message_header] + data
        hlp.writecsv(final_data, location_to_store + file_to_store + '.csv', delimiter_sym=',')
def __find_positive_survey(survey_file, week_info):
    week_no = week_info.keys()
    week_no.sort()

    ff = filterfields()
    s_obj = surveys()

    survey_data = hlp.readcsv(survey_file, delimiter_sym=',')
    n_data = s_obj.interpretanswers(survey_data, True)
    bullying_positives = ff.filterbyequality(s_i.s_qno, '4', data=n_data[1:])

    new_bullying_positives = []
    for datum in bullying_positives:
        datetime_of_survey = ff.converttodate(datum[s_i.s_time])
        found_match = False
        for week in week_no:
            (start_date, end_date) = week_info[week]
            if start_date <= datetime_of_survey <= end_date:
                datum.append(week)
                new_bullying_positives.append(datum)
                found_match = True
                break
        if not found_match:
            print 'Something funky happened...', datum
            return None
    return new_bullying_positives
Пример #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '-D', help='Dataset', required=True)
    parser.add_argument('-n', '-N', help='Flag to indicate new dataset', action='store_true')
    args = parser.parse_args()
    dataset_file = args.d
    new_dataset = args.n
    dataset = hlp.readcsv(dataset_file, delimiter_sym=',', remove_first=True)
    type_dict = {}
    for datum in dataset:
        m_type = datum[nd.m_type] if new_dataset else datum[pr.m_type]
        if m_type not in type_dict:
            type_dict[m_type] = 0
        type_dict[m_type] += 1
    sorted_types = type_dict.keys()
    sorted_types.sort()
    total = sum(type_dict.values())
    for keyn in sorted_types:
        print keyn + ': ' + str(type_dict[keyn])
    print 'total: '+str(total)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-d', '-D', required=True,
                        help='labelled csv')
    parser.add_argument('-f', '-F', required=True,
                        help='folder to save the data in')
    args = parser.parse_args()
    data_file = args.d
    location_to_store = args.f

    all_afinn_data = hlp.readcsv(data_file)
    labelled_data = hlp.processafinnsentiment(all_afinn_data)
    csv_header = ['pid', 'm_type',
                  'in_pos', 'in_neg', 'in_neu',
                  'out_pos', 'out_neg', 'out_neu',
                  'in_deg_part', 'in_deg_nonpart',
                  'out_deg_part', 'out_deg_nonpart']

    pol_dist, complete_in_out = distribution_polarity(labelled_data)
    print '***For Complete Dataset***'
    print 'Incoming(P, N, U): ', complete_in_out['in']
    print 'Outgoing(P, N, U): ', complete_in_out['out']
    hlp.dumpvariable([pol_dist, complete_in_out], 'polarity_in_out.dict', location_to_store)

    to_store_csv = [csv_header]

    for pid in pol_dist:
        pid_data = pol_dist[pid]
        for m_type in pid_data:
            m_data = pid_data[m_type]
            csv_line = __summarize_data(m_data)
            final_csv_line = [pid, m_type]
            final_csv_line.extend(csv_line)
            to_store_csv.append(final_csv_line)
    hlp.writecsv(to_store_csv, location_to_store+'polarity_in_out.csv')
from sentimentanalysis import sentiment
from filterByField import filterfields
from basicInfo import twitterdataset as td
from basicInfo import privateInfo as pr
import helper as hlp
import random

data = hlp.readcsv('../ignore_data/Sentiment_Twitter.csv')
data = data[1:]
ff = filterfields('../ignore_data/messages.csv')
smsdata = ff.filterbyequality(pr.m_type, 'sms')

k = len(data)
l = len(smsdata)
seed = 254
random.seed(seed)
tr_n = 1000000
ts_n = 30
idx = 0
tr_before = []
ts_before = []

while idx < tr_n:
    i = random.randint(0, k)
    datum = data[i]
    tweet_type = td.sentiment_dict[datum[td.sentiment]]
    tweet_content = datum[td.sentiment_text]
    tr_before.append((tweet_content, tweet_type))
    idx += 1

random.seed(seed)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-o', '-O', help='Old Dataset', required=True)
    parser.add_argument('-n', '-N', help='New Dataset', required=True)
    parser.add_argument('-f', '-F', help='Folder to store results in, ending with /', required=True)
    parser.add_argument('-p', '-P', help='text file with list of people who were ordered to be removed', required=True)
    parser.add_argument('-s', '-S', help='text file with list of people who were semi-consented', required=True)

    args = parser.parse_args()

    old_dataset_file = args.o
    new_dataset_file = args.n
    location_to_store = args.f
    ordered_removed_file = args.p
    semi_consented_file = args.s

    print '***Reading data from arguments...'
    old_dataset = hlp.readcsv(old_dataset_file, delimiter_sym=',', remove_first=True)
    new_dataset = hlp.readcsv(new_dataset_file, delimiter_sym=',')
    new_dataset_dictionary = generate_new_dataset_dictionary(new_dataset[1:])
    new_dataset_msg_id_dictionary = generate_new_dataset_dictionary(new_dataset[1:], use_m_id=True)
    with open(ordered_removed_file, 'r') as f:
        ordered_removed = eval(f.read())
    with open(semi_consented_file, 'r') as f:
        semi_consented = eval(f.read())

    print '***Filtering old data within dates of study...'
    ff = filterfields()
    old_dataset_within_dates = ff.filterbetweendates(ff.converttodate(pr.start_datetime),
                                                     ff.converttodate(pr.end_datetime), data_to_work=old_dataset,
                                                     right_equality=True, date_field=pr.m_time_sent)
    old_dataset = old_dataset_within_dates
    old_dataset_counts = {}
    for datum in old_dataset:
        m_type = datum[pr.m_type]
        if m_type not in old_dataset_counts:
            old_dataset_counts[m_type] = 0
        old_dataset_counts[m_type] += 1
    print '*** OLD DATASET COUNTS***', old_dataset_counts
    print '***Finding mapping...'
    mapping_dict = {}
    inverted_mapping_dict = {}
    missed_dict = {}
    no_reason = []
    counts_no_match = {'ord': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0,
                               'fb_activity': 0, 'fb_like': 0, 'fb_comment': 0},
                       'semi': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                                'fb_like': 0, 'fb_comment': 0},
                       'no': {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                              'fb_like': 0, 'fb_comment': 0}}
    counts_match = {'sms': 0, 'fb_message': 0, 'twitter_status': 0, 'twitter_message': 0, 'fb_activity': 0,
                    'fb_like': 0, 'fb_comment': 0}
    no_reason_counts = {}
    for datum in old_dataset:
        m_result, msg_val = message_exists(datum, new_dataset_dictionary, ff)
        if m_result:
            mapping_dict[datum[pr.msg_id]] = msg_val
            if msg_val[1] not in inverted_mapping_dict:
                inverted_mapping_dict[msg_val[1]] = []
            inverted_mapping_dict[msg_val[1]].append(datum[pr.msg_id])
            m_type = datum[pr.m_type]
            if m_type in counts_match:
                counts_match[m_type] += 1
        else:
            src = datum[pr.m_source]
            trg = datum[pr.m_target]
            m_type = datum[pr.m_type]
            if src in ordered_removed or trg in ordered_removed:
                reason = 'ordered removed'
                if m_type in counts_no_match['ord']:
                    counts_no_match['ord'][m_type] += 1
            elif src in semi_consented or trg in semi_consented:
                reason = 'semi consented'
                if m_type in counts_no_match['semi']:
                    counts_no_match['semi'][m_type] += 1
            else:
                reason = ''
                temp = datum
                temp.append(msg_val)
                no_reason.append(temp)
                if m_type in counts_no_match['no']:
                    counts_no_match['no'][m_type] += 1
                if m_type not in no_reason_counts.keys():
                    no_reason_counts[m_type] = {}
                if msg_val not in no_reason_counts[m_type].keys():
                    no_reason_counts[m_type][msg_val] = 0
                no_reason_counts[m_type][msg_val] += 1
            missed_dict[datum[pr.msg_id]] = [msg_val, datum[pr.m_type], reason]
    print '\n\n**NOT FOUND**'
    for key_v in counts_no_match.keys():
        print key_v
        print counts_no_match[key_v]
    print '\n\n**NO REASON**'
    for key_v in no_reason_counts.keys():
        print key_v
        print no_reason_counts[key_v]
    print '\n\n**FOUND**', counts_match
    print '***Creating new dataset with mappings...'
    new_dataset_header = new_dataset[0]
    new_dataset_header.extend(['Old Message IDs'])
    final_dataset = [new_dataset_header]
    for new_msg_id in new_dataset_msg_id_dictionary.keys():
        datum = new_dataset_msg_id_dictionary[new_msg_id]
        old_msg_id = [''] if new_msg_id not in inverted_mapping_dict else inverted_mapping_dict[new_msg_id]
        datum.extend(old_msg_id)
        final_dataset.append(datum)

    print '***Writing data...'
    hlp.writecsv(final_dataset, location_to_store + 'new_old_mapped_hashed_dataset.csv', delimiter_sym=',')
    mapping_dict_list = [[x, mapping_dict[x][0], mapping_dict[x][1]] for x in mapping_dict]
    mapping_header = [['old_id', 'cosine_val', 'new_id']]
    mapping_header.extend(mapping_dict_list)
    hlp.writecsv(mapping_header, location_to_store + 'old_to_new_mapping.csv', delimiter_sym=',')
    missed_dict_list = [[x, missed_dict[x][0], missed_dict[x][1], missed_dict[x][2]] for x in missed_dict]
    missed_header = [['old_id', 'Reason', 'm_type', 'Explanation']]
    missed_header.extend(missed_dict_list)
    hlp.writecsv(missed_header, location_to_store + 'old_not_found.csv', delimiter_sym=',')
    hlp.writecsv(no_reason, location_to_store + 'old_not_found_no_reason.csv', delimiter_sym=',')
    print 'TADAA!!!'