示例#1
0
def aggregate_by_period(dataframe):
    before_convention = Counter()
    convention = Counter()
    after_convention = Counter()
    for i, time in enumerate(dataframe['Full Date']):
        # Convert time to a string to do the string equality analysis to determine which period the row belongs to
        time = str(time)
        if (time >= "1792-6-10") and (time <= "1792-8-10"):
            before_convention = before_convention + dataframe['ngrams'].iloc[i]
        if (time >= "1792-9-20") and (time < "1793-6-2"):
            convention = convention + dataframe['ngrams'].iloc[i]
        if (time >= "1793-6-2") and (time <= "1793-8-2"):
            after_convention = after_convention + dataframe['ngrams'].iloc[i]

    before_convention_tfidf = compute_tfidf(before_convention, num_speeches,
                                            doc_freq)
    convention_tfidf = compute_tfidf(convention, num_speeches, doc_freq)
    after_convention_tfidf = compute_tfidf(after_convention, num_speeches,
                                           doc_freq)

    before_convention_df = pd.DataFrame.from_dict(before_convention_tfidf,
                                                  orient="index")
    convention_df = pd.DataFrame.from_dict(convention_tfidf, orient="index")
    after_convention_df = pd.DataFrame.from_dict(after_convention_tfidf,
                                                 orient="index")

    #period_df = pd.DataFrame([before_convention, convention, after_convention])
    #write_to_excel(period_df, 'periods.xlsx')

    period_df = [
        before_convention_tfidf, convention_tfidf, after_convention_tfidf
    ]
    return period_df
示例#2
0
def aggregate_by_period(dataframe):
    before_convention = Counter()
    convention = Counter()
    after_convention = Counter()
    for key in dataframe:
        time = key
        if (time >= "1792-06-10") and (time <= "1792-08-10"):
            before_convention = before_convention + dataframe[time]
        if (time >= "1792-09-20") and (time < "1793-06-02"):
            convention = convention + dataframe[time]
        if (time >= "1793-06-02") and (time <= "1793-08-02"):
            after_convention = after_convention + dataframe[time]

    before_convention_tfidf = compute_tfidf(before_convention, num_speeches,
                                            doc_freq)
    convention_tfidf = compute_tfidf(convention, num_speeches, doc_freq)
    after_convention_tfidf = compute_tfidf(after_convention, num_speeches,
                                           doc_freq)

    before_convention_df = pd.DataFrame.from_dict(before_convention_tfidf,
                                                  orient="index")
    convention_df = pd.DataFrame.from_dict(convention_tfidf, orient="index")
    after_convention_df = pd.DataFrame.from_dict(after_convention_tfidf,
                                                 orient="index")

    period_df = [
        before_convention_tfidf, convention_tfidf, after_convention_tfidf
    ]
    return period_df
def distance_analysis():

	by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
	speakernumwords = pickle.load(open("speakernumwords.pickle", "rb"))

	speaker_tfidf = {}
	for speaker in by_speaker:
		counter = by_speaker[speaker]
		counter = convert_keys_to_string(counter)
		# Tried doing v>=3 but there are some speakers who do not say any bigrams more than 3 times
		freq = {k:v for k,v in counter.items() if (v >= 2)}
		tfidf = compute_tfidf(freq, num_speeches, doc_freq)
		speaker_tfidf[speaker] = tfidf

	robespierre = speaker_tfidf["Maximilien-Francois-Marie-Isidore-Joseph de Robespierre"]

	speaker_dist = {}

	for speaker in speaker_tfidf:
		if str(speaker) != "Maximilien-François-Marie-Isidore-Joseph de Robespierre":
			print speaker_tfidf[speaker]
			dist = 1-cosine_similarity(robespierre, speaker_tfidf[speaker])
			speaker_dist[speaker] = dist


	w = csv.writer(open("dist_to_robespierre_withlimit.csv", "w"))
	for key, val in speaker_dist.items():
		w.writerow([key,val])
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq):
	
	# Computes the tfidf scores within each group
	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	store_to_pickle(gir_tfidf, "gir_tfidf.pickle")
	store_to_pickle(mont_tfidf, "mont_tfidf.pickle")

	# Stores the tf_idf vectors in Excel
	df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
	write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx')

	df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
	write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx')

	# Combines the tfidf vectors of both parties into one file
	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx')

	# Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of 
	# speakers in each group that use that bigram
	# Can change the name of these dataframes to illuminate what the restrictions are
	Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
	Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

	store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle")
	store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle")

	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	# Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions
	df_combined = pd.DataFrame([Girondins, Montagnards])
	df_combined = df_combined.transpose()
	df_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_combined, 'combined_frequency_restricted.xlsx')

	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
def create_tfidf_vectors(dataframe):
	speeches = dataframe['concat_speeches'].tolist()
	ngrams = []
	for unit in speeches:
		ngrams.append(compute_ngrams(unit, 2))
	ngrams_to_add = pd.Series(ngrams)
	dataframe['ngrams'] = ngrams_to_add.values
	tfidf = []
	for element in ngrams:
		tfidf.append(compute_tfidf(element, num_speeches, doc_freq))
	tfidf_to_add = pd.Series(tfidf)
	dataframe['tfidf'] = tfidf_to_add.values
	return dataframe
示例#6
0
def create_tfidf_vectors(dataframe):
    tfidf = {}
    for element in dataframe:
        tfidf[element] = compute_tfidf(dataframe[element], num_speeches,
                                       doc_freq)
    return tfidf
示例#7
0
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        speaker_num_speeches = pickle.load(
            open("speaker_num_speeches.pickle", "rb"))
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))

            # Consider dividing by number of speeches, to normalize
            # Maintain num of speeches per group and number of chars per group

            gir = pickle.load(open("Girondins.pickle", "rb"))
            mont = pickle.load(open("Montagnards.pickle", "rb"))

            file = open('gir_speeches_noplein.txt', 'r')
            gir_num_speeches = int(file.read())

            file = open('mont_speeches_noplein.txt', 'r')
            mont_num_speeches = int(file.read())

            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]

            print element
            print party
            print type(dataframe[element])

            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                mont = mont - dataframe[element]

            # Normalizing by number of speeches
            #gir_normalized = normalize_by_speeches(gir, gir_num_speeches)
            #mont_normalized = normalize_by_speeches(mont, mont_num_speeches)

            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)

            # Resets the Gir and Mont vectors to their unnormalized version
            #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq))
            #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq))

            w = csv.writer(open("gir_mont_diff.csv", "w"))
            for key, val in gir_mont_diff.items():
                w.writerow([key, val])

            # Normalizing the speaker data as well
            #speaker_speeches = speaker_num_speeches[element]
            #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches)

            speaker_dict = dataframe[element]

            tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq)

            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            #Normalized
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))

            #Unnormalized
            #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare))
            #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare))

            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    mont_dist = pd.Series(mont_dist)
    gir_dist = pd.Series(gir_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff'
    ]
    return comp_df
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = {}
    bigrams_to_speakers = {}
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
                # To potentially establish a cutoff for analysis purposes
                augment(speaker_num_speeches, speaker_name)
                if speaker_name in speaker_char_count:
                    speaker_char_count[speaker_name] += len(
                        raw_speeches[identity])
                else:
                    speaker_char_count[speaker_name] = len(
                        raw_speeches[identity])

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    augment(bigram_doc_freq, bigram)

                    # Maintains a list of speeches in which given bigrams are spoken in
                    if bigram in bigrams_to_speeches:
                        bigrams_to_speeches[bigram].append(identity)
                    else:
                        bigrams_to_speeches[bigram] = []
                        bigrams_to_speeches[bigram].append(identity)
                    if bigram in bigrams_to_speakers:
                        bigrams_to_speakers[bigram].add(speaker_name)
                    else:
                        bigrams_to_speakers[bigram] = set()
                        bigrams_to_speakers[bigram].add(speaker_name)

                # Augments the relevant variables according to the party the speaker belongs to
                if party == "Girondins":
                    gir_num_speeches += 1
                    gir_docs = check_num_speakers(indv_speech_bigram,
                                                  speaker_name, gir_docs)
                    try:
                        Girondins = Girondins + indv_speech_bigram
                    except NameError:
                        Girondins = indv_speech_bigram
                else:
                    mont_num_speeches += 1
                    mont_docs = check_num_speakers(indv_speech_bigram,
                                                   speaker_name, mont_docs)
                    try:
                        Montagnards = Montagnards + indv_speech_bigram
                    except NameError:
                        Montagnards = indv_speech_bigram

                #speech = speech + indv_speech_bigram

    # 	# Stores the bigram Counter object for each individual speaker
    # 	"""pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
    # 	with open(pickle_filename, 'wb') as handle:
    # 		pickle.dump(speech, handle, protocol = 0)"""

    # Stores the bigrams_to_speeches document in Excel
    df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches,
                                                    orient="index")
    write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
    pickle_filename = "bigrams_to_speakers.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speakers, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "gir_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(gir_docs, handle, protocol=0)

    pickle_filename = "mont_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(mont_docs, handle, protocol=0)

    # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
    # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

    # gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
    # mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

    # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
    # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

    bigram_num_speakers = []
    bigram_num_speeches = []
    bigram_total_freq = []
    bg_speeches = {}
    bigrams = []
    speeches = []
    speakers = []
    for bigram in bigrams_to_speeches:
        if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10):
            bigram_num_speakers.append(len(bigrams_to_speakers[bigram]))
            bigram_num_speeches.append(len(bigrams_to_speeches[bigram]))
            bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram])
            bigrams.append(str(bigram))
            speeches.append(str(bigrams_to_speeches[bigram]))
            speakers.append(str(bigrams_to_speakers[bigram]))

    bg_num_speakers = pd.DataFrame(bigram_num_speakers,
                                   columns=['Num Speakers'])
    bg_num_speeches = pd.DataFrame(bigram_num_speeches,
                                   columns=['Num Speeches'])
    bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count'])
    bgs = pd.DataFrame(bigrams, columns=["Bigram"])
    speech = pd.DataFrame(speeches, columns=["Speechids"])
    speaker = pd.DataFrame(speakers, columns=["Speakers"])

    bigram_info = pd.DataFrame()
    bigram_info = pd.concat([
        bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq
    ],
                            axis=1)
    writer = pd.ExcelWriter("bigram_info.xlsx")
    bigram_info.to_excel(writer, 'Sheet1')
    writer.save()

    w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w"))
    for key, val in bigrams_to_speeches.items():
        w.writerow([key, val])

    bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(),
                                                key=lambda x: len(x[1]),
                                                reverse=True)
    w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w"))
    for item in bigrams_to_speakers_noplein_sorted:
        w.writerow([item[0], item[1]])

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    # num_speeches = 4479
    # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb'))

    with open('gir_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % gir_num_speeches)
    with open('mont_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % mont_num_speeches)
    print num_speeches

    with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withlimit.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
    # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

    # with open("Girondins_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Girondins, handle, protocol = 0)
    # with open("Montagnards_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Montagnards, handle, protocol = 0)
    # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # """with open("gir_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(gir_tfidf, handle, protocol = 0)
    # with open("mont_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # # Computes the distance between the tf_idf vectors
    # #compute_distance(gir_tfidf, mont_tfidf)

    # # Stores the tf_idf vectors
    # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
    # #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx')
    # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
    # #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx')

    # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    # df_tfidf_combined = df_tfidf_combined.transpose()
    # df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    # print gir_docs
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 10)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 10)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx")

    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # # Normalizes the vectors and computes the distance between them
    # #normalized = normalize_dicts(Girondins, Montagnards)
    # #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
示例#9
0
def data_clean(iteration, train_columns, bigram_speeches, unigram_speeches,
               bigram_freq, unigram_freq, bigram_doc_freq, unigram_doc_freq,
               num_speeches):
    classification = []
    data_set = []
    speeches = []
    speakers = []

    speechid_to_speaker = pickle.load(
        open("speechid_to_speaker_store.pickle", "rb"))
    speakers_to_analyze = pickle.load(
        open("speakers_to_analyze_store.pickle", "rb"))
    ### Should I do this once for all the data and then split it into test and train? That way all the data is based on the same bigrams. Or is that
    ### bad because then the training data is connected to the test data via the tfidf calculations?
    for speechid in bigram_speeches:
        speaker = speechid_to_speaker[speechid]

        #create a vector of speechids in correct order to reverse engineer and check which speeches were/were not correctly classified
        speeches.append(speechid)
        speakers.append(speaker)

        if speakers_to_analyze.loc[speaker, "Party"] == "Girondins":
            classification.append(0)
        else:
            classification.append(1)

        # Feature selection taking place here
        # Analysis accounts for bigrams and unigrams
        if iteration == "train":
            # Restricting features according to how many times they appear
            bigram_input = {
                k: v
                for k, v in bigram_speeches[speechid].items()
                if (bigram_freq[k] >= 20)
            }
            unigram_input = {
                k: v
                for k, v in unigram_speeches[speechid].items()
                if (unigram_freq[k] >= 62)
            }

            bigram_scores = compute_tfidf(bigram_input, num_speeches,
                                          bigram_doc_freq)
            unigram_scores = compute_tfidf(unigram_input, num_speeches,
                                           unigram_doc_freq)
        else:
            bigram_input = {
                k: v
                for k, v in bigram_speeches[speechid].items()
                if (k in train_columns)
            }
            unigram_input = {
                k: v
                for k, v in unigram_speeches[speechid].items()
                if (k in train_columns)
            }

            bigram_scores = compute_tfidf(bigram_input, num_speeches,
                                          bigram_doc_freq)
            unigram_scores = compute_tfidf(unigram_input, num_speeches,
                                           unigram_doc_freq)

        merge_scores = bigram_scores.copy()
        merge_scores.update(unigram_scores)

        data_set.append(merge_scores)

    # Remove data from memory to clear space for other computations
    speechid_to_speaker = None
    speakers_to_analyze = None
    bigram_speeches = None
    unigram_speeches = None
    bigram_input = None
    unigram_input = None
    bigram_scores = None
    unigram_scores = None

    data = pd.DataFrame(data_set)
    data_set = None
    data = data.fillna(0)

    return ([data, classification, speeches, speakers])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    plein_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        if (date >= "1792-09-20") and (date <= "1793-06-02"):
            # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
            # To potentially establish a cutoff for analysis purposes
            speaker_name = speechid_to_speaker[identity]
            party = ""
            if speaker_name in speakers_to_consider:
                party = speakers_to_analyze.loc[speaker_name, "Party"]
            else:
                party = "Plein"
            augment(speaker_num_speeches, speaker_name)
            if speaker_name in speaker_char_count:
                speaker_char_count[speaker_name] += len(raw_speeches[identity])
            else:
                speaker_char_count[speaker_name] = len(raw_speeches[identity])
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                augment(bigram_doc_freq, bigram)

                # Maintains a list of speeches in which given bigrams are spoken in
                if bigram in bigrams_to_speeches:
                    bigrams_to_speeches[bigram].append(identity)
                else:
                    bigrams_to_speeches[bigram] = []
                    bigrams_to_speeches[bigram].append(identity)

            # Augments the relevant variables according to the party the speaker belongs to
            if party == "Girondins":
                gir_num_speeches += 1
                gir_docs = check_num_speakers(indv_speech_bigram, speaker_name,
                                              gir_docs)
                try:
                    Girondins = Girondins + indv_speech_bigram
                except NameError:
                    Girondins = indv_speech_bigram
            elif party == "Montagnards":
                mont_num_speeches += 1
                mont_docs = check_num_speakers(indv_speech_bigram,
                                               speaker_name, mont_docs)
                try:
                    Montagnards = Montagnards + indv_speech_bigram
                except NameError:
                    Montagnards = indv_speech_bigram
            # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one
            # or the other
            else:
                plein_num_speeches += 1
                plein_docs = check_num_speakers(indv_speech_bigram,
                                                speaker_name, plein_docs)
                try:
                    Plein = Plein + indv_speech_bigram
                except NameError:
                    Plein = indv_speech_bigram

                #speech = speech + indv_speech_bigram

        # Stores the bigram Counter object for each individual speaker
        """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speech, handle, protocol = 0)"""
    """# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')"""

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches
    print num_speeches

    with open('speaker_num_speeches_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withplein.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withplein.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_withplein.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_withplein.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    with open("Girondins_withplein.pickle", 'wb') as handle:
        pickle.dump(Girondins, handle, protocol=0)
    with open("Montagnards_withplein.pickle", 'wb') as handle:
        pickle.dump(Montagnards, handle, protocol=0)
    with open("Plein.pickle", 'wb') as handle:
        pickle.dump(Plein, handle, protocol=0)
    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)
    plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq)
    """with open("gir_tfidf.pickle", 'wb') as handle:
		pickle.dump(gir_tfidf, handle, protocol = 0)
	with open("mont_tfidf.pickle", 'wb') as handle:
		pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # Computes the distance between the tf_idf vectors
    #compute_distance(gir_tfidf, mont_tfidf)

    # Stores the tf_idf vectors
    df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index")
    #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx')
    df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx')
    df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 3)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 3)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx")

    # Normalizes the vectors and computes the distance between them
    #normalized = normalize_dicts(Girondins, Montagnards)
    #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency.xlsx')
示例#11
0
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq):
	speaker_ngrams = {}
	speakers_to_consider = []
	speaker_distances = collections.defaultdict()
	chronology = collections.defaultdict(dict)

	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	for identity in raw_speeches:
		date = re.findall(date_regex, str(identity))[0]
		speaker_name = speechid_to_speaker[identity]
		if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider):
			indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
			if speaker_name in speaker_ngrams:
				speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram
			else:
				speaker_ngrams[speaker_name] = indv_speech_bigram
		"""
		if speaker_name in chronology:
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				if bigram in pairing:
					pairing[bigram].append([identity, indv_speech_bigram[bigram]])
				else:
					pairing[bigram] = [identity, indv_speech_bigram[bigram]]
		else:
			chronology[speaker_name] = {}
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				pairing[bigram] = []
				# stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram
				pairing[bigram] = [identity, indv_speech_bigram[bigram]]"""

	
	## Need tf-idf vectors for gir and mont
	## Need the doc_freq for the previous calcuations
	## compute tf-idf for individual speakers
	## compute cosine distance based on those vectors (dot product over length of vectors)
	## compute cosine similarity between the difference between the two group vectors (subtract from each other)
	## A - B, if positive more like A, if negative more like B

	## create tf vector for each speech and store that so can just add
	## Separately store single idf vector

	#########

	gir_dict = convert_keys_to_string(gir_tfidf)
	mont_dict = convert_keys_to_string(mont_tfidf)
	doc_freq_dict = convert_keys_to_string(doc_freq)
	gir_mont_diff = compute_difference(gir_dict, mont_dict)
	#gir_dict = gir_tfidf
	#print gir_dict
	#mont_dict = mont_tfidf
	for speaker in speaker_ngrams:
		speaker_dict = convert_keys_to_string(speaker_ngrams[speaker])
		to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict)
		gir_dist = cosine_similarity(gir_dict, to_compare)
		mont_dist = cosine_similarity(mont_dict, to_compare)
		# Need to actually compute the distance
		gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare)
		speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist]

	"""
	#speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']}
	speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot'])
	to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq)
	gir_dist = cosine_similarity(gir_dict, to_compare)
	df = pd.DataFrame([to_compare, gir_dict])
	df = df.transpose()
	write_to_excel(df, "Francois Chabot Test.xlsx")"""

	
	"""for speaker in speaker_ngrams:
		#to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)}
		to_compare = speaker_ngrams[speaker]
		gir_dict = gir_tfidf
		mont_dict = mont_tfidf
		gir_normalized = normalize_dicts(to_compare, gir_dict)
		gir_dist = 	compute_distance(gir_normalized[0], gir_normalized[1])
		to_compare = speaker_ngrams[speaker]
		mont_normalized = normalize_dicts(to_compare, mont_dict)
		mont_dist = compute_distance(mont_normalized[0], mont_normalized[1])
		speaker_distances[speaker] = [gir_dist, mont_dist]"""

	

	
	pickle_filename_3 = "speaker_ngrams.pickle"
	with open(pickle_filename_3, 'wb') as handle:
		pickle.dump(speaker_ngrams, handle, protocol = 0)

	df = pd.DataFrame.from_dict(speaker_distances)
	df = df.transpose()
	df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"]
	filename = "freq_dist_map.xlsx"
	writer = pd.ExcelWriter(filename)
	df.to_excel(writer, 'Sheet1')
	writer.save()

	pickle_filename = "freq_dist.pickle"
	with open(pickle_filename, 'wb') as handle:
		pickle.dump(speaker_distances, handle, protocol = 0)

	"""df2 = pd.DataFrame.from_dict(chronology)
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict,
                      gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    plein_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        print element
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))
            gir = pickle.load(open("Girondins_withplein.pickle", "rb"))
            mont = pickle.load(open("Montagnards_withplein.pickle", "rb"))
            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]
            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                print "here"
                mont = mont - dataframe[element]
            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)
            tfidf_speaker = compute_tfidf(dataframe[element], num_speeches,
                                          doc_freq)
            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))
            plein_dist.append(1 - cosine_similarity(plein_dict, to_compare))
            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            plein_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    gir_dist = pd.Series(gir_dist)
    mont_dist = pd.Series(mont_dist)
    plein_dist = pd.Series(plein_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff',
        'distance to plein'
    ]
    return comp_df