예제 #1
0
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff):
    period_vector = []
    if period == 'month':
        period_vector = dataframe['Year-Month'].tolist()
        period_vector = pd.Series(period_vector)
        tfidf_scores = dataframe['tfidf'].tolist()
    elif period == 'date':
        period_vector = dataframe['Date'].tolist()
        period_vector = pd.Series(period_vector)
        tfidf_scores = dataframe['tfidf'].tolist()
    elif period == 'speaker':
        period_vector = dataframe['Speaker'].tolist()
        period_vector = pd.Series(period_vector)
        tfidf_scores = dataframe['tfidf'].tolist()
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe
        tfidf_scores = dataframe

    gir_dist = []
    mont_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for counter in tfidf_scores:
        to_compare = convert_keys_to_string(counter)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))
            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    gir_dist = pd.Series(gir_dist)
    mont_dist = pd.Series(mont_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff'
    ]
    return comp_df
def distance_analysis():

	by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
	speakernumwords = pickle.load(open("speakernumwords.pickle", "rb"))

	speaker_tfidf = {}
	for speaker in by_speaker:
		counter = by_speaker[speaker]
		counter = convert_keys_to_string(counter)
		# Tried doing v>=3 but there are some speakers who do not say any bigrams more than 3 times
		freq = {k:v for k,v in counter.items() if (v >= 2)}
		tfidf = compute_tfidf(freq, num_speeches, doc_freq)
		speaker_tfidf[speaker] = tfidf

	robespierre = speaker_tfidf["Maximilien-Francois-Marie-Isidore-Joseph de Robespierre"]

	speaker_dist = {}

	for speaker in speaker_tfidf:
		if str(speaker) != "Maximilien-François-Marie-Isidore-Joseph de Robespierre":
			print speaker_tfidf[speaker]
			dist = 1-cosine_similarity(robespierre, speaker_tfidf[speaker])
			speaker_dist[speaker] = dist


	w = csv.writer(open("dist_to_robespierre_withlimit.csv", "w"))
	for key, val in speaker_dist.items():
		w.writerow([key,val])
예제 #3
0
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        speaker_num_speeches = pickle.load(
            open("speaker_num_speeches.pickle", "rb"))
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))

            # Consider dividing by number of speeches, to normalize
            # Maintain num of speeches per group and number of chars per group

            gir = pickle.load(open("Girondins.pickle", "rb"))
            mont = pickle.load(open("Montagnards.pickle", "rb"))

            file = open('gir_speeches_noplein.txt', 'r')
            gir_num_speeches = int(file.read())

            file = open('mont_speeches_noplein.txt', 'r')
            mont_num_speeches = int(file.read())

            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]

            print element
            print party
            print type(dataframe[element])

            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                mont = mont - dataframe[element]

            # Normalizing by number of speeches
            #gir_normalized = normalize_by_speeches(gir, gir_num_speeches)
            #mont_normalized = normalize_by_speeches(mont, mont_num_speeches)

            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)

            # Resets the Gir and Mont vectors to their unnormalized version
            #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq))
            #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq))

            w = csv.writer(open("gir_mont_diff.csv", "w"))
            for key, val in gir_mont_diff.items():
                w.writerow([key, val])

            # Normalizing the speaker data as well
            #speaker_speeches = speaker_num_speeches[element]
            #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches)

            speaker_dict = dataframe[element]

            tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq)

            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            #Normalized
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))

            #Unnormalized
            #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare))
            #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare))

            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    mont_dist = pd.Series(mont_dist)
    gir_dist = pd.Series(gir_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff'
    ]
    return comp_df
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict,
                      gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    plein_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        print element
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))
            gir = pickle.load(open("Girondins_withplein.pickle", "rb"))
            mont = pickle.load(open("Montagnards_withplein.pickle", "rb"))
            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]
            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                print "here"
                mont = mont - dataframe[element]
            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)
            tfidf_speaker = compute_tfidf(dataframe[element], num_speeches,
                                          doc_freq)
            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))
            plein_dist.append(1 - cosine_similarity(plein_dict, to_compare))
            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            plein_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    gir_dist = pd.Series(gir_dist)
    mont_dist = pd.Series(mont_dist)
    plein_dist = pd.Series(plein_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff',
        'distance to plein'
    ]
    return comp_df