def test_partial_load_start_idx_specified_only(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=1) self.assertEqual(len(list(corpus2.iter_utterances())), 2) self.assertEqual(corpus1.get_utterance("1"), corpus2.get_utterance("1")) self.assertEqual(corpus1.get_utterance("2"), corpus2.get_utterance("2"))
def test_dump_and_load_with_binary(self): """ Dump a corpus containing speakers with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice", meta={ 'speaker_binary_data': speaker_byte_arr1, 'index': 99 }), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) alice = corpus1.get_speaker("alice") bob = corpus1.get_speaker("bob") corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.get_speaker("alice") bob2 = corpus2.get_speaker("bob") self.assertEqual(alice.meta, alice2.meta) self.assertEqual( corpus1.get_utterance('0').meta, corpus2.get_utterance('0').meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual( corpus1.get_utterance('1').meta, corpus2.get_utterance('1').meta)
def print_corpus(c: Corpus) -> None: leaves = get_corpus_leaf_ids(c) for leaf_id in leaves: utt = c.get_utterance(leaf_id) chain = [utt] while utt.reply_to: utt = c.get_utterance(utt.reply_to) chain.append(utt) depth = "" print("this conversation is", len(chain), "utterances long.") for utterance in reversed(chain): print(depth + utterance.text.replace("\n", " ")) depth += "--> " print("\n")
def add_title_to_root(corpus: Corpus): for conversation in corpus.iter_conversations(): utterance = corpus.get_utterance(conversation.id) title = conversation.retrieve_meta('title') if title is None: title = '' if utterance.text is None: utterance.text = title else: utterance.text = title + ' ' + utterance.text
def transform(self, corpus: Corpus) -> Corpus: corpus = copy.deepcopy(corpus) for convo in corpus.iter_conversations(): if 'rank' in convo.meta.keys(): raise Exception( 'rank is already a key in this conversations meta! aborting' ) t = 0 for id in convo._utterance_ids: u = corpus.get_utterance(id) t += len(u.text) convo.meta['rank'] = t return corpus
from gensim.models import Word2Vec from convokit import Corpus, Speaker, Utterance from tqdm import tqdm tartan_corpus = Corpus(filename="../tartan_corpus") # gather all user utterances as training data # 360,345 utterances utterances = [] utt_id = tartan_corpus.get_utterance_ids() for _id in tqdm(utt_id): utt = tartan_corpus.get_utterance(_id) if 'user' in utt._id: utterances.append(utt.text.split(" ")) # w2v model training model = Word2Vec(utterances, min_count=1) model.save("model/w2v_all.model")
def run_stats(transformed_corpus: Corpus): male_speaking = 0 male_speaking_about_female = 0 male_speaking_about_female_romantic = 0 male_speaking_not_about_female = 0 female_speaking = 0 female_speaking_about_male = 0 female_speaking_about_male_romantic = 0 female_speaking_not_about_male = 0 romantic = 0 not_romantic = 0 utterance_ids = transformed_corpus.get_utterance_ids() for uid in utterance_ids: utt = transformed_corpus.get_utterance(uid) # First get whether it's a male or female speaker if 'gender' in utt.user.meta: speaker_gender = utt.user.meta['gender'].lower() elif 'sex' in utt.user.meta: speaker_gender = utt.user.meta['sex'].lower() if speaker_gender == "male": male_speaking += 1 if speaker_gender == "female": female_speaking += 1 # Then get whether the utterance is a male speaking about a female: if utt.meta["male_about_female"]: male_speaking_about_female += 1 # And whether it was romantic if utt.meta["contains_romantic"]: male_speaking_about_female_romantic += 1 else: male_speaking_not_about_female += 1 # Then get whether the utterance is a female speaking about a male: if utt.meta["female_about_male"]: female_speaking_about_male += 1 # And whether it was romantic if utt.meta["contains_romantic"]: female_speaking_about_male_romantic += 1 else: female_speaking_not_about_male += 1 # Then register whether the utt is romantic, period. rom = utt.meta["contains_romantic"] if rom: romantic += 1 else: not_romantic += 1 #Creating Percentages - help with graphs later perc_male_about_female = (float(male_speaking_about_female) / float(male_speaking)) * 100 perc_male_about_female_rom = (float(male_speaking_about_female_romantic) / float(male_speaking_about_female)) * 100 perc_female_about_male = (float(female_speaking_about_male) / float(female_speaking)) * 100 perc_female_about_male_rom = (float(female_speaking_about_male_romantic) / float(female_speaking_about_male)) * 100 print('male_speaking: ', male_speaking) print('male_speaking_about_female: ', male_speaking_about_female) print('male_speaking_about_female_romantic: ', male_speaking_about_female_romantic) print('pct male utterances about females', perc_male_about_female) print('pct male utterances about females that are romantic', perc_male_about_female_rom) print('male_speaking_not_about_female: ', male_speaking_not_about_female) print('\n') print('female_speaking: ', female_speaking) print('female_speaking_about_male: ', female_speaking_about_male) print('female_speaking_about_male_romantic: ', female_speaking_about_male_romantic) print('pct female utterances about males', perc_female_about_male) print('pct female utterances about males that are romantic', perc_female_about_male_rom) print('female_speaking_not_about_male: ', female_speaking_not_about_male) print('\n') print('romantic: ', romantic) print('not_romantic: ', not_romantic)
def filter_winning_arguments_corpus(corpus: Corpus): utterance_ids = corpus.get_utterance_ids() #we want the original post made by op, the challenger's comments and all of OP's responses to the challengers #these three lists are utterance ids for the original post, challenger comments and op replies respectively opPost = [] challengerComments = [] opReplies = [] for iD in utterance_ids: if corpus.get_utterance(iD).id == corpus.get_utterance( iD).conversation_id: opPost.append(iD) if corpus.get_utterance(iD).speaker.id != corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance(iD).meta['success'] == 0: challengerComments.append(iD) if corpus.get_utterance(iD).speaker.id != corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance(iD).meta['success'] == 1: challengerComments.append(iD) if corpus.get_utterance(iD).id != corpus.get_utterance( iD).conversation_id and corpus.get_utterance( iD).speaker.id == corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance( iD).meta['success'] == 0: opReplies.append(iD) if corpus.get_utterance(iD).id != corpus.get_utterance( iD).conversation_id and corpus.get_utterance( iD).speaker.id == corpus.get_utterance( corpus.get_utterance(iD).conversation_id ).speaker.id and corpus.get_utterance( iD).meta['success'] == 1: opReplies.append(iD) #subset challenger and op replies for later use (into successful and unsuccessful arguments) challengerPos = [] challengerNeg = [] for iD in challengerComments: if corpus.get_utterance(iD).meta['success'] == 1: challengerPos.append(iD) if corpus.get_utterance(iD).meta['success'] == 0: challengerNeg.append(iD) #these are OP's replies to successful and unsuccessful challengers opReplyPos = [] opReplyNeg = [] for iD in opReplies: if corpus.get_utterance(iD).meta['success'] == 1: opReplyPos.append(iD) if corpus.get_utterance(iD).meta['success'] == 0: opReplyNeg.append(iD) subset = opPost + challengerComments + opReplies #collect utterance dict given the subset of ids utterance_list = [] for iD in subset: utterance_list.append(corpus.get_utterance(iD)) #this subset separates OP comments and challenger utterances from all other comments in every conversation (thread) corpus = Corpus(utterances=utterance_list) return corpus
# In 18 prompt_type_assignment_df.head() # In 19 # noinspection PyTypeChecker ps = PolitenessStrategies(verbose=1000) awry_corpus = ps.transform(awry_corpus) # In 20 utterance_ids = awry_corpus.get_utterance_ids() rows = [] for uid in utterance_ids: rows.append(awry_corpus.get_utterance(uid).meta["politeness_strategies"]) politeness_strategies = pd.DataFrame(rows, index=utterance_ids) # In 21 politeness_strategies.head(10) # In 22 # first, we need to directly map comment IDs to their conversations. We'll build a DataFrame to do this comment_ids = [] convo_ids = [] timestamps = [] page_ids = [] for conversation in awry_corpus.iter_conversations(): for comment in conversation.iter_utterances():
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from gensim.models import Word2Vec nltk.download('punkt') corpus = Corpus(filename=download("subreddit-creepypasta")) corpusTXT = open("corpus.txt", "w") utter_ids = corpus.get_utterance_ids() length = len(utter_ids) # Print the posts from subreddit to a file i = 0 while i < 2: corpusTXT.write(corpus.get_utterance(utter_ids[i]).text) i += 1 corpusTXT.close() corpusTXT_2 = open("corpus.txt", "r") text = corpusTXT_2.read() # Tokenize all of corpus.txt nltk_sentences = sent_tokenize(text) tokenized_sents = [word_tokenize(i) for i in nltk_sentences] new_sents = [] stop_words = set(stopwords.words('english')) whitespace = ' ' punctuation = string.punctuation
import pickle corpus = Corpus(filename=download("friends-corpus")) Caption = namedtuple( 'Caption', ['character', 'message', 'startTime', 'endTime', 'comments']) captions = [] i = 1 while True: convoNumber = '{:0>2}'.format(i) try: convo = corpus.get_conversation(f"s08_e14_c{convoNumber}_u001") for utterance_id in convo.get_utterance_ids(): utterance = corpus.get_utterance(utterance_id) if utterance.retrieve_meta("caption") is None: continue startTime, endTime, _ = utterance.retrieve_meta("caption") captions.append( Caption(utterance.speaker.id, utterance.text, startTime // 1000, endTime // 1000, None)) i += 1 except KeyError: break # there are no more conversations captionsPath = "./data/friends/captions.pkl" with open(captionsPath, 'wb') as captionsFile: pickle.dump(captions, captionsFile)
def convo_length(self, corpus: Corpus, convo): t = 0 for id in convo._utterance_ids: u = corpus.get_utterance(id) t += len(u.text) return t