def createMatrix(tweetText, tokenizer, maxVectorLength, pretrainedModel, hatebase_dic): """Creates a matrix of word embeddings based on the embeddings of BERT and the dictionary approach. Args: tweetText (str): The tweet as string. tokenizer (object): The instantiated tokenizer of the specific pretrained BERT model. maxVectorLength (int): The specified maximum vector length of the embeddings. pretrainedModel (str): To specify which pretrained Bert model to use. hatebase_dic (dataframe): The hatebase dictionary as pandas dataframe. Returns: matrix (torch tensor): The twodimensional matrix with BERT embeddings on the first and our dictionary approach on the second dimension. """ if (isinstance(tweetText, float)): #empty value is interpreted as nan print("Float tweet found in data: \"" + str(tweetText) + "\" --> interpreting it as string with str(tweet)") tweetText = str(tweetText) #empty tweets were interpreted as float raw_encoding = torch.tensor( tokenizer.encode(tweetText, max_length=maxVectorLength)) vlength = raw_encoding.size()[0] encoding = padWithZeros(raw_encoding, maxVectorLength) hateMetric = padWithZeros( stretch(hatesearch(tweetText, hatebase_dic), vlength), maxVectorLength) matrix = torch.cat((encoding, hateMetric), 0).unsqueeze(0) # embeddings[i] = matrix return matrix
def test_HateSearch(self): input_tweet = "how could i be a f*g but i like bitches please tell me" function_output = hatesearch(data=input_tweet) ideal_output = torch.tensor([ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 87.5849, 0.0000, 0.0000, 85.0000, 25.0000, 0.0000, 0.0000, 0.0000 ]) self.assertEqual(function_output, ideal_output)
def test_HateSearch(self): """Tests the hatesearch function from M1_5_dictionary_approach_tweetlevel creates correct tensors based on one example. """ input_tweet = "how could i be a f*g but i like bitches please tell me" function_output = hatesearch(data = input_tweet) ideal_output = torch.tensor([ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 87.58489525909593, 0.0000, 0.0000, 85.0000, 25.0000, 0.0000, 0.0000, 0.0000]) self.assertTrue(torch.equal(function_output, ideal_output))
def createMatrix(tweetText, tokenizer, maxVectorLength, pretrainedModel, hatebase_dic): if (isinstance(tweetText, float)): #empty value is interpreted as nan print("Float tweet found in data: \"" + str(tweetText) + "\" --> interpreting it as string with str(tweet)") tweetText = str(tweetText) #empty tweets were interpreted as float encoding = padWithZeros( torch.Tensor(tokenizer.encode(tweetText, max_length=maxVectorLength)), maxVectorLength) vlength = encoding.size()[0] hateMetric = padWithZeros( stretch(hatesearch(tweetText, hatebase_dic), vlength), maxVectorLength) matrix = torch.cat((encoding, hateMetric), 0).unsqueeze(0) # embeddings[i] = matrix return matrix