Exemplo n.º 1
0
def createMatrix(tweetText, tokenizer, maxVectorLength, pretrainedModel,
                 hatebase_dic):
    """Creates a matrix of word embeddings based on the embeddings of BERT and the dictionary approach.

    Args: 
        tweetText (str): The tweet as string.
        tokenizer (object): The instantiated tokenizer of the specific pretrained BERT model.
        maxVectorLength (int): The specified maximum vector length of the embeddings. 
        pretrainedModel (str): To specify which pretrained Bert model to use.
        hatebase_dic (dataframe): The hatebase dictionary as pandas dataframe.
    
    Returns: 
        matrix (torch tensor): The twodimensional matrix with BERT embeddings on the first and our dictionary approach on the second dimension. 
    """
    if (isinstance(tweetText, float)):  #empty value is interpreted as nan
        print("Float tweet found in data: \"" + str(tweetText) +
              "\" --> interpreting it as string with str(tweet)")

    tweetText = str(tweetText)  #empty tweets were interpreted as float

    raw_encoding = torch.tensor(
        tokenizer.encode(tweetText, max_length=maxVectorLength))

    vlength = raw_encoding.size()[0]

    encoding = padWithZeros(raw_encoding, maxVectorLength)

    hateMetric = padWithZeros(
        stretch(hatesearch(tweetText, hatebase_dic), vlength), maxVectorLength)

    matrix = torch.cat((encoding, hateMetric), 0).unsqueeze(0)

    # embeddings[i] = matrix
    return matrix
 def test_HateSearch(self):
     input_tweet = "how could i be a f*g but i like bitches please tell me"
     function_output = hatesearch(data=input_tweet)
     ideal_output = torch.tensor([
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 87.5849, 0.0000, 0.0000,
         85.0000, 25.0000, 0.0000, 0.0000, 0.0000
     ])
     self.assertEqual(function_output, ideal_output)
Exemplo n.º 3
0
 def test_HateSearch(self):
     """Tests the hatesearch function from M1_5_dictionary_approach_tweetlevel creates correct tensors based on one example.
     """
     input_tweet = "how could i be a f*g but i like bitches please tell me"
     function_output = hatesearch(data = input_tweet)
     ideal_output = torch.tensor([ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000, 87.58489525909593,  0.0000,  0.0000,
     85.0000, 25.0000,  0.0000,  0.0000,  0.0000])
     self.assertTrue(torch.equal(function_output, ideal_output))
def createMatrix(tweetText, tokenizer, maxVectorLength, pretrainedModel,
                 hatebase_dic):

    if (isinstance(tweetText, float)):  #empty value is interpreted as nan
        print("Float tweet found in data: \"" + str(tweetText) +
              "\" --> interpreting it as string with str(tweet)")

    tweetText = str(tweetText)  #empty tweets were interpreted as float

    encoding = padWithZeros(
        torch.Tensor(tokenizer.encode(tweetText, max_length=maxVectorLength)),
        maxVectorLength)

    vlength = encoding.size()[0]
    hateMetric = padWithZeros(
        stretch(hatesearch(tweetText, hatebase_dic), vlength), maxVectorLength)

    matrix = torch.cat((encoding, hateMetric), 0).unsqueeze(0)

    # embeddings[i] = matrix
    return matrix