Пример #1
0
def prepare_data(data, clean_char=True):
    '''
    Preparing data for classification. SM Data will be a list of tuples with the format
    (text, id, label, user, time), EF data is simply (text, label)
    Shuffle, then extract paired lists of samples and labels
    if clean_char is True, remove control characters and replace all emojis and emoticons with ' E'
    '''

    # Each tup is one sample paired with label/tag, no matter length
    for tup in data:
        # Get text, as list of word tokens
        if clean_char:
            sample = strip_emoticons(strip_cl_chars(tup[0]))
        else:
            sample = tup[0]
        words = word_tokenize(sample)

        # Get tag (label)
        if len(tup) > 2:
            tag = tup[2] # Twitter + Reddit data
        else:
            tag = tup[1] # EFcamdat data

        # Always map words and tag to corresponding indices
        # Format of input we need is:
        # sample: [word_idx, word_idx, word_idx...]
        # tag/lable: label_idx

        yield ([ w2i[word] for word in words ], t2i[tag])
Пример #2
0
def shuffle_prepare_data(data, clean_char=True, num_labs=6):
    '''
    Preparing data for classification. Data will be a list of tuples with the format
    (text, id, label, user, time)
    Shuffle, then extract paired lists of samples and labels
    if clean_char is True (default), remove control characters and replace all emojis and emoticons with '<emo> '
    num_labs is number of labels / classes to use. Default is 6, i.e. unmapped from data. Alternatives: 3 and 4
    '''
    lab3_mapping = {
        'a1': 'A',
        'a2': 'A',
        'b1': 'B',
        'b2': 'B',
        'c1': 'C',
        'c2': 'C'
    }
    lab4_mapping = {
        'a1': 'a',
        'a2': 'a',
        'b1': 'b',
        'b2': 'b',
        'c1': 'c1',
        'c2': 'c2'
    }

    # Shuffling data
    np.random.shuffle(data)

    # # Discriminating solely between C1 and C2
    # samples, labels = [],[]
    # for tup in data:
    #     if tup[2] in {'c1','c2'}:
    #         samples.append(strip_emoticons(strip_cl_chars(tup[0])))
    #         labels.append(tup[2])

    if clean_char:
        samples = [strip_emoticons(strip_cl_chars(tup[0])) for tup in data]
    else:
        samples = [tup[0] for tup in data]

    if num_labs == 6:
        labels = [tup[2] for tup in data]
    elif num_labs == 4:
        labels = [lab4_mapping[tup[2]] for tup in data]
    elif num_labs == 3:
        labels = [lab3_mapping[tup[2]] for tup in data]

    return samples, labels
Пример #3
0
def shuffle_prepare_data(data, clean_char=True):
    '''
    Preparing data for classification. Data will be a list of tuples with the format
    (text, id, label, user, time)
    Shuffle, then extract paired lists of samples and labels
    if clean_char is True (default), remove control characters and replace all emojis and emoticons with '<emo> '
    '''
    np.random.shuffle(data)

    if clean_char:
        samples = [strip_emoticons(strip_cl_chars(tup[0])) for tup in data]
    else:
        samples = [tup[0] for tup in data]

    # samples = [ tup[0] for tup in data]
    labels = []
    for tup in data:
        if len(tup) > 2:
            labels.append(tup[2])  # Twitter + Reddit data
        else:
            labels.append(tup[1])  # Efcamdat data
    # labels = [ tup[2] for tup in data]
    return samples, labels