def prepare_data(data, clean_char=True): ''' Preparing data for classification. SM Data will be a list of tuples with the format (text, id, label, user, time), EF data is simply (text, label) Shuffle, then extract paired lists of samples and labels if clean_char is True, remove control characters and replace all emojis and emoticons with ' E' ''' # Each tup is one sample paired with label/tag, no matter length for tup in data: # Get text, as list of word tokens if clean_char: sample = strip_emoticons(strip_cl_chars(tup[0])) else: sample = tup[0] words = word_tokenize(sample) # Get tag (label) if len(tup) > 2: tag = tup[2] # Twitter + Reddit data else: tag = tup[1] # EFcamdat data # Always map words and tag to corresponding indices # Format of input we need is: # sample: [word_idx, word_idx, word_idx...] # tag/lable: label_idx yield ([ w2i[word] for word in words ], t2i[tag])
def shuffle_prepare_data(data, clean_char=True, num_labs=6): ''' Preparing data for classification. Data will be a list of tuples with the format (text, id, label, user, time) Shuffle, then extract paired lists of samples and labels if clean_char is True (default), remove control characters and replace all emojis and emoticons with '<emo> ' num_labs is number of labels / classes to use. Default is 6, i.e. unmapped from data. Alternatives: 3 and 4 ''' lab3_mapping = { 'a1': 'A', 'a2': 'A', 'b1': 'B', 'b2': 'B', 'c1': 'C', 'c2': 'C' } lab4_mapping = { 'a1': 'a', 'a2': 'a', 'b1': 'b', 'b2': 'b', 'c1': 'c1', 'c2': 'c2' } # Shuffling data np.random.shuffle(data) # # Discriminating solely between C1 and C2 # samples, labels = [],[] # for tup in data: # if tup[2] in {'c1','c2'}: # samples.append(strip_emoticons(strip_cl_chars(tup[0]))) # labels.append(tup[2]) if clean_char: samples = [strip_emoticons(strip_cl_chars(tup[0])) for tup in data] else: samples = [tup[0] for tup in data] if num_labs == 6: labels = [tup[2] for tup in data] elif num_labs == 4: labels = [lab4_mapping[tup[2]] for tup in data] elif num_labs == 3: labels = [lab3_mapping[tup[2]] for tup in data] return samples, labels
def shuffle_prepare_data(data, clean_char=True): ''' Preparing data for classification. Data will be a list of tuples with the format (text, id, label, user, time) Shuffle, then extract paired lists of samples and labels if clean_char is True (default), remove control characters and replace all emojis and emoticons with '<emo> ' ''' np.random.shuffle(data) if clean_char: samples = [strip_cl_chars(tup[0]) for tup in data] else: samples = [tup[0] for tup in data] # samples = [ tup[0] for tup in data] labels = [tup[1] for tup in data] return samples, labels