def get_corpus(num_samples): """ Load a subset of the sent 140 corpus. This function draws a random portion, as the original data seems to have a bias on position :param num_samples: a random sample of this size will be extracted. A negative value implies that everything get offered instad of just a section :return: The subset, as list of <tweet> tab <pos-tags> entries """ all_data = [tweet.strip() for tweet in open(root+'Data/Corpora/batches/tokenized.tsv', encoding='utf-8')] if num_samples < 0: all_data = sample(all_data, len(all_data)) else: all_data = sample(all_data, num_samples) return [u'\t'.join(get_tweet(tweet)[1:]) for tweet in all_data]
def get_raw_semeval_data(text, meta): """ Extracting preprocessed semeval data, consisting of tweets and POS-tags as instances, and strings as labels :param text: location of tweet text :param meta: location of meta data :return: Training or test data in the classic (X, y) format """ tweets = {} for line_1, line_2 in zip(open(text, 'r', 'utf-8'), open(meta, 'r', 'utf-8')): tweet, pos, _, _ = line_1.strip().split('\t') _, _, sent = line_2.strip().split('\t') sent, tweet, pos = get_tweet(u'\t'.join((sent, tweet, pos))) tweets[u'\t'.join([tweet, pos])] = sent return tweets.keys(), tweets.values()