Пример #1
0
def get_corpus(num_samples):
    """
    Load a subset of the sent 140 corpus. This function draws a random portion,
    as the original data seems to have a bias on position
    :param num_samples: a random sample of this size will be extracted. A negative value implies that
    everything get offered instad of just a section
    :return: The subset, as list of <tweet> tab <pos-tags> entries
    """
    all_data = [tweet.strip() for tweet in open(root+'Data/Corpora/batches/tokenized.tsv', encoding='utf-8')]
    if num_samples < 0:
        all_data = sample(all_data, len(all_data))
    else:
        all_data = sample(all_data, num_samples)
    return [u'\t'.join(get_tweet(tweet)[1:]) for tweet in all_data]
Пример #2
0
def get_raw_semeval_data(text, meta):
    """
    Extracting preprocessed semeval data, consisting of tweets and POS-tags as instances, and strings as labels
    :param text: location of tweet text
    :param meta: location of meta data
    :return: Training or test data in the classic (X, y) format
    """
    tweets = {}
    for line_1, line_2 in zip(open(text, 'r', 'utf-8'),
                              open(meta, 'r', 'utf-8')):
        tweet, pos, _, _ = line_1.strip().split('\t')
        _, _, sent = line_2.strip().split('\t')
        sent, tweet, pos = get_tweet(u'\t'.join((sent, tweet, pos)))
        tweets[u'\t'.join([tweet, pos])] = sent
    return tweets.keys(), tweets.values()