示例#1
0
def recover_from_csv(csvfilename):
    progress = 0

    for row in csv_reader(csvfilename):
        progress += 1

        if progress % 1000 == 0:
            cl.progress('%d record(s) have been recovered' % progress)

        yield row

        if int(row['retweets']):
            try:
                retweets = twapi.GetRetweets(int(row['id']), count=100)
            except Exception:
                cl.warning('Error: %s' % get_exc_line())
            else:
                for tweet in retweets:
                    yield {
                        'id': tweet.id_str,
                        'text': row['text'],
                        'timestamp': tweet.created_at,
                        'likes': tweet.favorite_count,
                        'retweets': tweet.retweet_count,
                        'replies': None,
                        'url': None,
                        'html': None,
                        'user': merge_whitespaces(tweet.user.screen_name),
                        'fullname': merge_whitespaces(tweet.user.name)
                    }
def preprocess_csv(csvfilename, tweet_min_length, user_min_tweets,
                   remove_duplicates):
    cl.progress('Preprocessing file: %s' % csvfilename)
    preprocessor = TWLDAPreprocessor()
    grouped_tweets = collections.defaultdict(list)
    grouped_tweets_source = collections.defaultdict(list)

    for row in csv_reader(csvfilename):
        user = row['user']
        result = preprocessor.preprocess(row['text'])

        if len(result) >= tweet_min_length:
            result = ' '.join(result)

            if remove_duplicates and result in grouped_tweets[user]:
                continue

            grouped_tweets[user].append(result)
            grouped_tweets_source[user].append(row['text'].strip())

    grouped_tweets = {
        u: t
        for u, t in grouped_tweets.items() if len(t) >= user_min_tweets
    }
    return grouped_tweets, grouped_tweets_source
def preprocess_csv(csvfilename):
    cl.progress('Preprocessing file: %s' % csvfilename)

    grouped_tweets = collections.defaultdict(list)

    for row in csv_reader(csvfilename):
        grouped_tweets[row['user']].append(row['text'])

    for user in grouped_tweets:
        yield {'id': user, 'text': '  '.join(grouped_tweets[user])}
def load_user_info(userinfofile):
    user_info = {}

    for row in csv_reader(userinfofile):
        row['favourites_count'] = int(row['favourites_count'])
        row['followers_count'] = int(row['followers_count'])
        row['friends_count'] = int(row['friends_count'])
        row['listed_count'] = int(row['listed_count'])
        row['statuses_count'] = int(row['statuses_count'])
        row['verified'] = ast.literal_eval(row['verified'])
        user_info[row['screen_name']] = row

    return user_info
def load_all(modeldesc, sourcedesc):
    modelfilename = model_file('ldamodel-%s' % modeldesc)
    ldamodel = LdaMulticore.load(modelfilename)

    corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc))

    prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json'))

    sourcefilename = data_source_file(sourcedesc + '.csv')
    reader = csv_reader(sourcefilename)
    source_texts = {row['id']: row['text'] for row in reader}

    return ldamodel, corpus, prep_items, source_texts
示例#6
0
def random_sampler(csvfilename, amount):
    cl.section('Data Random Sampler')
    cl.info('Random sampling file: %s' % csvfilename)
    cl.info('Amount: %d' % amount)

    csvfilename = data_source_file(csvfilename)
    data = list(csv_reader(csvfilename))

    random.shuffle(data)
    data = data[:amount]

    exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount)
    export_csv(data, exportfilename)
def preprocess_csv(csvfilename,
                   *,
                   preprocessor_cls=TextPreprocessor,
                   custom_stop_words=None,
                   lem_ignore_patterns=None):
    cl.progress('Preprocessing file: %s' % csvfilename)

    preprocessor = preprocessor_cls(custom_stop_words=custom_stop_words,
                                    lem_ignore_patterns=lem_ignore_patterns)

    for row in csv_reader(csvfilename):
        result = preprocessor.preprocess(row['text'])

        if result:
            yield row['id'], result
def get_usernames(tweets_file):
    return list(
        set(row['user'] for row in csv_reader(data_source_file(tweets_file))))
except:
    from_id = 0
    to_id = len(file_list)
# Initial parameter
fold_size = 11
missingness_flag = [10, 20, 30, 40, 50]  # t% missing data  
seed = 42
cat_cols = [1, 2, 5, 6, 8, 10, 11, 12, 13]
num_cols = [0, 3, 4, 7, 9]
# Main program
for i_file in range(from_id, to_id):
    file_name = file_list[i_file]
    print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name))

    for i in range(1, fold_size):
        (D_train, D_test) = csv_reader(data_K_Fold, file_name, i, method='original_data', missingness=None)
        x_train = D_train[:, :(D_train.shape[1] - 1)]
        y_train = D_train[:, -1]
        x_test = D_test[:, :(D_test.shape[1] - 1)]
        y_test = D_test[:, -1]
        for missingness in missingness_flag:
            missingness /= 100
            cx_train, cx_train_mask = degrade_dataset(x_train, missingness, seed, np.nan)
            cx_test, cx_test_mask = degrade_dataset(x_test, missingness, seed, np.nan)

            cx_tr = np.c_[cx_train, y_train]
            cx_te = np.c_[cx_test, y_test]

            mask_tr = np.c_[cx_train_mask, np.ones(y_train.shape)]
            mask_te = np.c_[cx_test_mask, np.ones(y_test.shape)]
            # Here we proprecess the data applying a one-hot encoding for the categorical variables. We get the encoded dataset three different