def recover_from_csv(csvfilename): progress = 0 for row in csv_reader(csvfilename): progress += 1 if progress % 1000 == 0: cl.progress('%d record(s) have been recovered' % progress) yield row if int(row['retweets']): try: retweets = twapi.GetRetweets(int(row['id']), count=100) except Exception: cl.warning('Error: %s' % get_exc_line()) else: for tweet in retweets: yield { 'id': tweet.id_str, 'text': row['text'], 'timestamp': tweet.created_at, 'likes': tweet.favorite_count, 'retweets': tweet.retweet_count, 'replies': None, 'url': None, 'html': None, 'user': merge_whitespaces(tweet.user.screen_name), 'fullname': merge_whitespaces(tweet.user.name) }
def preprocess_csv(csvfilename, tweet_min_length, user_min_tweets, remove_duplicates): cl.progress('Preprocessing file: %s' % csvfilename) preprocessor = TWLDAPreprocessor() grouped_tweets = collections.defaultdict(list) grouped_tweets_source = collections.defaultdict(list) for row in csv_reader(csvfilename): user = row['user'] result = preprocessor.preprocess(row['text']) if len(result) >= tweet_min_length: result = ' '.join(result) if remove_duplicates and result in grouped_tweets[user]: continue grouped_tweets[user].append(result) grouped_tweets_source[user].append(row['text'].strip()) grouped_tweets = { u: t for u, t in grouped_tweets.items() if len(t) >= user_min_tweets } return grouped_tweets, grouped_tweets_source
def preprocess_csv(csvfilename): cl.progress('Preprocessing file: %s' % csvfilename) grouped_tweets = collections.defaultdict(list) for row in csv_reader(csvfilename): grouped_tweets[row['user']].append(row['text']) for user in grouped_tweets: yield {'id': user, 'text': ' '.join(grouped_tweets[user])}
def load_user_info(userinfofile): user_info = {} for row in csv_reader(userinfofile): row['favourites_count'] = int(row['favourites_count']) row['followers_count'] = int(row['followers_count']) row['friends_count'] = int(row['friends_count']) row['listed_count'] = int(row['listed_count']) row['statuses_count'] = int(row['statuses_count']) row['verified'] = ast.literal_eval(row['verified']) user_info[row['screen_name']] = row return user_info
def load_all(modeldesc, sourcedesc): modelfilename = model_file('ldamodel-%s' % modeldesc) ldamodel = LdaMulticore.load(modelfilename) corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc)) prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json')) sourcefilename = data_source_file(sourcedesc + '.csv') reader = csv_reader(sourcefilename) source_texts = {row['id']: row['text'] for row in reader} return ldamodel, corpus, prep_items, source_texts
def random_sampler(csvfilename, amount): cl.section('Data Random Sampler') cl.info('Random sampling file: %s' % csvfilename) cl.info('Amount: %d' % amount) csvfilename = data_source_file(csvfilename) data = list(csv_reader(csvfilename)) random.shuffle(data) data = data[:amount] exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount) export_csv(data, exportfilename)
def preprocess_csv(csvfilename, *, preprocessor_cls=TextPreprocessor, custom_stop_words=None, lem_ignore_patterns=None): cl.progress('Preprocessing file: %s' % csvfilename) preprocessor = preprocessor_cls(custom_stop_words=custom_stop_words, lem_ignore_patterns=lem_ignore_patterns) for row in csv_reader(csvfilename): result = preprocessor.preprocess(row['text']) if result: yield row['id'], result
def get_usernames(tweets_file): return list( set(row['user'] for row in csv_reader(data_source_file(tweets_file))))
except: from_id = 0 to_id = len(file_list) # Initial parameter fold_size = 11 missingness_flag = [10, 20, 30, 40, 50] # t% missing data seed = 42 cat_cols = [1, 2, 5, 6, 8, 10, 11, 12, 13] num_cols = [0, 3, 4, 7, 9] # Main program for i_file in range(from_id, to_id): file_name = file_list[i_file] print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name)) for i in range(1, fold_size): (D_train, D_test) = csv_reader(data_K_Fold, file_name, i, method='original_data', missingness=None) x_train = D_train[:, :(D_train.shape[1] - 1)] y_train = D_train[:, -1] x_test = D_test[:, :(D_test.shape[1] - 1)] y_test = D_test[:, -1] for missingness in missingness_flag: missingness /= 100 cx_train, cx_train_mask = degrade_dataset(x_train, missingness, seed, np.nan) cx_test, cx_test_mask = degrade_dataset(x_test, missingness, seed, np.nan) cx_tr = np.c_[cx_train, y_train] cx_te = np.c_[cx_test, y_test] mask_tr = np.c_[cx_train_mask, np.ones(y_train.shape)] mask_te = np.c_[cx_test_mask, np.ones(y_test.shape)] # Here we proprecess the data applying a one-hot encoding for the categorical variables. We get the encoded dataset three different