def contains_keywords(self, labels): df = get_parsed_data(usecols=['id', 'contains_keywords']) labels = pd.merge(labels, df, left_on='tweet_id', right_on='id', how='inner') return labels[labels.contains_keywords]
def get_usernames(data_type='annotation', num=5000, seed=42): if data_type == 'annotation': df = pd.read_pickle(annotation_data) df = df.dropna(subset=['username']) df = df.drop_duplicates(subset=['username']) usernames = df.username.sample(num, random_state=seed) elif data_type == 'raw': df = get_parsed_data(usecols=['user.screen_name']) df = df.dropna(subset=['user.screen_name']) df = df.drop_duplicates(subset=['user.screen_name']) usernames = df['user.screen_name'].sample(num, random_state=seed) return usernames
def prepare_predict(args): # paths date_key = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') f_out_folder = os.path.join(find_project_root(), 'data', 'other', 'prepare_prediction', date_key) f_path_txt = os.path.join(f_out_folder, 'text.csv') f_path_meta = os.path.join(f_out_folder, 'id_created_at.csv') if os.path.isdir(f_out_folder): raise Exception(f'Folder {f_out_folder} already exists') os.makedirs(f_out_folder) # read data logger.info('Reading raw data...') df = get_parsed_data(num_files=None, s_date=args.start_date, e_date=args.end_date, usecols=['id', 'created_at', 'text']) if args.start_date is not None: df = df[df.created_at > args.start_date] if args.end_date is not None: df = df[df.created_at > args.end_date] logger.info('Sorting...') df = df.sort_values('created_at') if args.anonymize: def anonymize(df_chunk): return df_chunk.apply(ProcessTweet.anonymize_text, url_filler=args.url_filler, user_filler=args.user_filler) logger.info('Anonymize...') anonymize_delayed = joblib.delayed(anonymize) num_cores = max(multiprocessing.cpu_count() - 1, 1) parallel = joblib.Parallel(n_jobs=num_cores) res = parallel( anonymize_delayed(df_chunk) for df_chunk in tqdm(np.array_split(df['text'], num_cores), unit='chunk')) df['text'] = pd.concat(res) # write data logger.info(f'Writing text column to {f_path_txt}...') df[['text']].to_csv(f_path_txt, index=False, quoting=csv.QUOTE_NONE) logger.info(f'Writing id/created_at column to {f_path_meta}...') df[['id', 'created_at']].to_csv(f_path_meta, index=False)
def prepare(self): parser = ArgParseDefault(description='Prepare text to be translated') logger = logging.getLogger(__name__) parser.add_argument('--geo-only', dest='geo_only', action='store_true', help='Only use geo-tagged data') parser.add_argument('-d', '--dtype', dest='dtype', choices=['original', 'anonymized', 'encrypted'], default='anonymized', help='Data source type') parser.add_argument('-l', '--limit', dest='limit', type=int, default=-1, help='If set, only extract random subsample') args = parser.parse_args(sys.argv[2:]) # load data logger.info('Loading data...') df = get_parsed_data(dtype=args.dtype, usecols=['id', 'text', 'is_duplicate', 'has_place', 'has_coordinates', 'is_retweet']) # filter if args.geo_only: df = df[(df.has_place | df.has_coordinates) & (~df.is_duplicate) & (~df.is_retweet)] else: df = df[(~df.is_duplicate) & (~df.is_retweet)] if args.limit > 0: df = df.sample(args.limit) # write data folder = os.path.join(find_folder('other'), 'translations') if not os.path.isdir(folder): os.makedirs(folder) f_path = os.path.join(folder, 'prepare_{}.csv'.format(get_df_hash(df)[:5])) logger.info('Writing {:,} records to file {}...'.format(len(df), f_path)) df[['id', 'text']].to_csv(f_path, index=False)
def main(args): """ This script creates a new files in preprocess/data/other/pretrain with tweets which should be used for pretraining language models. It excludes training data and duplicates. """ # load data logger.info('Reading data...') usecols = [ 'id', 'text', 'lang', 'token_count', 'is_retweet', 'contains_keywords' ] df = get_parsed_data(usecols=usecols, num_files=args.num_files) logger.info(f'...loaded a total of {len(df):,} tweets') # Filter retweets if 'retweets' in args.filters: logger.info(f'Filter retweets...') num_before = len(df) df = df[~df.is_retweet] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # Filtering by keyword if 'contains_keywords' in args.filters: logger.info(f'Filter contains_keywords...') num_before = len(df) df = df[df.contains_keywords] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # filter lang if args.lang is not None: logger.info(f'Filter lang {args.lang}...') num_before = len(df) df = df[df.lang == args.lang] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # filter min tokens if args.min_tokens > 0: logger.info(f'Filter has >={args.min_tokens} tokens...') num_before = len(df) df = df[df.token_count >= args.min_tokens] num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # generate text column to filter for duplicates logger.info('Remove duplicates...') num_before = len(df) df.loc[:, 'text_cleared'] = df.text.apply(generate_text_cleared) df = df.drop_duplicates(subset=['text_cleared']) num_after = len(df) logger.info( f'... {num_after:,} remaining (removed {num_before-num_after:,})') # shuffle logger.info('Shuffle...') df = df.sample(frac=1) # write output file num_lines = len(df) logger.info(f'Collected total of {num_lines:,} examples') num_train = max(int(0.8 * num_lines), num_lines - int(2e5)) ts = datetime.datetime.now().strftime('%Y_%m_%d-%H-%M_%s') for (_s, _e), _type in zip([(None, num_train), (num_train, None)], ['train', 'dev']): _df = df[_s:_e] logger.info(f'Writing {len(_df):,} examples for {_type} data...') output_folder = os.path.join(find_folder('other'), 'pretrain', f'run_{ts}', _type) if not os.path.isdir(output_folder): os.makedirs(output_folder) if args.no_parallel: num_cpus = 1 else: num_cpus = max(multiprocessing.cpu_count() - 1, 1) parallel = joblib.Parallel(n_jobs=num_cpus) write_output_file_delayed = joblib.delayed(write_output_file) res = parallel((write_output_file_delayed( _df.iloc[i:(i + args.max_examples_per_file)], os.path.join(output_folder, f'pretrain_{_type}_{j:03}.txt')) for j, i in enumerate( trange(0, len(_df), args.max_examples_per_file)))) logger.info( f'Successfully wrote {len(res):,} file(s) to folder {output_folder}' )
def run(size=None, langs=None, include_replies=False, anonymize=True, contains_keywords=False, min_token_count=3, mode='monthly', seed=None, extend=False, bin_size=None, min_date=None, max_date=None): if bin_size is None: logger.info('Creating sample of size {:,}...'.format(size)) else: logger.info('Creating sample of size {:,} or bin size {:,}...'.format( size, bin_size)) df = get_parsed_data(usecols=[ 'id', 'text', 'created_at', 'lang', 'is_reply', 'has_quote', 'token_count' ], contains_keywords=contains_keywords, s_date=min_date, e_date=max_date) logger.info(f'Read {len(df):,} samples. Filtering...') flags = '' # Filter by date if min_date is not None or max_date is not None: logger.info('Filtering by dates...') df = df.set_index('created_at')[min_date:max_date].reset_index() # Min token count if isinstance(min_token_count, int): logger.info('Filtering by min_token_count...') df = df[df.token_count > min_token_count] if not include_replies: # by default filter replies df = df[~df.is_reply] else: logger.info('Including replies...') flags += '_include_replies' # Contains keywords if contains_keywords: # data was already filtered in get_parsed_data logger.info('Filtered for contains_keywords...') flags += '_contains_keywords' # Filter by language if isinstance(langs, list): if len(langs) > 0: logger.info('Filtering for languages {}...'.format( ','.join(langs))) df = df[df.lang.isin(langs)] flags += '_langs_{}'.format(','.join(langs)) # Filter previous if extend: logger.info('Extending previous sampled data...') flags += '_extended' df_sampled = get_sampled_data() df = df[~df.id.isin(df_sampled.tweet_id)] df = df[~df.text.isin(df_sampled.tweet_text)] # is_duplicate only marks duplicates before replacing <url> and @user tokens logger.info('Final screening for duplicates...') df['text_cleared'] = df.text.str.replace(r'@<user>|<url>', '') df['text_cleared'] = df.text_cleared.str.strip() df = df.drop_duplicates(subset=['text_cleared']) df = df.drop(['text_cleared'], axis=1) # release memory logger.info(f'... {len(df):,} rows in filtered data') generator = SampleGenerator(seed=seed) sample = pd.DataFrame() if mode == 'monthly': if extend: logger.info( 'Extending sample by evenly spread months based on seed "{}"...' .format(generator.seed)) logger.info('Reading unavailable tweets...') df_unavailable = get_uploaded_batched_data('unavailable') unused_ids = set(df_unavailable['tweet_id']) unused = df[df.id.isin(unused_ids)].copy() unbins, _ = generator.create_month_bins(unused) logger.info('Reading available tweets...') df_available = get_uploaded_batched_data('available') used_ids = set(df_available['tweet_id']) used = pd.DataFrame(df[df.id.isin(used_ids)].copy()) ubins, _ = generator.create_month_bins(used) logger.info('Generating sample...') else: unbins = None ubins = None logger.info('Generating sample by evenly spread months...') bins, bin_type = generator.create_month_bins(df) sample = generator.create_sample(bins, size=size, bins_unused=unbins, bins_used=ubins, bin_size=bin_size) elif mode == 'random': logger.info('Generating random sample...') sample = generator.random_sample(df, size) # anonymize if anonymize: logger.info('Anonymizing sample...') sample.loc[:, 'text'] = sample.text.apply(ProcessTweet.anonymize_text) generator.write_sample(sample, mode, size=('bin' + str(bin_size)) if size is None else size, min_date=min_date, max_date=max_date, flags=flags)