示例#1
0
 def contains_keywords(self, labels):
     df = get_parsed_data(usecols=['id', 'contains_keywords'])
     labels = pd.merge(labels,
                       df,
                       left_on='tweet_id',
                       right_on='id',
                       how='inner')
     return labels[labels.contains_keywords]
def get_usernames(data_type='annotation', num=5000, seed=42):
    if data_type == 'annotation':
        df = pd.read_pickle(annotation_data)
        df = df.dropna(subset=['username'])
        df = df.drop_duplicates(subset=['username'])
        usernames = df.username.sample(num, random_state=seed)
    elif data_type == 'raw':
        df = get_parsed_data(usecols=['user.screen_name'])
        df = df.dropna(subset=['user.screen_name'])
        df = df.drop_duplicates(subset=['user.screen_name'])
        usernames = df['user.screen_name'].sample(num, random_state=seed)
    return usernames
示例#3
0
def prepare_predict(args):
    # paths
    date_key = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    f_out_folder = os.path.join(find_project_root(), 'data', 'other',
                                'prepare_prediction', date_key)
    f_path_txt = os.path.join(f_out_folder, 'text.csv')
    f_path_meta = os.path.join(f_out_folder, 'id_created_at.csv')
    if os.path.isdir(f_out_folder):
        raise Exception(f'Folder {f_out_folder} already exists')
    os.makedirs(f_out_folder)
    # read data
    logger.info('Reading raw data...')
    df = get_parsed_data(num_files=None,
                         s_date=args.start_date,
                         e_date=args.end_date,
                         usecols=['id', 'created_at', 'text'])
    if args.start_date is not None:
        df = df[df.created_at > args.start_date]
    if args.end_date is not None:
        df = df[df.created_at > args.end_date]
    logger.info('Sorting...')
    df = df.sort_values('created_at')
    if args.anonymize:

        def anonymize(df_chunk):
            return df_chunk.apply(ProcessTweet.anonymize_text,
                                  url_filler=args.url_filler,
                                  user_filler=args.user_filler)

        logger.info('Anonymize...')
        anonymize_delayed = joblib.delayed(anonymize)
        num_cores = max(multiprocessing.cpu_count() - 1, 1)
        parallel = joblib.Parallel(n_jobs=num_cores)
        res = parallel(
            anonymize_delayed(df_chunk)
            for df_chunk in tqdm(np.array_split(df['text'], num_cores),
                                 unit='chunk'))
        df['text'] = pd.concat(res)
    # write data
    logger.info(f'Writing text column to {f_path_txt}...')
    df[['text']].to_csv(f_path_txt, index=False, quoting=csv.QUOTE_NONE)
    logger.info(f'Writing id/created_at column to {f_path_meta}...')
    df[['id', 'created_at']].to_csv(f_path_meta, index=False)
示例#4
0
 def prepare(self):
     parser = ArgParseDefault(description='Prepare text to be translated')
     logger = logging.getLogger(__name__)
     parser.add_argument('--geo-only', dest='geo_only', action='store_true', help='Only use geo-tagged data')
     parser.add_argument('-d', '--dtype', dest='dtype', choices=['original', 'anonymized', 'encrypted'], default='anonymized', help='Data source type')
     parser.add_argument('-l', '--limit', dest='limit', type=int, default=-1, help='If set, only extract random subsample')
     args = parser.parse_args(sys.argv[2:])
     # load data
     logger.info('Loading data...')
     df = get_parsed_data(dtype=args.dtype, usecols=['id', 'text', 'is_duplicate', 'has_place', 'has_coordinates', 'is_retweet'])
     # filter
     if args.geo_only:
         df = df[(df.has_place | df.has_coordinates) & (~df.is_duplicate) & (~df.is_retweet)]
     else:
         df = df[(~df.is_duplicate) & (~df.is_retweet)]
     if args.limit > 0:
         df = df.sample(args.limit)
     # write data
     folder = os.path.join(find_folder('other'), 'translations')
     if not os.path.isdir(folder):
         os.makedirs(folder)
     f_path = os.path.join(folder, 'prepare_{}.csv'.format(get_df_hash(df)[:5]))
     logger.info('Writing {:,} records to file {}...'.format(len(df), f_path))
     df[['id', 'text']].to_csv(f_path, index=False)
def main(args):
    """
    This script creates a new files in preprocess/data/other/pretrain with tweets which should be used for pretraining language models.
    It excludes training data and duplicates.
    """
    # load data
    logger.info('Reading data...')
    usecols = [
        'id', 'text', 'lang', 'token_count', 'is_retweet', 'contains_keywords'
    ]
    df = get_parsed_data(usecols=usecols, num_files=args.num_files)
    logger.info(f'...loaded a total of {len(df):,} tweets')

    # Filter retweets
    if 'retweets' in args.filters:
        logger.info(f'Filter retweets...')
        num_before = len(df)
        df = df[~df.is_retweet]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # Filtering by keyword
    if 'contains_keywords' in args.filters:
        logger.info(f'Filter contains_keywords...')
        num_before = len(df)
        df = df[df.contains_keywords]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # filter lang
    if args.lang is not None:
        logger.info(f'Filter lang {args.lang}...')
        num_before = len(df)
        df = df[df.lang == args.lang]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # filter min tokens
    if args.min_tokens > 0:
        logger.info(f'Filter has >={args.min_tokens} tokens...')
        num_before = len(df)
        df = df[df.token_count >= args.min_tokens]
        num_after = len(df)
        logger.info(
            f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # generate text column to filter for duplicates
    logger.info('Remove duplicates...')
    num_before = len(df)
    df.loc[:, 'text_cleared'] = df.text.apply(generate_text_cleared)
    df = df.drop_duplicates(subset=['text_cleared'])
    num_after = len(df)
    logger.info(
        f'... {num_after:,} remaining (removed {num_before-num_after:,})')

    # shuffle
    logger.info('Shuffle...')
    df = df.sample(frac=1)

    # write output file
    num_lines = len(df)
    logger.info(f'Collected total of {num_lines:,} examples')
    num_train = max(int(0.8 * num_lines), num_lines - int(2e5))
    ts = datetime.datetime.now().strftime('%Y_%m_%d-%H-%M_%s')
    for (_s, _e), _type in zip([(None, num_train), (num_train, None)],
                               ['train', 'dev']):
        _df = df[_s:_e]
        logger.info(f'Writing {len(_df):,} examples for {_type} data...')
        output_folder = os.path.join(find_folder('other'), 'pretrain',
                                     f'run_{ts}', _type)
        if not os.path.isdir(output_folder):
            os.makedirs(output_folder)
        if args.no_parallel:
            num_cpus = 1
        else:
            num_cpus = max(multiprocessing.cpu_count() - 1, 1)
        parallel = joblib.Parallel(n_jobs=num_cpus)
        write_output_file_delayed = joblib.delayed(write_output_file)
        res = parallel((write_output_file_delayed(
            _df.iloc[i:(i + args.max_examples_per_file)],
            os.path.join(output_folder, f'pretrain_{_type}_{j:03}.txt'))
                        for j, i in enumerate(
                            trange(0, len(_df), args.max_examples_per_file))))
        logger.info(
            f'Successfully wrote {len(res):,} file(s) to folder {output_folder}'
        )
示例#6
0
def run(size=None,
        langs=None,
        include_replies=False,
        anonymize=True,
        contains_keywords=False,
        min_token_count=3,
        mode='monthly',
        seed=None,
        extend=False,
        bin_size=None,
        min_date=None,
        max_date=None):
    if bin_size is None:
        logger.info('Creating sample of size {:,}...'.format(size))
    else:
        logger.info('Creating sample of size {:,} or bin size {:,}...'.format(
            size, bin_size))
    df = get_parsed_data(usecols=[
        'id', 'text', 'created_at', 'lang', 'is_reply', 'has_quote',
        'token_count'
    ],
                         contains_keywords=contains_keywords,
                         s_date=min_date,
                         e_date=max_date)
    logger.info(f'Read {len(df):,} samples. Filtering...')
    flags = ''
    # Filter by date
    if min_date is not None or max_date is not None:
        logger.info('Filtering by dates...')
        df = df.set_index('created_at')[min_date:max_date].reset_index()
    # Min token count
    if isinstance(min_token_count, int):
        logger.info('Filtering by min_token_count...')
        df = df[df.token_count > min_token_count]
    if not include_replies:
        # by default filter replies
        df = df[~df.is_reply]
    else:
        logger.info('Including replies...')
        flags += '_include_replies'
    # Contains keywords
    if contains_keywords:
        # data was already filtered in get_parsed_data
        logger.info('Filtered for contains_keywords...')
        flags += '_contains_keywords'
    # Filter by language
    if isinstance(langs, list):
        if len(langs) > 0:
            logger.info('Filtering for languages {}...'.format(
                ','.join(langs)))
            df = df[df.lang.isin(langs)]
            flags += '_langs_{}'.format(','.join(langs))
    # Filter previous
    if extend:
        logger.info('Extending previous sampled data...')
        flags += '_extended'
        df_sampled = get_sampled_data()
        df = df[~df.id.isin(df_sampled.tweet_id)]
        df = df[~df.text.isin(df_sampled.tweet_text)]
    # is_duplicate only marks duplicates before replacing <url> and @user tokens
    logger.info('Final screening for duplicates...')
    df['text_cleared'] = df.text.str.replace(r'@<user>|<url>', '')
    df['text_cleared'] = df.text_cleared.str.strip()
    df = df.drop_duplicates(subset=['text_cleared'])
    df = df.drop(['text_cleared'], axis=1)  # release memory
    logger.info(f'... {len(df):,} rows in filtered data')
    generator = SampleGenerator(seed=seed)
    sample = pd.DataFrame()
    if mode == 'monthly':
        if extend:
            logger.info(
                'Extending sample by evenly spread months based on seed "{}"...'
                .format(generator.seed))
            logger.info('Reading unavailable tweets...')
            df_unavailable = get_uploaded_batched_data('unavailable')
            unused_ids = set(df_unavailable['tweet_id'])
            unused = df[df.id.isin(unused_ids)].copy()
            unbins, _ = generator.create_month_bins(unused)
            logger.info('Reading available tweets...')
            df_available = get_uploaded_batched_data('available')
            used_ids = set(df_available['tweet_id'])
            used = pd.DataFrame(df[df.id.isin(used_ids)].copy())
            ubins, _ = generator.create_month_bins(used)
            logger.info('Generating sample...')
        else:
            unbins = None
            ubins = None
            logger.info('Generating sample by evenly spread months...')
        bins, bin_type = generator.create_month_bins(df)
        sample = generator.create_sample(bins,
                                         size=size,
                                         bins_unused=unbins,
                                         bins_used=ubins,
                                         bin_size=bin_size)
    elif mode == 'random':
        logger.info('Generating random sample...')
        sample = generator.random_sample(df, size)
    # anonymize
    if anonymize:
        logger.info('Anonymizing sample...')
        sample.loc[:, 'text'] = sample.text.apply(ProcessTweet.anonymize_text)
    generator.write_sample(sample,
                           mode,
                           size=('bin' +
                                 str(bin_size)) if size is None else size,
                           min_date=min_date,
                           max_date=max_date,
                           flags=flags)