示例#1
0
 def init(self):
     from utils.task_helpers import init
     parser = ArgParseDefault(description='Initialize project')
     parser.add_argument('-p', '--project', type=str, required=False, default='', dest='project', help='Name of project to initialize')
     parser.add_argument('--template', dest='template', action='store_true', default=False, help='Initialize project manually.')
     args = parser.parse_args(sys.argv[2:])
     init(args.project, args.template)
示例#2
0
 def sync(self):
     from utils.task_helpers import sync
     parser = ArgParseDefault(description='Sync project data from S3')
     parser.add_argument('-s', '--source', choices=['all', 'streaming', 'annotation', 'media'], required=False, default='all', help='Type of data to be synced. By default sync all data belonging to this project.')
     parser.add_argument('-l', '--last', required=False, type=int, help='Sync streaming data of last n days')
     args = parser.parse_args(sys.argv[2:])
     sync(data_type=args.source, last_n_days=args.last)
示例#3
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--run_name',
                        required=True,
                        help='Run name to plot confusion matrix for')
    args = parser.parse_args()
    return args
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--bucket_name', default='cb-tpu-us-central1', help='Bucket name')
    parser.add_argument('--project_name', default='covid-bert-v2', help='Project name')
    parser.add_argument('--run_prefix', default='ct_bert_v2_eval', help='Run prefix')
    parser.add_argument('--metric', default='f1_macro', help='Metric to plot')
    parser.add_argument('-v', '--version', type=int, default=6, help='Plot version')
    args = parser.parse_args()
    return args
示例#5
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--init_checkpoint', help='Path to checkpoint')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--output', default=['tf_hub', 'huggingface'], choices=['tf_hub', 'huggingface'], nargs='+', help='Generate output for those model types')
    args = parser.parse_args()
    return args
示例#6
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_folder',
                        required=True,
                        help='Path to input folder.')
    parser.add_argument('--output_folder',
                        required=True,
                        help='Path to output folder.')
    args = parser.parse_args()
    return args
示例#7
0
 def __init__(self):
     logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)-5.5s] [%(name)-12.12s]: %(message)s')
     parser = ArgParseDefault(
             description='',
             usage=USAGE_DESC)
     parser.add_argument('command', help='Subcommand to run')
     args = parser.parse_args(sys.argv[1:2])
     if not hasattr(self, args.command):
         print('Unrecognized command')
         parser.print_help()
         sys.exit(1)
     getattr(self, args.command)()
示例#8
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--run_prefix',
                        default='wwm_v2',
                        help='Prefix to plot heatmap')
    parser.add_argument('-y', default='train_batch_size', help='Y-axis column')
    parser.add_argument('-x', default='learning_rate', help='X-axis column')
    parser.add_argument('--metric', default='f1_macro', help='Metric to plot')
    parser.add_argument('-v',
                        '--version',
                        type=int,
                        default=1,
                        help='Plot version')
    args = parser.parse_args()
    return args
示例#9
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--project_name', default='covid-bert', help='Project name')
    parser.add_argument('--run_prefix', default='wwm_v2', help='Prefix to plot heatmap')
    parser.add_argument('-v', '--version', type=int, default=2, help='Plot version')
    args = parser.parse_args()
    return args
示例#10
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--project_name',
                        default='covid-bert',
                        help='Project name')
    parser.add_argument('--run_prefix',
                        default='eval_wwm_v4',
                        help='Run prefix')
    parser.add_argument('--metric', default='f1_macro', help='Metric to plot')
    parser.add_argument('-v',
                        '--version',
                        type=int,
                        default=2,
                        help='Plot version')
    args = parser.parse_args()
    return args
示例#11
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--run_prefix',
                        default='eval_wwm_v2',
                        help='Prefix to plot heatmap')
    parser.add_argument('--bucket_name', help='Bucket name')
    parser.add_argument('--project_name',
                        help='Project name (subfolder in bucket)')
    parser.add_argument('--metric', default='f1_macro', help='Metric to plot')
    parser.add_argument('-v',
                        '--version',
                        type=int,
                        default=9,
                        help='Plot version')
    args = parser.parse_args()
    return args
示例#12
0
 def prepare_predict(self):
     from utils.task_helpers import prepare_predict
     parser = ArgParseDefault(description='Prepare data for prediction with the text-classification library. \
             This function generates two files (1 for text 1 for IDs/created_at) under data/other. The text.csv file can then be predicted.')
     parser.add_argument('--start_date', required=False, default=None, help='Filter start date')
     parser.add_argument('--end_date', required=False, default=None, help='Filter end date')
     add_bool_arg(parser, 'anonymize', default=True, help='Replace usernames and URLs with filler (@user and <url>)')
     parser.add_argument('--url_filler', required=False, default='<url>', help='Filler for urls (if anonymize)')
     parser.add_argument('--user_filler', required=False, default='@user', help='Filler for user names (if anonymize)')
     args = parser.parse_args(sys.argv[2:])
     prepare_predict(args)
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--input_checkpoint',
                        required=True,
                        help='Path to v1 checkpoint')
    parser.add_argument('--output_checkpoint',
                        required=True,
                        help='Path to checkpoint to be written out.')
    parser.add_argument('--num_heads',
                        default=16,
                        help='Path to checkpoint to be written out.')
    parser.add_argument('--name_replacements',
                        default=BERT_NAME_REPLACEMENTS,
                        help='Name replacements')
    parser.add_argument('--name_permutations',
                        default=BERT_PERMUTATIONS,
                        help='Name permuations')
    args = parser.parse_args()
    return args
示例#14
0
 def batch(self):
     from utils.processing.sample_tweets import SampleGenerator
     parser = ArgParseDefault(description='Generate new batch for labelling. As a result a new csv will be created in `data/2_sampled/batch_{batch_id}/`')
     parser.add_argument('-N', '--num_tweets', type=int, default=None, help='The number of tweets to be generated in new batch')
     parser.add_argument('-b', '--batch', type=int, default=None, help='The batch id to be generated, default: Automatically find next batch')
     parser.add_argument('--ignore-previous', dest='ignore_previous', action='store_true', default=False, help='Also sample tweets from old batches which were not annotated')
     parser.add_argument('--stats-only', dest='stats_only', action='store_true', default=False, help='Show stats only')
     args = parser.parse_args(sys.argv[2:])
     s = SampleGenerator()
     if args.stats_only:
         s.stats(ignore_previous=args.ignore_previous)
     else:
         s.generate_batch(num_tweets=args.num_tweets, batch_id=args.batch, ignore_previous=args.ignore_previous)
示例#15
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_file', required=True, help='Path to input file.')
    parser.add_argument('--output_folder', required=True, help='Path to output folder.')
    parser.add_argument('--config_file',required=False, default="config.json", help='Needs to be placed in output folder. Overrides the default config.json')
    args = parser.parse_args()
    return args
示例#16
0
 def parse(self):
     import utils.processing.parse_tweets as parse_tweets
     parser = ArgParseDefault(description='Preprocess raw data to create parquet files in `data/1_parsed`')
     parser.add_argument('--no-parallel', dest='no_parallel', action='store_true', default=False, help='Do not run in parallel')
     parser.add_argument('--extend', dest='extend', action='store_true', default=False, help='Extend existing parsed data')
     parser.add_argument('--ray_num_cpus', type=int, default=None, help='Limit the number of worker processes for Ray during the memory intensive merge phase (by default using maximum worker processes)')
     add_bool_arg(parser, 'extract_retweets', default=True, help='Extract top-level retweets')
     add_bool_arg(parser, 'extract_quotes', default=True, help='Extract top-level quotes')
     add_bool_arg(parser, 'omit_last_day', default=True, help='Omit parsing data from the last day')
     args = parser.parse_args(sys.argv[2:])
     parse_tweets.run(no_parallel=args.no_parallel, extract_retweets=args.extract_retweets, extract_quotes=args.extract_quotes, extend=args.extend, omit_last_day=args.omit_last_day, ray_num_cpus=args.ray_num_cpus)
示例#17
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--run_prefix',
                        default='eval_wwm_v2',
                        help='Prefix to plot heatmap')
    parser.add_argument('--metric', default='accuracy', help='Metric to plot')
    parser.add_argument('-v',
                        '--version',
                        type=int,
                        default=10,
                        help='Plot version')
    args = parser.parse_args()
    return args
示例#18
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault(usage=USAGE_DESCRIPTION)
    parser.add_argument(
        '--run_name',
        required=True,
        help=
        'Finetune run name. The model will be loaded from gs://{bucket_name}/{project_name}/finetune/runs/{run_name}.'
    )
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--project_name',
                        required=False,
                        default='covid-bert',
                        help='Name of subfolder in Google bucket')
    parser.add_argument(
        '--input_text',
        required=False,
        help='Predict arbitrary input text and print prediction to stdout')
    parser.add_argument(
        '--input_txt_files',
        nargs='+',
        required=False,
        help='Predict text from local txt files. One example per line.')
    parser.add_argument(
        '--input_tfrecord_files',
        nargs='+',
        required=False,
        help='Predict text from tfrecord files (local or on bucket).')
    parser.add_argument('--tpu_ip',
                        required=False,
                        help='IP-address of the TPU')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument('--num_gpus',
                        default=1,
                        type=int,
                        help='Number of GPUs to use')
    parser.add_argument('--eval_batch_size',
                        default=32,
                        type=int,
                        help='Eval batch size')
    parser.add_argument('--label_name',
                        default='label',
                        type=str,
                        help='Name of label to predicted')
    add_bool_arg(parser,
                 'use_tf_hub',
                 default=False,
                 help='Use TF-Hub to initialize model')
    add_bool_arg(parser,
                 'interactive_mode',
                 default=False,
                 help='Interactive mode')
    add_bool_arg(
        parser,
        'use_tpu',
        default=False,
        help=
        'Use TPU (only works when using input_tfrecord_files stored on a Google bucket)'
    )
    args = parser.parse_args()
    return args
示例#19
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_file', required=True, help='Path to input file.')
    parser.add_argument('--output_file', required=True, help='Path to output file.')
    parser.add_argument('--username_filler', default='@user', type=str, help='Username filler (ignored when replace_username option is false)')
    parser.add_argument('--url_filler', default='http://domain.com', type=str, help='URL filler (ignored when replace_urls option is false)')
    parser.add_argument('--email_filler', default='*****@*****.**', type=str, help='Email filler (ignored when replace_email option is false)')
    parser.add_argument('--digibok', default='keep', type=str, help='Handling of digibok_ids. "keep", "remove" or "auto". Last option relies on other settings in script')
    parser.add_argument('--min_alphawords', default=2, type=int, help='The minimum number of letter-only- words with a length of at least 2. Keeps empty lines.')
    parser.add_argument('--max_words_in_section', required=False, default=1000, help='After reaching this maximum number of words, the next paragraph will be split into a new section.')

    #parser.add_argument('--num_logged_samples', default=10, type=int, help='Log first n samples to output')
    #add_bool_arg(parser, 'run_in_parallel', default=True, help='Run script in parallel')
    add_bool_arg(parser, 'replace_usernames', default=False, help='Replace usernames with filler. Mainly for tweets')
    add_bool_arg(parser, 'replace_urls', default=False, help='Replace URLs with filler')
    add_bool_arg(parser, 'replace_email', default=True, help='Replace emails with filler')
    add_bool_arg(parser, 'fix_unicode', default=True, help='Use ftfy to fix and standardise unicode. Converts it all to valid utf-8')
    add_bool_arg(parser, 'asciify_emojis', default=False, help='Asciifyi emojis. On by default but mainly useful for social media')
    add_bool_arg(parser, 'replace_multiple_usernames', default=False, help='Replace "@user @user" with "2 <username_filler>. Mainly for use on tweets"')
    add_bool_arg(parser, 'standardize', default=True, help='Replace "Standardize text. Remove all control characters.')
    add_bool_arg(parser, 'replace_multiple_urls', default=False, help='Replace "http://... http://.." with "2 <url_filler>". Mainly for use on tweets')
    add_bool_arg(parser, 'remove_unicode_symbols', default=True, help='After preprocessing remove characters which belong to unicode category "So"')
    add_bool_arg(parser, 'remove_accented_characters', default=False, help='Remove accents/asciify everything. Probably not recommended.')
    add_bool_arg(parser, 'standardize_punctuation', default=True, help='Standardize (asciifyi) special punctuation')
    add_bool_arg(parser, 'do_lower_case', default=False, help='Convert text to lower case')
    args = parser.parse_args()
    return args
示例#20
0
 def split(self):
     from utils.task_helpers import train_dev_test_split
     parser = ArgParseDefault(description='Split annotated data into training and test data set')
     parser.add_argument('--question', type=str, required=False, default='sentiment', help='Which data to load (has to be a valid question tag)')
     parser.add_argument('--name', type=str, required=False, default='', help='In case there are multiple cleaned labelled data output files give name of file (without csv ending), default: No name provided (works only if a single file is present).')
     parser.add_argument('--balanced-labels', dest='balanced_labels', action='store_true', default=False, help='Ensure equal label balance')
     parser.add_argument('--all-questions', dest='all_questions', action='store_true', default=False, help='Generate files for all available question tags. This overwrites the `question` argument. Default: False.')
     parser.add_argument('--label-tags', dest='label_tags', required=False, default=[], nargs='+', help='Only select examples with certain label tags')
     parser.add_argument('--has-label', dest='has_label', required=False, default='', help='Only select examples which have also been tagged with certain label')
     parser.add_argument('--dev-size', dest='dev_size', type=float, required=False, default=0.2, help='Fraction of dev size')
     parser.add_argument('--test-size', dest='test_size', type=float, required=False, default=0.2, help='Fraction of test size')
     parser.add_argument('--seed', type=int, required=False, default=42, help='Random state split')
     args = parser.parse_args(sys.argv[2:])
     train_dev_test_split(question=args.question, dev_size=args.dev_size, test_size=args.test_size, seed=args.seed, name=args.name, balanced_labels=args.balanced_labels, all_questions=args.all_questions, label_tags=args.label_tags, has_label=args.has_label)
示例#21
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_txt_files',
                        type=str,
                        nargs='+',
                        help='Input txt files to process.')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class')
    parser.add_argument('--run_prefix', help='Run prefix')
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--username_filler',
                        default='twitteruser',
                        type=str,
                        help='Username filler')
    parser.add_argument(
        '--url_filler',
        default='twitterurl',
        type=str,
        help='URL filler (ignored when replace_urls option is false)')
    add_bool_arg(parser,
                 'replace_usernames',
                 default=True,
                 help='Replace usernames with filler')
    add_bool_arg(parser,
                 'replace_urls',
                 default=True,
                 help='Replace URLs with filler')
    add_bool_arg(parser,
                 'asciify_emojis',
                 default=True,
                 help='Asciifyi emojis')
    add_bool_arg(parser,
                 'replace_multiple_usernames',
                 default=True,
                 help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser,
                 'replace_multiple_urls',
                 default=True,
                 help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(
        parser,
        'remove_unicode_symbols',
        default=True,
        help=
        'After preprocessing remove characters which belong to unicode category "So"'
    )
    add_bool_arg(parser,
                 'standardize_punctuation',
                 default=True,
                 help='Standardize (asciifyi) special punctuation')
    add_bool_arg(
        parser,
        'remove_accented_characters',
        default=False,
        help='Remove accents/asciify everything. Probably not recommended.')
    add_bool_arg(parser,
                 'write_preprocessed_file',
                 default=True,
                 help='Write preprocess output file')
    add_bool_arg(parser,
                 'run_in_parallel',
                 default=True,
                 help='Run script in parallel')
    return parser.parse_args()
示例#22
0
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument(
        '--finetune_data',
        required=True,
        help=
        'Finetune data folder sub path. Path has to be in gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}.\
                    This folder includes a meta.json (containing meta info about the dataset), and a file label_mapping.json. \
                    TFrecord files (train.tfrecords and dev.tfrecords) should be located in a \
                    subfolder gs://{bucket_name}/{project_name}/finetune/finetune_data/{finetune_data}/tfrecords/'
    )
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_ip',
                        required=False,
                        help='IP-address of the TPU')
    parser.add_argument('--tpu_name',
                        required=False,
                        help='Name of the TPU (required for pods)')
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--project_name',
                        default='covid-bert',
                        help='Name of subfolder in Google bucket')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument('--num_gpus',
                        default=1,
                        type=int,
                        help='Number of GPUs to use')
    parser.add_argument(
        '--init_checkpoint',
        default=None,
        help=
        'Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". \
            By default using a pretrained model from gs://{bucket_name}/pretrained_models/'
    )
    parser.add_argument(
        '--init_checkpoint_index',
        type=int,
        help=
        'Checkpoint index. This argument is ignored and only added for reporting.'
    )
    parser.add_argument(
        '--repeats',
        default=1,
        type=int,
        help='Number of times the script should run. Default is 1')
    parser.add_argument('--num_epochs',
                        default=3,
                        type=int,
                        help='Number of epochs')
    parser.add_argument(
        '--limit_train_steps',
        type=int,
        help='Limit the number of train steps per epoch. Useful for testing.')
    parser.add_argument(
        '--limit_eval_steps',
        type=int,
        help='Limit the number of eval steps per epoch. Useful for testing.')
    parser.add_argument('--train_batch_size',
                        default=32,
                        type=int,
                        help='Training batch size')
    parser.add_argument('--eval_batch_size',
                        default=32,
                        type=int,
                        help='Eval batch size')
    parser.add_argument('--learning_rate',
                        default=2e-5,
                        type=float,
                        help='Learning rate')
    parser.add_argument('--end_lr',
                        default=0,
                        type=float,
                        help='Final learning rate')
    parser.add_argument('--warmup_proportion',
                        default=0.1,
                        type=float,
                        help='Learning rate warmup proportion')
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument(
        '--early_stopping_epochs',
        default=-1,
        type=int,
        help='Stop when loss hasn\'t decreased during n epochs')
    parser.add_argument('--optimizer_type',
                        default='adamw',
                        choices=['adamw', 'lamb'],
                        type=str,
                        help='Optimizer')
    parser.add_argument('--dtype',
                        default='fp32',
                        choices=['fp32', 'bf16', 'fp16'],
                        type=str,
                        help='Data type')
    parser.add_argument(
        '--steps_per_loop',
        default=10,
        type=int,
        help=
        'Steps per loop (unavailable for Keras fit in TF 2.2, will be added in later version)'
    )
    parser.add_argument(
        '--validation_freq',
        default=None,
        type=int,
        nargs='+',
        help=
        'Validation frequency. Run eval after specified epochs. Default: After every epoch'
    )
    parser.add_argument(
        '--time_history_log_steps',
        default=10,
        type=int,
        help='Frequency with which to log timing information with TimeHistory.'
    )
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    add_bool_arg(parser,
                 'save_model',
                 default=True,
                 help='Save model checkpoint(s)')
    args = parser.parse_args()
    return args
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--filters',
                        choices=['retweets', 'contains_keywords'],
                        default=['retweets'],
                        help="Apply filters")
    parser.add_argument('--lang', default='en', help="Filter language")
    parser.add_argument('--min_tokens', default=3, help="Min num tokens")
    parser.add_argument('--max_examples_per_file',
                        default=int(1e6),
                        type=int,
                        help="Max examples per file")
    parser.add_argument('--num_files',
                        default=None,
                        type=int,
                        help="Only read n files from file")
    parser.add_argument('--no_parallel',
                        action='store_true',
                        default=False,
                        help='Do not run in parallel')
    args = parser.parse_args()
    return args
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument(
        '--run_name',
        required=True,
        help=
        'Run name to create tf record files for. Run folder has to be located under \
            data/pretrain/{run_name}/preprocessed/ and must contain one or multiple txt files. May also contain train and dev subfolders with txt files.'
    )
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument(
        '--dupe_factor',
        default=10,
        type=int,
        help=
        'Number of times to duplicate the input data (with different masks).')
    parser.add_argument(
        '--short_seq_prob',
        default=0.1,
        type=float,
        help=
        'Probability of creating sequences which are shorter than the maximum length.'
    )
    parser.add_argument(
        '--max_predictions_per_seq',
        default=14,
        type=int,
        help='Maximum number of masked LM predictions per sequence.')
    parser.add_argument('--random_seed',
                        default=42,
                        type=int,
                        help='Random seed')
    parser.add_argument('--masked_lm_prob',
                        default=0.15,
                        type=float,
                        help='Masked LM probabibility')
    parser.add_argument('--gzipped',
                        action='store_true',
                        default=False,
                        help='Create gzipped tfrecords files')
    parser.add_argument('--num_logged_samples',
                        default=10,
                        type=int,
                        help='Log first n samples to output')
    parser.add_argument(
        '--max_num_cpus',
        default=10,
        type=int,
        help=
        'Adapt this number based on the available memory/size of input files. \
            This code was tested on a machine with a lot of memory (250GB). Decrease this number if you run into memory issues.'
    )
    add_bool_arg(parser,
                 'run_in_parallel',
                 default=True,
                 help='Run script in parallel')
    return parser.parse_args()
示例#25
0
 def stats(self):
     from utils.task_helpers import stats
     parser = ArgParseDefault(description='Output various stats about project', usage=STATS_USAGE_DESC)
     parser.add_argument('command', choices=['all', 'overview', 'sample', 'annotation', 'annotator_outliers', 'annotation_cleaned'], help='Subcommand to run')
     args = parser.parse_args(sys.argv[2:3])
     if args.command == 'annotation':
         parser = ArgParseDefault(description='Print stats about annotations')
         parser.add_argument('-m', '--mode', choices=['all', 'mturk', 'local', 'public', 'other', '*'], type=str, required=False, default='all', help='Print stats for certain annotation modes only.')
         args = parser.parse_args(sys.argv[3:])
         stats('annotation', **vars(args))
     elif args.command == 'annotator_outliers':
         parser = ArgParseDefault(description='Find annotators which have under-performed compared to others')
         parser.add_argument('-m', '--mode', choices=['mturk', 'local', 'public', 'other'], type=str, required=False, default='mturk', help='Print stats for certain annotation modes only.')
         parser.add_argument('-b', '--batch-name', type=str, required=False, dest='batch_name', default='*', help='Only analyse for specific local/mturk batch name (this looks for a pattern in filename). Default: All data')
         parser.add_argument('--agreement-cutoff', dest='agreement_cutoff', type=float, required=False, default=3, help='Z-value cutoff for inter-worker agreement deviation')
         parser.add_argument('--time-cutoff', dest='time_cutoff', type=float, required=False, default=3, help='Z-value cutoff for average task duration per worker')
         parser.add_argument('--min-tasks', dest='min_tasks', type=int, required=False, default=3, help='Min tasks for worker to have completed before considered as outlier')
         parser.add_argument('--min-comparisons-count', dest='min_comparisons_count', type=int, required=False, default=20, help='Min number of questions to compare for a worker needed to compute agreement score')
         args = parser.parse_args(sys.argv[3:])
         stats('annotator_outliers', **vars(args))
     else:
         stats(args.command)
示例#26
0
 def sample(self):
     import utils.processing.sample_tweets as sample_tweets
     parser = ArgParseDefault(description='Sample cleaned data to generate `data/2_sampled`')
     parser.add_argument('-s', '--size', type=int, required=True, dest='size', help='Number of tweets to sample')
     parser.add_argument('-bs', '--bin_size', type=int, required=False, help='Number of tweets per bin')
     parser.add_argument('-m', '--mode', choices=['monthly', 'random'], required=False, default='random', help='Sampling mode. Random: Sample randomly. Monthly: Try to sample evenly within months.')
     parser.add_argument('-l', '--langs', default=[], nargs='+', required=False, help='Filter by language(s)')
     parser.add_argument('--contains_keywords', default=False, action='store_true', help='Only sample from tweets which include keywords')
     parser.add_argument('--min_token_count', default=3, type=int, required=False, help='Minimum number of tokens')
     parser.add_argument('--include_replies', default=False, action='store_true', help='Include replies')
     parser.add_argument('--seed', type=int, required=False, default=None, help='Random state split')
     parser.add_argument('--extend', action='store_true', help='Extending existing sample given by seed by removing already labelled tweets. If size is <= original sample size this has no effect except removing labelled tweets');
     add_bool_arg(parser, 'anonymize', default=True, help='Replace usernames and URLs with filler (@user and <url>)')
     parser.add_argument('--max_date', required=False, default=None, help='Sample until date (YYYY-MM-DD), default: No max')
     parser.add_argument('--min_date', required=False, default=None, help='Sample from date (YYYY-MM-DD), default: No min')
     args = parser.parse_args(sys.argv[2:])
     sample_tweets.run(size=args.size, contains_keywords=args.contains_keywords, anonymize=args.anonymize, min_token_count=args.min_token_count, langs=args.langs, include_replies=args.include_replies, mode=args.mode, seed=args.seed, extend=args.extend, bin_size=args.bin_size, min_date=args.min_date, max_date=args.max_date)
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument('--input_data',
                        required=True,
                        help='Path to folder with txt files. \
            Folder may contain train/dev/test subfolders. Each txt file contains the text of a single tweet per line.'
                        )
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to identify runs')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class')
    parser.add_argument('--username_filler',
                        default='twitteruser',
                        type=str,
                        help='Username filler')
    parser.add_argument(
        '--url_filler',
        default='twitterurl',
        type=str,
        help='URL filler (ignored when replace_urls option is false)')
    parser.add_argument('--num_logged_samples',
                        default=10,
                        type=int,
                        help='Log first n samples to output')
    add_bool_arg(parser,
                 'run_in_parallel',
                 default=True,
                 help='Run script in parallel')
    add_bool_arg(parser,
                 'replace_usernames',
                 default=True,
                 help='Replace usernames with filler')
    add_bool_arg(parser,
                 'replace_urls',
                 default=True,
                 help='Replace URLs with filler')
    add_bool_arg(parser,
                 'asciify_emojis',
                 default=True,
                 help='Asciifyi emojis')
    add_bool_arg(parser,
                 'replace_multiple_usernames',
                 default=True,
                 help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser,
                 'replace_multiple_urls',
                 default=True,
                 help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(
        parser,
        'remove_unicode_symbols',
        default=True,
        help=
        'After preprocessing remove characters which belong to unicode category "So"'
    )
    add_bool_arg(
        parser,
        'remove_accented_characters',
        default=False,
        help='Remove accents/asciify everything. Probably not recommended.')
    add_bool_arg(parser,
                 'standardize_punctuation',
                 default=True,
                 help='Standardize (asciifyi) special punctuation')
    args = parser.parse_args()
    return args
示例#28
0
def parse_args():
    parser = ArgParseDefault()
    parser.add_argument(
        '--finetune_datasets',
        type=str,
        nargs='+',
        help=
        'Finetune dataset(s) to process. These correspond to folder names in data/finetune. \
            Data should be located in data/finetune/originals/{finetune_dataset}/[train.tsv/dev.tsv/test.tsv]. By default runs all datasets.'
    )
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class')
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to identify runs')
    parser.add_argument('--max_seq_length',
                        default=96,
                        type=int,
                        help='Maximum sequence length')
    parser.add_argument('--username_filler',
                        default='twitteruser',
                        type=str,
                        help='Username filler')
    parser.add_argument(
        '--url_filler',
        default='twitterurl',
        type=str,
        help='URL filler (ignored when replace_urls option is false)')
    add_bool_arg(parser,
                 'replace_usernames',
                 default=True,
                 help='Replace usernames with filler')
    add_bool_arg(parser,
                 'replace_urls',
                 default=True,
                 help='Replace URLs with filler')
    add_bool_arg(parser,
                 'asciify_emojis',
                 default=True,
                 help='Asciifyi emojis')
    add_bool_arg(parser,
                 'replace_multiple_usernames',
                 default=True,
                 help='Replace "@user @user" with "2 <username_filler>"')
    add_bool_arg(parser,
                 'replace_multiple_urls',
                 default=True,
                 help='Replace "http://... http://.." with "2 <url_filler>"')
    add_bool_arg(parser,
                 'standardize_punctuation',
                 default=True,
                 help='Standardize (asciifyi) special punctuation')
    add_bool_arg(
        parser,
        'remove_unicode_symbols',
        default=True,
        help=
        'After preprocessing remove characters which belong to unicode category "So"'
    )
    add_bool_arg(
        parser,
        'remove_accented_characters',
        default=False,
        help='Remove accents/asciify everything. Probably not recommended.')
    return parser.parse_args()
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--tpu_ip', required=False, help='IP-address of the TPU')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_name', required=False, help='Name of the TPU')
    parser.add_argument('--tpu_name_project', required=False, help='Name of the TPU project')
    parser.add_argument('--pretrain_data', required=True, type=str, help='Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/')
    parser.add_argument('--run_prefix', help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--model_class', default='bert_large_uncased_wwm', choices=PRETRAINED_MODELS.keys(), help='Model class to use')
    parser.add_argument('--project_name', default='covid-bert', help='Name of subfolder in Google bucket')
    parser.add_argument('--num_gpus', default=1, type=int, help='Number of GPUs to use')
    parser.add_argument('--eval_steps', default=1000, type=int, help='Number eval steps to run (only active when --do_eval flag is provided)')
    parser.add_argument('--init_checkpoint', default=None, help='Run name to initialize checkpoint from. Example: "run2/ctl_step_8000.ckpt-8". or "run2/pretrained/bert_model_8000.ckpt-8". The first contains the mlm/nsp layers. \
            By default using a pretrained model from gs://{bucket_name}/pretrained_models/')
    parser.add_argument('--load_mlm_nsp_weights', default=None, help="If set to True it will load the mlm/nsp-layers. The init_checkpoint should then be set to a model containing these. Usually in base run-directory named 'ctl_step*'.")
    parser.add_argument('--set_trainstep', default=None, help="If set this will set the trainstep. This is only needed when restarting from an old checkpoint and you would like to get the scheduler/optimiser to start at the correct point.")
    parser.add_argument('--optimizer_type', default='adamw', choices=['adamw', 'lamb'], type=str, help='Optimizer')
    parser.add_argument('--train_batch_size', default=32, type=int, help='Training batch size')
    parser.add_argument('--eval_batch_size', default=32, type=int, help='Eval batch size')
    parser.add_argument('--num_epochs', default=3, type=int, help='Number of epochs')
    parser.add_argument('--num_steps_per_epoch', default=1000, type=int, help='Number of steps per epoch')
    parser.add_argument('--warmup_steps', default=10000, type=int, help='Warmup steps')
    parser.add_argument('--warmup_proportion', default=None, type=float, help='If set overwrites warmup_steps.')
    parser.add_argument('--learning_rate', default=2e-5, type=float, help='Learning rate')
    parser.add_argument('--end_lr', default=0, type=float, help='Final learning rate')
    parser.add_argument('--max_seq_length', default=96, type=int, help='Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.')
    parser.add_argument('--max_predictions_per_seq', default=14, type=int, help='Maximum predictions per sequence_output.')
    parser.add_argument('--dtype', default='fp32', choices=['fp32', 'bf16', 'fp16'], type=str, help='Data type')
    parser.add_argument('--steps_per_loop', default=10, type=int, help='Steps per loop')
    parser.add_argument('--time_history_log_steps', default=1000, type=int, help='Frequency with which to log timing information with TimeHistory.')
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    add_bool_arg(parser, 'do_eval', default=False, help='Run evaluation (make sure eval data is present in tfrecords folder)')
    args = parser.parse_args()
    return args
def parse_args():
    # Parse commandline
    parser = ArgParseDefault()
    parser.add_argument('--tpu_ip',
                        required=True,
                        help='IP-address of the TPU')
    parser.add_argument('--bucket_name', required=True, help='Bucket name')
    parser.add_argument('--tpu_name', required=False, help='Name of the TPU')
    parser.add_argument('--tpu_name_project',
                        required=False,
                        help='Name of the TPU project')
    parser.add_argument(
        '--pretrain_data',
        required=True,
        type=str,
        help=
        'Folder which contains pretrain data. Should be located under gs://{bucket_name}/{project_name}/pretrain/pretrain_data/'
    )
    parser.add_argument(
        '--run_prefix',
        help='Prefix to be added to all runs. Useful to group runs')
    parser.add_argument('--model_class',
                        default='bert_large_uncased_wwm',
                        choices=PRETRAINED_MODELS.keys(),
                        help='Model class to use')
    parser.add_argument('--project_name',
                        default='covid-bert',
                        help='Name of subfolder in Google bucket')
    parser.add_argument('--num_gpus',
                        default=1,
                        type=int,
                        help='Number of GPUs to use')
    parser.add_argument(
        '--eval_steps',
        default=1000,
        type=int,
        help=
        'Number eval steps to run (only active when --do_eval flag is provided)'
    )
    parser.add_argument('--optimizer_type',
                        default='adamw',
                        choices=['adamw', 'lamb'],
                        type=str,
                        help='Optimizer')
    parser.add_argument('--train_batch_size',
                        default=32,
                        type=int,
                        help='Training batch size')
    parser.add_argument('--eval_batch_size',
                        default=32,
                        type=int,
                        help='Eval batch size')
    parser.add_argument('--num_epochs',
                        default=3,
                        type=int,
                        help='Number of epochs')
    parser.add_argument('--num_steps_per_epoch',
                        default=1000,
                        type=int,
                        help='Number of steps per epoch')
    parser.add_argument('--warmup_steps',
                        default=10000,
                        type=int,
                        help='Warmup steps')
    parser.add_argument('--learning_rate',
                        default=2e-5,
                        type=float,
                        help='Learning rate')
    parser.add_argument('--end_lr',
                        default=0,
                        type=float,
                        help='Final learning rate')
    parser.add_argument(
        '--max_seq_length',
        default=96,
        type=int,
        help=
        'Maximum sequence length. Sequences longer than this will be truncated, and sequences shorter than this will be padded.'
    )
    parser.add_argument('--max_predictions_per_seq',
                        default=14,
                        type=int,
                        help='Maximum predictions per sequence_output.')
    parser.add_argument('--dtype',
                        default='fp32',
                        choices=['fp32', 'bf16', 'fp16'],
                        type=str,
                        help='Data type')
    parser.add_argument('--steps_per_loop',
                        default=10,
                        type=int,
                        help='Steps per loop')
    parser.add_argument(
        '--time_history_log_steps',
        default=1000,
        type=int,
        help='Frequency with which to log timing information with TimeHistory.'
    )
    add_bool_arg(parser, 'use_tpu', default=True, help='Use TPU')
    add_bool_arg(
        parser,
        'do_eval',
        default=False,
        help=
        'Run evaluation (make sure eval data is present in tfrecords folder)')
    args = parser.parse_args()
    return args