def analyze_dataset_intent(dataset_dir: Path): train_data = json_load(dataset_dir / "train.json") train_intent_counter = Counter(map(lambda d: d["intent"], train_data)) train_token_counter = Counter( chain.from_iterable(map(lambda d: d["text"].split(), train_data))) logger.info( f"Analyzing training data...\n" f"# Training data: {len(train_data)}\n" f"Example training data format: {json.dumps(train_data[0], indent=2)}\n\n" f"# Intents: {len(train_intent_counter)}\n" f"Intent distribution: {pretty_list(train_intent_counter.most_common())}]\n" f"# Unique words: {len(train_token_counter)}") val_data = json_load(dataset_dir / "eval.json") val_intent_counter = Counter(map(lambda d: d["intent"], val_data)) val_token_counter = Counter( chain.from_iterable(map(lambda d: d["text"].split(), val_data))) logger.info( f"Analyzing validation data...\n" f"# Validation data: {len(val_data)}\n" f"Example validation data format: {json.dumps(val_data[0], indent=2)}\n\n" f"# Intents: {len(val_intent_counter)}\n" f"Intent distribution: {pretty_list(val_intent_counter.most_common())}\n" f"# Unique words: {len(val_token_counter)}") assert set(train_intent_counter.elements()) == set( val_intent_counter.elements()) test_data = json_load(dataset_dir / "test_release.json") test_token_counter = Counter( chain.from_iterable(map(lambda d: d["text"].split(), test_data))) logger.info( f"Analyzing testing data...\n" f"# Testing data: {len(test_data)}\n" f"Example validation data format: {json.dumps(test_data[0], indent=2)}\n\n" f"# Unique words: {len(test_token_counter)}") train_tokens = set(train_token_counter.keys()) val_tokens = set(val_token_counter.keys()) test_tokens = set(test_token_counter.keys()) common_words = train_tokens | val_tokens | test_tokens logger.info( "\n" f"# Common words: {len(common_words)}\n" f"# In val but not in train: {len(val_tokens - train_tokens)}\n" f"# In test but not in train: {len(test_tokens - train_tokens)}")
def from_pretrained(cls, model_dir: Path): word_list = json_load(model_dir / "dictionary.json") embeddings = np.load(model_dir / "word_vectors.npy") word_list = [cls.PAD_TOKEN, cls.OOV_TOKEN] + word_list embeddings = np.concatenate([ np.zeros((1, embeddings.shape[1])), embeddings.mean(axis=0).reshape(1, -1), embeddings, ]) return cls(word_list, embeddings)
def __init__(self, **kwargs): super(JSONStorage, self).__init__(**kwargs) tasks = {} if os.path.exists(self.path): tasks = json_load(self.path, int_keys=True) if len(tasks) == 0: self.data = {} elif isinstance(tasks, dict): self.data = tasks elif isinstance(self.data, list): self.data = {int(task['id']): task for task in tasks} self._save()
def main(args): logger.info(args) logger.info("Analyzing context.json...") context = json_load(args.dataset_dir / "context.json") logger.info(f"#contexts: {len(context)}") context_lengthes = sorted(map(len, context)) logger.info("\n".join([ "About the lengthes (character level)", f"mean:\t{sum(context_lengthes) / len(context):.2f}", f"min:\t{min(context_lengthes)}", f"max:\t{max(context_lengthes)}", f"90%:\t{context_lengthes[int(len(context) * 0.9)]}", f">510:\t{sum(map(lambda t: len(t) > 510, context))} / {len(context)}", ])) def print_counter(counter): return "{\n" + "\n".join(f" {k} -> {v}" for k, v in sorted(counter.items())) + "\n}" def analyze(json_name, is_private=False): logger.info(f"Analyzing {json_name}...") data = json_load(args.dataset_dir / json_name) logger.info(f"#training examples: {len(data)}") num_paragraph_counter = Counter( map(lambda d: len(d["paragraphs"]), data)) logger.info( f"About the related paragraphs: {print_counter(num_paragraph_counter)}" ) question_length_counter = Counter( map(lambda d: len(d["question"]) // 10, data)) logger.info( f"About the question lengths: {print_counter(question_length_counter)}" ) if not is_private: assert all( all(a["text"] == context[d["relevant"]][a["start"]:a["start"] + len(a["text"])] for a in d["answers"]) for d in data) analyze("train.json") analyze("public.json") analyze("private.json", is_private=True)
def main(args): set_seed(args.seed) logger.info( f"Loading training data from {args.dataset_dir / 'train.json'}...") all_data = json_load(args.dataset_dir / "train.json") logger.info("Random shuffling the data...") random.shuffle(all_data) train_size = int(args.train_ratio * len(all_data)) val_size = len(all_data) - train_size logger.info(f"Splitting the dataset into [{train_size}, {val_size}] sizes") train_data, val_data = all_data[:train_size], all_data[train_size:] json_dump(train_data, args.dataset_dir / "train_splitted.json") json_dump(val_data, args.dataset_dir / "val_splitted.json")
def __init__( self, contexts: List[str], data: List[dict], tokenizer: Optional[BertTokenizer] = None, test: bool = False, include_nonrelevant=0, split_name: str = "no_name", cache_dir: Optional[Path] = None, skip_preprocess: Optional[bool] = False, ): super().__init__() self._contexts = contexts self._raw_data = data self.tokenizer = tokenizer self.test = test self.split_name = split_name if skip_preprocess: return cache_path = ((cache_dir / f"_{split_name}_preprocessed_{include_nonrelevant}.json" ) if cache_dir and split_name else None) if cache_path and cache_path.is_file(): logger.info( f"Loading cached preprocessed dataset from {cache_path}...") self.data = json_load(cache_path) else: self.data = self.preprocess_dataset( self.tokenizer, contexts, data, include_nonrelevant=include_nonrelevant, test=self.test, ) if cache_path: logger.info( f"Saving cached preprocessed dataset to {cache_path}...") json_dump(self.data, cache_path)
def analyze(json_name, is_private=False): logger.info(f"Analyzing {json_name}...") data = json_load(args.dataset_dir / json_name) logger.info(f"#training examples: {len(data)}") num_paragraph_counter = Counter( map(lambda d: len(d["paragraphs"]), data)) logger.info( f"About the related paragraphs: {print_counter(num_paragraph_counter)}" ) question_length_counter = Counter( map(lambda d: len(d["question"]) // 10, data)) logger.info( f"About the question lengths: {print_counter(question_length_counter)}" ) if not is_private: assert all( all(a["text"] == context[d["relevant"]][a["start"]:a["start"] + len(a["text"])] for a in d["answers"]) for d in data)
def create_project_dir(cls, project_name, args): """ Create project directory in args.root_dir/project_name, and initialize there all required files If some files are missed, restore them from defaults. If config files are specified by args, copy them in project directory :param project_name: :param args: :return: """ dir = cls.get_project_dir(project_name, args) if args.force: delete_dir_content(dir) os.makedirs(dir, exist_ok=True) # config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json')) config = json_load( args.config_path) if args.config_path else json_load( 'utils/schema/default_config.json') def already_exists_error(what, path): raise RuntimeError( '{path} {what} already exists. Use "--force" option to recreate it.' .format(path=path, what=what)) input_path = args.input_path or config.get('input_path') # save label config config_xml = 'config.xml' config_xml_path = os.path.join(dir, config_xml) label_config_file = args.label_config or config.get('label_config') if label_config_file: copy2(label_config_file, config_xml_path) print(label_config_file + ' label config copied to ' + config_xml_path) else: if os.path.exists(config_xml_path) and not args.force: already_exists_error('label config', config_xml_path) if not input_path: # create default config with polygons only if input data is not set default_label_config = 'examples/adv_region_image/config.xml' copy2(default_label_config, config_xml_path) print(default_label_config + ' label config copied to ' + config_xml_path) else: with io.open(config_xml_path, mode='w') as fout: fout.write('<View></View>') print('Empty config has been created in ' + config_xml_path) config['label_config'] = config_xml if args.source: config['source'] = { 'type': args.source, 'path': args.source_path, 'params': args.source_params } else: # save tasks.json tasks_json = 'tasks.json' tasks_json_path = os.path.join(dir, tasks_json) if input_path: tasks = cls._load_tasks(input_path, args, config_xml_path) else: tasks = {} with io.open(tasks_json_path, mode='w') as fout: json.dump(tasks, fout, indent=2) config['input_path'] = tasks_json config['source'] = { 'name': 'Tasks', 'type': 'tasks-json', 'path': os.path.abspath(tasks_json_path) } logger.debug( '{tasks_json_path} input file with {n} tasks has been created from {input_path}' .format(tasks_json_path=tasks_json_path, n=len(tasks), input_path=input_path)) if args.target: config['target'] = { 'type': args.target, 'path': args.target_path, 'params': args.target_params } else: completions_dir = os.path.join(dir, 'completions') if os.path.exists(completions_dir) and not args.force: already_exists_error('output dir', completions_dir) if os.path.exists(completions_dir): delete_dir_content(completions_dir) print(completions_dir + ' output dir already exists. Clear it.') else: os.makedirs(completions_dir, exist_ok=True) print(completions_dir + ' output dir has been created.') config['output_dir'] = 'completions' config['target'] = { 'name': 'Completions', 'type': 'completions-dir', 'path': os.path.abspath(completions_dir) } if 'ml_backends' not in config or not isinstance( config['ml_backends'], list): config['ml_backends'] = [] if args.ml_backends: for url in args.ml_backends: config['ml_backends'].append( cls._create_ml_backend_params(url, project_name)) if args.sampling: config['sampling'] = args.sampling if args.port: config['port'] = args.port if args.host: config['host'] = args.host if args.allow_serving_local_files: config['allow_serving_local_files'] = True # create config.json config_json = 'config.json' config_json_path = os.path.join(dir, config_json) if os.path.exists(config_json_path) and not args.force: already_exists_error('config', config_json_path) with io.open(config_json_path, mode='w') as f: json.dump(config, f, indent=2) print('') print( 'Label Studio has been successfully initialized. Check project states in ' + dir) print('Start the server: label-studio start ' + dir) return dir
def from_json(cls, context_json: Path, data_json: Path, **kwargs): contexts = json_load(context_json) data = json_load(data_json) return cls(contexts, data, **kwargs)
def load(cls, json_path: Path, **kwargs): data = json_load(json_path) return cls(data, **kwargs)
def _load_ids(self): if self._save_to_file_enabled and os.path.exists(self._ids_file): self._ids_keys_map = json_load(self._ids_file, int_keys=True) self._keys_ids_map = {item['key']: id for id, item in self._ids_keys_map.items()}
def get(self, id): filename = os.path.join(self.path, str(id) + '.json') if os.path.exists(filename): return json_load(filename)
def _get_objects(self): self.data = json_load(self.path, int_keys=True) return (str(id) for id in self.data)
def items(self): for key in self.ids(): filename = os.path.join(self.path, str(key) + '.json') yield key, json_load(filename)
def load(cls, config_json): return cls(json_load(config_json))