示例#1
0
def do_niv(X_test, X_train, T_train, Y_train, n_niv_params, dataset_name,
           fold_idx):
    niv_filename = 'niv_' + dataset_name
    fold_name = 'fold' + str(fold_idx + 1)
    niv_vars = load_json(niv_filename)
    survived_vars = niv_vars.get(fold_name) if niv_vars else None

    if survived_vars:
        print('Stored NIV:', survived_vars)
        X_test = X_test[survived_vars]
        X_train = X_train[survived_vars]
    else:
        niv_start_time = time.time()
        print('Start NIV variable selection')

        survived_vars = niv_variable_selection(X_train, Y_train, T_train,
                                               n_niv_params)
        print('NIV:', list(survived_vars))

        X_train = X_train[survived_vars]
        X_test = X_test[survived_vars]

        niv_end_time = time.time()
        print('NIV time:', niv_end_time - niv_start_time)

        if niv_vars:
            niv_vars.update({fold_name: survived_vars.tolist()})
        else:
            niv_vars = {fold_name: survived_vars.tolist()}
        save_json(niv_filename, niv_vars)

    return X_test, X_train
示例#2
0
def merge_datasets(dataset_filenames, out_filename):
    # the assumption here is that the abstract text and entity detected are identical
    # the only different things are relations
    # for chemprot and drugprot this is the case, see compare_datasets.py
    datasets = []
    for filename in dataset_filenames:
        datasets.append(utils.read_json(filename))
    # map all dataset to CPR-X
    for ds in datasets:
        map_to_cpr(ds)
    merged = datasets[0]
    for ds in datasets[1:]:
        for article_id, article in ds.items():

            # if this article is unique to this dataset, just add it
            # but make sure its relations are converted to CPR-X
            if article_id not in merged:
                merged[article_id] = article
            else:
                print(f'merging {article_id}')
                merge_article(merged[article_id], article)

    # stats
    total_relation = 0
    for article_id, article in merged.items():
        for sent in article['abstract']:
            total_relation += len(sent['relations'])
    print(f'number of relation of merged dataset: {total_relation}')

    utils.save_json(out_filename, merged)
def main(base_model_name, weights_file, image_source, predictions_file, img_format='jpg'):
    # load samples
    if os.path.isfile(image_source):
        image_dir, samples = image_file_to_json(image_source)
    else:
        image_dir = image_source
        samples = image_dir_to_json(image_dir, img_type='jpg')

    # build model and load weights
    nima = Nima(base_model_name, weights=None)
    nima.build()
    nima.nima_model.load_weights(weights_file)

    # initialize data generator
    data_generator = TestDataGenerator(samples, image_dir, 64, 10, nima.preprocessing_function(),
                                       img_format=img_format)

    # get predictions
    predictions = predict(nima.nima_model, data_generator)

    # calc mean scores and add to samples
    for i, sample in enumerate(samples):
        sample['mean_score_prediction'] = calc_mean_score(predictions[i])

    print(json.dumps(samples, indent=2))

    if predictions_file is not None:
        save_json(samples, predictions_file)
 def save_area_btn(self):
     _area_dict = {
         'name': 'area1',
         'areaName': '',
         'area': '{"abs": [], "points": []}',
         'alertType': '1',
         'day': '1111111',
         'hour': '0,24',
         'minute': '0'
     }
     _name = "123"
     _H_start = '0'
     _H_end = '24'
     _sec = '1'
     _points = self.main.win.get_draw_point()
     _weekday = '1111111'
     _abs = []
     _1080points = normalize_points([p for p in _points],
                                    self.main.canvasHandler.get_size())
     _1080points.append(_1080points[0])
     for i in range(len(_1080points) - 1):
         x1, y1 = _1080points[i]
         x2, y2 = _1080points[i + 1]
         if x1 - x2 == 0:
             a = float('inf')
             b = 0
         else:
             a = round((y1 - y2) / (x1 - x2), 3)
             b = round(y1 - x1 * a, 3)
         _abs.append((a, b))
     _1080points.pop()
     _area = json.dumps({'abs': _abs, 'points': _1080points})
     _area_dict['area'] = _area
     save_json('./area.txt', _area_dict)
     self.main.win.add_point(_points[0])
示例#5
0
 def config(self):
     self._set_backup_folder()
     self._set_bucket()
     self._set_time_interval()
     self._set_salt()
     self._set_control_key()
     self._create_password_test_file()
     self._stat_cache = StatCache(self._stat_cache_dir, self._backup_folder)
     self._object_db = ObjectDB(self._object_db_path)
     config = {
         "backup_folder": self._backup_folder,
         "bucket": self._bucket,
         "time_interval": self._time_interval,
     }
     save_json(config, self._CONFIG_FILEPATH)
     print(config)
示例#6
0
def select_relations(visualgenome_path, house_objects_path, model_path):
    ''' Select relations about how often attributes belong to objects of house domain '''
    attribute_frequency = load_json(
        join(visualgenome_path, 'attribute_frequencies.json'))
    groups = classification(attribute_frequency.values(), model_path)
    save_json(groups, join(visualgenome_path, 'attribute_classes.json'))
    if 'others' in groups: del groups['others']
    attribute_knowledge, relations = extract_knowledge(attribute_frequency,
                                                       groups)
    house_objects = {
        v.replace(' ', '_'): k['dbpedia_uri']
        for v, k in load_json(house_objects_path).items()
    }
    save_json(attribute_knowledge,
              join(visualgenome_path, 'attribute_knowledge.json'))
    create_triples(relations, house_objects, visualgenome_path)
示例#7
0
文件: dataset.py 项目: damengde/aloof
def create_dataset(option, frame_raw_path, frame_parsed_path):
    ''' Create a dataset of frame triples according to a Validator (by core, by synset or by embeddings) '''
    obj_validator = None
    if option == 'core':
        frame_types_path = join(dirname(__file__), '../../resource/frames/annotations_frame_types/')
        obj_validator = Validator_By_Core(frame_types_path)
    elif option == 'synset':
        frame_elements_path = join(dirname(__file__), '../../resource/frames/annotations_frame_elements/')
        obj_validator = Validator_By_Synset(frame_elements_path)
    elif option == 'embeddings':
        embeddings_path = join(dirname(__file__), '../../resource/embeddings/googlenews_negative300')
        obj_validator = Validator_By_Embeddings(embeddings_path)
    else:
        logging.error('Unknown "%s" option of frame validator' % option)

    if obj_validator:
        frame_instances = read_folder_frames(frame_raw_path, delete_repetition=False)
        filtered_frames = filter_instances(frame_instances, obj_validator)
        prototypical_frames = find_prototypical_instances(filtered_frames)

        save_json(prototypical_frames, join(frame_parsed_path, 'frame_instances.json'))
        logging.info('Selected %s prototypical frames' % len(prototypical_frames))
示例#8
0
def get_relations(object_name, relation, conceptnet_path, limit=100):
    ''' Get relations of an object throught Conceptnet RESTful API '''
    base_query = 'http://api.conceptnet.io/query?node=/c/en/%s&rel=/r/%s&offset=%d&limit=%d'
    data = {}
    flag = True
    index = 0

    while flag:
        try:
            data = requests.get(base_query %
                                (object_name, relation, index, limit)).json()
            save_json(
                data,
                join(conceptnet_path,
                     '%s_%s_%d.json' % (object_name, relation, index)))
        except:
            query = base_query % (object_name, relation, index, limit)
            logging.error('Corrupted JSON file in "%s"' % query)

        if 'view' in data and 'nextPage' in data['view']:
            index += limit
        else:
            flag = False
示例#9
0
def create_dataset(visualgenome_raw_path, visualgenome_parsed_path):
    ''' Create a dataset of objects and their attributes using VisualGenome dataset '''
    visualgenome_data = load_json(
        join(visualgenome_raw_path, 'attributes.json'))
    attribute_synsets = load_json(
        join(visualgenome_raw_path, 'attribute_synsets.json'))
    frequency_data = {}

    for image in visualgenome_data:
        objects = set()
        for attribute_data in image['attributes']:
            if 'attributes' in attribute_data and len(
                    set(attribute_data['synsets'])) == 1:
                object_name = attribute_data['synsets'][0]
                assigned = assign_attribute(object_name,
                                            attribute_data['attributes'],
                                            attribute_synsets, frequency_data)
                if assigned and object_name not in objects:
                    objects.add(object_name)
                    frequency_data[object_name]['images'] += 1

    logging.info('Size: %s objects selected' % len(frequency_data))
    save_json(frequency_data,
              join(visualgenome_parsed_path, 'attribute_frequencies.json'))
示例#10
0
 def save_metrics(self, latest_metrics, type_path) -> None:
     self.metrics[type_path].append(latest_metrics)
     save_json(self.metrics, self.metrics_save_path)
示例#11
0
def generate_summaries_or_translations(
    data_dir: str,
    out_dir: str,
    model_path: str,
    config_path: str,
    batch_size: int = 8,
    device: str = DEFAULT_DEVICE,
    fp16=False,
    task="summarization",
    prefix=None,
    max_source_length=1024,
    max_target_length=142,
    eval_beams=5,
    eval_max_gen_length=142,
    n_obs=-1,
    type_path="test",
    num_return_sequences=1,
    distill=None,
    num_layers=None,
    do_encoder=False,
    do_decoder=False,
    **generate_kwargs,
) -> Dict:

    out_dir = Path(out_dir)
    save_path = out_dir.joinpath(
        f"rank_{utils.distributed_utils.get_rank()}_output.json")

    if num_return_sequences > eval_beams:
        eval_beams = num_return_sequences

    ### Define BART model
    # Config from "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-cnn/config.json
    # Vocab modified to 50265 to be consistent with facebook/bart-large default
    config = BartConfig(**json.load(open(config_path, "r")))
    config.fp16 = fp16
    model = BartForConditionalGeneration.from_pretrained(
        model_path, config=config).to(device)

    # if distilling, change model
    if distill == "sft":
        model = distill_sft(model, num_layers, do_encoder, do_decoder)

    if fp16:
        model = model.half()
    model.eval()

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}"
                )  # if this is wrong, check config.model_type.

    start_time = time.time()
    # update config with task specific params
    use_task_specific_params(model, task)
    if prefix is None:
        prefix = prefix or getattr(model.config, "prefix", "") or ""

    ds = Seq2SeqDataset(tokenizer,
                        data_dir,
                        max_source_length,
                        max_target_length,
                        type_path=type_path,
                        n_obs=n_obs,
                        prefix=prefix)

    # I set shuffle=True for a more accurate progress bar.
    # If all the longest samples are first, the prog bar estimate is too high at the beginning.
    is_distributed = True if utils.distributed_utils.get_world_size(
    ) > 1 else False
    sampler = ds.make_sortish_sampler(batch_size,
                                      distributed=is_distributed,
                                      add_extra_examples=False,
                                      shuffle=True)
    data_loader = DataLoader(ds,
                             sampler=sampler,
                             batch_size=batch_size,
                             collate_fn=ds.collate_fn)

    results = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            t0 = time.time()

            summaries = model.generate(
                input_ids=batch["input_ids"].to(device),
                attention_mask=batch["attention_mask"].to(device),
                use_cache=True,
                num_return_sequences=num_return_sequences,
                num_beams=eval_beams,
                max_length=eval_max_gen_length,
                num_beam_groups=1,
                output_scores=False,
                return_dict_in_generate=False,
                encoder_no_repeat_ngram_size=0,
                diversity_penalty=0.0,
                **generate_kwargs,
            )
            preds = tokenizer.batch_decode(summaries,
                                           skip_special_tokens=True,
                                           clean_up_tokenization_spaces=False)
            ids = batch["ids"]
            if num_return_sequences > 1:
                preds = chunks(
                    preds, num_return_sequences
                )  # batch size chunks, each of size num_return_seq

            eval_time = time.time() - t0
            for i, pred in enumerate(preds):
                store_time = eval_time if i == 0 else None  #only store latency for element 0 of every batch
                results.append(
                    dict(pred=pred, id=ids[i].item(), eval_time=store_time))

    save_json(results, save_path)
    runtime = int(time.time() - start_time)  # seconds
    num_replicas = sampler.num_replicas if is_distributed else 1
    n_obs = len(results)
    return results, num_replicas, dict(n_obs=n_obs,
                                       eval_only_runtime=runtime,
                                       seconds_per_sample=round(
                                           runtime / n_obs, 4))
示例#12
0
def run_generate(verbose=True):
    """

    Takes input text, generates output, and then using reference calculates the BLEU scores.

    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.

    Args:
        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout

    Returns:
        a tuple: ``(scores, params}``
        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
    """

    parser = argparse.ArgumentParser()
    parser.add_argument("model_path",
                        type=str,
                        help="like facebook/bart-large-cnn or path to ckpt")
    parser.add_argument("config_path", type=str, help="path to config")
    parser.add_argument("data_dir", type=str, help="like cnn_dm/test.source")
    parser.add_argument("save_path", type=str, help="where to save summaries")
    parser.add_argument("--type_path",
                        type=str,
                        required=False,
                        default="test",
                        help="like cnn_dm/test.target")
    parser.add_argument("--device",
                        type=str,
                        required=False,
                        default=DEFAULT_DEVICE,
                        help="cuda, cuda:1, cpu etc.")
    parser.add_argument("--prefix",
                        type=str,
                        required=False,
                        default=None,
                        help="will be added to the begininng of src examples")
    parser.add_argument("--task",
                        type=str,
                        default="summarization",
                        help="used for task_specific_params + metrics")
    parser.add_argument("--bs",
                        type=int,
                        default=8,
                        required=False,
                        help="batch size")
    parser.add_argument("--n_obs",
                        type=int,
                        default=None,
                        required=False,
                        help="How many observations. Defaults to all.")
    parser.add_argument("--num_return_sequences",
                        type=int,
                        default=1,
                        required=False,
                        help="How many sequences to return")
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--dump-args",
                        action="store_true",
                        help="print the custom hparams with the results")
    parser.add_argument(
        "--info",
        nargs="?",
        type=str,
        const=datetime_now(),
        help=
        "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g. lang=en-ru. If no value is passed, the current datetime string will be used.",
    )
    parser.add_argument("--eval_max_gen_length",
                        type=int,
                        default=None,
                        help="never generate more than n tokens")
    parser.add_argument(
        "--eval_beams",
        type=int,
        default=None,
        required=False,
        help="# beams to use. 0 corresponds to not using beam search.")
    parser.add_argument(
        "--max_source_length",
        default=1024,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--max_target_length",
        default=142,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument(
        "--sync_timeout",
        type=int,
        default=600,
        required=False,
        help=
        "How long should master process wait for other processes to finish.",
    )
    parser.add_argument("--debug", action="store_true")
    parser.add_argument('--json-summary',
                        type=str,
                        default="results/dllogger.json",
                        help='If provided, the json summary will be written to'
                        'the specified file.')
    parser.add_argument(
        '--distill',
        type=str,
        default=None,
        help="string indicating how model is distilled, only sft supported",
        choices=["sft", None])
    parser.add_argument(
        '--layers',
        type=str,
        default=None,
        help=
        "string indicating which teacher layers remain, split by '-' (ex. 0-6-11)"
    )
    parser.add_argument('--do_encoder',
                        action="store_true",
                        default=False,
                        help="if true encoder distilled")
    parser.add_argument('--do_decoder',
                        action="store_true",
                        default=False,
                        help="if true decoder distilled")

    dist = parser.add_argument_group('distributed setup')
    dist.add_argument('--local_rank',
                      type=int,
                      default=os.getenv('LOCAL_RANK', 0),
                      help='Used for multi-process training.')

    start_time = time.time()

    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
    args, rest = parser.parse_known_args()
    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)

    if args.local_rank <= 0:
        print(args)
        print(rest)

    # Initialize device and distributed backend
    utils.distributed_utils.init_distributed(args.device == "cuda")
    if utils.distributed_utils.get_world_size() > 1:
        utils.distributed_utils.set_affinity(args.local_rank)
        torch.cuda.set_device(args.local_rank)

    if Path(args.json_summary).exists():
        warnings.warn(
            f"json_summary {args.json_summary} will be overwritten unless you type ctrl-c."
        )

    if utils.distributed_utils.get_rank() == 0:
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=args.json_summary),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   step_format=format_step)
        ])
    else:
        dllogger.init(backends=[])

    if parsed_args and verbose:
        print(f"parsed the following generate kwargs: {parsed_args}")

    Path(args.save_path).parent.mkdir(exist_ok=True)
    json_save_path = Path(args.save_path + "/tmp")
    Path(json_save_path).mkdir(exist_ok=True)  # this handles locking.

    if args.layers:
        num_layers = len(args.layers.split('-'))
    else:
        num_layers = None

    results, num_replicas, runtime_metrics = generate_summaries_or_translations(
        args.data_dir,
        json_save_path,
        args.model_path,
        args.config_path,
        batch_size=args.bs,
        device=args.device,
        fp16=args.fp16,
        task=args.task,
        prefix=args.prefix,
        eval_beams=args.eval_beams,
        max_source_length=args.max_source_length,
        max_target_length=args.max_target_length,
        eval_max_gen_length=args.eval_max_gen_length,
        n_obs=args.n_obs,
        type_path=args.type_path,
        num_return_sequences=args.num_return_sequences,
        distill=args.distill,
        num_layers=num_layers,
        do_encoder=args.do_encoder,
        do_decoder=args.do_decoder,
        **parsed_args,
    )

    if args.local_rank <= 0:
        save_path = Path(args.save_path)
        save_path.mkdir(exist_ok=True)
        partial_results = gather_results_from_each_node(
            num_replicas, json_save_path, args.sync_timeout)
        preds, time_list = combine_partial_results(partial_results)
        if args.num_return_sequences > 1:
            save_path = save_path.joinpath("pseudolabel_results.json")
            print(
                f"Saving aggregated results at {save_path}, intermediate in {json_save_path}/"
            )
            save_json(preds, save_path)
            return
        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
        labels = [x.rstrip() for x in open(tgt_file).readlines()][:len(preds)]

        # Calculate metrics, save metrics,  and save _generations.txt
        calc_bleu = "translation" in args.task
        score_fn = calculate_bleu if calc_bleu else calculate_rouge
        metric_name = "bleu" if calc_bleu else "rouge"
        metrics: Dict = score_fn(preds, labels)
        metrics["n_obs"] = len(preds)
        runtime = time.time() - start_time
        metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
        metrics["n_gpus"] = num_replicas
        metrics.update(runtime_metrics)

        time_list.sort()
        metrics["inference_latency_mean"] = np.mean(time_list)
        metrics["inference_latency_conf_50"] = max(
            time_list[:int(len(time_list) * 0.50)])
        metrics["inference_latency_conf_90"] = max(
            time_list[:int(len(time_list) * 0.90)])
        metrics["inference_latency_conf_95"] = max(
            time_list[:int(len(time_list) * 0.95)])
        metrics["inference_latency_conf_99"] = max(
            time_list[:int(len(time_list) * 0.99)])
        metrics["inference_latency_conf_100"] = max(
            time_list[:int(len(time_list) * 1)])
        metrics["inference_throughput_mean"] = len(preds) * 1.0 / sum(
            time_list)

        metrics_save_path = save_path.joinpath(
            f"{args.type_path}_{metric_name}.json")
        save_json(metrics, metrics_save_path, indent=None)
        dllogger.log(step=tuple(), data=metrics)
        print(metrics)
        write_txt_file(preds,
                       save_path.joinpath(f"{args.type_path}_generations.txt"))
        if args.debug:
            write_txt_file(labels,
                           save_path.joinpath(f"{args.type_path}.target"))
        else:
            shutil.rmtree(json_save_path)

    dllogger.flush()
#!/usr/bin/env python3

import sys

from utils.cr_utils import get_sample
from utils.utils import save_json

sample = get_sample(10000, {'has-references': True})
save_json(sample, sys.argv[1])

references = [r for item in sample for r in item.get('reference', [])]
doi_publ = [r for r in references if r.get('doi-asserted-by') == 'publisher']
doi_cr_str = [
    r for r in references if r.get('doi-asserted-by') == 'crossref' and (
        'year' in r or 'author' in r)
]
doi_cr_uns = [
    r for r in references if r.get('doi-asserted-by') == 'crossref'
    and 'year' not in r and 'author' not in r
]
no_match_str = [
    r for r in references if 'DOI' not in r and ('year' in r or 'author' in r)
]
no_match_uns = [
    r for r in references
    if 'DOI' not in r and 'year' not in r and 'author' not in r
]

print(','.join([
    str(len(e) / len(references))
    for e in [doi_publ, doi_cr_uns, doi_cr_str, no_match_uns, no_match_str]
示例#14
0
def save_dataset(ref_strings, file_path):
    save_json(ref_strings, file_path)
    logging.info('Dataset written to {}'.format(file_path))
示例#15
0
    with Pool(config.THREADS) as p:
        results = p.map(
            matcher.match,
            [r['reference']['unstructured'] for r in refs_unstructured])
    [
        d.update({'sbmv_unstructured': {
            'DOI': r[0],
            'score': r[1]
        }}) for d, r in zip(refs_unstructured, results)
    ]

    matcher = matching.openurl_query_matcher.Matcher()
    with Pool(config.THREADS) as p:
        results = p.map(matcher.match,
                        [r['reference'] for r in refs_structured])
    [d.update({'open_url': r[0]}) for d, r in zip(refs_structured, results)]

    matcher = matching.stq_matcher.Matcher()
    with Pool(config.THREADS) as p:
        results = p.map(
            matcher.match,
            [r['reference']['unstructured'] for r in refs_unstructured])
    [
        d.update({'simple_text_query': r[0]})
        for d, r in zip(refs_unstructured, results)
    ]

    [d.update({'gt': ''}) for d in data]

    save_json(data, args.output)
示例#16
0
#!/usr/bin/env python3

import matching.cr_search_validation_matcher
import sys

from evaluation.link_metrics import LinkMetricsResults
from multiprocessing import Pool
from utils.utils import read_json, save_json

dataset = read_json(sys.argv[1])['dataset']

matcher = matching.cr_search_validation_matcher.Matcher(0.4, 0.34, [])
with Pool(10) as p:
    results = p.map(matcher.match,
                    [item.get('ref_string') for item in dataset])

for item, target in zip(dataset, results):
    item['target_test']['DOI'] = target[0]
save_json(dataset, sys.argv[2])

link_results = LinkMetricsResults(dataset)
print(','.join([str(link_results.get(m))
                for m in ['precision', 'recall', 'F1']]))
示例#17
0
def main(ent_file, rel_file, abs_file, out_file, verbose=False):
    processed_data = process_dataset(ent_file, rel_file, abs_file, verbose)
    utils.save_json(out_file, processed_data)
示例#18
0
def save_sample_data(sample_data, file_path):
    save_json(sample_data, file_path)
    logging.info('Sample data written to {}'.format(file_path))
示例#19
0
        'Neutral': 2,
        'Positive': 3,
        'Extremely Positive': 4,
    }  # hard coding

    train_dic, dev_dic, test_dic = {}, {}, {}

    train_dic["texts"] = list(train["OriginalTweet"])
    train_dic["categories"] = [cat2idx[cat] for cat in train['Sentiment']]

    dev_dic["texts"] = list(dev["OriginalTweet"])
    dev_dic["categories"] = [cat2idx[cat] for cat in dev['Sentiment']]

    test_dic["texts"] = list(test["OriginalTweet"])
    test_dic["categories"] = [cat2idx[cat] for cat in test['Sentiment']]

    train_idx = list(train.index)
    dev_idx = list(dev.index)

    try:
        os.mkdir(f"{_data_root}/info")
    except FileExistsError:
        pass

    save_json(f"{_data_root}/train.json", train_dic)
    save_json(f"{_data_root}/dev.json", dev_dic)
    save_json(f"{_data_root}/test.json", test_dic)
    save_json(f"{_data_root}/info/cat2idx.json", cat2idx)
    save_json(f"{_data_root}/info/train_idx.json", train_idx)
    save_json(f"{_data_root}/info/dev_idx.json", dev_idx)