def _info(self): features = datasets.Features({ "id": datasets.Value("string"), "text": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "nps": [{ "text": datasets.Value("string"), "first_char": datasets.Value("int32"), "last_char": datasets.Value("int32"), "first_token": datasets.Value("int32"), "last_token": datasets.Value("int32"), "id": datasets.Value("string"), }], "np_relations": [{ "anchor": datasets.Value("string"), "complement": datasets.Value("string"), "preposition": datasets.features.ClassLabel(names=[ "about", "for", "with", "from", "among", "by", "on", "at", "during", "of", "member(s) of", "in", "after", "under", "to", "into", "before", "near", "outside", "around", "between", "against", "over", "inside", ]), "complement_coref_cluster_id": datasets.Value("string"), }], "coref": [{ "id": datasets.Value("string"), "members": datasets.Sequence(datasets.Value("string")), "np_type": datasets.features.ClassLabel(names=[ "standard", "time/date/measurement", "idiomatic", ]), }], "metadata": { "annotators": { "coref_worker": datasets.Value("int32"), "consolidator_worker": datasets.Value("int32"), "np-relations_worker": datasets.Sequence(datasets.Value("int32")), }, "url": datasets.Value("string"), "source": datasets.Value("string"), }, }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and # specify them. They'll be used if as_supervised=True in builder.as_dataset. # supervised_keys=("sentence", "label"), # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): features = datasets.Features({ "id": datasets.Value("int32"), "tokens": datasets.Sequence(datasets.Value("string")), "ner_ids": datasets.Sequence(datasets.Value("int32")), "space_after": datasets.Sequence(datasets.Value("bool")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-PERSON", "I-PERSON", "B-ORG", "I-ORG", "B-GPE", "I-GPE", "B-LOC", "I-LOC", "B-NAT_REL_POL", "I-NAT_REL_POL", "B-EVENT", "I-EVENT", "B-LANGUAGE", "I-LANGUAGE", "B-WORK_OF_ART", "I-WORK_OF_ART", "B-DATETIME", "I-DATETIME", "B-PERIOD", "I-PERIOD", "B-MONEY", "I-MONEY", "B-QUANTITY", "I-QUANTITY", "B-NUMERIC", "I-NUMERIC", "B-ORDINAL", "I-ORDINAL", "B-FACILITY", "I-FACILITY", ])), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( features=datasets.Features({ #"position": datasets.Sequence(datasets.Value("string")), "tokens": datasets.Sequence(datasets.Value("string")), #"sid": datasets.Value("string"), "pos_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ '"', "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "-LRB-", "MD", "NN", "NNP", "NNPS", "NNS", "NN|SYM", "PDT", "POS", "PRP", "PRP$", "-RRB-", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", ])), }), supervised_keys=None, )
def _info(self): features = datasets.Features({ "messages": datasets.Sequence(datasets.Value("string")), "sender_labels": datasets.Sequence(datasets.ClassLabel(names=["false", "true"])), "receiver_labels": datasets.Sequence( datasets.ClassLabel(names=["false", "true", "noannotation"])), "speakers": datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)), "receivers": datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)), "absolute_message_index": datasets.Sequence(datasets.Value("int64")), "relative_message_index": datasets.Sequence(datasets.Value("int64")), "seasons": datasets.Sequence(datasets.ClassLabel(names=_SEASONS)), "years": datasets.Sequence(datasets.ClassLabel(names=_YEARS)), "game_score": datasets.Sequence(datasets.ClassLabel(names=_GAME_SCORE)), "game_score_delta": datasets.Sequence(datasets.ClassLabel(names=_GAME_SCORE_DELTA)), "players": datasets.Sequence(datasets.ClassLabel(names=_PLAYABLE_COUNTRIES)), "game_id": datasets.Value("int64"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "text": datasets.Value("string"), "annot_text": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "SO", "SS", "VV", "XR", "VCP", "JC", "VCN", "JKB", "MM", "SP", "XSN", "SL", "NNP", "NP", "EP", "JKQ", "IC", "XSA", "EC", "EF", "SE", "XPN", "ETN", "SH", "XSV", "MAG", "SW", "ETM", "JKO", "NNB", "MAJ", "NNG", "JKV", "JKC", "VA", "NR", "JKG", "VX", "SF", "JX", "JKS", "SN", ])), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "I", "O", "B_OG", "B_TI", "B_LC", "B_DT", "B_PS" ])), }), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "''", "#", "$", "(", ")", ",", ".", ":", "``", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", ])), "chunk_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP", ])), }), supervised_keys=None, homepage="https://www.clips.uantwerpen.be/conll2000/chunking/", citation=_CITATION, )
def _info(self): features = datasets.Features({ "document_id": datasets.Value("string"), "source_segments": datasets.Sequence(datasets.Value("string")), "source_tokenized": datasets.Sequence(datasets.Value("string")), "mt_segments": datasets.Sequence(datasets.Value("string")), "mt_tokenized": datasets.Sequence(datasets.Value("string")), "annotations": datasets.Sequence({ "segment_id": datasets.Sequence(datasets.Value("int32")), "annotation_start": datasets.Sequence(datasets.Value("int32")), "annotation_length": datasets.Sequence(datasets.Value("int32")), "severity": datasets.ClassLabel(names=["minor", "major", "critical"]), "severity_weight": datasets.Value("float32"), "category": datasets.ClassLabel(names=_ANNOTATION_CATEGORIES), }), "token_annotations": datasets.Sequence({ "segment_id": datasets.Sequence(datasets.Value("int32")), "first_token": datasets.Sequence(datasets.Value("int32")), "last_token": datasets.Sequence(datasets.Value("int32")), "token_after_gap": datasets.Sequence(datasets.Value("int32")), "severity": datasets.ClassLabel(names=["minor", "major", "critical"]), "category": datasets.ClassLabel(names=_ANNOTATION_CATEGORIES), }), "token_index": datasets.Sequence( datasets.Sequence(datasets.Sequence(datasets.Value("int32")))), "total_words": datasets.Value("int32"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): if self.config.name == "en": features = datasets.Features({ "category": datasets.Value("string"), "size": datasets.Value("int32"), "eid": datasets.Value("string"), "original_triple_sets": datasets.Sequence({ "otriple_set": datasets.Sequence(datasets.Value("string")) }), "modified_triple_sets": datasets.Sequence({ "mtriple_set": datasets.Sequence(datasets.Value("string")) }), "shape": datasets.Value("string"), "shape_type": datasets.Value("string"), "lex": datasets.Sequence({ "comment": datasets.Value("string"), "lid": datasets.Value("string"), "text": datasets.Value("string"), "template": datasets.Value("string"), "sorted_triple_sets": datasets.Sequence(datasets.Value("string")), # only present in the en version "lexicalization": datasets.Value("string"), }), }) else: features = datasets.Features({ "category": datasets.Value("string"), "size": datasets.Value("int32"), "eid": datasets.Value("string"), "original_triple_sets": datasets.Sequence({ "otriple_set": datasets.Sequence(datasets.Value("string")) }), "modified_triple_sets": datasets.Sequence({ "mtriple_set": datasets.Sequence(datasets.Value("string")) }), "shape": datasets.Value("string"), "shape_type": datasets.Value("string"), "lex": datasets.Sequence({ "comment": datasets.Value("string"), "lid": datasets.Value("string"), "text": datasets.Value("string"), "template": datasets.Value("string"), "sorted_triple_sets": datasets.Sequence(datasets.Value("string")), }), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, citation=_CITATION, license=_LICENSE, )
def _info(self): if self.config.name == "slot_description": features = datasets.Features({ "service_name": datasets.Value("string"), "description": datasets.Value("string"), "slots": datasets.Sequence({ "name": datasets.Value("string"), "description": datasets.Value("string"), "is_categorical": datasets.Value("bool"), "possible_values": datasets.Sequence(datasets.Value("string")), }), "intents": datasets.Sequence( { "name": datasets.Value("string"), "description": datasets.Value("string"), "is_transactional": datasets.Value("bool"), "required_slots": datasets.Sequence(datasets.Value("string")), # optional_slots was originally a dictionary "optional_slots": datasets.Sequence( { "slot_name": datasets.Value("string"), "slot_value": datasets.Value("string"), }), "result_slots": datasets.Sequence(datasets.Value("string")), }, ), }) elif self.config.name == "dialogues": features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "turns": datasets.Sequence({ "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "frames": datasets.Sequence({ "service": datasets.Value("string"), "slots": datasets.Sequence({ "slot": datasets.Value("string"), "start": datasets.Value("int32"), "exclusive_end": datasets.Value("int32"), }), # optional "state": { "active_intent": datasets.Value("string"), "requested_slots": datasets.Sequence(datasets.Value("string")), # slot_values was originally a dictionary "slot_values": datasets.Sequence({ "slot_name": datasets.Value("string"), "slot_value_list": datasets.Sequence(datasets.Value("string")), }), }, "actions": datasets.Sequence({ "act": datasets.ClassLabel(names=_ALL_ACTS), # optional "slot": datasets.Value("string"), # optional "canonical_values": datasets.Sequence(datasets.Value("string")), # optional "values": datasets.Sequence(datasets.Value("string")), }), # optional "service_results": datasets.Sequence( # Arrow doesn't like Sequences of Sequences for default values so we need a Sequence of Features of Sequences { "service_results_list": datasets.Sequence( # originally each list item was a dictionary (optional) { "service_slot_name": datasets.Value("string"), "service_canonical_value": datasets.Value("string"), }) }), # optional "service_call": { "method": datasets.Value("string"), # parameters was originally a dictionary "parameters": datasets.Sequence({ "parameter_slot_name": datasets.Value("string"), "parameter_canonical_value": datasets.Value("string"), }), }, }), }), }) elif self.config.name == "turns": features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "frames": datasets.Sequence({ "service": datasets.Value("string"), "slots": datasets.Sequence({ "slot": datasets.Value("string"), "start": datasets.Value("int32"), "exclusive_end": datasets.Value("int32"), }), # optional "state": { "active_intent": datasets.Value("string"), "requested_slots": datasets.Sequence(datasets.Value("string")), # slot_values was originally a dictionary "slot_values": datasets.Sequence({ "slot_name": datasets.Value("string"), "slot_value_list": datasets.Sequence(datasets.Value("string")), }), }, "actions": datasets.Sequence({ "act": datasets.ClassLabel(names=_ALL_ACTS), # optional "slot": datasets.Value("string"), # optional "canonical_values": datasets.Sequence(datasets.Value("string")), # optional "values": datasets.Sequence(datasets.Value("string")), }), # optional "service_results": datasets.Sequence( # Arrow doesn't like Sequences of Sequences for default values so we need a # Sequence of Features of Sequences { "service_results_list": datasets.Sequence( # originally each list item was a dictionary (optional) { "service_slot_name": datasets.Value("string"), "service_canonical_value": datasets.Value("string"), }) }), # optional "service_call": { "method": datasets.Value("string"), # parameters was originally a dictionary "parameters": datasets.Sequence({ "parameter_slot_name": datasets.Value("string"), "parameter_canonical_value": datasets.Value("string"), }), }, }), }) elif self.config.name == "slots": features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "history": datasets.Value("string"), "name": datasets.Value("string"), "description": datasets.Value("string"), "value": datasets.Value("string"), "service+description+history": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features= features, # Here we define them above because they are different between the two configurations supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): if self.config.name == "ner": features = { "words": datasets.Sequence(datasets.Value("string")), "ner": datasets.Sequence( datasets.features.ClassLabel( names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC", ] ) ), } elif self.config.name == "pos": features = { "words": datasets.Sequence(datasets.Value("string")), "pos": datasets.Sequence( datasets.features.ClassLabel( names=[ "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", ] ) ), } elif self.config.name == "mlqa": features = { "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.features.Sequence( {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")} ), # These are the features of your dataset like images, labels ... } elif self.config.name == "nc": features = { "news_title": datasets.Value("string"), "news_body": datasets.Value("string"), "news_category": datasets.ClassLabel( names=[ "foodanddrink", "sports", "travel", "finance", "lifestyle", "news", "entertainment", "health", "video", "autos", ] ), } elif self.config.name == "xnli": features = { "premise": datasets.Value("string"), "hypothesis": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["entailment", "neutral", "contradiction"]), } elif self.config.name == "paws-x": features = { "sentence1": datasets.Value("string"), "sentence2": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["different", "same"]), } elif self.config.name == "qadsm": features = { "query": datasets.Value("string"), "ad_title": datasets.Value("string"), "ad_description": datasets.Value("string"), "relevance_label": datasets.features.ClassLabel(names=["Bad", "Good"]), } elif self.config.name == "wpr": features = { "query": datasets.Value("string"), "web_page_title": datasets.Value("string"), "web_page_snippet": datasets.Value("string"), "relavance_label": datasets.features.ClassLabel(names=["Bad", "Fair", "Good", "Excellent", "Perfect"]), } elif self.config.name == "qam": features = { "question": datasets.Value("string"), "answer": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["False", "True"]), } elif self.config.name == "qg": features = { "answer_passage": datasets.Value("string"), "question": datasets.Value("string"), } elif self.config.name == "ntg": features = { "news_body": datasets.Value("string"), "news_title": datasets.Value("string"), } return datasets.DatasetInfo( description=_XGLUE_DESCRIPTION, features=datasets.Features(features), homepage=self.config.url, citation=self.config.citation + "\n" + _XGLUE_CITATION, )
def _info(self): features = datasets.Features( { "text": datasets.Value("string"), "sentence_offsets": datasets.features.Sequence( {"begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64")} ), "sentences": datasets.features.Sequence(datasets.Value("string")), "sentence_labels": datasets.features.Sequence(datasets.Value("int64")), "token_offsets": datasets.features.Sequence( { "offsets": datasets.features.Sequence( {"begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64")} ) } ), "tokens": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("string"))), "entity_labels": datasets.features.Sequence( datasets.features.Sequence( datasets.features.ClassLabel( names=[ "B-DEVICE", "B-EXPERIMENT", "B-MATERIAL", "B-VALUE", "I-DEVICE", "I-EXPERIMENT", "I-MATERIAL", "I-VALUE", "O", ] ) ) ), "slot_labels": datasets.features.Sequence( datasets.features.Sequence( datasets.features.ClassLabel( names=[ "B-anode_material", "B-cathode_material", "B-conductivity", "B-current_density", "B-degradation_rate", "B-device", "B-electrolyte_material", "B-experiment_evoking_word", "B-fuel_used", "B-interlayer_material", "B-interconnect_material", "B-open_circuit_voltage", "B-power_density", "B-resistance", "B-support_material", "B-thickness", "B-time_of_operation", "B-voltage", "B-working_temperature", "I-anode_material", "I-cathode_material", "I-conductivity", "I-current_density", "I-degradation_rate", "I-device", "I-electrolyte_material", "I-experiment_evoking_word", "I-fuel_used", "I-interlayer_material", "I-interconnect_material", "I-open_circuit_voltage", "I-power_density", "I-resistance", "I-support_material", "I-thickness", "I-time_of_operation", "I-voltage", "I-working_temperature", "O", ] ) ) ), "links": datasets.Sequence( { "relation_label": datasets.features.ClassLabel( names=["coreference", "experiment_variation", "same_experiment", "thickness"] ), "start_span_id": datasets.Value("int64"), "end_span_id": datasets.Value("int64"), } ), "slots": datasets.features.Sequence( { "frame_participant_label": datasets.features.ClassLabel( names=[ "anode_material", "cathode_material", "current_density", "degradation_rate", "device", "electrolyte_material", "fuel_used", "interlayer_material", "open_circuit_voltage", "power_density", "resistance", "support_material", "time_of_operation", "voltage", "working_temperature", ] ), "slot_id": datasets.Value("int64"), } ), "spans": datasets.features.Sequence( { "span_id": datasets.Value("int64"), "entity_label": datasets.features.ClassLabel(names=["", "DEVICE", "MATERIAL", "VALUE"]), "sentence_id": datasets.Value("int64"), "experiment_mention_type": datasets.features.ClassLabel( names=["", "current_exp", "future_work", "general_info", "previous_work"] ), "begin_char_offset": datasets.Value("int64"), "end_char_offset": datasets.Value("int64"), } ), "experiments": datasets.features.Sequence( { "experiment_id": datasets.Value("int64"), "span_id": datasets.Value("int64"), "slots": datasets.features.Sequence( { "frame_participant_label": datasets.features.ClassLabel( names=[ "anode_material", "cathode_material", "current_density", "degradation_rate", "conductivity", "device", "electrolyte_material", "fuel_used", "interlayer_material", "open_circuit_voltage", "power_density", "resistance", "support_material", "time_of_operation", "voltage", "working_temperature", ] ), "slot_id": datasets.Value("int64"), } ), } ), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "source": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-LOC", "I-LOC", "B-LOCderiv", "I-LOCderiv", "B-LOCpart", "I-LOCpart", "B-ORG", "I-ORG", "B-ORGderiv", "I-ORGderiv", "B-ORGpart", "I-ORGpart", "B-OTH", "I-OTH", "B-OTHderiv", "I-OTHderiv", "B-OTHpart", "I-OTHpart", "B-PER", "I-PER", "B-PERderiv", "I-PERderiv", "B-PERpart", "I-PERpart", ])), "nested_ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-LOC", "I-LOC", "B-LOCderiv", "I-LOCderiv", "B-LOCpart", "I-LOCpart", "B-ORG", "I-ORG", "B-ORGderiv", "I-ORGderiv", "B-ORGpart", "I-ORGpart", "B-OTH", "I-OTH", "B-OTHderiv", "I-OTHderiv", "B-OTHpart", "I-OTHpart", "B-PER", "I-PER", "B-PERderiv", "I-PERderiv", "B-PERpart", "I-PERpart", ])), }), supervised_keys=None, homepage="https://sites.google.com/site/germeval2014ner/", citation=_CITATION, )
def _info(self): if ( self.config.mode == "experiments" ): # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features({ "question": datasets.Value("string"), "candidate": datasets.Value("string"), "label": datasets.ClassLabel(names=["0", "1"]), }) else: if self.config.type_ == "answer_selection": features = datasets.Features({ "section": datasets.Value("string"), "question": datasets.Value("string"), "article": datasets.Value("string"), "is_paraphrase": datasets.Value("bool"), "topic": datasets.ClassLabel(names=[ "MUSIC", "TV", "TRAVEL", "ART", "SPORT", "COUNTRY", "MOVIES", "HISTORICAL EVENTS", "SCIENCE", "FOOD", ]), "answers": datasets.Sequence(datasets.Value("int32")), "candidates": datasets.Sequence(datasets.Value("string")), "q_types": datasets.Sequence( datasets.ClassLabel(names=[ "what", "why", "when", "who", "where", "how", "" ])), }) else: features = datasets.Features({ "section": datasets.Value("string"), "question": datasets.Value("string"), "article": datasets.Value("string"), "is_paraphrase": datasets.Value("bool"), "topic": datasets.ClassLabel(names=[ "MUSIC", "TV", "TRAVEL", "ART", "SPORT", "COUNTRY", "MOVIES", "HISTORICAL EVENTS", "SCIENCE", "FOOD", ]), "q_types": datasets.Sequence( datasets.ClassLabel(names=[ "what", "why", "when", "who", "where", "how", "" ])), "candidate_list": datasets.Sequence({ "article": datasets.Value("string"), "section": datasets.Value("string"), "candidates": datasets.Sequence(datasets.Value("string")), "answers": datasets.Sequence(datasets.Value("int32")), }), }) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def benchmark_iterating(): times = {"num examples": SPEED_TEST_N_EXAMPLES} functions = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted, { "type": "pandas", "length": SMALL_TEST }), (read_formatted, { "type": "torch", "length": SMALL_TEST }), (read_formatted, { "type": "tensorflow", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] functions_shuffled = [ (read, { "length": SMALL_TEST }), (read, { "length": SPEED_TEST_N_EXAMPLES }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 10 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 100 }), (read_batch, { "length": SPEED_TEST_N_EXAMPLES, "batch_size": 1_000 }), (read_formatted, { "type": "numpy", "length": SMALL_TEST }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 10 }), (read_formatted_batch, { "type": "numpy", "length": SMALL_TEST, "batch_size": 1_000 }), ] with tempfile.TemporaryDirectory() as tmp_dir: print("generating dataset") features = datasets.Features({ "list": datasets.Sequence(datasets.Value("float32")), "numbers": datasets.Value("float32") }) dataset = generate_example_dataset( os.path.join(tmp_dir, "dataset.arrow"), features, num_examples=SPEED_TEST_N_EXAMPLES, seq_shapes={"list": (100, )}, ) print("first set of iterations") for func, kwargs in functions: print(func.__name__, str(kwargs)) times[func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) print("shuffling dataset") dataset = dataset.shuffle() print("Second set of iterations (after shuffling") for func, kwargs in functions_shuffled: print("shuffled ", func.__name__, str(kwargs)) times["shuffled " + func.__name__ + " " + " ".join(str(v) for v in kwargs.values())] = func( dataset, **kwargs) with open(RESULTS_FILE_PATH, "wb") as f: f.write(json.dumps(times).encode("utf-8"))
def _info(self): # TODO(xtreme): Specifies the datasets.DatasetInfo object features = {text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features)} if "answers" in features.keys(): features["answers"] = datasets.features.Sequence( {"answer_start": datasets.Value("int32"), "text": datasets.Value("string")} ) if self.config.name.startswith("PAWS-X"): features["label"] = datasets.Value("string") if self.config.name == "XNLI": features["gold_label"] = datasets.Value("string") if self.config.name.startswith("udpos"): features = datasets.Features( { "token": datasets.Value("string"), "pos_tag": datasets.features.ClassLabel( names=[ "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X", ] ), } ) if self.config.name.startswith("PAN-X"): features = datasets.Features( { "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", ] ) ), "langs": datasets.Sequence(datasets.Value("string")), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=self.config.description + "\n" + _DESCRIPTION, # datasets.features.FeatureConnectors features=datasets.Features( features # These are the features of your dataset like images, labels ... ), # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage="https://github.com/google-research/xtreme" + "\t" + self.config.url, citation=self.config.citation + "\n" + _CITATION, )
def _info(self): if self.config.name == configs["classification"]: features = datasets.Features( { "text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["Not-Related", "Related"]), } ) if self.config.name == configs["RE_ade"]: features = datasets.Features( { "text": datasets.Value("string"), "drug": datasets.Value("string"), "effect": datasets.Value("string"), "indexes": { "drug": datasets.Sequence( { "start_char": datasets.Value("int32"), "end_char": datasets.Value("int32"), } ), "effect": datasets.Sequence( { "start_char": datasets.Value("int32"), "end_char": datasets.Value("int32"), } ), }, } ) if self.config.name == configs["RE_dosage"]: features = datasets.Features( { "text": datasets.Value("string"), "drug": datasets.Value("string"), "dosage": datasets.Value("string"), "indexes": { "drug": datasets.Sequence( { "start_char": datasets.Value("int32"), "end_char": datasets.Value("int32"), } ), "dosage": datasets.Sequence( { "start_char": datasets.Value("int32"), "end_char": datasets.Value("int32"), } ), }, } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage="https://www.sciencedirect.com/science/article/pii/S1532046412000615", citation=_CITATION, )
def _info(self): features = datasets.Features( { "sent_id": datasets.Value("string"), "text": datasets.Value("string"), "tok_ids": datasets.Sequence(datasets.Value("int64")), "tokens": datasets.Sequence(datasets.Value("string")), "lemmas": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ "NUM", "CCONJ", "PRON", "VERB", "INTJ", "AUX", "ADJ", "PROPN", "PART", "ADV", "PUNCT", "ADP", "NOUN", "X", "DET", "SYM", "SCONJ", ] ) ), "morph_tags": datasets.Sequence(datasets.Value("string")), "dep_ids": datasets.Sequence(datasets.Value("int64")), "dep_labels": datasets.Sequence( datasets.ClassLabel( names=[ "parataxis", "mark", "nummod", "discourse", "compound:prt", "reparandum", "vocative", "list", "obj", "dep", "det", "obl:loc", "flat", "iobj", "cop", "expl", "obl", "conj", "nmod", "root", "acl:relcl", "goeswith", "appos", "fixed", "obl:tmod", "xcomp", "advmod", "nmod:poss", "aux", "ccomp", "amod", "cc", "advcl", "nsubj", "punct", "case", ] ) ), "ner_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC", ] ) ), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( # https://www.ims.uni-stuttgart.de/forschung/ressourcen/lexika/germantagsets/ datasets.features.ClassLabel(names=[ "ADJA", "ADJD", "ADV", "APPR", "APPRART", "APPO", "APZR", "ART", "CARD", "FM", "ITJ", "KOUI", "KOUS", "KON", "KOKOM", "NN", "NE", "PDS", "PDAT", "PIS", "PIAT", "PIDAT", "PPER", "PPOSS", "PPOSAT", "PRELS", "PRELAT", "PRF", "PWS", "PWAT", "PWAV", "PAV", "PTKZU", "PTKNEG", "PTKVZ", "PTKANT", "PTKA", "TRUNC", "VVFIN", "VVIMP", "VVINF", "VVIZU", "VVPP", "VAFIN", "VAIMP", "VAINF", "VAPP", "VMFIN", "VMINF", "VMPP", "XY", "$,", "$.", "$(", ])), "chunk_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-ADJP", "I-ADJP", "B-ADVP", "I-ADVP", "B-CONJP", "I-CONJP", "B-INTJ", "I-INTJ", "B-LST", "I-LST", "B-NP", "I-NP", "B-PP", "I-PP", "B-PRT", "I-PRT", "B-SBAR", "I-SBAR", "B-UCP", "I-UCP", "B-VP", "I-VP", "I-NC", "B-NC", "I-PC", "B-PC", "I-VC", "B-VC" ])), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ])), }), supervised_keys=None, homepage="https://www.aclweb.org/anthology/W03-0419/", citation=_CITATION, )
def _info(self): if self.config.name == "CLS" or self.config.name == "XNLI": features = { text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features) } features[self.config.label_column] = datasets.features.ClassLabel(names=self.config.label_classes) features["idx"] = datasets.Value("int32") elif self.config.name == "WSD-V": features = { text_feature: datasets.Sequence(datasets.Value("string")) for text_feature in six.iterkeys(self.config.text_features) } features["fine_pos_tags"] = datasets.Sequence( datasets.features.ClassLabel( names=[ "DET", "P+D", "CC", "VS", "P", "CS", "NC", "NPP", "ADJWH", "VINF", "VPP", "ADVWH", "PRO", "V", "CLO", "PREF", "VPR", "PROREL", "ADV", "PROWH", "N", "DETWH", "ADJ", "P+PRO", "ET", "VIMP", "CLS", "PONCT", "I", "CLR", ] ) ) features["pos_tags"] = datasets.Sequence( datasets.features.ClassLabel( names=[ "V", "PREF", "P+D", "I", "A", "P+PRO", "PRO", "P", "anonyme", "D", "C", "CL", "ET", "PONCT", "ADV", "N", ] ) ) features["disambiguate_tokens_ids"] = datasets.Sequence(datasets.Value("int32")) features["disambiguate_labels"] = datasets.Sequence(datasets.Value("string")) features["idx"] = datasets.Value("string") else: features = { text_feature: datasets.Value("string") for text_feature in six.iterkeys(self.config.text_features) } features[self.config.label_column] = datasets.Value("int32") features["idx"] = datasets.Value("int32") return datasets.DatasetInfo( description=_FLUE_DESCRIPTION, features=datasets.Features(features), homepage=self.config.url, citation=self.config.citation + "\n" + _FLUE_CITATION, )
def _info(self): Date = { "Year": datasets.Value("int32"), "Month": datasets.Value("int32"), "Day": datasets.Value("int32"), } MeshHeading = { "DescriptorName": datasets.Value("string"), "QualifierName": datasets.Value("string") } MedlineJournalInfo = { "Country": datasets.Value("string"), # Too inconsistent # 'MedlineTA': datasets.Value('string'), # 'NlmUniqueID': datasets.Value('string'), # 'ISSNLinking': datasets.Value('string'), } Chemical = { "RegistryNumber": datasets.Value("string"), "NameOfSubstance": datasets.Value("string"), } # Too inconsistent in the data to be used # Journal = { # 'ISSN': datasets.Value('string'), # 'JournalIssue': { # 'Volume': datasets.Value('string'), # 'Issue': datasets.Value('string'), # }, # # 'PubDate': Date, # 'Title': datasets.Value('string'), # 'ISOAbbreviation': datasets.Value('string') # } Author = { "LastName": datasets.Value("string"), "ForeName": datasets.Value("string"), "Initials": datasets.Value("string"), "CollectiveName": datasets.Value("string"), } Reference = { "Citation": datasets.Value("string"), "CitationId": datasets.Value("int32"), } Grant = { "GrantID": datasets.Value("string"), "Agency": datasets.Value("string"), "Country": datasets.Value("string"), } Article = { # 'Journal': Journal, "Abstract": { "AbstractText": datasets.Value("string") }, "ArticleTitle": datasets.Value("string"), # Too inconistent # 'Pagination': {'MedlinePgn': datasets.Value('string')}, "AuthorList": { "Author": datasets.Sequence(Author) }, "Language": datasets.Value("string"), "GrantList": { "Grant": datasets.Sequence(Grant), }, "PublicationTypeList": { "PublicationType": datasets.Sequence(datasets.Value("string")) }, } features = datasets.Features({ "MedlineCitation": { "PMID": datasets.Value("int32"), "DateCompleted": Date, "NumberOfReferences": datasets.Value("int32"), "DateRevised": Date, "Article": Article, "MedlineJournalInfo": MedlineJournalInfo, "ChemicalList": { "Chemical": datasets.Sequence(Chemical) }, "CitationSubset": datasets.Value("string"), "MeshHeadingList": { "MeshHeading": datasets.Sequence(MeshHeading), }, }, "PubmedData": { "ArticleIdList": datasets.Sequence( {"ArticleId": datasets.Sequence(datasets.Value("string"))}), "PublicationStatus": datasets.Value("string"), "History": { "PubMedPubDate": datasets.Sequence(Date) }, "ReferenceList": datasets.Sequence(Reference), }, }) self.fill_keys_from_features(features) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features= features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): if self.config.name.endswith("-7"): ner_tags = datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-PROD", "I-PROD", "B-LOC", "I-LOC", "B-DRV", "I-DRV", "B-EVT", "I-EVT", "B-MISC", "I-MISC", ])) elif self.config.name.endswith("-8"): ner_tags = datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-PROD", "I-PROD", "B-LOC", "I-LOC", "B-GPE", "I-GPE", "B-DRV", "I-DRV", "B-EVT", "I-EVT", "B-MISC", "I-MISC", ])) else: ner_tags = datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-GPE_LOC", "I-GPE_LOC", "B-PROD", "I-PROD", "B-LOC", "I-LOC", "B-GPE_ORG", "I-GPE_ORG", "B-DRV", "I-DRV", "B-EVT", "I-EVT", "B-MISC", "I-MISC", ])) return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "idx": datasets.Value("string"), "lang": datasets.Value("string"), "text": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "lemmas": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "NOUN", "PUNCT", "ADP", "NUM", "SYM", "SCONJ", "ADJ", "PART", "DET", "CCONJ", "PROPN", "PRON", "X", "ADV", "INTJ", "VERB", "AUX", ])), "ner_tags": ner_tags, }), supervised_keys=None, homepage=_HOMEPAGE, citation=_CITATION, )
def _info(self): features = datasets.Features( { "id": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ "O", "B-DATE", "I-DATE", "B-DISASTER_TYPE", "I-DISASTER_TYPE", "B-DISTANCE", "I-DISTANCE", "B-DURATION", "I-DURATION", "B-LOCATION", "I-LOCATION", "B-LOCATION_CITY", "I-LOCATION_CITY", "B-LOCATION_ROUTE", "I-LOCATION_ROUTE", "B-LOCATION_STOP", "I-LOCATION_STOP", "B-LOCATION_STREET", "I-LOCATION_STREET", "B-NUMBER", "I-NUMBER", "B-ORGANIZATION", "I-ORGANIZATION", "B-ORGANIZATION_COMPANY", "I-ORGANIZATION_COMPANY", "B-ORG_POSITION", "I-ORG_POSITION", "B-PERSON", "I-PERSON", "B-TIME", "I-TIME", "B-TRIGGER", "I-TRIGGER", ] ) ), } ) return datasets.DatasetInfo( # This is the description that will appear on the datasets page. description=_DESCRIPTION, # This defines the different columns of the dataset and their types features=features, # Here we define them above because they are different between the two configurations # If there's a common (input, target) tuple from the features, # specify them here. They'll be used if as_supervised=True in # builder.as_dataset. supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): if self.config.name == "manual": # This is the name of the configuration selected in BUILDER_CONFIGS above features = datasets.Features({ "alignment_label": datasets.ClassLabel(names=["notAligned", "aligned"]), "normal_sentence_id": datasets.Value("string"), "simple_sentence_id": datasets.Value("string"), "normal_sentence": datasets.Value("string"), "simple_sentence": datasets.Value("string"), }) elif self.config.name == "auto_acl": features = datasets.Features({ "normal_sentence": datasets.Value("string"), "simple_sentence": datasets.Value("string"), }) else: features = datasets.Features({ "example_id": datasets.Value("string"), "normal": { "normal_article_id": datasets.Value("int32"), "normal_article_title": datasets.Value("string"), "normal_article_url": datasets.Value("string"), "normal_article_content": datasets.Sequence({ "normal_sentence_id": datasets.Value("string"), "normal_sentence": datasets.Value("string"), }), }, "simple": { "simple_article_id": datasets.Value("int32"), "simple_article_title": datasets.Value("string"), "simple_article_url": datasets.Value("string"), "simple_article_content": datasets.Sequence({ "simple_sentence_id": datasets.Value("string"), "simple_sentence": datasets.Value("string"), }), }, "paragraph_alignment": datasets.Sequence({ "normal_paragraph_id": datasets.Value("string"), "simple_paragraph_id": datasets.Value("string"), }), "sentence_alignment": datasets.Sequence({ "normal_sentence_id": datasets.Value("string"), "simple_sentence_id": datasets.Value("string"), }), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage="https://github.com/chaojiang06/wiki-auto", license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( features=datasets.Features({"a": datasets.Sequence({"b": datasets.Value("string")})}), # No default supervised_keys. supervised_keys=None, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "id": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "ner_tags": datasets.Sequence( datasets.features.ClassLabel(names=[ "O", "B-academic", "I-academic", "B-academic_person", "I-academic_person", "B-aircraft", "I-aircraft", "B-album_person", "I-album_person", "B-anatomy", "I-anatomy", "B-animal", "I-animal", "B-architect_person", "I-architect_person", "B-capital", "I-capital", "B-chemical", "I-chemical", "B-clothes", "I-clothes", "B-country", "I-country", "B-culture", "I-culture", "B-currency", "I-currency", "B-date", "I-date", "B-food", "I-food", "B-genre", "I-genre", "B-government", "I-government", "B-government_person", "I-government_person", "B-language", "I-language", "B-location", "I-location", "B-material", "I-material", "B-measure", "I-measure", "B-medical", "I-medical", "B-military", "I-military", "B-military_person", "I-military_person", "B-nation", "I-nation", "B-newspaper", "I-newspaper", "B-organization", "I-organization", "B-organization_person", "I-organization_person", "B-person", "I-person", "B-production_art_music", "I-production_art_music", "B-production_art_music_person", "I-production_art_music_person", "B-quantity", "I-quantity", "B-religion", "I-religion", "B-science", "I-science", "B-shape", "I-shape", "B-ship", "I-ship", "B-software", "I-software", "B-space", "I-space", "B-space_person", "I-space_person", "B-sport", "I-sport", "B-sport_name", "I-sport_name", "B-sport_person", "I-sport_person", "B-structure", "I-structure", "B-subject", "I-subject", "B-tech", "I-tech", "B-train", "I-train", "B-vehicle", "I-vehicle", ])), }), supervised_keys=None, # Homepage of the dataset for documentation homepage=_HOMEPAGE, # License for the dataset if available license=_LICENSE, # Citation for the dataset citation=_CITATION, )
def _info(self): if self.config.name in [ "afqmc", "tnews", "iflytek", "cmnli", "diagnostics", "ocnli" ]: features = { text_feature: datasets.Value("string") for text_feature in self.config.text_features.keys() } if self.config.label_classes: features["label"] = datasets.features.ClassLabel( names=self.config.label_classes) else: features["label"] = datasets.Value("float32") features["idx"] = datasets.Value("int32") elif self.config.name == "cluewsc2020": features = { "idx": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["true", "false"]), "target": { "span1_text": datasets.Value("string"), "span2_text": datasets.Value("string"), "span1_index": datasets.Value("int32"), "span2_index": datasets.Value("int32"), }, } elif self.config.name == "csl": features = { "idx": datasets.Value("int32"), "corpus_id": datasets.Value("int32"), "abst": datasets.Value("string"), "label": datasets.features.ClassLabel(names=self.config.label_classes), "keyword": datasets.Sequence(datasets.Value("string")), } elif self.config.name in ["cmrc2018", "drcd"]: features = { "id": datasets.Value("string"), "context": datasets.Value("string"), "question": datasets.Value("string"), "answers": datasets.Sequence({ "text": datasets.Value("string"), "answer_start": datasets.Value("int32"), }), } elif self.config.name == "chid": features = { "idx": datasets.Value("int32"), "candidates": datasets.Sequence(datasets.Value("string")), "content": datasets.Sequence(datasets.Value("string")), "answers": datasets.features.Sequence({ "text": datasets.Value("string"), "candidate_id": datasets.Value("int32"), }), } elif self.config.name == "c3": features = { "id": datasets.Value("int32"), "context": datasets.Sequence(datasets.Value("string")), "question": datasets.Value("string"), "choice": datasets.Sequence(datasets.Value("string")), "answer": datasets.Value("string"), } else: raise NotImplementedError( "This task is not implemented. If you believe" " this task was recently added to the CLUE benchmark, " "please open a GitHub issue and we will add it.") return datasets.DatasetInfo( description=_CLUE_DESCRIPTION, features=datasets.Features(features), homepage=self.config.url, citation=self.config.citation + "\n" + _CLUE_CITATION, )
from processing_image import Preprocess from utils import Config """ USAGE: ``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>`` """ TEST = False CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") DEFAULT_SCHEMA = datasets.Features( OrderedDict( { "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"), "img_id": datasets.Value("int32"), "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"), "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")), "preds_per_image": datasets.Value(dtype="int32"), } ) ) class Extract: def __init__(self, argv=sys.argv[1:]):
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "id": datasets.Value("string"), "tokens": datasets.Sequence(datasets.Value("string")), "pos_tags": datasets.Sequence( datasets.features.ClassLabel( names=['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'] ) ), "dependency_tags": datasets.Sequence( datasets.features.ClassLabel( names=[ 'acl', 'acl:relcl', 'advcl', 'advcl:cleft', 'advmod', 'advmod:emph', 'advmod:lmod', 'amod', 'appos', 'aux', 'aux:aspect', 'aux:pass', 'aux:q', 'aux:tense', 'case', 'case:dec', 'case:pref', 'case:suff', 'cc', 'cc:preconj', 'ccomp', 'ccomp:agent', 'ccomp:obj', 'clf', 'compound', 'compound:lvc', 'compound:prt', 'compound:redup', 'compound:svc', 'conj', 'cop', 'csubj', 'csubj:cop', 'csubj:pass', 'dep', 'dep:comp', 'det', 'det:numgov', 'det:nummod', 'det:poss', 'discourse', 'dislocated', 'det:predet', 'expl', 'expl:impers', 'expl:pass', 'expl:pv', 'expl:subj', 'fixed', 'flat', 'flat:foreign', 'flat:name', 'goeswith', 'iobj', 'list', 'mark', 'mark:advb', 'mark:comp', 'mark:relcl', 'nmod', 'nmod:comp', 'nmod:part', 'nmod:poss', 'nmod:tmod', 'nmod:npmod', 'nsubj', 'nsubj:cop', 'nsubj:pass', 'nummod', 'nummod:gov', 'obj', 'obj:lvc', 'obl', 'obl:agent', 'obl:arg', 'obl:lmod', 'obl:mod', 'obl:loc', 'obl:tmod', 'obl:npmod', 'obl:patient', 'orphan', 'parataxis', 'punct', 'reparandum', 'root', 'vocative', 'xcomp', 'xcomp:obj', 'xcomp:obl', ] + ['acl:appos', 'acl:inf', 'acl:part', 'advcl:arg', 'advcl:cond', 'advmod:cc', 'amod:advmod', 'aux:caus', 'aux:neg', 'case:voc', 'ccomp:obl', 'ccomp:pred', 'compound:conjv', 'compound:nv', 'compound:plur', 'conj:expl', 'csubj:cleft', 'iobj:agent', 'iobj:loc', 'mark:prt', 'nmod:advmod', 'nmod:appos', 'nsubj:caus', 'nsubj:nc', 'obj:agent', 'obl:abl', 'obl:ben', 'obl:cmpr', 'obl:inst', 'obl:pmod', 'obl:prep', 'obl:soc', 'xcomp:adj', 'xcomp:pred'] ) ), "lang": datasets.Sequence( datasets.features.ClassLabel( names=list(testing_path.keys()) ) ), } ), supervised_keys=None, homepage="https://www.aclweb.org/anthology/W03-0419/", citation=_CITATION, )
def _info(self): features_dict = { "word_ids": datasets.Sequence(datasets.Value("string")), "word_start_times": datasets.Sequence(datasets.Value("float")), "word_end_times": datasets.Sequence(datasets.Value("float")), "word_speakers": datasets.Sequence(datasets.Value("string")), "segment_ids": datasets.Sequence(datasets.Value("string")), "segment_start_times": datasets.Sequence(datasets.Value("float")), "segment_end_times": datasets.Sequence(datasets.Value("float")), "segment_speakers": datasets.Sequence(datasets.Value("string")), "words": datasets.Sequence(datasets.Value("string")), "channels": datasets.Sequence(datasets.Value("string")), } if self.config.name == "headset-single": features_dict.update({"file": datasets.Value("string")}) features_dict.update( {"audio": datasets.features.Audio(sampling_rate=16_000)}) config_description = ( "Close talking audio of single headset. " "This configuration only includes audio belonging to the " "headset of the person currently speaking.") elif self.config.name == "microphone-single": features_dict.update({"file": datasets.Value("string")}) features_dict.update( {"audio": datasets.features.Audio(sampling_rate=16_000)}) config_description = ( "Far field audio of single microphone. " "This configuration only includes audio belonging the first microphone, " "*i.e.* 1-1, of the microphone array.") elif self.config.name == "headset-multi": features_dict.update( {f"file-{i}": datasets.Value("string") for i in range(4)}) features_dict.update({ f"file-{i}": datasets.features.Audio(sampling_rate=16_000) for i in range(4) }) config_description = ( "Close talking audio of four individual headset. " "This configuration includes audio belonging to four individual headsets." " For each annotation there are 4 audio files 0, 1, 2, 3.") elif self.config.name == "microphone-multi": features_dict.update( {f"file-1-{i}": datasets.Value("string") for i in range(1, 8)}) features_dict.update({ f"file-1-{i}": datasets.features.Audio(sampling_rate=16_000) for i in range(1, 8) }) config_description = ( "Far field audio of microphone array. " "This configuration includes audio of " "the first microphone array 1-1, 1-2, ..., 1-8.") else: raise ValueError( f"Configuration {self.config.name} does not exist.") return datasets.DatasetInfo( description=_DESCRIPTION + config_description, features=datasets.Features(features_dict), homepage=_URL, citation=_CITATION, )
def _info(self): features = datasets.Features({ "dialogue_id": datasets.Value("string"), "services": datasets.Sequence(datasets.Value("string")), "turns": datasets.Sequence({ "turn_id": datasets.Value("string"), "speaker": datasets.ClassLabel(names=["USER", "SYSTEM"]), "utterance": datasets.Value("string"), "frames": datasets.Sequence({ "service": datasets.Value("string"), "state": { "active_intent": datasets.Value("string"), "requested_slots": datasets.Sequence(datasets.Value("string")), "slots_values": datasets.Sequence({ "slots_values_name": datasets.Value("string"), "slots_values_list": datasets.Sequence(datasets.Value("string")), }), }, "slots": datasets.Sequence({ "slot": datasets.Value("string"), "value": datasets.Value("string"), "start": datasets.Value("int32"), "exclusive_end": datasets.Value("int32"), "copy_from": datasets.Value("string"), "copy_from_value": datasets.Sequence(datasets.Value("string")), }), }), "dialogue_acts": datasets.Features({ "dialog_act": datasets.Sequence({ "act_type": datasets.Value("string"), "act_slots": datasets.Sequence( datasets.Features({ "slot_name": datasets.Value("string"), "slot_value": datasets.Value("string"), }), ), }), "span_info": datasets.Sequence({ "act_type": datasets.Value("string"), "act_slot_name": datasets.Value("string"), "act_slot_value": datasets.Value("string"), "span_start": datasets.Value("int32"), "span_end": datasets.Value("int32"), }), }), }), }) return datasets.DatasetInfo( description=_DESCRIPTION, features= features, # Here we define them above because they are different between the two configurations supervised_keys=None, homepage= "https://github.com/budzianowski/multiwoz/tree/master/data/MultiWOZ_2.2", license=_LICENSE, citation=_CITATION, )