def create_fields( header: List[str], to_lower: bool = False, sen_column: str = "sen", tokenize_columns: Optional[List[str]] = None, convert_numerical: bool = False, tokenizer: Optional[PreTrainedTokenizer] = None, ) -> List[Tuple[str, Field]]: tokenize_columns = tokenize_columns or [sen_column] pipeline = None if convert_numerical: def preprocess_field(s: Union[str, int]) -> Union[str, int]: return int(s) if (isinstance(s, str) and s.isdigit()) else s pipeline = Pipeline(convert_token=preprocess_field) fields = [] for column in header: if column in tokenize_columns: field = Field(batch_first=True, include_lengths=True, lower=to_lower) if tokenizer is not None: attach_tokenizer(field, tokenizer) else: field = RawField(preprocessing=pipeline) field.is_target = False fields.append((column, field)) return fields
def import_corpus( path: str, header: Optional[List[str]] = None, header_from_first_line: bool = False, to_lower: bool = False, vocab_path: Optional[str] = None, vocab_from_corpus: bool = False, sen_column: str = "sen", ) -> TabularDataset: """ Imports a corpus from a path. The corpus can either be a raw string or a pickled dictionary. Outputs a `Corpus` type, that is used throughout the library. The raw sentence is assumed to be labeled `sen` or `sent` Sentences can possibly be labeled, which are assumed to be labeled by a `labels` tag. Parameters ---------- path : str Path to corpus file header : List[str], optional Optional list of attribute names of each column, if not provided all lines will be considered to be sentences, with the attribute name "sen". to_lower : bool, optional Transform entire corpus to lower case, defaults to False. header_from_first_line : bool, optional Use the first line of the corpus as the attribute names of the corpus. vocab_path : str, optional Path to the model vocabulary, which should a file containing a vocab entry at each line. vocab_from_corpus : bool, optional Create a new vocabulary from the tokens of the corpus itself. If set to True `vocab_path` does not need to be provided. Defaults to False. sen_column : str, optional Name of the corpus column containing the raw sentences. Defaults to `sen`. Returns ------- corpus : TabularDataset A TabularDataset containing the parsed sentences and optional labels """ if header is None: if header_from_first_line: with open(path) as f: header = f.readline().strip().split("\t") else: header = ["sen"] assert sen_column in header, "`sen` should be part of corpus_header!" def preprocess(s: str) -> Union[str, int]: return int(s) if s.isdigit() else s pipeline = Pipeline(convert_token=preprocess) fields = {} for field in header: if field == sen_column: fields[field] = Field( batch_first=True, include_lengths=True, lower=to_lower ) elif field == "labels": fields[field] = Field( use_vocab=False, tokenize=lambda s: list(map(int, s.split())) ) else: fields[field] = RawField(preprocessing=pipeline) fields[field].is_target = False corpus = TabularDataset( fields=fields.items(), format="tsv", path=path, skip_header=header_from_first_line, csv_reader_params={"quotechar": None}, ) # The current torchtext Vocab does not allow a fixed vocab order if vocab_path is not None or vocab_from_corpus: attach_vocab(corpus, vocab_path or path, sen_column=sen_column) return corpus