예제 #1
0
    def train_data(self) -> pd.DataFrame:
        """

        Returns
        -------

        """
        if self.train_data_ is None:
            logger.info('Loading training dataset')
            self.train_data_ = pd.read_csv(self.train_path, usecols=self.cols,
                                           nrows=TEST_ROWS)

            self.train_data_.dropna(inplace=True)
            self.train_data_['name'] = self.train_data_['name'].str.lower()

            self.train_data_['alternate_names'] = self.train_data_[
                'alternate_names'].str.lower()

            self.train_data_['n_alternate_names'] = self.train_data_[
                'alternate_names'].str.split(',').apply(len)

            self.train_data_['len_name'] = self.train_data_['name'].str.len()

            # get rid of toponyms that have more than "max_chars" characters.
            self.train_data_ = self.train_data_[
                (self.train_data_['len_name'] <= self.max_chars)
                & (self.train_data_['len_name'] > 2)].reset_index(drop=True)

        return self.train_data_
예제 #2
0
    def __init__(self,
                 train_fname: str,
                 val_fname: str,
                 verbose: int = 0,
                 max_chars: int = 32,
                 save_tokenizer: bool = True,
                 tokenizer_params: Optional[dict] = None,
                 train_sampler_params: Optional[dict] = None,
                 val_sampler_params: Optional[dict] = None):
        """

        Parameters
        ----------
        train_fname
        val_fname
        verbose
        max_chars
        save_tokenizer
        tokenizer_params
        train_sampler_params
        val_sampler_params
        """
        self.train_path = os.path.join(DirConf.DATA_DIR, train_fname)
        self.val_path = os.path.join(DirConf.DATA_DIR, val_fname)

        assert os.path.exists(self.train_path)
        assert os.path.exists(self.val_path)

        self.verbose = verbose
        self.max_chars = max_chars

        self.save_tokenizer = save_tokenizer

        self.cols = ['name', 'alternate_names']

        self.train_data_: Optional[pd.DataFrame] = None
        self.val_data_: Optional[pd.DataFrame] = None

        self.tokenizer_params = {'name': 'ngram_tokenizer', 'maxlen': 30,
                                 'filters': '', 'lower': True, 'split': ' ',
                                 'char_level': False, 'num_words': 20_000,
                                 'oov_token': '<OOV>'}

        if tokenizer_params:
            self.tokenizer_params.update(tokenizer_params)

        logger.info('Loading tokenizer')
        TokenizerCLass = getattr(tokenizers,
                                 underscore_to_camel(
                                     self.tokenizer_params.pop('name')))
        self.tokenizer = TokenizerCLass(**self.tokenizer_params)

        self.train_sampler_params = train_sampler_params
        self.val_sampler_params = val_sampler_params

        self.train_sampler = None
        self.val_sampler = None
예제 #3
0
    def tokenize_external_data(self):
        """

        Returns
        -------

        """
        logger.info('Creating Sequences for External Test dataset')
        #  ========== Procedure for the Test Set ===================

        for col_name in self.external_cols:
            if col_name == 'target':
                # obviously, we don't want to tokenize the target
                continue

            logger.info(f'Creating N-grams for column name: "{col_name}".')

            self.data[f'{col_name}_ngrams'] = self.data[
                col_name].progress_apply(self.tokenizer.get_ngrams)

            logger.info(f'Converting column name: "{col_name}" to sequences')

            self.data[f'{col_name}_seq'] = self.data[
                f'{col_name}_ngrams'].progress_apply(
                lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0])

            logger.info(f'Padding sequences for column name: "{col_name}".')
            self.data[f'{col_name}_seq'] = self.data[
                f'{col_name}_seq'].progress_apply(self.tokenizer.pad_single)

        self.data_ = self.data_[self.output_cols]

        return self.data_
예제 #4
0
    def save(self):
        """

        :return:
        """
        self.name = self.build_outfile()

        tokenizer_json = self.to_json()

        path = os.path.join(DirConf.MODELS_DIR, self.name)

        exp_logger.info(f'Storing Unigram Tokenizer at: {path}')

        with io.open(path, 'w', encoding='utf-8') as f:
            f.write(json.dumps(tokenizer_json, ensure_ascii=True))
예제 #5
0
    def __init__(self,
                 fname: str,
                 tokenizer_fname: str,
                 use_external: bool = False,
                 verbose: int = 0,
                 max_chars: int = 32,
                 sampler_params: Optional[dict] = None,
                 ):
        """
        Parameters
        ----------
        fname : str
            The filename of the dataset that we want to prepare for model
            evaluation.
        tokenizer_fname : str
            The filename of the already fitted tokenizer tha we want to use
            for the conversion of the toponyms and the alternate_names to
            sequences.
        use_external : bool
        verbose : int
        max_chars : int
        """
        self.path = os.path.join(DirConf.DATA_DIR, fname)

        assert os.path.exists(self.path)

        self.verbose = verbose
        self.max_chars = max_chars

        self.cols = ['name', 'alternate_names']

        self.data_: Optional[pd.DataFrame] = None

        logger.info(f'Loading fitted tokenizer: {tokenizer_fname}')
        self.tokenizer = tokenizers.load_tokenizer(name=tokenizer_fname)

        self.test_sampler = None

        self.use_external = use_external
        self.sampler_params = sampler_params

        self.external_cols = ['name', 'alternate_name', 'target']

        self.output_cols = ['name', 'alternate_name', 'name_seq',
                            'alternate_name_seq', 'target']
예제 #6
0
    def filter_latin_related_records(self) -> pd.DataFrame:
        """

        Returns
        -------

        """
        exp_logger.info('Filtering Latin Names')
        self.data_ = self.data[self.data['name_alphabet'] == "LATIN"]
        exp_logger.info(
            f'Number of Records after filtering: {len(self.data_)}')

        # filter only the alternate names that are written in LATIN
        exp_logger.info('Filtering Latin Alternate Names')
        self.data['alt_names_seq'] = self.data.apply(lambda row: [
            n for n, ab in zip(row['alt_names_seq'], row[
                'alternate_names_alphabet']) if ab == 'LATIN'
        ],
                                                     axis=1)

        # replace the alternate_names with those that are only written in LATIN
        self.data['alternate_names'] = self.data['alt_names_seq'].apply(
            lambda l: ', '.join(l) if l else None)

        return self.data
예제 #7
0
    def data(self) -> pd.DataFrame:
        """
        This method loads lazily the dataset that we will use for the model
        evaluation.
        Returns
        -------
        pd.DataFrame :
            A pandas dataframe that
        """
        if self.data_ is None:
            if self.use_external:
                logger.info('Loading external test dataset')
                self.data_ = pd.read_csv(self.path,
                                         usecols=self.external_cols,
                                         nrows=TEST_ROWS, sep='\t')

                self.data_['target'] = self.data_['target'].astype(int)
            else:
                logger.info('Loading test dataset')
                self.data_ = pd.read_csv(self.path, usecols=self.cols,
                                         nrows=TEST_ROWS)

            self.data_.dropna(inplace=True)

            logger.info(f'Dataset size: {len(self.data_)}')

            self.data_['name'] = self.data_['name'].str.lower()

            self.data_['len_name'] = self.data_['name'].str.len()

            # get rid of toponyms that have more than "max_chars" characters.
            self.data_ = self.data_[
                (self.data_['len_name'] <= self.max_chars)
                & (self.data_['len_name'] > 2)].reset_index(drop=True)

            if not self.use_external:
                # since we don't have a ready external dataset for testing
                #  we need to pre-process the raw dataset.
                self.data_['alternate_names'] = self.data_[
                    'alternate_names'].str.lower()

                self.data_['n_alternate_names'] = self.data_[
                    'alternate_names'].str.split(',').apply(len)

        return self.data_
예제 #8
0
    def run(self):
        """

        Returns
        -------

        """
        # get the alternate names as a list for each record
        self.data['alt_names_seq'] = self.data['alternate_names'].apply(
            lambda x: x.split(',') if x else [])

        self.data['len_name'] = self.data['name'].apply(len)

        exp_logger.info(f'Keeping records with Name '
                        f'Length smaller than {self.max_name_chars}')

        self.data_ = self.data[
            self.data['len_name'] <= self.max_name_chars].reset_index(
                drop=True)

        exp_logger.info('Detecting Alphabet for all Names')
        # detect the alphabet for the name
        self.data['name_alphabet'] = self.data['name'].progress_apply(
            self.detect_alphabet)

        exp_logger.info('Converting non frequent alphabets to "UND"')
        alphabet_counts = self.data['name_alphabet'].value_counts()
        non_frequent_alpha = {
            i: 'UND'
            for i in alphabet_counts[alphabet_counts.values < 10].index
        }
        self.data['name_alphabet'] = self.data['name_alphabet'].apply(
            lambda x: non_frequent_alpha.get(x, x))

        exp_logger.info('Detecting Alphabet for all Alternate Names')
        # get the alphabet for each alternate name
        self.data['alternate_names_alphabet'] = self.data[
            'alt_names_seq'].progress_apply(
                lambda l: [self.detect_alphabet(n) for n in l])

        if self.only_latin:
            self.filter_latin_related_records()  # filters self.data

        self.data['n_alt_names'] = self.data['alt_names_seq'].progress_apply(
            len)

        self.data['n_alt_gte'] = self.data['n_alt_names'] >= self.n_alternates

        if self.show_plots:
            exp_logger.info('Creating Plots')
            self.create_plots()

        datasets = self.split_records()

        if self.save_data:
            exp_logger.info('Saving Datasets')

            for data_type, data in datasets.items():
                exp_logger.info(f'Saving {data_type}')

                ab = 'latin' if self.only_latin else 'global'
                shuffle = 'stratified' if self.stratified_split else 'random'

                outfile = f'n_alternates_{self.n_alternates}+_{ab}_' \
                          f'{shuffle}_split_{data_type}.csv'.strip().lower()

                outfile = os.path.join(self.data_dir, outfile)

                data[self.basic_cols].to_csv(outfile,
                                             encoding='utf8',
                                             index=False)

        return datasets
예제 #9
0
    def split_records(self) -> Dict[str, pd.DataFrame]:
        """

        Returns
        -------

        """
        data_size = len(self.data)
        test_size = int(self.test_size * data_size)
        val_size = int(self.val_size * data_size)

        if not self.stratified_split:

            exp_logger.info('Random Split into Train-Val and Test')
            X_train_val, X_test = train_test_split(self.data[self.basic_cols],
                                                   test_size=test_size,
                                                   shuffle=True,
                                                   random_state=2020,
                                                   stratify=None)

            exp_logger.info('Random Split into Train and Val')
            X_train, X_val = train_test_split(X_train_val,
                                              test_size=val_size,
                                              shuffle=True,
                                              random_state=2020,
                                              stratify=None)

        else:
            if self.only_latin:
                exp_logger.info('Using Name Length as stratification factor')
                stratify_column = self.data['len_name']
            else:
                exp_logger.info('Using Name Alphabet as stratification factor')
                stratify_column = self.data['name_alphabet']

            exp_logger.info('Stratified Split into Train-Val and Test')

            # y_train_val will be used for the stratification in the second
            # split.
            X_train_val, X_test, y_train_val, _ = train_test_split(
                self.data[self.basic_cols],
                stratify_column,
                test_size=test_size,
                shuffle=True,
                random_state=2020,
                stratify=stratify_column)

            exp_logger.info('Stratified Split into Train and Val')
            X_train, X_val = train_test_split(X_train_val,
                                              test_size=val_size,
                                              shuffle=True,
                                              random_state=2020,
                                              stratify=y_train_val)

        exp_logger.info(f'X_train-val size: {X_train_val.shape[0]}')
        exp_logger.info(f'X_train size: {X_train.shape[0]}')
        exp_logger.info(f'X_val size: {X_val.shape[0]}')
        exp_logger.info(f'X_test size: {X_test.shape[0]}')

        return dict(X_train_val=X_train_val,
                    X_train=X_train,
                    X_val=X_val,
                    X_test=X_test)
예제 #10
0
    def tokenize_raw_data(self):
        """
        1) Creates n-grams from the toponyms
        2) Creates n-grams for each of the variations
        Returns
        -------

        """
        #  ========== Procedure for the Test Set ===================
        logger.info('Creating N-grams for test toponyms')

        self.data['toponym_ngrams'] = self.data['name'].progress_apply(
            self.tokenizer.get_ngrams)

        self.data['alternate_names'] = self.data['alternate_names'].str.split(
            ',')

        logger.info('Creating N-grams for test alternate-names')
        self.data['variations_ngrams'] = self.data[
            'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams)

        logger.info('Converting test toponyms to sequences')

        self.data['toponym_seqs'] = self.data['toponym_ngrams'].progress_apply(
            lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0])

        logger.info('Padding test toponym sequences')
        self.data['toponym_seqs'] = self.data['toponym_seqs'].progress_apply(
            self.tokenizer.pad_single)

        logger.info('Converting test alternate-names to sequences')
        self.data['variations_seqs'] = self.data[
            'variations_ngrams'].progress_apply(
            self.tokenizer.texts_to_sequences)

        logger.info('Padding test alternate-names sequences')
        self.data['variations_seqs'] = self.data[
            'variations_seqs'].progress_apply(self.tokenizer.pad)

        if self.verbose > 0:
            logger.info(
                f'N-gram index length: {len(self.tokenizer.word_index)}')
            logger.info('\nExample Transformation')
            logger.info(self.data.loc[0])
            logger.info(self.data.loc[0]['variations_seqs'])
예제 #11
0
    def tokenize_data(self):
        """
        1) Creates n-grams from the toponyms
        2) Creates n-grams for each of the variations
        Returns
        -------

        """

        logger.info('Creating N-grams for training toponyms')
        # convert each toponym to it's ngram representation
        self.train_data['toponym_ngrams'] = self.train_data[
            'name'].progress_apply(self.tokenizer.get_ngrams)

        # convert each variation of each toponym to it's n-gram representation
        self.train_data['alternate_names'] = self.train_data[
            'alternate_names'].str.split(',')

        logger.info('Creating N-grams for training alternate-names')
        self.train_data['variations_ngrams'] = self.train_data[
            'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams)

        # collect (flatten out) all the n-grams (toponyms and variations)
        # these are needed in order to fit it to the tokenizer.
        all_train_names = list()
        for row in self.train_data['variations_ngrams']:
            all_train_names.extend(row)

        all_train_names += list(self.train_data['toponym_ngrams'])

        # fitting all the training texts on the instantiated tokenizer
        # this will create all the necessary tools that we will need.
        logger.info('Fitting tokenizer to training-data')
        self.tokenizer.fit_on_texts(texts=all_train_names)

        # using the fitted tokenizer, convert the train toponyms to sequences
        logger.info('Converting training toponyms to sequences')
        self.train_data['toponym_seqs'] = self.train_data[
            'toponym_ngrams'].progress_apply(
            lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0])

        logger.info('Padding training toponym sequences')
        # pad the sequences to the max length
        self.train_data['toponym_seqs'] = self.train_data[
            'toponym_seqs'].progress_apply(self.tokenizer.pad_single)

        # using the fitted tokenizer, convert each of the variations of all the
        # toponyms sequences
        logger.info('Converting training alternate-names to sequences')
        self.train_data['variations_seqs'] = self.train_data[
            'variations_ngrams'].progress_apply(
            self.tokenizer.texts_to_sequences)

        logger.info('Padding training alternate-names sequences')
        self.train_data['variations_seqs'] = self.train_data[
            'variations_seqs'].progress_apply(self.tokenizer.pad)

        #  ========== Same Procedure for the Validation Set ===================
        logger.info('Creating N-grams for validation toponyms')

        self.val_data['toponym_ngrams'] = self.val_data['name'].progress_apply(
            self.tokenizer.get_ngrams)

        self.val_data['alternate_names'] = self.val_data[
            'alternate_names'].str.split(',')

        logger.info('Creating N-grams for validation alternate-names')
        self.val_data['variations_ngrams'] = self.val_data[
            'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams)

        logger.info('Converting validation toponyms to sequences')

        self.val_data['toponym_seqs'] = self.val_data[
            'toponym_ngrams'].progress_apply(
            lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0])

        logger.info('Padding validation toponym sequences')
        self.val_data['toponym_seqs'] = self.val_data[
            'toponym_seqs'].progress_apply(
            self.tokenizer.pad_single)

        logger.info('Converting validation alternate-names to sequences')
        self.val_data['variations_seqs'] = self.val_data[
            'variations_ngrams'].progress_apply(
            self.tokenizer.texts_to_sequences)

        logger.info('Padding validation alternate-names sequences')
        self.val_data['variations_seqs'] = self.val_data[
            'variations_seqs'].progress_apply(self.tokenizer.pad)

        if self.verbose > 0:
            print(f'N-gram index length: {len(self.tokenizer.word_index)}')
            print('\nExample Transformation')
            print(self.val_data.loc[0])
            print(self.val_data.loc[0]['variations_seqs'])

        if self.save_tokenizer:
            print('Saving Tokenizer')
            self.tokenizer.save()
            print('Tokenizer saved')