def test_flatten_data(self):
     raw_data = pandas.read_csv(join(TEST_DATA_PATH, "raw_test_data.csv.xz"),
                                index_col=0, keep_default_na=False)
     assert_frame_equal(flatten_df_by_column(raw_data, Columns.Split, Columns.Token, str.split),
                        self.flat_data)
     assert_frame_equal(flatten_df_by_column(self.custom_data, Columns.Split, Columns.Token,
                                             str.split), self.flat_custom_data)
Пример #2
0
    def check_identifiers(
        self,
        identifiers: List[str],
    ) -> Dict[int, Dict[str, List[Candidate]]]:
        """
        Check tokens from identifiers for typos.

        :param identifiers: List of identifiers to check.
        :return: Dictionary of corrections grouped by ids of corresponding identifier \
                 in 'identifiers' and typoed tokens which have correction suggestions.
        """
        identifiers_positions = defaultdict(list)
        for i, identifier in enumerate(identifiers):
            identifiers_positions[identifier].append(i)
        unique_identifiers = sorted(identifiers_positions.keys())
        df = pandas.DataFrame(columns=[IDENTIFIER_INDEX_COLUMN, Columns.Split])
        df[IDENTIFIER_INDEX_COLUMN] = range(len(unique_identifiers))
        df[Columns.Split] = [
            " ".join(self.parser.split(identifier))
            for identifier in unique_identifiers
        ]
        df = flatten_df_by_column(df, Columns.Split, Columns.Token, str.split)
        df = df[df[Columns.Token].str.len() >= self.config["min_token_length"]]
        suggestions = self.corrector.suggest(
            df, n_candidates=self.config["n_candidates"], return_all=False)
        suggestions = self.filter_suggestions(df, suggestions)
        grouped_suggestions = defaultdict(dict)
        for index, row in df.iterrows():
            if index in suggestions.keys():
                for pos in identifiers_positions[unique_identifiers[
                        row[IDENTIFIER_INDEX_COLUMN]]]:
                    grouped_suggestions[pos][row[Columns.Token]] = \
                        suggestions[index]
        return grouped_suggestions
Пример #3
0
    def check_identifiers(
        self,
        identifiers: List[str],
    ) -> Dict[int, Dict[str, List[Tuple[str, float]]]]:
        """
        Check tokens from identifiers for typos.

        :param identifiers: List of identifiers to check.
        :return: Dictionary of corrections grouped by ids of corresponding identifier \
                 in 'identifiers' and typoed tokens which have correction suggestions.
        """
        df = pandas.DataFrame(columns=[self.INDEX_COLUMN, Columns.Split])
        df[self.INDEX_COLUMN] = range(len(identifiers))
        df[Columns.Split] = [
            " ".join(self.parser.split(i)) for i in identifiers
        ]
        df = flatten_df_by_column(df, Columns.Split, Columns.Token, str.split)
        suggestions = self.model.suggest(df,
                                         n_candidates=self.n_candidates,
                                         return_all=False)
        suggestions = self.filter_suggestions(df, suggestions)
        grouped_suggestions = defaultdict(dict)
        for index, row in df.iterrows():
            if index in suggestions.keys():
                grouped_suggestions[row[self.INDEX_COLUMN]][row[Columns.Token]] =\
                    suggestions[index]
        return grouped_suggestions
Пример #4
0
def prepare_data(
        params: Optional[Mapping[str, Any]] = None) -> pandas.DataFrame:
    """
    Generate all the necessary data from the raw dataset of split identifiers.

    Brief algorithm description:
    1. Derive vocabulary for typos correction which is a set of tokens, which is considered
       correctly spelled. All typos corrections will belong to the vocabulary.
       It is a set of most frequent tokens (based on given statistics).
    2. Save vocabulary and statistics for given amount of most frequent tokens for future use.
    3. Filter raw data, leaving identifiers, containing only tokens from the vocabulary.
       The result is a dataset of tokens which will be considered correct. It will be used
       for creating artificial misspelling cases for training and testing the corrector model.
    4. Save prepared dataset, if needed.
    :param params: Dictionary with parameters for data preparation. Used fields are:
                   data_dir: Directory to put all derived data to.
                   drive_dataset_id: ID of google drive document, where raw dataset is stored.
                   input_path: Path to a .csv dump of input dataframe. Should contain \
                               column Columns.Split. If None or file doesn't exist,
                               the dataset will be loaded from drive.
                   frequency_column: Name of column with identifiers frequencies. If not \
                                     specified, every split is considered to have frequency 1.
                   vocabulary_size: Number of most frequent tokens to take as a vocabulary.
                   frequencies_size: Number of most frequent tokens to save  frequencies info for.\
                                     This information will be used by corrector as features for \
                                     these tokens when they will be checked. If not specified, \
                                     frequencies for all present tokens will be saved.
                   raw_data_filename: Name of .csv file in data_dir to put raw dataset in case of \
                                      loading from drive.
                   vocabulary_path: Name of .csv file in data_dir to save vocabulary to.
                   frequencies_path: Name of .csv file in data_dir to save frequencies to.
    :return: Dataset baked for training the typos correction.
    """
    if params is None:
        params = deepcopy(defaults_for_preparation)
    else:
        params = merge_dicts(defaults_for_preparation, params)

    raw_data_path = params["input_path"]
    if raw_data_path is None or not os.path.exists(raw_data_path):
        raw_data_path = os.path.join(params["data_dir"],
                                     params["raw_data_filename"])
        _download_url(params["dataset_url"], raw_data_path)

    data = pandas.read_csv(raw_data_path, index_col=0)
    if params["frequency_column"] not in data.columns:
        data[Columns.Frequency] = 1
    else:
        data = data.rename(
            columns={params["frequency_column"]: Columns.Frequency})

    # Expand dataframe by splits (repeat rows for every token in splits)
    data[Columns.Split] = data[Columns.Split].astype(str)
    flat_data = flatten_df_by_column(data,
                                     Columns.Split,
                                     Columns.Token,
                                     apply_function=lambda x: x.split())

    # Collect statistics for tokens
    stats = flat_data[[Columns.Frequency,
                       Columns.Token]].groupby([Columns.Token]).sum()
    stats = stats.sort_values(by=Columns.Frequency, ascending=False)

    # Derive new vocabulary for future use
    frequencies_tokens = set(
        stats.index[:(params["frequencies_size"] or len(stats))])
    vocabulary_tokens = set(stats.index[:params["vocabulary_size"]])
    print_frequencies(
        vocabulary_tokens, stats,
        os.path.join(params["data_dir"], params["vocabulary_filename"]))
    print_frequencies(
        frequencies_tokens, stats,
        os.path.join(params["data_dir"], params["frequencies_filename"]))

    # Leave only splits that contain tokens from vocabulary
    prepared_data = filter_splits(flat_data, vocabulary_tokens)[[
        Columns.Frequency, Columns.Split, Columns.Token
    ]]
    return prepared_data
Пример #5
0
def prepare_data(
        config: Optional[Mapping[str, Any]] = None) -> pandas.DataFrame:
    """
    Generate all the necessary data from the raw dataset of split identifiers.

    Brief algorithm description:
    1. Derive vocabulary for typos correction which is a set of tokens, which is considered
       correctly spelled. All typos corrections will belong to the vocabulary.
       It is a set of most frequent tokens (based on given statistics).
    2. Save vocabulary and statistics for a given amount of most frequent tokens for future use.
    3. Filter raw data, leaving identifiers, containing only tokens from the vocabulary.
       The result is a dataset of tokens which will be considered correct. It will be used
       for creating artificial misspelling cases for training and testing the corrector model.
    4. Save prepared dataset, if needed.
    :param config: Dictionary with parameters for data preparation. Used fields are:
                   data_dir: Directory to put all derived data to.
                   drive_dataset_id: ID of google drive document, where a raw dataset is stored.
                   input_path: Path to a .csv dump of input dataframe. Should contain \
                               column Columns.Split. If None or file doesn't exist,
                               the dataset will be loaded from Google drive.
                   frequency_column: Name of the column with identifiers frequencies. If not \
                                     specified, every split is considered to have frequency 1.
                   vocabulary_size: Number of most frequent tokens to take as a vocabulary.
                   frequencies_size: Number of most frequent tokens to save frequencies info for. \
                                     This information will be used by corrector as features for \
                                     these tokens when they will be checked. If not specified, \
                                     frequencies for all present tokens will be saved.
                   raw_data_filename: Name of the .csv file in data_dir to put raw dataset \
                                      in case of loading from drive.
                   vocabulary_filename: Name of the .csv file in data_dir to save vocabulary to.
                   frequencies_filename: Name of the .csv file in data_dir to save frequencies to.
                   prepared_filename: Name of the .csv file in data_dir to save prepared \
                                      dataset to.
    :return: Dataset baked for training the typos correction.
    """
    log = logging.getLogger("prepare_data")
    if config is None:
        config = {}
    config = merge_dicts(DEFAULT_CORRECTOR_CONFIG["preparation"], config)

    os.makedirs(config["data_dir"], exist_ok=True)
    raw_data_path = config["input_path"]
    if raw_data_path is None or not os.path.exists(raw_data_path):
        raw_data_path = os.path.join(config["data_dir"],
                                     config["raw_data_filename"])
        log.warning("raw dataset was not found, downloading from %s to %s",
                    config["dataset_url"], raw_data_path)
        _download_url(config["dataset_url"], raw_data_path)

    data = pandas.read_csv(raw_data_path, index_col=0, keep_default_na=False)
    log.debug("raw dataset shape: %s", data.shape)
    if config["frequency_column"] not in data.columns:
        log.info("frequency column is not found. Set all frequencies to 1")
        data[Columns.Frequency] = 1
    else:
        log.info("frequency column `%s` is found", config["frequency_column"])
        data = data.rename(
            columns={config["frequency_column"]: Columns.Frequency})

    # Expand dataframe by splits (repeat rows for every token in splits)
    data[Columns.Split] = data[Columns.Split].astype(str)
    log.debug("expand data by splits")
    flat_data = flatten_df_by_column(data,
                                     Columns.Split,
                                     Columns.Token,
                                     apply_function=lambda x: x.split())
    log.debug("expanded data shape %s", flat_data.shape)

    log.info("collect statistics for tokens")
    stats = flat_data[[Columns.Frequency,
                       Columns.Token]].groupby([Columns.Token]).sum()
    stats = stats.sort_values(by=Columns.Frequency,
                              ascending=False)[Columns.Frequency]

    log.info("derive the new vocabulary")
    frequencies = stats.iloc[:(
        config["frequencies_size"] or len(stats))].to_dict()
    log.info("tokens with frequencies data size: %d", len(frequencies))
    vocabulary = stats.iloc[:config["vocabulary_size"]].to_dict()
    log.info("vocabulary size: %d", len(vocabulary))
    vocabulary_filepath = os.path.join(config["data_dir"],
                                       config["vocabulary_filename"])
    print_frequencies(vocabulary, vocabulary_filepath)
    log.info("vocabulary saved to %s", vocabulary_filepath)
    frequencies_filepath = os.path.join(config["data_dir"],
                                        config["frequencies_filename"])
    print_frequencies(frequencies, frequencies_filepath)
    log.info("tokens with frequencies data are saved to %s",
             frequencies_filepath)

    # Leave only splits that contain tokens from vocabulary
    prepared_data = filter_splits(flat_data, set(
        vocabulary.keys()))[[Columns.Frequency, Columns.Split, Columns.Token]]
    prepared_data.reset_index(drop=True, inplace=True)
    log.info("final dataset shape: %s", prepared_data.shape)
    if config["prepared_filename"] is not None:
        prepared_data_filepath = os.path.join(config["data_dir"],
                                              config["prepared_filename"])
        prepared_data.to_csv(prepared_data_filepath)
        log.info("final dataset is saved to %s", prepared_data_filepath)
    return prepared_data