예제 #1
0
    def get_languages(self):
        """
        Find which languages are predicted for the WiLI dataset.

        Returns
        -------
        languages : list of str
            Each str is a ISO 369-3 code
        """
        languages = set()
        # Read data
        data = wili.load_data()
        logging.info("Finished loading data")
        for set_name in ['test', 'train']:
            x_set_name = 'x_{}'.format(set_name)
            bar = progressbar.ProgressBar(redirect_stdout=True,
                                          max_value=len(data[x_set_name]))
            for i, el in enumerate(data[x_set_name]):
                try:
                    predicted = self.predict(el)
                except Exception as e:
                    predicted = 'UNK'
                    logging.error({'message': 'Exception in get_languages',
                                   'error': e})
                languages.add(predicted)
                bar.update(i + 1)
            bar.finish()
        return sorted(list(languages))
예제 #2
0
def main(start, end):
    """Run."""
    # Read data
    data = wili.load_data()
    logging.info("Finished loading data")

    lang_amounts = {}
    for paragraph, label in zip(data['x_train'], data['y_train']):
        if label not in lang_amounts:
            lang_amounts[label] = []
        chars_in_range = 0
        for char in paragraph:
            if start <= ord(char) <= end:
                chars_in_range += 1
        amount = float(chars_in_range) / len(paragraph)
        lang_amounts[label].append(amount)

    for key in lang_amounts.keys():
        lang_amounts[key] = np.array(lang_amounts[key]).mean() * 100

    print('Label    Chars in range [{} - {}]'.format(start, end))
    print('-' * 80)
    lang_a = sorted(lang_amounts.items(), key=lambda n: n[1], reverse=True)
    for i, (label, chars_in_range) in enumerate(lang_a, start=1):
        print('{:>3}. {:<10}  {:>5.2f}%'.format(i, label, chars_in_range))
예제 #3
0
def main(start: int, end: int) -> None:
    """Run."""
    # Read data
    data = wili.load_data()
    logger.info("Finished loading data")

    lang_amounts = {}  # type: Dict[str, List[Any]]
    for paragraph, label in zip(data["x_train"], data["y_train"]):
        if label not in lang_amounts:
            lang_amounts[label] = []
        chars_in_range = 0
        for char in paragraph:
            if start <= ord(char) <= end:
                chars_in_range += 1
        amount = float(chars_in_range) / len(paragraph)
        lang_amounts[label].append(amount)

    for key in lang_amounts.keys():
        lang_amounts[key] = np.array(lang_amounts[key]).mean() * 100

    print(f"Label    Chars in range [{start} - {end}]")
    print("-" * 80)
    lang_a = sorted(lang_amounts.items(), key=lambda n: n[1], reverse=True)
    for i, (label, chars_in_range_t) in enumerate(lang_a, start=1):
        print(f"{i:>3}. {label:<10}  {chars_in_range_t:>5.2f}%")
예제 #4
0
    def get_languages(self) -> List[str]:
        """
        Find which languages are predicted for the WiLI dataset.

        Returns
        -------
        languages : List[str]
            Each str is a ISO 369-3 code
        """
        languages = set()
        # Read data
        data = wili.load_data()
        logger.info("Finished loading data")
        for set_name in ["test", "train"]:
            x_set_name = f"x_{set_name}"
            bar = progressbar.ProgressBar(redirect_stdout=True,
                                          max_value=len(data[x_set_name]))
            for i, el in enumerate(data[x_set_name]):
                try:
                    predicted = self.predict(el)
                except Exception as e:
                    predicted = "UNK"
                    logger.error({
                        "message": "Exception in get_languages",
                        "error": e
                    })
                languages.add(predicted)
                bar.update(i + 1)
            bar.finish()
        return sorted(languages)
예제 #5
0
def main(coverage: float, metric: int, unicode_cutoff: int, set_name: str = "train"):
    """
    Train and test character distance models.

    Parameters
    ----------
    coverage : float
    metric : int
        Specify a function
    unicode_cutoff : int
    set_name : str
        Define on which set to evaluate
    """
    metrics = [
        ido,  # 0
        distance.braycurtis,  # 1
        distance.canberra,  # 2
        distance.chebyshev,  # 3 - l_infty
        distance.cityblock,  # 4
        distance.correlation,  # 5
        distance.cosine,  # 6
        distance.euclidean,  # 7
        distance.sqeuclidean,  # 8
        scipy.stats.entropy,  # 9
    ]
    metric_function = metrics[metric]

    # Read data
    data = wili.load_data()
    logger.info("Finished loading data")

    # Train
    trained = train(data, unicode_cutoff, coverage, metric_function)

    # Create model for each language and store it
    out_tmp = get_counts_by_lang(
        trained["common_chars"], trained["char_counter_by_lang"]
    )
    language_models, chars = out_tmp
    model_filename = "~/.lidtk/models/char_dist_{metric}_{cutoff}.pickle".format(
        metric=metric_function.__name__, cutoff=unicode_cutoff
    )
    model_filename = os.path.expanduser(model_filename)
    with open(model_filename, "wb") as handle:
        model_info = {"language_models": language_models, "chars": chars}
        pickle.dump(model_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Evaluate
    cm_filepath = "char_{metric}_{coverage}_{cutoff}_{set_name}.cm.csv".format(
        metric=metric_function.__name__,
        coverage=coverage,
        cutoff=unicode_cutoff,
        set_name=set_name,
    )
    cfg = lidtk.utils.load_cfg()
    cm_filepath = os.path.join(cfg["artifacts_path"], cm_filepath)
예제 #6
0
def main(config_file):
    config = load_cfg(config_file)
    data = wili.load_data()
    ret = get_features(config, data)
    analyze_vocabulary(ret)
    print("First 20 samplex of x_train:")
    print(ret['xs']['x_train'][0])
    filepath = config['feature-extraction']['serialization_path']
    with open(filepath, 'wb') as handle:
        logging.info('Store model to \'{}\''.format(filepath))
        pickle.dump(ret['vectorizer'],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
예제 #7
0
def main(config_file: str) -> None:
    config = load_cfg(config_file)
    data = wili.load_data()
    ret = get_features(config, data)
    analyze_vocabulary(ret)
    print("First 20 samplex of x_train:")
    print(ret["xs"]["x_train"][0])
    filepath = config["feature-extraction"]["serialization_path"]
    with open(filepath, "wb") as handle:
        logger.info(f"Store model to '{filepath}'")
        pickle.dump(ret["vectorizer"],
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
예제 #8
0
파일: __init__.py 프로젝트: ml-lab/lidtk
def train(config, data=None):
    """
    Train a neural network.

    Parameters
    ----------
    config : dict
    data : dict, optional (default: wili)
    """
    cfg = lidtk.utils.load_cfg(config)
    if data is None:
        # Read data
        data = wili.load_data()
        logging.info("Finished loading data")
    nn_module = imp.load_source('nn_module', cfg['classifier']['script_path'])
    model = nn_module.create_model(lidtk.features.get_dim(cfg),
                                   len(set(data['y_train'])))
    print(model.summary())
예제 #9
0
def train(config: str, data: Optional[Dict[Any, Any]] = None) -> None:
    """
    Train a neural network.

    Parameters
    ----------
    config : str
    data : Optional[Dict[Any, Any]], optional (default: wili)
    """
    assert config is not None, "Run lidtk.utils.load_cfg(config)"
    cfg = lidtk.utils.load_cfg(config)
    if data is None:
        # Read data
        data = wili.load_data()
        logger.info("Finished loading data")
    nn_module = imp.load_source("nn_module", cfg["classifier"]["script_path"])
    model = nn_module.create_model(  # type: ignore
        lidtk.features.get_dim(cfg),
        len(set(data["y_train"]))  # type: ignore
    )
    print(model.summary())
예제 #10
0
    def eval_wili(self, result_file, languages=None, eval_unk=False):
        """
        Evaluate the classifier on WiLI.

        Parameters
        ----------
        result_file : str
            Path to a file where the results will be stored
        languages : list, optional (default: All languages)
            Filter languages by this list
        """
        # Read data
        data = wili.load_data()
        logging.info("Finished loading data")
        times = []
        bar = progressbar.ProgressBar(redirect_stdout=True,
                                      max_value=len(data['x_test']))
        result_filepath = os.path.abspath(result_file)
        logging.info("Write results to {}".format(result_filepath))
        results = {'meta': {}}
        now = datetime.datetime.now()
        results['meta']['experiment_start'] = ('{:%Y-%m-%d %H:%M:%S}'
                                               .format(now))
        cl_results = {}
        if languages is None:
            eval_unk = False
        with open(result_filepath, 'w') as filepointer:
            for i, (el, label_t) in enumerate(zip(data['x_test'],
                                                  data['y_test'])):
                if languages is not None:
                    if label_t not in languages:
                        if eval_unk:
                            print('UNK')
                        else:
                            continue
                    else:
                        print(label_t)
                try:
                    t0 = time.time()
                    predicted = self.predict(el)
                    t1 = time.time()
                    times.append(t1 - t0)
                    bar.update(i + 1)
                    if label_t != predicted:
                        if label_t not in cl_results:
                            cl_results[label_t] = {}
                        if predicted not in cl_results[label_t]:
                            cl_results[label_t][predicted] = []
                        identifier = 'test_{}'.format(i)
                        cl_results[label_t][predicted].append([identifier,
                                                               el])
                except Exception as e:  # catch them all
                    logging.error({'message': 'Exception in eval_wili',
                                   'error': e})
                    predicted = 'UNK-exception'
                filepointer.write(predicted + '\n')
        bar.finish()
        results['cl_results'] = cl_results
        times = np.array(times)
        print("Average time per 10**6 elements: {:.2f}s"
              .format(times.mean() * 10**6))
        results['time_per_10*6'] = times.mean() * 10**6
        logfile = result_filepath + '.json'
        results['meta']['hardware'] = lidtk.utils.get_hardware_info()
        results['meta']['software'] = lidtk.utils.get_software_info()
        with io.open(logfile, 'w', encoding='utf8') as f:
            f.write(json.dumps(results,
                               indent=4,
                               sort_keys=True,
                               ensure_ascii=False))
예제 #11
0
    def eval_wili(self,
                  result_file: str,
                  languages: List[str] = None,
                  eval_unk: bool = False) -> None:
        """
        Evaluate the classifier on WiLI.

        Parameters
        ----------
        result_file : str
            Path to a file where the results will be stored
        languages : List[str], optional (default: All languages)
            Filter languages by this list
        eval_unk : bool, optional (default: False)
        """
        # Read data
        data = wili.load_data()
        logger.info("Finished loading data")
        times = []
        bar = progressbar.ProgressBar(redirect_stdout=True,
                                      max_value=len(data["x_test"]))
        result_filepath = os.path.abspath(result_file)
        logger.info(f"Write results to {result_filepath}")
        results: Dict[str, Any] = {"meta": {}}
        now = datetime.datetime.now()
        results["meta"]["experiment_start"] = f"{now:%Y-%m-%d %H:%M:%S}"
        cl_results = {}  # type: Dict[str, Dict[str, List[Any]]]
        if languages is None:
            eval_unk = False
        with open(result_filepath, "w") as filepointer:
            for i, (el,
                    label_t) in enumerate(zip(data["x_test"], data["y_test"])):
                if languages is not None:
                    if label_t not in languages:
                        if eval_unk:
                            print("UNK")
                        else:
                            continue
                    else:
                        print(label_t)
                try:
                    t0 = time.time()
                    predicted = self.predict(el)
                    t1 = time.time()
                    times.append(t1 - t0)
                    bar.update(i + 1)
                    if label_t != predicted:
                        if label_t not in cl_results:
                            cl_results[label_t] = {}
                        if predicted not in cl_results[label_t]:
                            cl_results[label_t][predicted] = []
                        identifier = f"test_{i}"
                        cl_results[label_t][predicted].append([identifier, el])
                except Exception as e:  # catch them all
                    logger.error({
                        "message": "Exception in eval_wili",
                        "error": e
                    })
                    predicted = "UNK-exception"
                filepointer.write(predicted + "\n")
        bar.finish()
        results["cl_results"] = cl_results
        times_arr = np.array(times)
        print(
            f"Average time per 10**6 elements: {times_arr.mean() * 10 ** 6:.2f}s"
        )
        results["time_per_10*6"] = times_arr.mean() * 10**6
        logfile = result_filepath + ".json"
        results["meta"]["hardware"] = lidtk.utils.get_hardware_info()
        results["meta"]["software"] = lidtk.utils.get_software_info()
        with open(logfile, "w", encoding="utf8") as f:
            f.write(
                json.dumps(results,
                           indent=4,
                           sort_keys=True,
                           ensure_ascii=False))