示例#1
0
    def __init__(self, idx2labelname_mapping: Optional[Dict[int, str]] = None):
        """

        Parameters
        ----------
        idx2labelname_mapping : Dict[int, str]
            Mapping from index to label. If this is not provided
            then we are going to use the class indices in all the reports
        """
        super(PrecisionRecallFMeasure, self).__init__()
        self.idx2labelname_mapping = idx2labelname_mapping
        self.msg_printer = Printer()
        self.classification_metrics_utils = ClassificationMetricsUtils(
            idx2labelname_mapping=idx2labelname_mapping
        )

        # setup counters to calculate true positives, false positives,
        # false negatives and true negatives
        # The keys are the different class indices in the dataset and the
        # values are the number of true positives, false positives, false negative
        # true negatvies for the dataset

        self.tp_counter = {}
        self.fp_counter = {}
        self.fn_counter = {}
        self.tn_counter = {}
示例#2
0
def info_model(model: str, *, silent: bool = True) -> Dict[str, Any]:
    """Generate info about a specific model.

    model (str): Model name of path.
    silent (bool): Don't print anything, just return.
    RETURNS (dict): The model meta.
    """
    msg = Printer(no_print=silent, pretty=not silent)
    if util.is_package(model):
        model_path = util.get_package_path(model)
    else:
        model_path = Path(model)
    meta_path = model_path / "meta.json"
    if not meta_path.is_file():
        msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path)
    if model_path.resolve() != model_path:
        meta["source"] = str(model_path.resolve())
    else:
        meta["source"] = str(model_path)
    return {
        k: v
        for k, v in meta.items()
        if k not in ("accuracy", "performance", "speed")
    }
示例#3
0
def download_file(url: str, dest_filename: str) -> None:
    """ Download a file from the given url

    Parameters
    ----------
    url : str
        The url from which the file will be downloaded
    dest_filename : str
        The destination filename

    """
    # NOTE the stream=True parameter below
    msg_printer = Printer()
    block_size = 65536
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get("content-length", 0))
    written = 0
    with open(dest_filename, "wb") as f:
        for chunk in tqdm(
                r.iter_content(chunk_size=block_size),
                total=math.ceil(total_size // block_size),
                desc=f"Downloading from {url}",
        ):
            if chunk:  # filter out keep-alive new chunks
                written = written + len(chunk)
                f.write(chunk)
    msg_printer.good(f"Finished downloading {url} to {dest_filename}")
示例#4
0
def op_iter(
        data: List[Example],
        pre: List[PreProcessor],
        verbose: bool = True) -> Iterator[Tuple[int, Example, Dict[str, Any]]]:
    """Iterate over list of examples for an operation
    yielding tuples of (example hash, example)

    Args:
        data (List[Example]): List of examples to iterate
        pre (List[PreProcessor]): List of preprocessors to run
        verbose (bool, optional): Show verbose output.

    Yields:
        Iterator[Tuple[int, Example]]: Tuples of (example hash, example)
    """
    msg = Printer(no_print=verbose == False, hide_animation=verbose == False)
    preprocessed_outputs: Dict[Example, Dict[str, Any]] = defaultdict(dict)
    for processor in pre:
        with msg.loading(f"\t=> Running preprocessor {processor.name}..."):
            processor_outputs = list(processor(data))
            msg.good("Done")

        for i, (example, output) in enumerate(zip(data, processor_outputs)):
            preprocessed_outputs[example][
                processor.name] = processor_outputs[i]

    for example in data:
        yield hash(example), example.copy(
            deep=True), preprocessed_outputs[example]
示例#5
0
    def __init__(self, datasets_manager: DatasetsManager):
        """

        Parameters
        ----------
        datasets_manager : DatasetsManager
            The dataset manager managing the labels and other information
        """
        super(PrecisionRecallFMeasure,
              self).__init__(datasets_manager=datasets_manager)
        self.datasets_manager = datasets_manager
        self.idx2labelname_mapping = None
        self.msg_printer = Printer()
        self.classification_metrics_utils = ClassificationMetricsUtils()
        self.label_namespace = self.datasets_manager.label_namespaces[0]
        self.normalized_probs_namespace = "normalized_probs"
        self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[
            self.label_namespace]

        # setup counters to calculate true positives, false positives,
        # false negatives and true negatives
        # The keys are the different class indices in the dataset and the
        # values are the number of true positives, false positives, false negative
        # true negatvies for the dataset

        self.tp_counter = {}
        self.fp_counter = {}
        self.fn_counter = {}
        self.tn_counter = {}
示例#6
0
def evaluate(
    model,
    data_path,
    gpu_id=-1,
    gold_preproc=False,
    displacy_path=None,
    displacy_limit=25,
    return_scores=False,
):
    """
    Evaluate a model. To render a sample of parses in a HTML file, set an
    output directory as the displacy_path argument.
    """
    msg = Printer()
    util.fix_random_seed()
    if gpu_id >= 0:
        util.use_gpu(gpu_id)
    util.set_env_log(False)
    data_path = util.ensure_path(data_path)
    displacy_path = util.ensure_path(displacy_path)
    if not data_path.exists():
        msg.fail("Evaluation data not found", data_path, exits=1)
    if displacy_path and not displacy_path.exists():
        msg.fail("Visualization output directory not found", displacy_path, exits=1)
    corpus = GoldCorpus(data_path, data_path)
    nlp = util.load_model(model)
    dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
    begin = timer()
    scorer = nlp.evaluate(dev_docs, verbose=False)
    end = timer()
    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
    results = {
        "Time": "%.2f s" % (end - begin),
        "Words": nwords,
        "Words/s": "%.0f" % (nwords / (end - begin)),
        "TOK": "%.2f" % scorer.token_acc,
        "POS": "%.2f" % scorer.tags_acc,
        "UAS": "%.2f" % scorer.uas,
        "LAS": "%.2f" % scorer.las,
        "NER P": "%.2f" % scorer.ents_p,
        "NER R": "%.2f" % scorer.ents_r,
        "NER F": "%.2f" % scorer.ents_f,
    }
    msg.table(results, title="Results")

    if displacy_path:
        docs, golds = zip(*dev_docs)
        render_deps = "parser" in nlp.meta.get("pipeline", [])
        render_ents = "ner" in nlp.meta.get("pipeline", [])
        render_parses(
            docs,
            displacy_path,
            model_name=model,
            limit=displacy_limit,
            deps=render_deps,
            ents=render_ents,
        )
        msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path)
    if return_scores:
        return scorer.scores
示例#7
0
文件: common.py 项目: yyht/sciwing
def cached_path(path: Union[pathlib.Path, str],
                url: str,
                unzip=True) -> pathlib.Path:

    if isinstance(path, str):
        path = pathlib.Path(path)
    msg_printer = Printer()
    if path.is_file() or path.is_dir():
        msg_printer.info(f"{path} exists.")
        return path

    download_file(url=url, dest_filename=str(path))

    if unzip:
        if zipfile.is_zipfile(str(path)):
            extract_zip(filename=str(path), destination_dir=str(path.parent))
        if tarfile.is_tarfile(str(path)):
            if "tar" in path.suffix:
                mode = "r"
            elif "gz" in path.suffix:
                mode = "r:gz"
            else:
                mode = "r"

            extract_tar(filename=str(path),
                        destination_dir=str(path.parent),
                        mode=mode)

    return path
示例#8
0
    def __init__(
        self,
        encoder: nn.Module,
        encoding_dim: int,
        num_classes: int,
        classification_layer_bias: bool,
    ):
        """ SimpleClassifier is a linear classifier head on top of any encoder

        Parameters
        ----------
        encoder : nn.Module
            Any encoder that takes in instances
        encoding_dim : int
            The encoding dimension
        num_classes : int
            The number of classes
        classification_layer_bias : bool
            Whether to add classification layer bias or no
            This is set to false only for debugging purposes ff
        """
        super(SimpleClassifier, self).__init__()
        self.encoder = encoder
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        print(self.num_classes)
        self.classification_layer_bias = classification_layer_bias
        self.classification_layer = nn.Linear(
            encoding_dim, num_classes, bias=self.classification_layer_bias)
        self._loss = CrossEntropyLoss()
        self.msg_printer = Printer()
示例#9
0
def profile(model, inputs=None, n_texts=10000):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    Input should be formatted as one JSON object per line with a key "text".
    It can either be provided as a JSONL file, or be read from sys.sytdin.
    If no input file is specified, the IMDB dataset is loaded via Thinc.
    """
    msg = Printer()
    if inputs is not None:
        inputs = _read_inputs(inputs, msg)
    if inputs is None:
        n_inputs = 25000
        with msg.loading("Loading IMDB dataset via Thinc..."):
            imdb_train, _ = thinc.extra.datasets.imdb()
            inputs, _ = zip(*imdb_train)
        msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs))
        inputs = inputs[:n_inputs]
    with msg.loading("Loading model '{}'...".format(model)):
        nlp = load_model(model)
    msg.good("Loaded model '{}'".format(model))
    texts = list(itertools.islice(inputs, n_texts))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    msg.divider("Profile stats")
    s.strip_dirs().sort_stats("time").print_stats()
示例#10
0
def info(
    model: Optional[str] = None,
    *,
    markdown: bool = False,
    silent: bool = True,
    exclude: Optional[List[str]] = None,
) -> Union[str, dict]:
    msg = Printer(no_print=silent, pretty=not silent)
    if not exclude:
        exclude = []
    if model:
        title = f"Info about pipeline '{model}'"
        data = info_model(model, silent=silent)
    else:
        title = "Info about spaCy"
        data = info_spacy()
    raw_data = {k.lower().replace(" ", "_"): v for k, v in data.items()}
    if "Pipelines" in data and isinstance(data["Pipelines"], dict):
        data["Pipelines"] = ", ".join(f"{n} ({v})"
                                      for n, v in data["Pipelines"].items())
    markdown_data = get_markdown(data, title=title, exclude=exclude)
    if markdown:
        if not silent:
            print(markdown_data)
        return markdown_data
    if not silent:
        table_data = {k: v for k, v in data.items() if k not in exclude}
        msg.table(table_data, title=title)
    return raw_data
示例#11
0
def print_textcats_auc_per_cat(msg: Printer,
                               scores: Dict[str, Dict[str, float]]) -> None:
    msg.table(
        [(k, f"{v:.2f}") for k, v in scores.items()],
        header=("", "ROC AUC"),
        aligns=("l", "r"),
        title="Textcat ROC AUC (per label)",
    )
示例#12
0
def validate_config_for_pretrain(config: Config, msg: Printer) -> None:
    if "tok2vec" not in config["nlp"]["pipeline"]:
        msg.warn(
            "No tok2vec component found in the pipeline. If your tok2vec "
            "component has a different name, you may need to adjust the "
            "tok2vec_model reference in the [pretraining] block. If you don't "
            "have a tok2vec component, make sure to add it to your [components] "
            "and the pipeline specified in the [nlp] block, so you can pretrain "
            "weights for it.")
示例#13
0
def get_build_formats(formats: List[str]) -> Tuple[bool, bool]:
    supported = ["sdist", "wheel", "none"]
    for form in formats:
        if form not in supported:
            msg = Printer()
            err = f"Unknown build format: {form}. Supported: {', '.join(supported)}"
            msg.fail(err, exits=1)
    if not formats or "none" in formats:
        return (False, False)
    return ("sdist" in formats, "wheel" in formats)
示例#14
0
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]],
                       name: str, type: str) -> None:
    data = [(k, f"{v['p']*100:.2f}", f"{v['r']*100:.2f}", f"{v['f']*100:.2f}")
            for k, v in scores.items()]
    msg.table(
        data,
        header=("", "P", "R", "F"),
        aligns=("l", "r", "r", "r"),
        title=f"{name} (per {type})",
    )
示例#15
0
def convert_sectlabel_to_json(filename: str) -> Dict:
    """ Converts the secthead file into more readable json format

    Parameters
    ----------
    filename : str
        The sectlabel file name available at WING-NUS website

    Returns
    -------
    Dict[str, Any]
        text
            The text of the line
        label
            The label of the file
        file_no
            A unique file number
        line_count
            A line count within the file

    """
    file_count = 1
    line_count = 1
    output_json = {"parse_sect": []}
    msg_printer = Printer()

    with open(filename) as fp:
        for line in tqdm(fp, desc="Converting SectLabel File to JSON"):
            line = line.replace("\n", "")

            # if the line is empty then the next line is the beginning of the
            if not line:
                file_count += 1
                continue

            # new file
            fields = line.split()
            line_content = fields[0]  # first column contains the content text
            line_content = line_content.replace(
                "|||", " "
            )  # every word in the line is sepearted by |||
            label = fields[-1]  # the last column contains the field marked
            line_json = {
                "text": line_content,
                "label": label,
                "file_no": file_count,
                "line_count": line_count,
            }
            line_count += 1

            output_json["parse_sect"].append(line_json)

    msg_printer.good("Finished converting sect label file to JSON")
    return output_json
示例#16
0
def cached_path(path: pathlib.Path, url: str, unzip=True) -> pathlib.Path:

    msg_printer = Printer()
    if path.is_file() or path.is_dir():
        msg_printer.info(f"{path} exists.")
        return path

    download_file(url=url, dest_filename=f"{str(path)}.zip")

    if unzip:
        extract_zip(filename=f"{path}.zip", destination_dir=str(path.parent))
示例#17
0
def setup_gpu(use_gpu: int, silent=None) -> None:
    """Configure the GPU and log info."""
    if silent is None:
        local_msg = Printer()
    else:
        local_msg = Printer(no_print=silent, pretty=not silent)
    if use_gpu >= 0:
        local_msg.info(f"Using GPU: {use_gpu}")
        require_gpu(use_gpu)
    else:
        local_msg.info("Using CPU")
        if gpu_is_available():
            local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
示例#18
0
class Output:
    def __init__(self, stages, *args, **kwargs):
        self.stages = stages
        self.context = None
        self.printer = Printer()

    def success(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.GOOD, icon=MESSAGES.GOOD,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def info(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.INFO, icon=MESSAGES.INFO,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def error(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.FAIL, icon=MESSAGES.FAIL,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def warning(self, title="", text="", show=True, spaced=False, exits=None):
        self.write(
            self.printer.text(title=self.with_prefix(title), text=text, color=MESSAGES.WARN, icon=MESSAGES.WARN,
                              show=show, spaced=spaced, exits=exits, no_print=True)
        )

    def set_description(self, *args, **kwargs):
        pass

    def close(self, *args, **kwargs):
        pass

    def write(self, text):
        click.echo(text)

    def set_context(self, context: str):
        self.context = context

    def line_prefix(self) -> str:
        return f"[{self.context}] " if self.context else ""

    def with_prefix(self, title) -> str:
        return f"{self.line_prefix()} {title}"

    def __iter__(self):
        return iter(self.stages)
示例#19
0
def run_on_all_states(f, index_slice=None):
    if index_slice is not None:
        states = list(us.STATES)[index_slice]
    else:
        states = list(us.STATES)
    run_task = catch_errors(f)
    results = [run_task(state) for state in states]

    successes = sum(result is Result.Success for result in results)
    errors = sum(result is Result.Error for result in results)
    printer = Printer()
    printer.info("Final result:")
    printer.info(f"{successes} were created successfully. {errors} errored.")
    printer.table(
        list(
            zip(
                [name for name in states],
                [
                    str(result) if result is not None else "Error"
                    for result in results
                ],
            )),
        header=("State", "Created"),
        divider=True,
    )
示例#20
0
文件: evaluate.py 项目: EricM2/venv
def print_prf_per_type(msg: Printer, scores: Dict[str, Dict[str, float]],
                       name: str, type: str) -> None:
    data = []
    for key, value in scores.items():
        row = [key]
        for k in ("p", "r", "f"):
            v = value[k]
            row.append(f"{v * 100:.2f}" if isinstance(v, (int, float)) else v)
        data.append(row)
    msg.table(
        data,
        header=("", "P", "R", "F"),
        aligns=("l", "r", "r", "r"),
        title=f"{name} (per {type})",
    )
示例#21
0
    def __init__(
            self,
            encoder: nn.Module,
            encoding_dim: int,
            num_classes: int,
            classification_layer_bias: bool = True,
            label_namespace: str = "label",
            datasets_manager: DatasetsManager = None,
            device: Union[torch.device, str] = torch.device("cpu"),
    ):
        """ SimpleClassifier is a linear classifier head on top of any encoder

        Parameters
        ----------
        encoder : nn.Module
            Any encoder that takes in lines and produces a single vector
            for every line.
        encoding_dim : int
            The encoding dimension
        num_classes : int
            The number of classes
        classification_layer_bias : bool
            Whether to add classification layer bias or no
            This is set to false only for debugging purposes ff
        label_namespace : str
            The namespace used for labels in the dataset
        datasets_manager: DatasetsManager
            The datasets manager for the model
        device: torch.device
            The device on which the model is run
        """
        super(SimpleClassifier, self).__init__()
        self.encoder = encoder
        self.encoding_dim = encoding_dim
        self.num_classes = num_classes
        self.classification_layer_bias = classification_layer_bias
        self.classification_layer = nn.Linear(
            self.encoding_dim,
            num_classes,
            bias=self.classification_layer_bias)
        self._loss = CrossEntropyLoss()
        self.label_namespace = label_namespace
        self.datasets_manager = datasets_manager
        self.label_numericalizer = self.datasets_manager.namespace_to_numericalizer[
            self.label_namespace]
        self.device = torch.device(device) if isinstance(device,
                                                         str) else device
        self.msg_printer = Printer()
示例#22
0
def convert(
    input_path: Union[str, Path],
    output_dir: Union[str, Path],
    *,
    file_type: str = "json",
    n_sents: int = 1,
    seg_sents: bool = False,
    model: Optional[str] = None,
    morphology: bool = False,
    merge_subtokens: bool = False,
    converter: str = "auto",
    ner_map: Optional[Path] = None,
    lang: Optional[str] = None,
    concatenate: bool = False,
    silent: bool = True,
    msg: Optional[Printer],
) -> None:
    if not msg:
        msg = Printer(no_print=silent)
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(Path(input_path), converter):
        with input_loc.open("r", encoding="utf-8") as infile:
            input_data = infile.read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
        docs = func(
            input_data,
            n_sents=n_sents,
            seg_sents=seg_sents,
            append_morphology=morphology,
            merge_subtokens=merge_subtokens,
            lang=lang,
            model=model,
            no_print=silent,
            ner_map=ner_map,
        )
        doc_files.append((input_loc, docs))
    if concatenate:
        all_docs = itertools.chain.from_iterable([docs for _, docs in doc_files])
        doc_files = [(input_path, all_docs)]
    for input_loc, docs in doc_files:
        if file_type == "json":
            data = [docs_to_json(docs)]
            len_docs = len(data)
        else:
            db = DocBin(docs=docs, store_user_data=True)
            len_docs = len(db)
            data = db.to_bytes()
        if output_dir == "-":
            _print_docs_to_stdout(data, file_type)
        else:
            if input_loc != input_path:
                subpath = input_loc.relative_to(input_path)
                output_file = Path(output_dir) / subpath.with_suffix(f".{file_type}")
            else:
                output_file = Path(output_dir) / input_loc.parts[-1]
                output_file = output_file.with_suffix(f".{file_type}")
            _write_docs_to_file(data, output_file, file_type)
            msg.good(f"Generated output file ({len_docs} documents): {output_file}")
示例#23
0
def conllu_to_docs(
    input_data,
    n_sents=10,
    append_morphology=False,
    ner_map=None,
    merge_subtokens=False,
    no_print=False,
    **_
):
    """
    Convert conllu files into JSON format for use with train cli.
    append_morphology parameter enables appending morphology to tags, which is
    useful for languages such as Spanish, where UD tags are not so rich.

    Extract NER tags if available and convert them so that they follow
    BILUO and the Wikipedia scheme
    """
    MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
    msg = Printer(no_print=no_print)
    n_sents_info(msg, n_sents)
    sent_docs = read_conllx(
        input_data,
        append_morphology=append_morphology,
        ner_tag_pattern=MISC_NER_PATTERN,
        ner_map=ner_map,
        merge_subtokens=merge_subtokens,
    )
    sent_docs_to_merge = []
    for sent_doc in sent_docs:
        sent_docs_to_merge.append(sent_doc)
        if len(sent_docs_to_merge) % n_sents == 0:
            yield Doc.from_docs(sent_docs_to_merge)
            sent_docs_to_merge = []
    if sent_docs_to_merge:
        yield Doc.from_docs(sent_docs_to_merge)
示例#24
0
def dashboard(data_dir: Path) -> None:
    """Calculate statistics on a Corpus

    Args:
        data_dir (Path): Path to data folder
    """
    msg: Printer = Printer()

    # with msg.loading("Loading Corpus from Disk"):
    corpus = Corpus.from_disk(data_dir)
    # msg.good("Done")

    ner_stats = corpus.apply(get_ner_stats)

    # external_stylesheets = [
    #     "https://codepen.io/chriddyp/pen/bWLwgP.css",
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/css/uikit.min.css"
    # ]

    # external_scripts = [
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/js/uikit.min.js",
    #     "https://cdn.jsdelivr.net/npm/[email protected]/dist/js/uikit-icons.min.js"
    # ]

    # app = dash.Dash(__name__, external_stylesheets=external_stylesheets, external_scripts=external_scripts)

    # def generate_bar_chart_stats(id: str, ner_stats: NERStats, name: str = None):
    #     return dcc.Graph(
    #         id=id,
    #         figure={
    #             'data': [
    #                 go.Bar(
    #                     x = list(ner_stats.n_annotations_per_type.values()),
    #                     y = list(ner_stats.n_annotations_per_type.keys()),
    #                     orientation='h'
    #                 )
    #             ],
    #             'layout': {
    #                 'title': name or id.capitalize()
    #             }
    #         }
    #     )

    # app.layout = html.Div(children=[
    #     html.Div(className="uk-child-width-1-2@s uk-grid-match")
    #     html.H1(className="" children='Recon NER Dashboard'),

    #     html.Div(children='''
    #         This dashboard shows statistics for all your data.
    #     '''),

    #     html.Div(children=[
    #         generate_bar_chart_stats("train", ner_stats["train"]),
    #         generate_bar_chart_stats("dev", ner_stats["dev"]),
    #         generate_bar_chart_stats("test", ner_stats["test"]),
    #         generate_bar_chart_stats("all", ner_stats["all"])
    #     ], style={'columnCount': 4})
    # ])

    uvicorn.run(app, port=9090)
示例#25
0
def handle_scores_per_type(
    scores: Dict[str, Any],
    data: Dict[str, Any] = {},
    *,
    spans_key: str = "sc",
    silent: bool = False,
) -> Dict[str, Any]:
    msg = Printer(no_print=silent, pretty=not silent)
    if "morph_per_feat" in scores:
        if scores["morph_per_feat"]:
            print_prf_per_type(msg, scores["morph_per_feat"], "MORPH", "feat")
            data["morph_per_feat"] = scores["morph_per_feat"]
    if "dep_las_per_type" in scores:
        if scores["dep_las_per_type"]:
            print_prf_per_type(msg, scores["dep_las_per_type"], "LAS", "type")
            data["dep_las_per_type"] = scores["dep_las_per_type"]
    if "ents_per_type" in scores:
        if scores["ents_per_type"]:
            print_prf_per_type(msg, scores["ents_per_type"], "NER", "type")
            data["ents_per_type"] = scores["ents_per_type"]
    if f"spans_{spans_key}_per_type" in scores:
        if scores[f"spans_{spans_key}_per_type"]:
            print_prf_per_type(
                msg, scores[f"spans_{spans_key}_per_type"], "SPANS", "type"
            )
            data[f"spans_{spans_key}_per_type"] = scores[f"spans_{spans_key}_per_type"]
    if "cats_f_per_type" in scores:
        if scores["cats_f_per_type"]:
            print_prf_per_type(msg, scores["cats_f_per_type"], "Textcat F", "label")
            data["cats_f_per_type"] = scores["cats_f_per_type"]
    if "cats_auc_per_type" in scores:
        if scores["cats_auc_per_type"]:
            print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
            data["cats_auc_per_type"] = scores["cats_auc_per_type"]
    return scores
示例#26
0
def pull_state_graph(state, include_data=False):
    printer = Printer()
    fips = state.fips
    name = state.name
    with printer.loading(f"Downloading shapefile for {name}..."):
        df = geopandas.read_file(
            "http://www2.census.gov/geo/tiger/TIGER2010/BG/"
            f"2010/tl_2010_{fips}_bg10.zip")
    df.set_index("GEOID10", inplace=True, drop=False)
    if include_data:
        with printer.loading(f"Downloading block group data for {name}..."):
            data = data_for_state(fips, "block group")
        data.set_index("geoid", inplace=True)
        df = df.join(data)
    with printer.loading(f"Creating graph for {name}..."):
        graph = gerrychain.Graph.from_geodataframe(df)
    return graph, df
示例#27
0
def main(uri, table_path, schema, write_mode):
    msg = Printer()
    project_id, dataset_id, _ = table_path.split(".")
    config = Config(project_id=project_id, dataset_id=dataset_id)
    client = config.client()
    table_ref = str_to_bq_ref(table_path)

    load_job_config = bq.LoadJobConfig()
    load_job_config.schema = client.schema_from_json(schema)
    load_job_config.source_format = bq.SourceFormat.NEWLINE_DELIMITED_JSON
    load_job_config.ignore_unknown_values = True
    load_job_config.write_disposition = "WRITE_APPEND"
    load_job_config.max_bad_records = 100

    assert write_mode in ["CREATE_NEW", "WRITE_APPEND"]
    table_id = table_path.split(".")[-1]
    exists = any([
        table_id == table.table_id
        for table in client.list_tables(client.dataset(dataset_id))
    ])

    if exists and write_mode == "CREATE_NEW":
        msg.info(f"{table_path} already exists. Write_mode: {write_mode}")
        client.delete_table(table_ref)
        table = bq.Table(table_ref, schema=client.schema_from_json(schema))
        client.create_table(table)

    load_job = client.load_table_from_uri(uri,
                                          table_ref,
                                          job_config=load_job_config)
    with msg.loading("Loading data..."):
        load_job.result()
    msg.good("Data succesfully loaded!")
示例#28
0
def extract_tar(filename: str, destination_dir: str, mode="r"):
    """ Extracts tar, targz and other files

    Parameters
    ----------
    filename : str
        The tar zipped file
    destination_dir : str
        The destination directory in which the files should be placed
    mode : str
        A valid tar mode. You can refer to https://docs.python.org/3/library/tarfile.html
        for the different modes.

    Returns
    -------

    """
    msg_printer = Printer()
    try:
        with msg_printer.loading(
                f"Unzipping file {filename} to {destination_dir}"):
            stdout.flush()
            with tarfile.open(filename, mode) as t:
                t.extractall(destination_dir)

        msg_printer.good(
            f"Finished extraction {filename} to {destination_dir}")
    except tarfile.ExtractError:
        msg_printer.fail("Couldnot extract {filename} to {destination}")
示例#29
0
def convert_parscit_to_conll(
    parscit_train_filepath: pathlib.Path, ) -> List[Dict[str, Any]]:
    """ Convert the parscit data available at
    "https://github.com/knmnyn/ParsCit/blob/master/crfpp/traindata/parsCit.train.data"
    to a CONLL dummy version
    This is done so that we can use it with AllenNLPs built in data reader called
    conll2013 dataset reader

    Parameters
    ----------------
    parscit_train_filepath: pathlib.Path
        The path where the train file path is stored

    """
    printer = Printer()
    citation_string = []
    word_tags = []
    output_list = []
    with printer.loading(
            f"Converting {parscit_train_filepath.name} to conll format"):
        with open(str(parscit_train_filepath), "r", encoding="latin-1") as fp:
            for line in fp:
                if bool(line.strip()):
                    fields = line.strip().split()
                    word = fields[0]
                    tag = fields[-1]
                    word = word.strip()
                    tag = f"{tag.strip()}"
                    word_tag = " ".join([word] + [tag] * 3)
                    citation_string.append(word)
                    word_tags.append(word_tag)
                else:
                    citation_string = " ".join(citation_string)
                    output_list.append({
                        "word_tags": word_tags,
                        "citation_string": citation_string
                    })
                    citation_string = []
                    word_tags = []

    printer.good(
        f"Successfully converted {parscit_train_filepath.name} to conll format"
    )
    return output_list
示例#30
0
def corpus_trainer(cb, cpt, custom):
    from chatterbot.trainers import ChatterBotCorpusTrainer
    
    trainer = ChatterBotCorpusTrainer(cb)

    if(bool(custom)):
        for mode in custom.split():
            try:
                trainer.train("chatterbot.corpus.english.{}".format(mode))
            except(FileNotFoundError):
                from wasabi import Printer
                msg = Printer()
                msg.fail("That corpus doesn't exist!")
                return -1

        print("all done training masta!")
    elif(bool(cpt)):
        trainer.train("chatterbot.corpus.english")
        print("all done training masta!")
示例#31
0
文件: info.py 项目: spacy-io/spaCy
def info(model=None, markdown=False, silent=False):
    """
    Print info about spaCy installation. If a model shortcut link is
    speficied as an argument, print model information. Flag --markdown
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    msg = Printer()
    if model:
        if util.is_package(model):
            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
        meta_path = model_path / "meta.json"
        if not meta_path.is_file():
            msg.fail("Can't find model meta.json", meta_path, exits=1)
        meta = srsly.read_json(meta_path)
        if model_path.resolve() != model_path:
            meta["link"] = path2str(model_path)
            meta["source"] = path2str(model_path.resolve())
        else:
            meta["source"] = path2str(model_path)
        if not silent:
            title = "Info about model '{}'".format(model)
            model_meta = {
                k: v for k, v in meta.items() if k not in ("accuracy", "speed")
            }
            if markdown:
                print_markdown(model_meta, title=title)
            else:
                msg.table(model_meta, title=title)
        return meta
    data = {
        "spaCy version": about.__version__,
        "Location": path2str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
        "Models": list_models(),
    }
    if not silent:
        title = "Info about spaCy"
        if markdown:
            print_markdown(data, title=title)
        else:
            msg.table(data, title=title)
    return data
示例#32
0
文件: link.py 项目: spacy-io/spaCy
def link(origin, link_name, force=False, model_path=None):
    """
    Create a symlink for models within the spacy/data directory. Accepts
    either the name of a pip package, or the local path to the model data
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    msg = Printer()
    if util.is_package(origin):
        model_path = util.get_package_path(origin)
    else:
        model_path = Path(origin) if model_path is None else Path(model_path)
    if not model_path.exists():
        msg.fail(
            "Can't locate model data",
            "The data should be located in {}".format(path2str(model_path)),
            exits=1,
        )
    data_path = util.get_data_path()
    if not data_path or not data_path.exists():
        spacy_loc = Path(__file__).parent.parent
        msg.fail(
            "Can't find the spaCy data path to create model symlink",
            "Make sure a directory `/data` exists within your spaCy "
            "installation and try again. The data directory should be located "
            "here:".format(path=spacy_loc),
            exits=1,
        )
    link_path = util.get_data_path() / link_name
    if link_path.is_symlink() and not force:
        msg.fail(
            "Link '{}' already exists".format(link_name),
            "To overwrite an existing link, use the --force flag",
            exits=1,
        )
    elif link_path.is_symlink():  # does a symlink exist?
        # NB: It's important to check for is_symlink here and not for exists,
        # because invalid/outdated symlinks would return False otherwise.
        link_path.unlink()
    elif link_path.exists():  # does it exist otherwise?
        # NB: Check this last because valid symlinks also "exist".
        msg.fail(
            "Can't overwrite symlink '{}'".format(link_name),
            "This can happen if your data directory contains a directory or "
            "file of the same name.",
            exits=1,
        )
    details = "%s --> %s" % (path2str(model_path), path2str(link_path))
    try:
        symlink_to(link_path, model_path)
    except:  # noqa: E722
        # This is quite dirty, but just making sure other errors are caught.
        msg.fail(
            "Couldn't link model to '{}'".format(link_name),
            "Creating a symlink in spacy/data failed. Make sure you have the "
            "required permissions and try re-running the command as admin, or "
            "use a virtualenv. You can still import the model as a module and "
            "call its load() method, or create the symlink manually.",
        )
        msg.text(details)
        raise
    msg.good("Linking successful", details)
    msg.text("You can now load the model via spacy.load('{}')".format(link_name))
示例#33
0
def debug_data(
    lang,
    train_path,
    dev_path,
    base_model=None,
    pipeline="tagger,parser,ner",
    ignore_warnings=False,
    ignore_validation=False,
    verbose=False,
    no_format=False,
):
    msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)

    # Make sure all files and paths exists if they are needed
    if not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)

    # Initialize the model and pipeline
    pipeline = [p.strip() for p in pipeline.split(",")]
    if base_model:
        nlp = load_model(base_model)
    else:
        lang_cls = get_lang_class(lang)
        nlp = lang_cls()

    msg.divider("Data format validation")
    # Load the data in one – might take a while but okay in this case
    train_data = _load_file(train_path, msg)
    dev_data = _load_file(dev_path, msg)

    # Validate data format using the JSON schema
    # TODO: update once the new format is ready
    train_data_errors = []  # TODO: validate_json
    dev_data_errors = []  # TODO: validate_json
    if not train_data_errors:
        msg.good("Training data JSON format is valid")
    if not dev_data_errors:
        msg.good("Development data JSON format is valid")
    for error in train_data_errors:
        msg.fail("Training data: {}".format(error))
    for error in dev_data_errors:
        msg.fail("Develoment data: {}".format(error))
    if (train_data_errors or dev_data_errors) and not ignore_validation:
        sys.exit(1)

    # Create the gold corpus to be able to better analyze data
    with msg.loading("Analyzing corpus..."):
        train_data = read_json_object(train_data)
        dev_data = read_json_object(dev_data)
        corpus = GoldCorpus(train_data, dev_data)
        train_docs = list(corpus.train_docs(nlp))
        dev_docs = list(corpus.dev_docs(nlp))
    msg.good("Corpus is loadable")

    # Create all gold data here to avoid iterating over the train_docs constantly
    gold_data = _compile_gold(train_docs, pipeline)
    train_texts = gold_data["texts"]
    dev_texts = set([doc.text for doc, gold in dev_docs])

    msg.divider("Training stats")
    msg.text("Training pipeline: {}".format(", ".join(pipeline)))
    for pipe in [p for p in pipeline if p not in nlp.factories]:
        msg.fail("Pipeline component '{}' not available in factories".format(pipe))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
    msg.text("{} training docs".format(len(train_docs)))
    msg.text("{} evaluation docs".format(len(dev_docs)))

    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
    else:
        msg.good("No overlap between training and evaluation data")
    if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
        text = "Low number of examples to train from a blank model ({})".format(
            len(train_docs)
        )
        if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
            msg.fail(text)
        else:
            msg.warn(text)
        msg.text(
            "It's recommended to use at least {} examples (minimum {})".format(
                BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD
            ),
            show=verbose,
        )

    msg.divider("Vocab & Vectors")
    n_words = gold_data["n_words"]
    msg.info(
        "{} total {} in the data ({} unique)".format(
            n_words, "word" if n_words == 1 else "words", len(gold_data["words"])
        )
    )
    most_common_words = gold_data["words"].most_common(10)
    msg.text(
        "10 most common words: {}".format(
            _format_labels(most_common_words, counts=True)
        ),
        show=verbose,
    )
    if len(nlp.vocab.vectors):
        msg.info(
            "{} vectors ({} unique keys, {} dimensions)".format(
                len(nlp.vocab.vectors),
                nlp.vocab.vectors.n_keys,
                nlp.vocab.vectors_length,
            )
        )
    else:
        msg.info("No word vectors present in the model")

    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(label for label in gold_data["ner"] if label not in ("O", "-"))
        label_counts = gold_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        has_low_data_warning = False
        has_no_neg_warning = False
        has_ws_ents_error = False

        msg.divider("Named Entity Recognition")
        msg.info(
            "{} new {}, {} existing {}".format(
                len(new_labels),
                "label" if len(new_labels) == 1 else "labels",
                len(existing_labels),
                "label" if len(existing_labels) == 1 else "labels",
            )
        )
        missing_values = label_counts["-"]
        msg.text(
            "{} missing {} (tokens with '-' label)".format(
                missing_values, "value" if missing_values == 1 else "values"
            )
        )
        if new_labels:
            labels_with_counts = [
                (label, count)
                for label, count in label_counts.most_common()
                if label != "-"
            ]
            labels_with_counts = _format_labels(labels_with_counts, counts=True)
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

        if gold_data["ws_ents"]:
            msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"]))
            has_ws_ents_error = True

        for label in new_labels:
            if label_counts[label] <= NEW_LABEL_THRESHOLD:
                msg.warn(
                    "Low number of examples for new label '{}' ({})".format(
                        label, label_counts[label]
                    )
                )
                has_low_data_warning = True

                with msg.loading("Analyzing label distribution..."):
                    neg_docs = _get_examples_without_label(train_docs, label)
                if neg_docs == 0:
                    msg.warn(
                        "No examples for texts WITHOUT new label '{}'".format(label)
                    )
                    has_no_neg_warning = True

        if not has_low_data_warning:
            msg.good("Good amount of examples for all labels")
        if not has_no_neg_warning:
            msg.good("Examples without occurences available for all labels")
        if not has_ws_ents_error:
            msg.good("No entities consisting of or starting/ending with whitespace")

        if has_low_data_warning:
            msg.text(
                "To train a new entity type, your data should include at "
                "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD),
                show=verbose,
            )
        if has_no_neg_warning:
            msg.text(
                "Training data should always include examples of entities "
                "in context, as well as examples without a given entity "
                "type.",
                show=verbose,
            )
        if has_ws_ents_error:
            msg.text(
                "As of spaCy v2.1.0, entity spans consisting of or starting/ending "
                "with whitespace characters are considered invalid."
            )

    if "textcat" in pipeline:
        msg.divider("Text Classification")
        labels = [label for label in gold_data["textcat"]]
        model_labels = _get_labels_from_model(nlp, "textcat")
        new_labels = [l for l in labels if l not in model_labels]
        existing_labels = [l for l in labels if l in model_labels]
        msg.info(
            "Text Classification: {} new label(s), {} existing label(s)".format(
                len(new_labels), len(existing_labels)
            )
        )
        if new_labels:
            labels_with_counts = _format_labels(
                gold_data["textcat"].most_common(), counts=True
            )
            msg.text("New: {}".format(labels_with_counts), show=verbose)
        if existing_labels:
            msg.text(
                "Existing: {}".format(_format_labels(existing_labels)), show=verbose
            )

    if "tagger" in pipeline:
        msg.divider("Part-of-speech Tagging")
        labels = [label for label in gold_data["tags"]]
        tag_map = nlp.Defaults.tag_map
        msg.info(
            "{} {} in data ({} {} in tag map)".format(
                len(labels),
                "label" if len(labels) == 1 else "labels",
                len(tag_map),
                "label" if len(tag_map) == 1 else "labels",
            )
        )
        labels_with_counts = _format_labels(
            gold_data["tags"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)
        non_tagmap = [l for l in labels if l not in tag_map]
        if not non_tagmap:
            msg.good("All labels present in tag map for language '{}'".format(nlp.lang))
        for label in non_tagmap:
            msg.fail(
                "Label '{}' not found in tag map for language '{}'".format(
                    label, nlp.lang
                )
            )

    if "parser" in pipeline:
        msg.divider("Dependency Parsing")
        labels = [label for label in gold_data["deps"]]
        msg.info(
            "{} {} in data".format(
                len(labels), "label" if len(labels) == 1 else "labels"
            )
        )
        labels_with_counts = _format_labels(
            gold_data["deps"].most_common(), counts=True
        )
        msg.text(labels_with_counts, show=verbose)

    msg.divider("Summary")
    good_counts = msg.counts[MESSAGES.GOOD]
    warn_counts = msg.counts[MESSAGES.WARN]
    fail_counts = msg.counts[MESSAGES.FAIL]
    if good_counts:
        msg.good(
            "{} {} passed".format(
                good_counts, "check" if good_counts == 1 else "checks"
            )
        )
    if warn_counts:
        msg.warn(
            "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings")
        )
    if fail_counts:
        msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors"))

    if fail_counts:
        sys.exit(1)
示例#34
0
文件: train.py 项目: spacy-io/spaCy
def train(
    lang,
    output_path,
    train_path,
    dev_path,
    raw_text=None,
    base_model=None,
    pipeline="tagger,parser,ner",
    vectors=None,
    n_iter=30,
    n_early_stopping=None,
    n_examples=0,
    use_gpu=-1,
    version="0.0.0",
    meta_path=None,
    init_tok2vec=None,
    parser_multitasks="",
    entity_multitasks="",
    noise_level=0.0,
    eval_beam_widths="",
    gold_preproc=False,
    learn_tokens=False,
    verbose=False,
    debug=False,
):
    """
    Train or update a spaCy model. Requires data to be formatted in spaCy's
    JSON format. To convert data from other formats, use the `spacy convert`
    command.
    """
    msg = Printer()
    util.fix_random_seed()
    util.set_env_log(verbose)

    # Make sure all files and paths exists if they are needed
    train_path = util.ensure_path(train_path)
    dev_path = util.ensure_path(dev_path)
    meta_path = util.ensure_path(meta_path)
    output_path = util.ensure_path(output_path)
    if raw_text is not None:
        raw_text = list(srsly.read_jsonl(raw_text))
    if not train_path or not train_path.exists():
        msg.fail("Training data not found", train_path, exits=1)
    if not dev_path or not dev_path.exists():
        msg.fail("Development data not found", dev_path, exits=1)
    if meta_path is not None and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)
    meta = srsly.read_json(meta_path) if meta_path else {}
    if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
        msg.warn(
            "Output directory is not empty",
            "This can lead to unintended side effects when saving the model. "
            "Please use an empty directory or a different path instead. If "
            "the specified output path doesn't exist, the directory will be "
            "created for you.",
        )
    if not output_path.exists():
        output_path.mkdir()

    # Take dropout and batch size as generators of values -- dropout
    # starts high and decays sharply, to force the optimizer to explore.
    # Batch size starts at 1 and grows, so that we make updates quickly
    # at the beginning of training.
    dropout_rates = util.decaying(
        util.env_opt("dropout_from", 0.2),
        util.env_opt("dropout_to", 0.2),
        util.env_opt("dropout_decay", 0.0),
    )
    batch_sizes = util.compounding(
        util.env_opt("batch_from", 100.0),
        util.env_opt("batch_to", 1000.0),
        util.env_opt("batch_compound", 1.001),
    )

    if not eval_beam_widths:
        eval_beam_widths = [1]
    else:
        eval_beam_widths = [int(bw) for bw in eval_beam_widths.split(",")]
        if 1 not in eval_beam_widths:
            eval_beam_widths.append(1)
        eval_beam_widths.sort()
    has_beam_widths = eval_beam_widths != [1]

    # Set up the base model and pipeline. If a base model is specified, load
    # the model and make sure the pipeline matches the pipeline setting. If
    # training starts from a blank model, intitalize the language class.
    pipeline = [p.strip() for p in pipeline.split(",")]
    msg.text("Training pipeline: {}".format(pipeline))
    if base_model:
        msg.text("Starting with base model '{}'".format(base_model))
        nlp = util.load_model(base_model)
        if nlp.lang != lang:
            msg.fail(
                "Model language ('{}') doesn't match language specified as "
                "`lang` argument ('{}') ".format(nlp.lang, lang),
                exits=1,
            )
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
        nlp.disable_pipes(*other_pipes)
        for pipe in pipeline:
            if pipe not in nlp.pipe_names:
                nlp.add_pipe(nlp.create_pipe(pipe))
    else:
        msg.text("Starting with blank model '{}'".format(lang))
        lang_cls = util.get_lang_class(lang)
        nlp = lang_cls()
        for pipe in pipeline:
            nlp.add_pipe(nlp.create_pipe(pipe))

    if learn_tokens:
        nlp.add_pipe(nlp.create_pipe("merge_subtokens"))

    if vectors:
        msg.text("Loading vector from model '{}'".format(vectors))
        _load_vectors(nlp, vectors)

    # Multitask objectives
    multitask_options = [("parser", parser_multitasks), ("ner", entity_multitasks)]
    for pipe_name, multitasks in multitask_options:
        if multitasks:
            if pipe_name not in pipeline:
                msg.fail(
                    "Can't use multitask objective without '{}' in the "
                    "pipeline".format(pipe_name)
                )
            pipe = nlp.get_pipe(pipe_name)
            for objective in multitasks.split(","):
                pipe.add_multitask_objective(objective)

    # Prepare training corpus
    msg.text("Counting training words (limit={})".format(n_examples))
    corpus = GoldCorpus(train_path, dev_path, limit=n_examples)
    n_train_words = corpus.count_train()

    if base_model:
        # Start with an existing model, use default optimizer
        optimizer = create_default_optimizer(Model.ops)
    else:
        # Start with a blank model, call begin_training
        optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)

    nlp._optimizer = None

    # Load in pre-trained weights
    if init_tok2vec is not None:
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
        msg.text("Loaded pretrained tok2vec for: {}".format(components))

    # fmt: off
    row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"]
    row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7]
    if has_beam_widths:
        row_head.insert(1, "Beam W.")
        row_widths.insert(1, 7)
    row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
    # fmt: on
    print("")
    msg.row(row_head, **row_settings)
    msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )
                    if raw_text:
                        # If raw text is available, perform 'rehearsal' updates,
                        # which use unlabelled data to reduce overfitting.
                        raw_batch = list(next(raw_batches))
                        nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
                    if not int(os.environ.get("LOG_FRIENDLY", 0)):
                        pbar.update(sum(len(doc) for doc in docs))
                    words_seen += sum(len(doc) for doc in docs)
            with nlp.use_params(optimizer.averages):
                util.set_env_log(False)
                epoch_model_path = output_path / ("model%d" % i)
                nlp.to_disk(epoch_model_path)
                nlp_loaded = util.load_model_from_path(epoch_model_path)
                for beam_width in eval_beam_widths:
                    for name, component in nlp_loaded.pipeline:
                        if hasattr(component, "cfg"):
                            component.cfg["beam_width"] = beam_width
                    dev_docs = list(
                        corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                    )
                    nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
                    start_time = timer()
                    scorer = nlp_loaded.evaluate(dev_docs, debug)
                    end_time = timer()
                    if use_gpu < 0:
                        gpu_wps = None
                        cpu_wps = nwords / (end_time - start_time)
                    else:
                        gpu_wps = nwords / (end_time - start_time)
                        with Model.use_device("cpu"):
                            nlp_loaded = util.load_model_from_path(epoch_model_path)
                            for name, component in nlp_loaded.pipeline:
                                if hasattr(component, "cfg"):
                                    component.cfg["beam_width"] = beam_width
                            dev_docs = list(
                                corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
                            )
                            start_time = timer()
                            scorer = nlp_loaded.evaluate(dev_docs)
                            end_time = timer()
                            cpu_wps = nwords / (end_time - start_time)
                    acc_loc = output_path / ("model%d" % i) / "accuracy.json"
                    srsly.write_json(acc_loc, scorer.scores)

                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
                    meta["spacy_version"] = ">=%s" % about.__version__
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                        meta["accuracy"] = scorer.scores
                    else:
                        meta.setdefault("beam_accuracy", {})
                        meta.setdefault("beam_speed", {})
                        meta["beam_accuracy"][beam_width] = scorer.scores
                        meta["beam_speed"][beam_width] = {
                            "nwords": nwords,
                            "cpu": cpu_wps,
                            "gpu": gpu_wps,
                        }
                    meta["vectors"] = {
                        "width": nlp.vocab.vectors_length,
                        "vectors": len(nlp.vocab.vectors),
                        "keys": nlp.vocab.vectors.n_keys,
                        "name": nlp.vocab.vectors.name,
                    }
                    meta.setdefault("name", "model%d" % i)
                    meta.setdefault("version", version)
                    meta_loc = output_path / ("model%d" % i) / "meta.json"
                    srsly.write_json(meta_loc, meta)
                    util.set_env_log(verbose)

                    progress = _get_progress(
                        i,
                        losses,
                        scorer.scores,
                        beam_width=beam_width if has_beam_widths else None,
                        cpu_wps=cpu_wps,
                        gpu_wps=gpu_wps,
                    )
                    msg.row(progress, **row_settings)
                # Early stopping
                if n_early_stopping is not None:
                    current_score = _score_for_model(meta)
                    if current_score < best_score:
                        iter_since_best += 1
                    else:
                        iter_since_best = 0
                        best_score = current_score
                    if iter_since_best >= n_early_stopping:
                        msg.text(
                            "Early stopping, best iteration "
                            "is: {}".format(i - iter_since_best)
                        )
                        msg.text(
                            "Best score = {}; Final iteration "
                            "score = {}".format(best_score, current_score)
                        )
                        break
    finally:
        with nlp.use_params(optimizer.averages):
            final_model_path = output_path / "model-final"
            nlp.to_disk(final_model_path)
        msg.good("Saved model to output directory", final_model_path)
        with msg.loading("Creating best model..."):
            best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
        msg.good("Created best model", best_model_path)
示例#35
0
def validate():
    """
    Validate that the currently installed version of spaCy is compatible
    with the installed models. Should be run after `pip install -U spacy`.
    """
    msg = Printer()
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
            msg.fail(
                "Server error ({})".format(r.status_code),
                "Couldn't fetch compatibility table.",
                exits=1,
            )
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    version = about.__version__
    version = version.rsplit(".dev", 1)[0]
    current_compat = compat.get(version)
    if not current_compat:
        msg.fail(
            "Can't find spaCy v{} in compatibility table".format(version),
            about.__compatibility__,
            exits=1,
        )
    all_models = set()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    model_links = get_model_links(current_compat)
    model_pkgs = get_model_pkgs(current_compat, all_models)
    incompat_links = {l for l, d in model_links.items() if not d["compat"]}
    incompat_models = {d["name"] for _, d in model_pkgs.items() if not d["compat"]}
    incompat_models.update(
        [d["name"] for _, d in model_links.items() if not d["compat"]]
    )
    na_models = [m for m in incompat_models if m not in current_compat]
    update_models = [m for m in incompat_models if m in current_compat]
    spacy_dir = Path(__file__).parent.parent

    msg.divider("Installed models (spaCy v{})".format(about.__version__))
    msg.info("spaCy installation: {}".format(path2str(spacy_dir)))

    if model_links or model_pkgs:
        header = ("TYPE", "NAME", "MODEL", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            rows.append(get_model_row(current_compat, name, data, msg))
        for name, data in model_links.items():
            rows.append(get_model_row(current_compat, name, data, msg, "link"))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
    if update_models:
        msg.divider("Install updates")
        msg.text("Use the following commands to update the model packages:")
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
        msg.text(
            "The following models are not available for spaCy "
            "v{}: {}".format(about.__version__, ", ".join(na_models))
        )
    if incompat_links:
        msg.text(
            "You may also want to overwrite the incompatible links using the "
            "`python -m spacy link` command with `--force`, or remove them "
            "from the data directory. "
            "Data path: {path}".format(path=path2str(get_data_path()))
        )
    if incompat_models or incompat_links:
        sys.exit(1)
示例#36
0
def pretrain(
    texts_loc,
    vectors_model,
    output_dir,
    width=96,
    depth=4,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
    dropout=0.2,
    n_iter=1000,
    batch_size=3000,
    max_length=500,
    min_length=5,
    seed=0,
    n_save_every=None,
):
    """
    Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
    using an approximate language-modelling objective. Specifically, we load
    pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
    vectors which match the pre-trained ones. The weights are saved to a directory
    after each epoch. You can then pass a path to one of these pre-trained weights
    files to the 'spacy train' command.

    This technique may be especially helpful if you have little labelled data.
    However, it's still quite experimental, so your mileage may vary.

    To load the weights back in during 'spacy train', you need to ensure
    all settings are the same between pretraining and training. The API and
    errors around this need some improvement.
    """
    config = dict(locals())
    msg = Printer()
    util.fix_random_seed(seed)

    has_gpu = prefer_gpu()
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
        msg.good("Created output directory")
    srsly.write_json(output_dir / "config.json", config)
    msg.good("Saved settings to config.json")

    # Load texts from file or stdin
    if texts_loc != "-":  # reading from a file
        texts_loc = Path(texts_loc)
        if not texts_loc.exists():
            msg.fail("Input text file doesn't exist", texts_loc, exits=1)
        with msg.loading("Loading input texts..."):
            texts = list(srsly.read_jsonl(texts_loc))
        msg.good("Loaded input texts")
        random.shuffle(texts)
    else:  # reading from stdin
        msg.text("Reading input text from stdin...")
        texts = srsly.read_jsonl("-")

    with msg.loading("Loading model '{}'...".format(vectors_model)):
        nlp = util.load_model(vectors_model)
    msg.good("Loaded model '{}'".format(vectors_model))
    pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
    model = create_pretraining_model(
        nlp,
        Tok2Vec(
            width,
            embed_rows,
            conv_depth=depth,
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=True,  # Set to False for Chinese etc
        ),
    )
    optimizer = create_default_optimizer(model.ops)
    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)

    def _save_model(epoch, is_temp=False):
        is_temp_str = ".temp" if is_temp else ""
        with model.use_params(optimizer.averages):
            with (output_dir / ("model%d%s.bin" % (epoch, is_temp_str))).open(
                "wb"
            ) as file_:
                file_.write(model.tok2vec.to_bytes())
            log = {
                "nr_word": tracker.nr_word,
                "loss": tracker.loss,
                "epoch_loss": tracker.epoch_loss,
                "epoch": epoch,
            }
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

    for epoch in range(n_iter):
        for batch_id, batch in enumerate(
            util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
        ):
            docs = make_docs(
                nlp,
                [text for (text, _) in batch],
                max_length=max_length,
                min_length=min_length,
            )
            loss = make_update(
                model, docs, optimizer, objective=loss_func, drop=dropout
            )
            progress = tracker.update(epoch, loss, docs)
            if progress:
                msg.row(progress, **row_settings)
                if texts_loc == "-" and tracker.words_per_epoch[epoch] >= 10 ** 7:
                    break
            if n_save_every and (batch_id % n_save_every == 0):
                _save_model(epoch, is_temp=True)
        _save_model(epoch)
        tracker.epoch_loss = 0.0
        if texts_loc != "-":
            # Reshuffle the texts if texts were loaded from a file
            random.shuffle(texts)
示例#37
0
文件: package.py 项目: spacy-io/spaCy
def package(input_dir, output_dir, meta_path=None, create_meta=False, force=False):
    """
    Generate Python package for model data, including meta and required
    installation files. A new directory will be created in the specified
    output directory, and model data will be copied over. If --create-meta is
    set and a meta.json already exists in the output directory, the existing
    values will be used as the defaults in the command-line prompt.
    """
    msg = Printer()
    input_path = util.ensure_path(input_dir)
    output_path = util.ensure_path(output_dir)
    meta_path = util.ensure_path(meta_path)
    if not input_path or not input_path.exists():
        msg.fail("Can't locate model data", input_path, exits=1)
    if not output_path or not output_path.exists():
        msg.fail("Output directory not found", output_path, exits=1)
    if meta_path and not meta_path.exists():
        msg.fail("Can't find model meta.json", meta_path, exits=1)

    meta_path = meta_path or input_path / "meta.json"
    if meta_path.is_file():
        meta = srsly.read_json(meta_path)
        if not create_meta:  # only print if user doesn't want to overwrite
            msg.good("Loaded meta.json from file", meta_path)
        else:
            meta = generate_meta(input_dir, meta, msg)
    for key in ("lang", "name", "version"):
        if key not in meta or meta[key] == "":
            msg.fail(
                "No '{}' setting found in meta.json".format(key),
                "This setting is required to build your package.",
                exits=1,
            )
    model_name = meta["lang"] + "_" + meta["name"]
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_path / model_name_v
    package_path = main_path / model_name

    if package_path.exists():
        if force:
            shutil.rmtree(path2str(package_path))
        else:
            msg.fail(
                "Package directory already exists",
                "Please delete the directory and try again, or use the "
                "`--force` flag to overwrite existing "
                "directories.".format(path=path2str(package_path)),
                exits=1,
            )
    Path.mkdir(package_path, parents=True)
    shutil.copytree(path2str(input_path), path2str(package_path / model_name_v))
    create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
    create_file(main_path / "setup.py", TEMPLATE_SETUP)
    create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
    create_file(package_path / "__init__.py", TEMPLATE_INIT)
    msg.good("Successfully created package '{}'".format(model_name_v), main_path)
    msg.text("To build the package, run `python setup.py sdist` in this directory.")
示例#38
0
# coding: utf8
from __future__ import print_function

# NB! This breaks in plac on Python 2!!
# from __future__ import unicode_literals

if __name__ == "__main__":
    import plac
    import sys
    from wasabi import Printer
    from spacy.cli import download, link, info, package, train, pretrain, convert
    from spacy.cli import init_model, profile, evaluate, validate, debug_data

    msg = Printer()

    commands = {
        "download": download,
        "link": link,
        "info": info,
        "train": train,
        "pretrain": pretrain,
        "debug-data": debug_data,
        "evaluate": evaluate,
        "convert": convert,
        "package": package,
        "init-model": init_model,
        "profile": profile,
        "validate": validate,
    }
    if len(sys.argv) == 1:
        msg.info("Available commands", ", ".join(commands), exits=1)