示例#1
0
def diff():
    """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    y_true = [doc['sentences'] for doc in sents]

    y_pred = [Doc(doc) for doc in raw]

    paths = file_paths()

    for i in range(len(y_true)):

        if y_true[i] != y_pred[i]:
            click.secho(f"Document {paths[i]}")
            for s_true in y_true[i]:
                if s_true not in y_pred[i]:
                    click.secho(f"+ {s_true}", fg="green")

            click.secho()

        for s_pred in y_pred[i]:
            if s_pred not in y_true[i]:
                click.secho(f"- {s_pred}", fg="red")

        click.secho()
        click.secho()
示例#2
0
def build():
    """Build a ML based SBD"""

    raw_corpus = load_raw_corpus(False)
    sent_corpus = load_sentence_corpus(False)

    features = flatten([[span.span_features() for span in Doc(raw).spans]
                        for raw in raw_corpus])
    y = flatten([[is_eos(span, sent['sentences']) for span in Doc(raw).spans]
                 for raw, sent in zip(raw_corpus, sent_corpus)])

    if len(features) != len(y):
        raise Exception(
            f"Sanity check failed feature list length {len(features)} whereas target list length {len(y)}."
        )

    sbd_model = create_model()

    scores = cross_val_score(sbd_model.pipeline, features, y, scoring="f1")

    for i, score in enumerate(scores):
        click.secho(f"Fold {i + 1}: {score:.4f}", fg="yellow")

    sbd_model.fit(features, y)

    click.secho("\nTop 10 Features")
    feature_importance = sbd_model.pipeline.steps[1][1].feature_importances_
    for idx in list(reversed(feature_importance.argsort()))[:20]:
        click.secho(
            f"    {sbd_model.pipeline.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}",
            fg="yellow")

    save_model(sbd_model)
示例#3
0
def diff(verbose):
    """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    y_true = [doc['sentences'] for doc in sents]

    y_pred = [[str(s) for s in Doc(doc)] for doc in raw]

    paths = file_paths()

    differ = Differ()

    for t, p, f in zip(y_true, y_pred, paths):

        table = Table(show_header=True,
                      header_style="bold magenta",
                      show_edge=False)
        table.add_column("true", style="dim", width=100)
        table.add_column("predict", style="dim", width=100)

        table.columns[0].style = "green"
        table.columns[1].style = "red"

        ndiff = 0
        match = 0
        for sent in differ.compare(p, t):
            if sent.startswith('+'):
                if match > 0 and verbose > 0:
                    table.add_row(f"[blue]{match} sentences...[/blue]",
                                  f"[blue]{match} sentences...[/blue]")
                    match = 0

                table.add_row(sent[2:], "")
                ndiff += 1
            elif sent.startswith('-'):
                if match > 0 and verbose > 0:
                    table.add_row(f"[blue]{match} sentences...[/blue]",
                                  f"[blue]{match} sentences...[/blue]")
                    match = 0

                table.add_row("", sent[2:])
                ndiff += 1
            else:
                match += 1

        if match > 0 and verbose > 0:
            table.add_row(f"[blue]{match} sentences...[/blue]",
                          f"[blue]{match} sentences...[/blue]")

        if ndiff > 0:
            console.print(f)
            console.print(table)
            console.print(f"[blue]{len(t)} sentences...[/blue]")
            console.print()
示例#4
0
def evaluate(v):
    """Evaluate IoU metric for different SBD algorithms over our stock dataset."""
    click.secho("Loading corpus...")
    raw, sents = load_raw_corpus(False), load_sentence_corpus()

    nltk = NLTKPunctTokenizer()
    reg = RegexpSentenceTokenizer()

    y_pred = [nltk(doc) for doc in raw]
    y_true = [doc['sentences'] for doc in sents]

    iou_eval("NLTKPunctTokenizer", y_true, y_pred, file_paths() if v > 0 else None)

    y_pred = [reg(doc) for doc in raw]

    iou_eval("RegexpSentenceTokenizer", y_true, y_pred, file_paths() if v > 0 else None)

    y_pred = [[s.text for s in Doc(doc).sents] for doc in raw]

    iou_eval("MLBasedTokenizer", y_true, y_pred, file_paths() if v > 0 else None)
示例#5
0
def validate(v, base_path):
    from sadedegel.dataset import load_raw_corpus, load_sentence_corpus, load_annotated_corpus, file_paths, \
        CorpusTypeEnum

    click.secho("Corpus loading...")
    raw = load_raw_corpus(False, base_path)
    sents = load_sentence_corpus(False, base_path)
    anno = load_annotated_corpus(False, base_path)

    click.secho(".done.", fg="green")
    click.secho(f"Number of News Documents (raw): {len(raw)}".rjust(50))
    click.secho(
        f"Number of News Documents (sentences): {len(sents)}".rjust(50))
    click.secho(f"Number of News Documents (annotated): {len(anno)}".rjust(50))

    if len(anno) != len(sents):
        anno_files = file_paths(CorpusTypeEnum.ANNOTATED, True, True,
                                base_path)
        sent_files = file_paths(CorpusTypeEnum.SENTENCE, True, True, base_path)

        click.secho(
            "\nSymmetric Difference between sentences & annotated corpus.")

        for diff in set(anno_files).symmetric_difference(set(sent_files)):
            click.secho(f"{diff}".rjust(50))
        click.secho(".warn", fg="yellow")

    click.secho("\nPerforming span checks...")

    for a, b, file in zip(raw, sents, file_paths()):
        for i, sent in enumerate(b['sentences']):
            if sent not in a:
                logger.error(
                    f"""{sent}[{i}] \n\t\t is not a span in raw document \n {a} \n\n Corpus file: {file}
                """)
                sys.exit(1)

    click.secho(".done", fg="green")

    click.secho("\nPerforming span order checks...")

    for a, b, file in zip(raw, sents, file_paths()):

        start = 0
        for i, sent in enumerate(b['sentences']):

            idx = a.find(sent, start)

            if idx == -1:
                logger.error(
                    f"""{sent}[{i}] \n\t\t is potential our of order in "sentences" array of sentence corpus\n {a} \n\n Corpus file: {file}
                    """)
                sys.exit(1)
            else:
                start = start + len(sent)

    click.secho(".done", fg="green")

    click.secho("\nComparing annotated corpus with sentences corpus...")

    anno_names = file_paths(CorpusTypeEnum.ANNOTATED,
                            noext=True,
                            use_basename=True,
                            base_path=base_path)
    sents_names = file_paths(CorpusTypeEnum.SENTENCE,
                             noext=True,
                             use_basename=True,
                             base_path=base_path)

    anno_dict = dict((name, doc) for name, doc in zip(anno_names, anno))
    sents_dict = dict((name, doc) for name, doc in zip(sents_names, sents))

    match = 0

    for _name, _anno in anno_dict.items():
        sent = sents_dict[_name]

        if sent['sentences'] != _anno['sentences']:
            click.secho(
                f"\nSentences in annotated corpus {_name} doesn't match with document in sentence corpus."
            )
            sys.exit(1)
        else:
            match += 1

    click.secho(f".done ({match}/{len(anno_dict)})", fg="green")