Exemplo n.º 1
0
def lix(corpus: Corpus, _: str,
        callback: Callable) -> Optional[Tuple[np.ndarray, List[str]]]:
    """
    Readability index LIX
    https://en.wikipedia.org/wiki/Lix_(readability_test)
    """
    corpus = preprocess_only_words(corpus)
    tokenizer = tokenize.PunktSentenceTokenizer()

    def lix_index(document, tokens):
        callback()
        # if the text is a single sentence, scores will be high
        sentences = len(tokenizer.tokenize(document))
        words = len(tokens)
        long_words = len([token for token in tokens if len(token) > 6])
        try:
            return words / sentences + (long_words * 100 / words)
        except ZeroDivisionError:
            return 0

    return (
        np.c_[[
            lix_index(d, tokens)
            for d, tokens in zip(corpus.documents, corpus.tokens)
        ]],
        ["LIX index"],
    )
Exemplo n.º 2
0
class PunktSentenceTokenizer(BaseTokenizer):
    """ 根据句子分词. This example. Another example. → (This example.), (Another example.) """
    tokenizer = tokenize.PunktSentenceTokenizer()
    name = '句子'

    @wait_nltk_data
    def __init__(self):
        super().__init__()
Exemplo n.º 3
0
class PunktSentenceTokenizer(BaseTokenizer):
    """ Split by full-stop, keeping entire sentences. """
    tokenizer = tokenize.PunktSentenceTokenizer()
    name = 'Sentence'

    @wait_nltk_data
    def __init__(self):
        super().__init__()
Exemplo n.º 4
0
def tokenization(sample_text):
    cust_tokenizer = tokenize.PunktSentenceTokenizer(train_text)
    text = filter(sample_text)
    # Then we can actually tokenize, using:
    return cust_tokenizer.tokenize(text)  #tokenize in sentences
Exemplo n.º 5
0
class PunktSentenceTokenizer(BaseTokenizer):
    """ Split by full-stop, keeping entire sentences. """
    tokenizer = tokenize.PunktSentenceTokenizer()
    name = 'Sentence'
Exemplo n.º 6
0
import re
from typing import List, Tuple, Dict, Set, Pattern

from nltk import tokenize

from bionorm.common.models import SpeciesMention
from bionorm.normalizers.gene.GNormPlus.models import GNormPaper, GNormSpeciesAnnotation, SpeciesAnnotationPlacement, GNormGeneMention, \
    GNormPassage

HUMAN_ID = '9606'
TaxonomyFrequency = Dict[str, float]
HumanViruses = Set[str]
GeneWithoutSpPrefix = Set[str]
PrefixMap = Dict[str, Pattern[str]]

SENTENCE_TOKENIZER = tokenize.PunktSentenceTokenizer()


def assign_species(paper: GNormPaper, taxonomy_frequency: TaxonomyFrequency, human_viruses: HumanViruses,
                   gene_without_sp_prefix: GeneWithoutSpPrefix, prefix_map: PrefixMap):
    species_to_num_hash: Dict[str, float] = {}
    for passage in paper.passages:  # type: GNormPassage
        for species in passage.species:  # type: SpeciesMention
            if species.id is None:
                continue
            ID = species.id
            weight = 1.0
            if passage.name == 'title':
                weight = 2.0

            if ID in species_to_num_hash: