def adapt(markup): spans = list(markup.spans) spans = list(split_overlapping_spans(spans)) spans = list( strip_spans(spans, markup.text, QUOTES + BRACKETS + DASHES + SPACES)) spans = list(filter_empty_spans(spans)) spans = list(adapt_spans(spans, markup.text, TYPES)) return Markup(markup.text, spans)
def adapt(markup): # Чувашской Республики". # ---------------------- # год Чарльза Дарвина» # ---------------- spans = list(strip_spans(markup.spans, markup.text, QUOTES + DOT + SPACES)) spans = list(adapt_spans(spans, markup.text, TYPES)) return Markup(markup.text, spans)
def adapt(markup): # extra spaces + dots in spans # News Corp . # ----------- # « Русал » # --------- spans = strip_spans(markup.spans, markup.text, DOT + SPACES) spans = strip_spans_bounds(spans, markup.text, QUOTES + SPACES) spans = adapt_spans(spans, markup.text, TYPES) return Markup(markup.text, list(spans))
def adapt_spans(spans, text, types): spans = select_type_spans(spans, types) spans = convert_span_types(spans, types) # in mitie and sometimes in deeppavlov spans = list(strip_spans(spans, text, QUOTES)) # ne5 typos is span.stop # Magna Internationa -> Magna International # Горсове -> Горсовет # tokenizer errors # поезд Москва-Баку # Yahoo!. tokens = list(tokenize(text)) spans = list(filter_misaligned_spans(spans, tokens)) # ne5 bug # Бражский район Подмосковья # -------------- # ----------------- spans = list(filter_overlapping(spans)) return spans
def adapt(markup): spans = strip_spans(markup.spans, markup.text, QUOTES + SPACES + DOT) spans = adapt_spans(spans, markup.text, TYPES) return Markup(markup.text, list(spans))
def adapt(markup): spans = list(strip_spans(markup.spans, markup.text, QUOTES + SPACES)) spans = list(adapt_spans(spans, markup.text, TYPES)) return Markup(markup.text, spans)
def adapt_spans(spans, text): spans = list(adapt_overlapping_spans(spans, text)) spans = list(strip_spans(spans, text, QUOTES + BRACKETS + DASHES)) spans = list(filter_empty_spans(spans)) return adapt_spans_(list(spans), text, TYPES)
def adapt_overlapping_spans(spans, text): spans = split_overlapping_spans(spans) spans = strip_spans(spans, text, SPACES) return filter_empty_spans(spans)