示例#1
0
def test_custom_handler():
    def custom_handler(text, start, stop, norm):
        return start, stop, text[start:stop] + " is OK"

    ts = TextSearch("ignore", "norm", handlers=[("HI", True, custom_handler)])
    ts.add("hi", "HI")
    assert ts.findall("hi HI") == ['hi is OK', 'HI is OK']
示例#2
0
 def __init__(self):
     with open('svo_extraction/contract_dict.json',
               mode="r",
               encoding="utf-8") as json_file:
         self.contractdict = json.load(json_file)
     self.searching = TextSearch("ignore", "norm")
     self.searching.add(self.contractdict)
示例#3
0
class ContractText():
    def __init__(self):
        with open('svo_extraction/contract_dict.json',
                  mode="r",
                  encoding="utf-8") as json_file:
            self.contractdict = json.load(json_file)
        self.searching = TextSearch("ignore", "norm")
        self.searching.add(self.contractdict)

    def uncontract(self, text: str):
        return self.searching.replace(text)
示例#4
0
def ts_replacer(a, b):
    # lowercase letters before are allowed, how about a second one...
    # ts = TextSearch("sensitive", "norm", ALPHANUM - ALPHA_LOWER, ALPHANUM)
    ts = TextSearch("sensitive", "norm", BOUNDS, BOUNDS)

    found_sep_in_b = ""
    for x in [" ", "-", "_", "."]:
        if x in b:
            found_sep_in_b = x
            break

    # a = ["some", "thing"]
    # b = ["another", "thing"]

    aa = normalize(a)
    bb = normalize(b)

    # questions like:
    # - prefer camelCase for word2 when word1 is lowercase
    # - prefer Halftitle, PascalCase/Titlecase for word2 when word1 is titlecase
    # etc
    # below... lower order means higher prio
    for s in [".", "_", "-", found_sep_in_b, ""]:
        # halftitle
        x = aa[0][0].title() + s.join(aa)[1:]
        y = bb[0][0].title() + s.join(bb)[1:]
        ts.add(x, y)

        # camelCase
        x = s.join([aa[0].lower()] + [x.title() for x in aa[1:]])
        y = s.join([bb[0].lower()] + [x.title() for x in bb[1:]])
        ts.add(x, y)

        # easy cases
        for c in [str.upper, str.title, str.lower]:
            x = s.join([c(x) for x in aa])
            y = s.join([c(x) for x in bb])
            ts.add(x, y)

    # ts.add("SomeThing", "AnotherThing")
    ts.add(a, b)

    return ts
示例#5
0
def test_foreign_chars():
    ts = TextSearch("ignore", "norm", replace_foreign_chars=True)
    ts.add("á", "A")
    assert "a" in ts
    assert "á" in ts
    assert ts.contains("a")
    assert ts.contains("á")
    assert ts.findall("a")
    assert ts.findall("á")
    assert ts.find_overlapping("a")
    assert ts.find_overlapping("á")
    assert ts.replace("a") == "A"
    assert ts.replace("á") == "A"
示例#6
0
 def setup(self):
     self.tokenizer = TextSearch("sensitive", "norm", set(), set())
     self.add_base_cases()
     self.add_currencies()
     self.add_words(self.protected_words)
     if self.handle_http:
         self.tokenizer.add_http_handler(keep_result=True)
         for word in ["http://", "https://", "www."]:
             self.explain_dict[
                 word] = "regex: when it finds '{}' it will stop after it finds a space.".format(
                     word)
     if self.handle_domains:
         self.add_domain_handler()
     if self.contractions:
         if self.contractions == True:
             self.contractions = {}
             self.contractions.update(contractions_dict)
             self.contractions.update(leftovers_dict)
         self.add_words(self.contractions)
     if self.abbrevs:
         self.add_words(self.abbrevs)
示例#7
0
def test_serializable():
    ts = TextSearch("sensitive", dict)
    ts.add("hi")
    result = ts.findall("hi")
    assert result
    assert json.dumps(result)
示例#8
0
def test_replace():
    ts = TextSearch("sensitive", "norm")
    ts.add("hi", "HI")
    assert ts.replace("test hi test") == "test HI test"
示例#9
0
def test_replace_insensitive_keep_casing():
    ts = TextSearch("insensitive", "norm")
    ts.add("hi", "bye")
    assert ts.replace("test Hi test") == "test Bye test"
    assert ts.replace("test HI test") == "test BYE test"
示例#10
0
def test_not_overlap_3():
    ts = TextSearch("ignore", "norm")
    ts.add("a")
    ts.add("a a")
    assert ts.findall("a a a") == ["a a", "a"]
示例#11
0
def test_add_dict():
    ts = TextSearch("smart", "norm")
    ts.add({"hi": "greeting", "bye": "bye", "goodbye": "bye"})
    assert ts.findall("hi bye goodbye") == ["greeting", "bye", "bye"]
示例#12
0
def test_twitter():
    ts = TextSearch("ignore", "norm")
    ts.add_twitter_handler(keep_result=True)
    assert ts.findall("@hello") == ["@hello"]
    assert ts.findall("#hello") == ["#hello"]
示例#13
0
class Tokenizer:
    def __init__(
        self,
        handle_http=False,
        handle_domains=False,
        numbers=True,
        combine_punctuation=True,
        eol="\n",
        currencies=("$", ),
        protected_words=None,
        contractions=True,
        language="en",
        abbrevs=ABBREVS,
    ):
        # set() set() should fallback to just using __iter__ of automaton for a speedboost
        if language != "en" and contractions:
            raise ValueError(
                "No contractions known for languages other than English.")
        self.contractions = contractions
        self.tokenizer = None
        self.handle_http = handle_http
        self.handle_domains = handle_domains
        self.combine_punctuation = combine_punctuation
        self.numbers = numbers
        self.eol = eol
        self.currencies = currencies or []
        self.protected_words = protected_words or []
        self.abbrevs = abbrevs
        self.explain_dict = {}
        self.setup()

    def setup(self):
        self.tokenizer = TextSearch("sensitive", "norm", set(), set())
        self.add_base_cases()
        self.add_currencies()
        self.add_words(self.protected_words)
        if self.handle_http:
            self.tokenizer.add_http_handler(keep_result=True)
            for word in ["http://", "https://", "www."]:
                self.explain_dict[
                    word] = "regex: when it finds '{}' it will stop after it finds a space.".format(
                        word)
        if self.handle_domains:
            self.add_domain_handler()
        if self.contractions:
            if self.contractions == True:
                self.contractions = {}
                self.contractions.update(contractions_dict)
                self.contractions.update(leftovers_dict)
            self.add_words(self.contractions)
        if self.abbrevs:
            self.add_words(self.abbrevs)

    def add_words(self, words):
        words = words.items() if isinstance(words, dict) else words
        if words and isinstance(words, (list, set, tuple)) and isinstance(
                words[0], str):
            words = [(x, x) for x in words]
        for x, y in words:
            REASON_AS_IS = "protected word: adds word as is, prevents splitting it."
            REASON_UPPER = "protected word: adds word uppercased, prevents splitting it."
            REASON_TITLE = "protected word: adds word titlecased, prevents splitting it."
            self.add(x, y, REASON_AS_IS)
            self.add(x.upper(), y.upper(), REASON_UPPER)
            if y:
                self.add(x[0].upper() + x[1:], y[0].upper() + y[1:],
                         REASON_TITLE)

    def add_domain_handler(self):
        import re
        from tldextract.tldextract import TLD_EXTRACTOR

        valid_re = re.compile("^[a-zA-Z.]+$")
        tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)]

        for x in tlds:
            self.add(x, x,
                     "Added by domain handler, keeps the token existing.")

    def add_base_cases(self):
        if self.numbers:
            for x in "0123456789":
                self.keep(x + ",")
                self.keep(x + ".")

        # self.tokenizer.add(" !", " ! ")

        if self.combine_punctuation:
            # combine multiples
            R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence."
            for s in "!.?-":
                for i in range(2, 10):
                    # one of these is a splitting char
                    if i == 1 and s == "-":
                        continue
                    c = s * i
                    e = s * 3 if i > 1 else s
                    # end = "$<EOS>$" if i == 1 or s != "-" else " "
                    end = " \n" if i == 1 or s != "-" else " "
                    self.add(c, " {}{}".format(e, end),
                             R_COMBINE.format(c, e + end))

            for i in range(2, 10):
                # self.tokenizer.add("\n" * i, "$<EOS>$")
                self.add("\n" * i, " \n ", "merges newlines")

        for s in "!.?-\n":
            self.add(s, " " + s + "\n",
                     "Splits on '{}' and creating a new sentence.".format(s))

        self.split("- ")

        self.split("...")

        # does not work
        # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ")

        self.split("!?")
        self.split("!?!")
        self.split("!!?")
        self.split("!??")
        self.split("?!!")
        self.split("?!?")
        self.split("??!")

        for x in string.ascii_letters:
            self.keep(" " + x + ".")

        # for x in string.ascii_letters:
        #     self.tokenizer.add("\n" + x, "\n" + x)

        for s in ":;,":
            self.split(s, "Splits on '{}' (punctuation)")

        # quotes (make sure we add all the exeptions)
        self.split("'")
        self.split('"')

    def keep(self, x, reason=None):
        """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """
        self.tokenizer.add(x, x)
        self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace(
            "x", repr(x)).rstrip()

    def split(self, x, reason=None):
        """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """
        self.tokenizer.add(x, " {} ".format(x))
        self.explain_dict[x] = (
            reason
            or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip())

    def drop(self, x, reason=None):
        """ Whenever it finds x, it will remove it but add a split."""
        self.tokenizer.add(x, " ")
        self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace(
            "x", repr(x)).rstrip()

    def strip(self, x, reason=None):
        """ Whenever it finds x, it will remove it without splitting. """
        self.tokenizer.add(x, "")
        self.explain_dict[x] = (
            reason
            or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip())

    def add(self, x, y, reason):
        self.tokenizer.add(x, y)
        self.explain_dict[x] = reason

    def explain(self, char_or_chars):
        keys = [x for x in self.tokenizer._root_dict if char_or_chars in x]
        if not keys:
            return {
                "explanation":
                "No explanation, meaning there is nothing specified for the input"
            }
        return [{
            "from": x,
            "to": self.tokenizer._root_dict[x],
            "explanation": self.explain_dict[x]
        } for x in keys]

    def remove(self, x):
        if x in self.tokenizer:
            self.tokenizer.remove(x)
            del self.explain_dict[x]

    def add_currencies(self):
        for currency in self.currencies:
            self.split(currency)

        for num in "0123456789":
            # to prevent the . and , from being treated as punct
            for punc in ",.":
                s = "{currency}{num}{punc}".format(currency=currency,
                                                   num=num,
                                                   punc=punc)
                r = " {currency} {num}{punc}".format(currency=currency,
                                                     num=num,
                                                     punc=punc)
                self.add(s, r,
                         "protecting currency from being seen as a number.")

    def word_tokenize(self, z, return_entities=False, to_lower=False):
        if return_entities:
            a, b = self.tokenizer.replace(" " + z, return_entities=True)
            return a.split(), b
        res = self.tokenizer.replace(" " + z).split()
        if to_lower:
            res = [x.lower() for x in res]
        return res

    def word_newlined_tokenize(self, z):
        sentences = self.sent_tokenize(z)
        return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1]

    def sent_tokenize(self, z):
        return [
            x.split() for x in self.tokenizer.replace(z).split("\n")
            if x.strip()
        ]
示例#14
0
def test_regex_overlap():
    ts = TextSearch("insensitive", "object")
    ts.add_regex_handler(["last "], r"\d", keep_result=True)
    ts.add("last")
    assert ts.findall("last 5")[0].norm == "last 5"
示例#15
0
    return jsonify(res)

    # used for debug
    # return jsonify({"uuid":jsonfile['0']})


if __name__ == '__main__':
    """
    The TextSearch class instance is loaded at the init of the flask
    API for performance reasons, meaning that all the inputs matrices are 
    stored as in-memory objects
    Then at each call of the API, the searchTop method is called and uses 
    the in-memory matrices to make the proximity calculation
    
    The largest file - document matrix - is 
        (n documents) x (m vect components) 64 bit float
    
    64 bits is not necessary, this can be optimized. 
    
    
    """

    with open("params.json", 'r') as stream:
        params = json.load(stream)

    modelfile = params['modelfile']
    docmatrixfile = params['docmatrixfile']
    textfile = params['textfile']
    textSearch = TextSearch(modelfile, docmatrixfile, textfile)
    app.run(debug=True)
示例#16
0
def test_insensitive_object():
    ts = TextSearch("insensitive", "object")
    ts.add("hi")
    assert ts.findall("HI")[0].end == 2
示例#17
0
def test_regex_norm():
    ts = TextSearch("insensitive", "norm")
    ts.add_regex_handler(["last "], r"\d", keep_result=True)
    assert ts.findall("last 5") == ["last 5"]
示例#18
0
def test_not_overlap():
    ts = TextSearch("ignore", "norm")
    ts.add("http://")
    ts.add_http_handler(True)
    assert len(ts.findall("https://vks.ai")) == 1
示例#19
0
def test_postfix_regex():
    ts = TextSearch("ignore", "norm")
    ts.add_regex_handler(["products"], r"\d+ ", keep_result=True, prefix=False)
    assert ts.findall("90 products") == ["90 products"]
示例#20
0
def test_overlap():
    ts = TextSearch("ignore", "norm")
    ts.add("hi")
    ts.add("hi hi")
    assert len(ts.find_overlapping("hi hi")) == 3
示例#21
0
def test_http():
    ts = TextSearch("ignore", "norm")
    ts.add_http_handler(keep_result=True)
    assert ts.findall("http://google.com") == ["http://google.com"]
示例#22
0
def test_repr():
    assert repr(TextSearch("ignore", "match"))
    assert repr(TextSearch("ignore", "match", set(), set()))
示例#23
0
def test_http_no_keep():
    ts = TextSearch("ignore", "norm")
    ts.add_http_handler(keep_result=False)
    ts.add("google")
    assert ts.findall("http://google.com") == []
示例#24
0
def test_ignore_match():
    ts = TextSearch("ignore", "match")
    ts.add("hi")
    assert ts.findall("hi") == ["hi"]
    assert ts.findall("HI") == ["hi"]
    assert ts.findall("asdf") == []
示例#25
0
def get_ts():
    ts = TextSearch("insensitive", "object")
    ts.add(nlp_registry)
    return ts
示例#26
0
def test_add_list():
    ts = TextSearch("smart", "match")
    ts.add(["hi", "bye", "hello"])
    assert ts.findall("hi bye hello") == ["hi", "bye", "hello"]
示例#27
0
def test_sensitive_match():
    ts = TextSearch("sensitive", "object")
    ts.add("hi")
    assert ts.findall("hi")
    assert not ts.findall("HI")
示例#28
0
def test_not_overlap_2():
    ts = TextSearch("ignore", "norm")
    ts.add("hi", "HI")
    ts.add("hi hi", "h h")
    assert ts.replace("hi hi") == "h h"
示例#29
0
slang_dict = {
    "ima": "I am going to",
    "gonna": "going to",
    "gotta": "got to",
    "wanna": "want to",
    "woulda": "would have",
    "gimme": "give me",
    "asap": "as soon as possible",
    "u": "you",
    "r ": "are ",
}

slang_dict.update(unsafe_dict)

ts_leftovers = TextSearch("ignore", "norm")
ts_leftovers.add(contractions_dict)
ts_leftovers.add(leftovers_dict)

ts_leftovers_slang = TextSearch("ignore", "norm")
ts_leftovers_slang.add(contractions_dict)
ts_leftovers_slang.add(leftovers_dict)
ts_leftovers_slang.add(slang_dict)

ts_slang = TextSearch("ignore", "norm")
ts_slang.add(contractions_dict)
ts_slang.add(slang_dict)

ts_basic = TextSearch("ignore", "norm")
ts_basic.add(contractions_dict)
示例#30
0
def test_smart_match():
    ts = TextSearch("smart", "object")
    ts.add("hi")
    assert ts.findall("hi")[0].case == "lower"
    assert ts.findall("hi")[0].is_exact
    assert ts.findall("HI")[0].case == "upper"
    assert not ts.findall("HI")[0].is_exact
    assert ts.findall("Hi")[0].case == "title"
    assert not ts.findall("Hi")[0].is_exact
    ts.add("hI")
    assert ts.findall("hI")[0].case == "mixed"
    assert ts.findall("hI")[0].is_exact