Пример #1
0
    def predict(self, prompt: str) -> Dict[str, float]:
        vocab_size = len(self.unigram_counts)
        prompt_tokens = tokenize(prompt)[(-self.history - 1) :]
        prompt_tokens = [""] * max(0, self.n - 1 - len(prompt_tokens)) + prompt_tokens

        def mle(token: str) -> float:
            p_log = math.log(1.0)
            tokens = [*prompt_tokens, token]
            for ngram in ngrams(tokens, self.n):
                unigram_count = self.unigram_counts.get(ngram[0], 0)
                ngram_count = self.ngram_counts.get(ngram, 0)

                if self.smoothing is Smoothing.LAPLACE:
                    p_log = p_log + math.log((ngram_count + 1) / (unigram_count + vocab_size))
                elif self.smoothing is Smoothing.GOOD_TURING:
                    if unigram_count == 0:
                        p_log = p_log + math.log(1 / len(self.tokens))
                    else:
                        p_log = p_log + math.log((ngram_count or (1 / len(self.tokens))) / unigram_count)
                else:
                    if ngram_count == 0:
                        return 0
                    p_log = p_log + math.log(ngram_count / unigram_count)

            return 2 ** p_log

        follower_odds = {k: mle(k) for k in self.unigram_counts.keys()}

        return collections.OrderedDict((sorted(follower_odds.items(), key=lambda item: item[1], reverse=True)))
Пример #2
0
    def fit(self, text: str) -> None:
        tokens = tokenize(text)

        self.ngram_follower_counts = defaultdict(lambda: defaultdict(int))
        for ngram in ngrams(tokens, self.n + 1):
            ngram, follower = ngram[: self.n], ngram[-1]
            self.ngram_follower_counts[ngram][follower] += 1
Пример #3
0
def tokenize_simple_test_2():
    string = "this is a simple test"
    gold = ["this", "is", "a", "simple", "test"]
    tokens = nlp.tokenize(string)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #4
0
    def ptb_inputs_test11(self):
        string = "1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%"
        gold = [
            "1",
            ".",
            "Buy",
            "a",
            "new",
            "Chevrolet",
            "-LRB-",
            "37",
            "%",
            "-",
            "owned",
            "in",
            "the",
            "U.S.",
            ".",
            "-RRB-",
            ".",
            "15",
            "%",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #5
0
def get_skipgram(tweets, out_folder, nIn, kIn):
    #tokenization and preprocess (if not yet done) must be done here. when analyzer receives
    #a callable, it will not perform tokenization, see documentation
    tweet_tokenized = []
    for t in tweets:
        tweet_tokenized.append(nlp.tokenize(t))
    skipper = functools.partial(skipgrams, n=nIn, k=kIn)
    vectorizer = TfidfVectorizer(
        analyzer=skipper,
        #stop_words=nlp.stopwords,  # We do better when we keep stopwords
        use_idf=True,
        smooth_idf=False,
        norm=None,  # Applies l2 norm smoothing
        decode_error='replace',
        max_features=10000,
        min_df=5,
        max_df=0.501)
    # for t in cleaned_tweets:
    #     tweetTokens = word_tokenize(t)
    #     skipgram_feature_matrix.append(list(skipper(tweetTokens)))

    # Fit the text into the vectorizer.
    logger.info("\tgenerating skip-gram vectors, n={}, k={}, {}".format(
        nIn, kIn, datetime.datetime.now()))
    tfidf = vectorizer.fit_transform(tweet_tokenized).toarray()
    logger.info("\t\t complete, dim={}, {}".format(tfidf.shape,
                                                   datetime.datetime.now()))
    vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())}
    idf_vals = vectorizer.idf_
    idf_dict = {i: idf_vals[i]
                for i in vocab.values()
                }  # keys are indices; values are IDF scores
    pickle.dump(vocab,
                open(out_folder + "/" + SKIPGRAM_FEATURES_VOCAB + ".pk", "wb"))
    return tfidf, vocab
Пример #6
0
    def ptb_inputs_test13(self):
        string = "Diamond (``Not even the chair'') lives near Udaipur " "(84km). {1. A potential Palmer trade:}"
        gold = [
            "Diamond",
            "-LRB-",
            "``",
            "Not",
            "even",
            "the",
            "chair",
            "''",
            "-RRB-",
            "lives",
            "near",
            "Udaipur",
            "-LRB-",
            "84km",
            "-RRB-",
            ".",
            "-LCB-",
            "1",
            ".",
            "A",
            "potential",
            "Palmer",
            "trade",
            ":",
            "-RCB-",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #7
0
    def ptb_inputs_test12(self):
        string = "I like you ;-) but do you care :(. I'm happy ^_^ but " "shy (x.x)!"
        gold = [
            "I",
            "like",
            "you",
            ";--RRB-",
            "but",
            "do",
            "you",
            "care",
            ":-LRB-",
            ".",
            "I",
            "'m",
            "happy",
            "^_^",
            "but",
            "shy",
            "-LRB-x.x-RRB-",
            "!",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #8
0
    def ptb_inputs_test17(self):
        string = "Kenneth liked Windows 3.1, Windows 3.x, and Mesa A.B " "as I remember things."
        gold = [
            "Kenneth",
            "liked",
            "Windows",
            "3.1",
            ",",
            "Windows",
            "3.x",
            ",",
            "and",
            "Mesa",
            "A.B",
            "as",
            "I",
            "remember",
            "things",
            ".",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #9
0
    def ptb_inputs_test15(self):
        string = "You can get a B.S. or a B. A. or a Ph.D (sometimes a " "Ph. D) from Stanford."
        gold = [
            "You",
            "can",
            "get",
            "a",
            "B.S.",
            "or",
            "a",
            "B.",
            "A.",
            "or",
            "a",
            "Ph.D",
            "-LRB-",
            "sometimes",
            "a",
            "Ph.",
            "D",
            "-RRB-",
            "from",
            "Stanford",
            ".",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #10
0
    def ptb_inputs_test22(self):
        string = "I like: \u2022wine, \u0095cheese, \u2023salami, & " "\u2043speck."
        gold = [
            "I",
            "like",
            ":",
            "\u2022",
            "wine",
            ",",
            "\u2022",
            "cheese",
            ",",
            "\u2023",
            "salami",
            ",",
            "&",
            "\u2043",
            "speck",
            ".",
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #11
0
def tokenize_simple_test_2():
    string = "this is a simple test"
    gold = ["this", "is", "a", "simple", "test"]
    tokens = nlp.tokenize(string)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #12
0
def tokenize_sgml_test_2_no_normalize():
    nlp.get_global_PTB_config().strict_ptb3 = False
    sent2 = "Panasonic brand products are produced by Samsung Electronics " "Co. Ltd. Sanyo products aren't."
    gold = [
        "Panasonic",
        "brand",
        "products",
        "are",
        "produced",
        "by",
        "Samsung",
        "Electronics",
        "Co.",
        "Ltd.",
        ".",
        "Sanyo",
        "products",
        "are",
        "n't",
        ".",
    ]

    tokens = nlp.tokenize(sent2)
    print(tokens)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
    assert len(tokens) == len(gold)
Пример #13
0
def process_nlp_sentence(s_in):
    out = []
    # Get tokenization
    tokens = nlp.tokenize(s_in)
    dependencies = dependency(s_in)
    
    # All tokens are TOKEN, tokens relate concepts
    for t in tokens:
        out.append("<\"{}\" --> TOKEN>.".format(t))
        out.append("<(*,\"{0}\",{0}) --> RELATES>.".format(t))
        
    # The tokens together are a sentence
    sent = "(*,\"{}\")".format("\", \"".join(tokens))
    out.append("<{} --> SENTENCE>.".format(sent))
    out.append("<{} --> (*,USER,SAYS)>. :|:".format(sent))
    out.append("<{} <-> {}>.".format(sent, quote(raw)))
    
    # Process dependencies
    for D in dependencies:                                  
        (t1, pos1), d, (t2, pos2) = D                       # Get all terms seperately
        i1 = "<(*,{},{}) --> INSIDE>".format(t1,sent)       # get term for t1 in sentence
        i2 = "<(*,{},{}) --> INSIDE>".format(t2,sent)       # get term for t2 in sentence
        p1 = "<{} --] {}>.".format(i1,pos1)                 # i1 has part of speech pos1
        p2 = "<{} --] {}>.".format(i2,pos2)                 # i2 has part of speech pos2
        d0 = "<(&&,{},{}) --> {}>.".format(i1,i2,d)         # p1 and p2 have dependency d
        out += [p1,p2,d0]                                   # output p1, p2, and d0
    
    return out
Пример #14
0
 def testTokenize_singleSentenceWithPunctuation(self):
     tokens = nlp.tokenize(
         "As far as I can see, this is  a pipe made in 1965.")
     self.assertEquals(tokens, [
         "As", "far", "as", "I", "can", "see", ",", "this", "is", "a",
         "pipe", "made", "in", "1965."
     ])
Пример #15
0
    def ptb_inputs_test14(self):
        string = "No. I like No. 24 and no.47."
        gold = ["No", ".", "I", "like", "No.", "24", "and", "no.", "47", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #16
0
    def ptb_inputs_test1(self):
        string = "This is a sentence."
        gold = ["This", "is", "a", "sentence", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
def vocabulary(data):
    result = set()
    for record in data:
        text = str(record['text']) + ' ' + str(record['summary'])
        tokens = nlp.tokenize(text)
        for token in tokens:
            result.add(token.lower())
    return result
Пример #18
0
    def ptb_inputs_test4(self):
        string = "The Iron Age (ca. 1300 – ca. 300 BC)."
        gold = ["The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300", "BC", "-RRB-", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #19
0
    def ptb_inputs_test18(self):
        string = "I like programming in F# more than C#."
        gold = ["I", "like", "programming", "in", "F#", "more", "than", "C#", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #20
0
    def ptb_inputs_test16(self):
        string = "@Harry_Styles didn`t like Mu`ammar al-Qaddafi"
        gold = ["@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al-Qaddafi"]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #21
0
    def ptb_inputs_test20(self):
        string = "I lived in O\u2019Malley and read OK! Magazine."
        gold = ["I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #22
0
    def ptb_inputs_test5(self):
        string = "Indo\u00ADnesian ship\u00ADping \u00AD"
        gold = ["Indonesian", "shipping", "-"]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #23
0
    def ptb_inputs_test1(self):
        string = "This is a sentence."
        gold = ["This", "is", "a", "sentence", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #24
0
def get_website_html_pages_tokens(website_names, filter_type):
    result = {}
    html_doc_id = 0

    for website_name in website_names:
        with open(f'classifier/html_docs/{website_name}_html_docs_complete.json', 'r', encoding='utf-8') as f:
            html_docs_list = json.load(f)
            f.close()

        for html_doc in html_docs_list:
            if filter_type == 'true' and html_doc[0] and '<!DOCTYPE' in html_doc[0] and (html_doc[1] == True):
                    result[html_doc_id] = [html_doc[2], html_doc[3], nlp.tokenize(html_doc[0])]
            elif filter_type == 'all' and html_doc[0] and '<!DOCTYPE' in html_doc[0]:
                    result[html_doc_id] = [html_doc[2], html_doc[3], nlp.tokenize(html_doc[0])]
            html_doc_id = html_doc_id + 1
        print(f'Get {website_name.upper()} tokens.')
    return result
Пример #25
0
    def ptb_inputs_test5(self):
        string = "Indo\u00ADnesian ship\u00ADping \u00AD"
        gold = ["Indonesian", "shipping", "-"]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #26
0
    def ptb_inputs_test14(self):
        string = "No. I like No. 24 and no.47."
        gold = ["No", ".", "I", "like", "No.", "24", "and", "no.", "47", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #27
0
    def ptb_inputs_test6(self):
        string = "Gimme a phone, I'm gonna call."
        gold = ["Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #28
0
    def ptb_inputs_test21(self):
        string = "I lived in O\u0092Malley and read OK! Magazine."
        # /* invalid unicode codepoint, but inherit from cp1252 */
        gold = ["I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #29
0
def tokenize_sgml_test_1_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    nlp.get_global_PTB_config().normalize_brackets = False
    sent1 = (
        "Significant improvements in peak FEV1 were demonstrated "
        "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg "
        "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium "
        "monotherapy [51]."
    )
    gold = [
        "Significant",
        "improvements",
        "in",
        "peak",
        "FEV1",
        "were",
        "demonstrated",
        "with",
        "tiotropium/olodaterol",
        "5/2",
        "\u03BCg",
        "(",
        "p",
        "=",
        "0.008",
        ")",
        ",",
        "5/5",
        "\u03BCg",
        "(",
        "p",
        "=",
        "0.012",
        ")",
        ",",
        "and",
        "5/10",
        "\u03BCg",
        "(",
        "p",
        "<",
        "0.0001",
        ")",
        "versus",
        "tiotropium",
        "monotherapy",
        "[",
        "51",
        "]",
        ".",
    ]

    tokens = nlp.tokenize(sent1)
    # assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        print(token, gold_token.encode("utf-8"))
        assert bytes(token) == gold_token.encode("utf-8")
Пример #30
0
def tokenize_sgml_test_10_no_normalize():
    sent10 = "&lt;[email protected]&gt; [email protected] " "<*****@*****.**>"

    gold = ["&lt;[email protected]&gt;", "*****@*****.**", "<*****@*****.**>"]

    tokens = nlp.tokenize(sent10)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #31
0
def filter_english(data, min_english=MIN_ENGLISH):
    """
    Remove songs that are mostly non-English
    """
    rows = []
    song_words = []
    for i, row in data.iterrows():
        text = row.song_darklyrics.strip()
        words = tokenize(text)
        english_words = tokenize(text, english_only=True)
        is_english = len(english_words) > min_english * len(words)
        if is_english:
            rows.append(i)
            song_words.append(' '.join(english_words))
    print('Non-English songs removed:', len(data) - len(rows))
    data = data.loc[rows]
    data['song_words'] = song_words
    return data
Пример #32
0
    def ptb_inputs_test18(self):
        string = "I like programming in F# more than C#."
        gold = [
            "I", "like", "programming", "in", "F#", "more", "than", "C#", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #33
0
    def fit(self, text: str) -> None:
        self.tokens = tokenize(text)

        self.unigram_counts = dict()
        for unigram in self.tokens:
            self.unigram_counts[unigram] = self.unigram_counts.get(unigram, 0) + 1

        self.ngram_counts = dict()
        for ngram in ngrams(self.tokens, self.n):
            self.ngram_counts[ngram] = self.ngram_counts.get(ngram, 0) + 1
Пример #34
0
    def ptb_inputs_test6(self):
        string = "Gimme a phone, I'm gonna call."
        gold = [
            "Gim", "me", "a", "phone", ",", "I", "'m", "gon", "na", "call", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #35
0
    def ptb_inputs_test8(self):
        string = "I said at 4:45pm."
        gold = ["I", "said", "at", "4:45", "pm", "."]
        tokens = nlp.tokenize(string)
        print("SYSTEM:", tokens)
        print("GOLD  :", gold)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #36
0
    def ptb_inputs_test8(self):
        string = "I said at 4:45pm."
        gold = ["I", "said", "at", "4:45", "pm", "."]
        tokens = nlp.tokenize(string)
        print("SYSTEM:", tokens)
        print("GOLD  :", gold)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #37
0
    def ptb_inputs_test23(self):
        string = "I don't give a f**k about your sh*tty life."
        gold = ["I", "do", "n't", "give", "a", "f", "**", "k", "about", "your", "sh", "*", "tty", "life", "."]
        tokens = nlp.tokenize(string)
        print("SYSTEM:", tokens)
        print("GOLD  :", gold)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #38
0
    def ptb_inputs_test16(self):
        string = "@Harry_Styles didn`t like Mu`ammar al-Qaddafi"
        gold = [
            "@Harry_Styles", "did", "n`t", "like", "Mu`ammar", "al-Qaddafi"
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #39
0
    def ptb_inputs_test4(self):
        string = "The Iron Age (ca. 1300 – ca. 300 BC)."
        gold = [
            "The", "Iron", "Age", "-LRB-", "ca.", "1300", "--", "ca.", "300",
            "BC", "-RRB-", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #40
0
def tokenize_sgml_test_4_no_normalize():
    nlp.get_global_PTB_config().normalize_spaces = False
    sent4 = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">'

    gold = ['<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">']
    # spaces go to &nbsp; \u00A0

    tokens = nlp.tokenize(sent4)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #41
0
    def ptb_inputs_test20(self):
        string = "I lived in O\u2019Malley and read OK! Magazine."
        gold = [
            "I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine",
            "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #42
0
    def ptb_inputs_test10(self):
        string = "You `paid' US$170,000?!\nYou should've paid only$16.75."
        gold = [
            "You", "`", "paid", "'", "US$", "170,000", "?!", "You", "should",
            "'ve", "paid", "only", "$", "16.75", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #43
0
    def ptb_inputs_test11(self):
        string = "1. Buy a new Chevrolet (37%-owned in the U.S..) . 15%"
        gold = [
            "1", ".", "Buy", "a", "new", "Chevrolet", "-LRB-", "37", "%", "-",
            "owned", "in", "the", "U.S.", ".", "-RRB-", ".", "15", "%"
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
def get_website_html_pages_tokens(website_name):
    with open(f'classifier/html_docs/{website_name}_html_docs.json',
              'r',
              encoding='utf-8') as f:
        html_docs_list = json.load(f)
        f.close()

    for html_doc in html_docs_list:
        if html_doc[0] and '<!DOCTYPE' in html_doc[0]:
            X_TRAIN.append(nlp.tokenize(html_doc[0]))
            Y_TRAIN.append(html_doc[1])
Пример #45
0
def tokenize():
    data = request.get_data()
    if not data:
        return json_error('empty request')

    try:
        text = data.decode('utf-8')
    except UnicodeDecodeError as err:
        return json_error(str(err))

    return jsonify(tokens=nlp.tokenize(text))
Пример #46
0
    def ptb_inputs_test24(self):
        string = "First sentence.... Second sentence."
        # The tests that come with corenlp state that ". . . . Second" should
        # be "...", ".", "Second". However, the actual CoreNLP tokenizer
        # and our tokenizer produce "...", "Second" so I am going with that.
        gold = ["First", "sentence", "...", "Second", "sentence", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #47
0
    def ptb_inputs_test24(self):
        string = "First sentence.... Second sentence."
        # The tests that come with corenlp state that ". . . . Second" should
        # be "...", ".", "Second". However, the actual CoreNLP tokenizer
        # and our tokenizer produce "...", "Second" so I am going with that.
        gold = ["First", "sentence", "...", "Second", "sentence", "."]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #48
0
    def ptb_inputs_test22(self):
        string = "I like: \u2022wine, \u0095cheese, \u2023salami, & " \
                 "\u2043speck."
        gold = [
            "I", "like", ":", "\u2022", "wine", ",", "\u2022", "cheese", ",",
            "\u2023", "salami", ",", "&", "\u2043", "speck", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #49
0
def tokenize_sgml_test_8_no_normalize():
    nlp.get_global_PTB_config().escape_forward_slash_asterisk = False
    sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " "\"dude'/> <not sgmltag"

    gold = ['<a href="http:\\\\it\'s\\here">', "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag"]

    tokens = nlp.tokenize(sent8)
    print(tokens)
    print(gold)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #50
0
    def ptb_inputs_test21(self):
        string = "I lived in O\u0092Malley and read OK! Magazine."
        # /* invalid unicode codepoint, but inherit from cp1252 */
        gold = [
            "I", "lived", "in", "O'Malley", "and", "read", "OK!", "Magazine",
            "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #51
0
    def predict(self, prompt: str) -> Dict[str, float]:
        prompt_ngram = tuple(tokenize(prompt))[-self.n :]
        prompt_ngram = ("",) * max(0, self.n - len(prompt_ngram)) + prompt_ngram

        follower_odds: Dict[str, int] = defaultdict(int)

        for neighbor in self.ngram_follower_counts.keys():
            neighbor_distance = ngram_distance(prompt_ngram, neighbor, self.metrics)
            for follower, follower_count in self.ngram_follower_counts[neighbor].items():
                follower_odds[follower] += follower_count * (self.n - neighbor_distance)

        return collections.OrderedDict((sorted(follower_odds.items(), key=lambda item: item[1], reverse=True)))
Пример #52
0
    def ptb_inputs_test17(self):
        string = "Kenneth liked Windows 3.1, Windows 3.x, and Mesa A.B " \
                 "as I remember things."
        gold = [
            "Kenneth", "liked", "Windows", "3.1", ",", "Windows", "3.x", ",",
            "and", "Mesa", "A.B", "as", "I", "remember", "things", "."
        ]
        tokens = nlp.tokenize(string)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #53
0
def tokenize_sgml_test_10_no_normalize():
    sent10 = "&lt;[email protected]&gt; [email protected] " \
             "<*****@*****.**>"

    gold = [
        "&lt;[email protected]&gt;", "*****@*****.**",
        "<*****@*****.**>"
    ]

    tokens = nlp.tokenize(sent10)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #54
0
def tokenize_sgml_test_5_no_normalize():
    sent5 = "Hi! <foo bar=\"baz xy = foo !$*) 422\" > <?PITarget " \
            "PIContent?> <?PITarget PIContent> Hi!"

    gold = [
        "Hi", "!", "<foo bar=\"baz xy = foo !$*) 422\" >",
        "<?PITarget PIContent?>", "<?PITarget PIContent>", "Hi", "!"
    ]
    tokens = nlp.tokenize(sent5)
    print(tokens)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
    assert len(tokens) == len(gold)
Пример #55
0
def tokenize_sgml_test_5_no_normalize():
    sent5 = 'Hi! <foo bar="baz xy = foo !$*) 422" > <?PITarget ' "PIContent?> <?PITarget PIContent> Hi!"

    gold = [
        "Hi",
        "!",
        '<foo bar="baz xy = foo !$*) 422" >',
        "<?PITarget PIContent?>",
        "<?PITarget PIContent>",
        "Hi",
        "!",
    ]
    tokens = nlp.tokenize(sent5)
    print(tokens)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
    assert len(tokens) == len(gold)
Пример #56
0
def tokenize_sgml_test_6_no_space_normalize():
    sent6 = (
        '<?xml version="1.0" encoding="UTF-8" ?>\n<?xml-stylesheet '
        'type="text/xsl" href="style.xsl"?>\n<book '
        'xml:id="simple_book" '
        'xmlns="http://docbook.org/ns/docbook" version="5.0">\n'
    )

    gold = [
        '<?xml version="1.0" encoding="UTF-8" ?>',
        '<?xml-stylesheet type="text/xsl" href="style.xsl"?>',
        '<book xml:id="simple_book" ' 'xmlns="http://docbook.org/ns/docbook" version="5.0">',
    ]

    tokens = nlp.tokenize(sent6)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #57
0
def tokenize_sgml_test_3_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    sent3 = (
        "Oesophageal acid exposure (% time <pH 4) was similar in "
        "patients with or without complications (19.2% v 19.3% p>0.05)."
    )

    gold = [
        "Oesophageal",
        "acid",
        "exposure",
        "(",
        "%",
        "time",
        "<",
        "pH",
        "4",
        ")",
        "was",
        "similar",
        "in",
        "patients",
        "with",
        "or",
        "without",
        "complications",
        "(",
        "19.2",
        "%",
        "v",
        "19.3",
        "%",
        "p",
        ">",
        "0.05",
        ")",
        ".",
    ]

    tokens = nlp.tokenize(sent3)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #58
0
def tokenize_sgml_test_7_no_normalize():
    sent7 = (
        '<chapter xml:id="chapter_1"><?php echo $a; ?>\n<!-- This '
        'is an SGML/XML comment "Hi!" -->\n<p> </p> <p-fix / >'
    )

    gold = [
        '<chapter xml:id="chapter_1">',
        "<?php echo $a; ?>",
        '<!-- This is an SGML/XML comment "Hi!" -->',
        "<p>",
        "</p>",
        "<p-fix / >",
    ]

    tokens = nlp.tokenize(sent7)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Пример #59
0
    def ptb_inputs_test7(self):
        string = '"John & Mary\'s dog," Jane thought (to herself).\n"' "What a #$%!\na- ``I like AT&T''.\""
        gold = [
            "``",
            "John",
            "&",
            "Mary",
            "'s",
            "dog",
            ",",
            "''",
            "Jane",
            "thought",
            "-LRB-",
            "to",
            "herself",
            "-RRB-",
            ".",
            "``",
            "What",
            "a",
            "#",
            "$",
            "%",
            "!",
            "a",
            "-",
            "``",
            "I",
            "like",
            "AT&T",
            "''",
            ".",
            "''",
        ]
        tokens = nlp.tokenize(string)
        print("SYSTEM:", tokens)
        print("GOLD  :", gold)

        for token, gold_token in zip(tokens, gold):
            assert bytes(token) == gold_token.encode("utf-8")
        assert len(tokens) == len(gold)
Пример #60
0
def tokenize_sgml_test_11_no_normalize():
    sent11 = "<DOC> <DOCID> nyt960102.0516 </DOCID><STORYID cat=w " "pri=u> A0264 </STORYID> <SLUG fv=ttj-z> "
    # this is a MUC7 document

    gold = [
        "<DOC>",
        "<DOCID>",
        "nyt960102",
        ".0516",
        "</DOCID>",
        "<STORYID cat=w pri=u>",
        "A0264",
        "</STORYID>",
        "<SLUG fv=ttj-z>",
    ]

    tokens = nlp.tokenize(sent11)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")