예제 #1
0
 def create_token_vocab(self):
     factory = utils.VocabularyFactory(reserved=["<eos>"])
     for act in self.ontology.act.f2i:
         factory.update(nltk.casual_tokenize(act))
     for slot in self.ontology.slot.f2i:
         factory.update(nltk.casual_tokenize(slot))
     return factory.get_vocab()
예제 #2
0
 def create_act_slot_tensor(self, vocab):
     act_slots = []
     for as_idx, (act, slot) in self.ontology.act_slot.i2f.items():
         tokens = (list(nltk.casual_tokenize(act)) +
                   list(nltk.casual_tokenize(slot)) + ["<eos>"])
         tokens = [vocab[token] for token in tokens]
         act_slots.append((as_idx, torch.LongTensor(tokens)))
     act_slots = list(sorted(act_slots, key=lambda x: x[0]))
     act_slots = utils.pad_stack([act_slot[1] for act_slot in act_slots])
     return act_slots
예제 #3
0
def gen_lit_eval(s):
    if isinstance(s, str):
        if ('["' in s or "['" in s) and ('"]' in s or "']" in s):
            new_s = ast.literal_eval(s)
        else:
            new_s = s
        if isinstance(new_s, str):
            return [nltk.casual_tokenize(s.lower())]
        else:
            return [nltk.casual_tokenize(i.lower()) for i in new_s]
    return []
예제 #4
0
 def tensorize_turn_label_asv(self, asv: ActSlotValue):
     if asv == self.asv_pad:
         return self.tensorize_processed_tokens(("<pad>",))
     if asv.act == "inform":
         slot, value = asv.slot, asv.value,
     elif asv.act == "request":
         slot, value = "request", asv.value
     else:
         raise RuntimeError(f"unexpected act: {asv.act}")
     tokens = (list(nltk.casual_tokenize(slot)) + ["="] +
               list(nltk.casual_tokenize(value)) + ["<eos>"])
     return self.tensorize_processed_tokens(tokens)
예제 #5
0
def interactive_shell(model, casual=False):
    """Creates interactive shell to play with model
    Args:
        model: instance of NERModel
        casual: If we should use the nltk casual tokenize
    """
    model.logger.info("""
        This is an interactive mode.
        To exit, enter 'exit'.
        You can enter a sentence like
        input> If you have a java.io.InputStream object, how should you process that object and produce a String?"""
                      )

    while True:
        sentence = input("input> ")

        if casual:
            words_raw = casual_tokenize(sentence.strip())
        else:
            words_raw = [
                l.strip()
                for l in re.findall(CODE_TOKENISATION_REGEX, sentence.strip())
                if len(l.strip()) > 0
            ]

        if words_raw == ["exit"]:
            break

        preds = model.predict(words_raw)
        if isinstance(preds, tuple):
            preds = preds[0]

        print(' '.join(['%s_%s' % (w, t) for w, t in zip(words_raw, preds)]))
예제 #6
0
        def process_sent(sentence):
            if casual:
                words_raw = casual_tokenize(sentence.strip())
            else:
                words_raw = [
                    l.strip() for l in re.findall(CODE_TOKENISATION_REGEX,
                                                  sentence.strip())
                    if len(l.strip()) > 0
                ]

            predictions = model.predict(
                [marked_code_tokens_regex.sub(r"\1", w) for w in words_raw])
            if isinstance(predictions, tuple):
                out = [{
                    'word': str(w),
                    'tag': str(t),
                    'language': str(int(lid))
                } for w, (t, lid) in zip(words_raw, zip(*predictions))]
            else:
                out = [{
                    'word': str(w),
                    'tag': str(t)
                } for w, t in zip(words_raw, predictions)]
            print(json.dumps(out))
            return out
예제 #7
0
def tokenize(sentence):
    """Takes a string and returns a list of tokens using NLTK"""
    tokens = nltk.casual_tokenize(
        sentence, preserve_case=False)  # Tokenize the input, all lowercase
    post_process(tokens)
    tokens.append(
        sentence.lower())  # Needed for recognizing Talk to VTA Chat Bot phrase
    log.info("Tokens: %s", tokens)
    return tokens
예제 #8
0
def tokenize_no_punct_all_lower(txt):
    txt_tokenize = casual_tokenize(txt,
                                   preserve_case=False,
                                   strip_handles=True)
    txt_tokenize = [
        word for word in txt_tokenize if re.sub(r"\-", "", word).isalpha()
    ]
    txt_tokenize = [word for word in txt_tokenize if word not in stop]
    return txt_tokenize
예제 #9
0
def process_line_of_code(line: str) -> str:
    words = casual_tokenize(line.strip())
    new_words = list()
    for word in words:
        candidate = ''.join(map(lambda w: ps.stem(w), split_camel_case(word)))
        for new_word in candidate.split("."):
            new_words.append(new_word)
    processed = ' '.join(new_words)
    return processed
예제 #10
0
def tokenise(text):
    """
    Called before any processing of the text has occurred.
    """

    tokenize = []
    for word in nltk.casual_tokenize(text, preserve_case=False):
        # check stopwords and numbers ------ filter at this stage
        if word not in stopWords and not word.isnumeric():
            tokenize.append(word)

    #tokens = text.split()
    return tokenize
def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g.
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    for word in nltk.casual_tokenize(text, preserve_case=False):
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens
예제 #12
0
def probability_of_fragment(chain, fragment):
    """Return the probability of a fragment occurring"""
    words = [
        word.lower()
        for word in nltk.casual_tokenize(fragment, preserve_case=False)
    ]
    if (words[0]) not in chain.keys():
        return 0
    totalProb = 1
    for i, word in enumerate(words):
        if i == 0:
            continue
        if words[i] not in chain[words[i - 1]]["dst"].keys():
            return 0
        totalProb = totalProb * chain[words[i - 1]]["dst"][words[i]]["prob"]
    return totalProb
예제 #13
0
def tokenize_SO_row(row_, tag_name='body', all_as_code=False):
    row_ = BeautifulSoup(row_, HTML_PARSER).find(tag_name)
    text__ = [(tag.text, 'Code' if tag.name == 'pre' or tag.name == 'code' else 'NL')
              for tag in row_.childGenerator() if isinstance(tag, bs4.element.Tag)]
    text___ = list()
    for (body_, kind_) in text__:
        if kind_ == 'NL' and not all_as_code:
            toks_ = [casual_tokenize(s) for s in sent_tokenize(body_)]
        elif all_as_code or kind_ == 'Code':
            toks_ = [
                [l.strip()
                 for l in re.findall(CODE_TOKENISATION_REGEX,
                                     line.strip())
                 if len(l.strip()) > 0]
                for line in body_.split('\n')
            ]
        text___ += toks_
    return text___
예제 #14
0
파일: cky.py 프로젝트: sachan17/CKY-Parser
def parse(sent):
    files = tb.fileids()
    data = list(tb.parsed_sents(files))

    P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg(
        data)

    words = casual_tokenize(str(sent))
    scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms),
                                copy(P_vocab), copy(P_term_parents),
                                copy(P_parents_count))
    start = Tree(Nonterminal('S'), [])
    if scores[0][len(words)][Nonterminal('S')] == 0:
        start = get_start(scores, len(words))
    predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms)
    clean_tree(predicted_tree)
    predicted_tree.un_chomsky_normal_form()
    print('Parsed Tree')
    print(predicted_tree)
예제 #15
0
 def __init__(self, name_of_corpus='corpus'):
     self.name = name_of_corpus
     self.text = open('corpus/' + self.name + '.txt').read()
     self.text_tok = nltk.casual_tokenize(self.text)
     if not os.path.exists('corpus/' + self.name + '_word_int_dict.json'):
         print 'create dictionaries'
         self.word_to_int = {}
         self.createdict()
         self.int_to_word = {v: k for k, v in self.word_to_int.items()}
         self.save_text_toke()
         self.save_int_to_word_dict()
         self.save_word_to_int_dict()
     else:
         print 'load existing dictionary'
         with open('corpus/' + self.name + '_int_word_dict.json',
                   'r') as int_word_dict:
             self.int_to_word = json.load(int_word_dict)
         with open('corpus/' + self.name + '_word_int_dict.json',
                   'r') as word_int_dict:
             self.word_to_int = json.load(word_int_dict)
예제 #16
0
def code_tag(snippet, context_=None, context_only=True, freq_context_=None, casual=False):
    if casual:
        tokenised = [casual_tokenize(s) for s in snippet.split('\n')]
    else:
        tokenised = [
            [l.strip()
             for l in re.findall(CODE_TOKENISATION_REGEX,
                                 line.strip())
             if len(l.strip()) > 0]
            for line in snippet.split('\n')
        ]
    result = list()
    context_ = dict() if context_ is None else context_
    if context_only:
        assert freq_context_ is not None
        for tokens in tokenised:
            tagged, context_ = annotate_line_using_only_context(tokens, context_, freq_context_)
            result.append(tagged)
    else:
        for tokens in tokenised:
            tagged, context_ = annotate_line(tokens, context_)
            result.append(tagged)
    return result, context_
예제 #17
0
def sent_lit_eval(s):
    if s:
        return nltk.casual_tokenize(s.lower())
    else:
        return []
예제 #18
0
        if location >= limit:
            break
        row = BeautifulSoup(row, HTML_PARSER).find('body')
        text = [(tag.text, 'Code' if tag.name == 'pre' or tag.name == 'code' else 'NL')
                for tag in row.childGenerator() if isinstance(tag, bs4.element.Tag)]
        text_ = list()
        context = dict()
        for i, (body, kind) in enumerate(text):
            if kind == 'Code':
                toks, context = code_tag(body, context, frequency, freq_context)
                if language_id:
                    toks = [[(tok, tag, 1) for tok, tag in s] for s in toks]
                text_.append((i, toks))
        for i, (body, kind) in enumerate(text):
            if kind == 'NL':
                toks = [pos_tag(casual_tokenize(s), tagset="universal") for s in sent_tokenize(body)]
                toks = [[
                    tuple([w, t if t not in ['NOUN', 'VERB'] or w not in context.keys() else context[w]] +
                          ([0 if t not in ['NOUN', 'VERB'] or w not in context.keys() else 1]
                           if language_id else []))
                    for w, t in s] for s in toks]
                text_.append((i, toks))
        text_ = [t for _, t in sorted(text_, key=lambda p: p[0])]
        text = [[[t for t in s] for s in p if len(s) > 0] for p in text_]
        formatted_output = ''.join(['\n'.join([
            '%s %s %d' % t if language_id else '%s %s' % t for t in s]) + '\n\n' for p in text for s in p])
        with open('./data/corpora/SO%s/%s.txt'
                  % (('_Freq' if frequency else '') + ('_Id' if language_id else ''), output_name),
                  'a', encoding='utf-8') as f:
            f.write(formatted_output)
예제 #19
0
_author__ = 'piorkja1'

import nltk
from io import open

stem = open('acro-yoga-reddit-comments.txt', 'r')
wtokens = nltk.word_tokenize(stem.read())
stem = open('acro-yoga-reddit-comments.txt', 'r')
ctokens = nltk.casual_tokenize(stem.read())
stem = open('acro-yoga-reddit-comments.txt', 'r')
stokens = nltk.sent_tokenize(stem.read())

print("word tokens = ", wtokens)
print("casual tokens = ", ctokens)
print("sentence tokens = ", stokens)

file = open("tokenoutput.txt", "w", encoding="utf-8")
file.write(unicode("word tokens " + '\n'))
count = 0
for item in wtokens:
    file.write(unicode("%s\n" % item))
    count += 1
    if count > 100:
        break

count = 0
file.write(unicode("\n\ncasual tokens " + '\n'))
for item in ctokens:
    file.write(unicode("%s\n" % item))
    count += 1
    if count > 100:
예제 #20
0
def parse_cc(lucid_data_,
             cc_block,
             file_out,
             with_l_id_,
             freq_context_=None,
             fuzzy_k=3):
    observed_ = set()
    cc_block_children = [
        k_ for k_ in cc_block.keys() if k_.startswith('Child')
    ]
    for cc_block_child_key in cc_block_children:
        cc_block_child = cc_block[cc_block_child_key]
        if cc_block_child.startswith('Snippet'):
            for annotated_line in cc_block[cc_block_child]:
                line = [
                    tuple([p[0], p[1] if p[1] not in blacklist else '.'] +
                          ([1] if with_l_id_ else []))
                    for p in annotated_line['Tokens']
                ]
                line_2 = copy.deepcopy(line)
                code_toks = [(p[0], p[1]) for p in copy.deepcopy(line)
                             if p[1] != "comment" and p[1] != "keyword"]
                if with_l_id_:
                    for val, tag, l_id in line:
                        if tag == "comment":
                            toks = [
                                pos_tag([
                                    t for t in casual_tokenize(s)
                                    if t not in ['/', '\\', '*']
                                ],
                                        tagset="universal") for s in
                                sent_tokenize(cc_block['CommentText'].strip())
                            ]
                            new_toks = list()
                            for sent in toks:
                                new_sent = list()
                                for val_, tag_ in sent:
                                    try:
                                        short_list = fuzzy_match(
                                            val_, [p[0] for p in code_toks],
                                            fuzzy_k)
                                        just_list = [p[0] for p in short_list]
                                        new_tag = mode([
                                            p[1] for p in code_toks
                                            if p[0] in just_list
                                        ])
                                        if new_tag.isupper():
                                            new_tag = tag_
                                        new_sent.append(
                                            (val_, new_tag,
                                             0 if new_tag.isupper()
                                             or new_tag == '.' else 1))
                                    except IndexError:
                                        new_sent.append((val_, tag_, 0))
                                new_toks.append(new_sent)
                            toks = new_toks
                            if freq_context_ is not None:
                                new_toks = list()
                                for sent in toks:
                                    new_sent = list()
                                    for val_, tag_ in sent:
                                        try:
                                            new_tag = \
                                                sorted(freq_context_[val_].items(), reverse=True,
                                                       key=lambda p: p[1])[0][0]
                                            if new_tag.isupper():
                                                new_tag = tag_
                                            new_sent.append(
                                                (val_, new_tag,
                                                 0 if new_tag.isupper()
                                                 or new_tag == '.' else 1))
                                        except IndexError:
                                            new_sent.append((val_, tag_, 0))
                                    new_toks.append(new_sent)
                                toks = new_toks
                            formatted_output = ''.join([
                                '\n'.join(['%s %s %d' % t for t in s]) + '\n\n'
                                for s in toks
                            ])
                            file_out.write(formatted_output)
                            line_2.remove((val, tag, l_id))
                        elif tag == "string_literal":
                            line_2.remove((val, tag, l_id))
                            line_2.append((val.replace('\n', ' '), tag, l_id))
                    file_out.write('\n'.join(
                        ['%s %s %d' % (v, t, l)
                         for v, t, l in line_2]) + '\n\n')
                else:
                    for val, tag in line:
                        if tag == "comment":
                            toks = [
                                pos_tag([
                                    t for t in casual_tokenize(s)
                                    if t not in ['/', '\\', '*']
                                ],
                                        tagset="universal") for s in
                                sent_tokenize(cc_block['CommentText'].strip())
                            ]
                            new_toks = list()
                            for sent in toks:
                                new_sent = list()
                                for val_, tag_ in sent:
                                    try:
                                        short_list = fuzzy_match(
                                            val_, [p[0] for p in code_toks],
                                            fuzzy_k)
                                        just_list = [p[0] for p in short_list]
                                        new_tag = mode([
                                            p[1] for p in code_toks
                                            if p[0] in just_list
                                        ])
                                        if new_tag.isupper():
                                            new_tag = tag_
                                        new_sent.append(
                                            (val_, new_tag,
                                             0 if new_tag.isupper()
                                             or new_tag == '.' else 1))
                                    except IndexError:
                                        new_sent.append((val_, tag_, 0))
                                new_toks.append(new_sent)
                            toks = new_toks
                            if freq_context_ is not None:
                                new_toks = list()
                                for sent in toks:
                                    new_sent = list()
                                    for val_, tag_ in sent:
                                        try:
                                            new_tag = \
                                                sorted(freq_context_[val_].items(), reverse=True, key=lambda p: p[1])[
                                                    0][
                                                    0]
                                            if new_tag.isupper():
                                                new_tag = tag_
                                            new_sent.append((val_, new_tag))
                                        except IndexError:
                                            new_sent.append((val_, tag_))
                                    new_toks.append(new_sent)
                                toks = new_toks
                            formatted_output = ''.join([
                                '\n'.join(['%s %s' % t for t in s]) + '\n\n'
                                for s in toks
                            ])
                            file_out.write(formatted_output)
                            line_2.remove((val, tag))
                        elif tag == "string_literal":
                            line_2.remove((val, tag))
                            line_2.append((val.replace('\n', ' '), tag))
                    file_out.write(
                        '\n'.join(['%s %s' % (v, t)
                                   for v, t in line_2]) + '\n\n')
        else:
            observed_.add(cc_block_child)
            inner_observed = parse_cc(lucid_data_, lucid_data_[cc_block_child],
                                      file_out, with_l_id_)
            observed_.union(inner_observed)
    if len(cc_block['CommentText']) > 0:
        toks = [
            pos_tag(
                [t for t in casual_tokenize(s) if t not in ['/', '\\', '*']],
                tagset="universal")
            for s in sent_tokenize(cc_block['CommentText'].strip())
        ]
        if freq_context_ is not None:
            if with_l_id_:
                new_toks = list()
                for sent in toks:
                    new_sent = list()
                    for val, tag in sent:
                        try:
                            new_tag = sorted(freq_context_[val].items(),
                                             reverse=True,
                                             key=lambda p: p[1])[0][0]
                            if new_tag.isupper():
                                new_tag = tag
                            new_sent.append(
                                (val, new_tag, 0 if new_tag.isupper()
                                 or new_tag == '.' else 1))
                        except IndexError:
                            new_sent.append((val, tag, 0))
                    new_toks.append(new_sent)
                toks = new_toks
            else:
                new_toks = list()
                for sent in toks:
                    new_sent = list()
                    for val, tag in sent:
                        try:
                            new_tag = sorted(freq_context_[val].items(),
                                             reverse=True,
                                             key=lambda p: p[1])[0][0]
                            if new_tag.isupper():
                                new_tag = tag
                            new_sent.append((val, new_tag))
                        except IndexError:
                            new_sent.append((val, tag))
                    new_toks.append(new_sent)
                toks = new_toks
        if with_l_id_:
            formatted_output = ''.join([
                '\n'.join(['%s %s %d' % t for t in s]) + '\n\n' for s in toks
            ])
        else:
            formatted_output = ''.join(
                ['\n'.join(['%s %s' % t for t in s]) + '\n\n' for s in toks])
        file_out.write(formatted_output)
    return observed_
예제 #21
0
def ctx_lit_eval(s):
    if isinstance(s, str):
        sents = s.split("|")
        return [nltk.casual_tokenize(i.lower()) for i in sents]
    return []