def create_token_vocab(self): factory = utils.VocabularyFactory(reserved=["<eos>"]) for act in self.ontology.act.f2i: factory.update(nltk.casual_tokenize(act)) for slot in self.ontology.slot.f2i: factory.update(nltk.casual_tokenize(slot)) return factory.get_vocab()
def create_act_slot_tensor(self, vocab): act_slots = [] for as_idx, (act, slot) in self.ontology.act_slot.i2f.items(): tokens = (list(nltk.casual_tokenize(act)) + list(nltk.casual_tokenize(slot)) + ["<eos>"]) tokens = [vocab[token] for token in tokens] act_slots.append((as_idx, torch.LongTensor(tokens))) act_slots = list(sorted(act_slots, key=lambda x: x[0])) act_slots = utils.pad_stack([act_slot[1] for act_slot in act_slots]) return act_slots
def gen_lit_eval(s): if isinstance(s, str): if ('["' in s or "['" in s) and ('"]' in s or "']" in s): new_s = ast.literal_eval(s) else: new_s = s if isinstance(new_s, str): return [nltk.casual_tokenize(s.lower())] else: return [nltk.casual_tokenize(i.lower()) for i in new_s] return []
def tensorize_turn_label_asv(self, asv: ActSlotValue): if asv == self.asv_pad: return self.tensorize_processed_tokens(("<pad>",)) if asv.act == "inform": slot, value = asv.slot, asv.value, elif asv.act == "request": slot, value = "request", asv.value else: raise RuntimeError(f"unexpected act: {asv.act}") tokens = (list(nltk.casual_tokenize(slot)) + ["="] + list(nltk.casual_tokenize(value)) + ["<eos>"]) return self.tensorize_processed_tokens(tokens)
def interactive_shell(model, casual=False): """Creates interactive shell to play with model Args: model: instance of NERModel casual: If we should use the nltk casual tokenize """ model.logger.info(""" This is an interactive mode. To exit, enter 'exit'. You can enter a sentence like input> If you have a java.io.InputStream object, how should you process that object and produce a String?""" ) while True: sentence = input("input> ") if casual: words_raw = casual_tokenize(sentence.strip()) else: words_raw = [ l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, sentence.strip()) if len(l.strip()) > 0 ] if words_raw == ["exit"]: break preds = model.predict(words_raw) if isinstance(preds, tuple): preds = preds[0] print(' '.join(['%s_%s' % (w, t) for w, t in zip(words_raw, preds)]))
def process_sent(sentence): if casual: words_raw = casual_tokenize(sentence.strip()) else: words_raw = [ l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, sentence.strip()) if len(l.strip()) > 0 ] predictions = model.predict( [marked_code_tokens_regex.sub(r"\1", w) for w in words_raw]) if isinstance(predictions, tuple): out = [{ 'word': str(w), 'tag': str(t), 'language': str(int(lid)) } for w, (t, lid) in zip(words_raw, zip(*predictions))] else: out = [{ 'word': str(w), 'tag': str(t) } for w, t in zip(words_raw, predictions)] print(json.dumps(out)) return out
def tokenize(sentence): """Takes a string and returns a list of tokens using NLTK""" tokens = nltk.casual_tokenize( sentence, preserve_case=False) # Tokenize the input, all lowercase post_process(tokens) tokens.append( sentence.lower()) # Needed for recognizing Talk to VTA Chat Bot phrase log.info("Tokens: %s", tokens) return tokens
def tokenize_no_punct_all_lower(txt): txt_tokenize = casual_tokenize(txt, preserve_case=False, strip_handles=True) txt_tokenize = [ word for word in txt_tokenize if re.sub(r"\-", "", word).isalpha() ] txt_tokenize = [word for word in txt_tokenize if word not in stop] return txt_tokenize
def process_line_of_code(line: str) -> str: words = casual_tokenize(line.strip()) new_words = list() for word in words: candidate = ''.join(map(lambda w: ps.stem(w), split_camel_case(word))) for new_word in candidate.split("."): new_words.append(new_word) processed = ' '.join(new_words) return processed
def tokenise(text): """ Called before any processing of the text has occurred. """ tokenize = [] for word in nltk.casual_tokenize(text, preserve_case=False): # check stopwords and numbers ------ filter at this stage if word not in stopWords and not word.isnumeric(): tokenize.append(word) #tokens = text.split() return tokenize
def tokenize(text): ''' :param text: a doc with multiple sentences, type: str return a word list, type: list https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize e.g. Input: 'It is a nice day. I am happy.' Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy'] ''' tokens = [] for word in nltk.casual_tokenize(text, preserve_case=False): if word not in stop_words and not word.isnumeric(): tokens.append(word) return tokens
def probability_of_fragment(chain, fragment): """Return the probability of a fragment occurring""" words = [ word.lower() for word in nltk.casual_tokenize(fragment, preserve_case=False) ] if (words[0]) not in chain.keys(): return 0 totalProb = 1 for i, word in enumerate(words): if i == 0: continue if words[i] not in chain[words[i - 1]]["dst"].keys(): return 0 totalProb = totalProb * chain[words[i - 1]]["dst"][words[i]]["prob"] return totalProb
def tokenize_SO_row(row_, tag_name='body', all_as_code=False): row_ = BeautifulSoup(row_, HTML_PARSER).find(tag_name) text__ = [(tag.text, 'Code' if tag.name == 'pre' or tag.name == 'code' else 'NL') for tag in row_.childGenerator() if isinstance(tag, bs4.element.Tag)] text___ = list() for (body_, kind_) in text__: if kind_ == 'NL' and not all_as_code: toks_ = [casual_tokenize(s) for s in sent_tokenize(body_)] elif all_as_code or kind_ == 'Code': toks_ = [ [l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, line.strip()) if len(l.strip()) > 0] for line in body_.split('\n') ] text___ += toks_ return text___
def parse(sent): files = tb.fileids() data = list(tb.parsed_sents(files)) P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg( data) words = casual_tokenize(str(sent)) scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms), copy(P_vocab), copy(P_term_parents), copy(P_parents_count)) start = Tree(Nonterminal('S'), []) if scores[0][len(words)][Nonterminal('S')] == 0: start = get_start(scores, len(words)) predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms) clean_tree(predicted_tree) predicted_tree.un_chomsky_normal_form() print('Parsed Tree') print(predicted_tree)
def __init__(self, name_of_corpus='corpus'): self.name = name_of_corpus self.text = open('corpus/' + self.name + '.txt').read() self.text_tok = nltk.casual_tokenize(self.text) if not os.path.exists('corpus/' + self.name + '_word_int_dict.json'): print 'create dictionaries' self.word_to_int = {} self.createdict() self.int_to_word = {v: k for k, v in self.word_to_int.items()} self.save_text_toke() self.save_int_to_word_dict() self.save_word_to_int_dict() else: print 'load existing dictionary' with open('corpus/' + self.name + '_int_word_dict.json', 'r') as int_word_dict: self.int_to_word = json.load(int_word_dict) with open('corpus/' + self.name + '_word_int_dict.json', 'r') as word_int_dict: self.word_to_int = json.load(word_int_dict)
def code_tag(snippet, context_=None, context_only=True, freq_context_=None, casual=False): if casual: tokenised = [casual_tokenize(s) for s in snippet.split('\n')] else: tokenised = [ [l.strip() for l in re.findall(CODE_TOKENISATION_REGEX, line.strip()) if len(l.strip()) > 0] for line in snippet.split('\n') ] result = list() context_ = dict() if context_ is None else context_ if context_only: assert freq_context_ is not None for tokens in tokenised: tagged, context_ = annotate_line_using_only_context(tokens, context_, freq_context_) result.append(tagged) else: for tokens in tokenised: tagged, context_ = annotate_line(tokens, context_) result.append(tagged) return result, context_
def sent_lit_eval(s): if s: return nltk.casual_tokenize(s.lower()) else: return []
if location >= limit: break row = BeautifulSoup(row, HTML_PARSER).find('body') text = [(tag.text, 'Code' if tag.name == 'pre' or tag.name == 'code' else 'NL') for tag in row.childGenerator() if isinstance(tag, bs4.element.Tag)] text_ = list() context = dict() for i, (body, kind) in enumerate(text): if kind == 'Code': toks, context = code_tag(body, context, frequency, freq_context) if language_id: toks = [[(tok, tag, 1) for tok, tag in s] for s in toks] text_.append((i, toks)) for i, (body, kind) in enumerate(text): if kind == 'NL': toks = [pos_tag(casual_tokenize(s), tagset="universal") for s in sent_tokenize(body)] toks = [[ tuple([w, t if t not in ['NOUN', 'VERB'] or w not in context.keys() else context[w]] + ([0 if t not in ['NOUN', 'VERB'] or w not in context.keys() else 1] if language_id else [])) for w, t in s] for s in toks] text_.append((i, toks)) text_ = [t for _, t in sorted(text_, key=lambda p: p[0])] text = [[[t for t in s] for s in p if len(s) > 0] for p in text_] formatted_output = ''.join(['\n'.join([ '%s %s %d' % t if language_id else '%s %s' % t for t in s]) + '\n\n' for p in text for s in p]) with open('./data/corpora/SO%s/%s.txt' % (('_Freq' if frequency else '') + ('_Id' if language_id else ''), output_name), 'a', encoding='utf-8') as f: f.write(formatted_output)
_author__ = 'piorkja1' import nltk from io import open stem = open('acro-yoga-reddit-comments.txt', 'r') wtokens = nltk.word_tokenize(stem.read()) stem = open('acro-yoga-reddit-comments.txt', 'r') ctokens = nltk.casual_tokenize(stem.read()) stem = open('acro-yoga-reddit-comments.txt', 'r') stokens = nltk.sent_tokenize(stem.read()) print("word tokens = ", wtokens) print("casual tokens = ", ctokens) print("sentence tokens = ", stokens) file = open("tokenoutput.txt", "w", encoding="utf-8") file.write(unicode("word tokens " + '\n')) count = 0 for item in wtokens: file.write(unicode("%s\n" % item)) count += 1 if count > 100: break count = 0 file.write(unicode("\n\ncasual tokens " + '\n')) for item in ctokens: file.write(unicode("%s\n" % item)) count += 1 if count > 100:
def parse_cc(lucid_data_, cc_block, file_out, with_l_id_, freq_context_=None, fuzzy_k=3): observed_ = set() cc_block_children = [ k_ for k_ in cc_block.keys() if k_.startswith('Child') ] for cc_block_child_key in cc_block_children: cc_block_child = cc_block[cc_block_child_key] if cc_block_child.startswith('Snippet'): for annotated_line in cc_block[cc_block_child]: line = [ tuple([p[0], p[1] if p[1] not in blacklist else '.'] + ([1] if with_l_id_ else [])) for p in annotated_line['Tokens'] ] line_2 = copy.deepcopy(line) code_toks = [(p[0], p[1]) for p in copy.deepcopy(line) if p[1] != "comment" and p[1] != "keyword"] if with_l_id_: for val, tag, l_id in line: if tag == "comment": toks = [ pos_tag([ t for t in casual_tokenize(s) if t not in ['/', '\\', '*'] ], tagset="universal") for s in sent_tokenize(cc_block['CommentText'].strip()) ] new_toks = list() for sent in toks: new_sent = list() for val_, tag_ in sent: try: short_list = fuzzy_match( val_, [p[0] for p in code_toks], fuzzy_k) just_list = [p[0] for p in short_list] new_tag = mode([ p[1] for p in code_toks if p[0] in just_list ]) if new_tag.isupper(): new_tag = tag_ new_sent.append( (val_, new_tag, 0 if new_tag.isupper() or new_tag == '.' else 1)) except IndexError: new_sent.append((val_, tag_, 0)) new_toks.append(new_sent) toks = new_toks if freq_context_ is not None: new_toks = list() for sent in toks: new_sent = list() for val_, tag_ in sent: try: new_tag = \ sorted(freq_context_[val_].items(), reverse=True, key=lambda p: p[1])[0][0] if new_tag.isupper(): new_tag = tag_ new_sent.append( (val_, new_tag, 0 if new_tag.isupper() or new_tag == '.' else 1)) except IndexError: new_sent.append((val_, tag_, 0)) new_toks.append(new_sent) toks = new_toks formatted_output = ''.join([ '\n'.join(['%s %s %d' % t for t in s]) + '\n\n' for s in toks ]) file_out.write(formatted_output) line_2.remove((val, tag, l_id)) elif tag == "string_literal": line_2.remove((val, tag, l_id)) line_2.append((val.replace('\n', ' '), tag, l_id)) file_out.write('\n'.join( ['%s %s %d' % (v, t, l) for v, t, l in line_2]) + '\n\n') else: for val, tag in line: if tag == "comment": toks = [ pos_tag([ t for t in casual_tokenize(s) if t not in ['/', '\\', '*'] ], tagset="universal") for s in sent_tokenize(cc_block['CommentText'].strip()) ] new_toks = list() for sent in toks: new_sent = list() for val_, tag_ in sent: try: short_list = fuzzy_match( val_, [p[0] for p in code_toks], fuzzy_k) just_list = [p[0] for p in short_list] new_tag = mode([ p[1] for p in code_toks if p[0] in just_list ]) if new_tag.isupper(): new_tag = tag_ new_sent.append( (val_, new_tag, 0 if new_tag.isupper() or new_tag == '.' else 1)) except IndexError: new_sent.append((val_, tag_, 0)) new_toks.append(new_sent) toks = new_toks if freq_context_ is not None: new_toks = list() for sent in toks: new_sent = list() for val_, tag_ in sent: try: new_tag = \ sorted(freq_context_[val_].items(), reverse=True, key=lambda p: p[1])[ 0][ 0] if new_tag.isupper(): new_tag = tag_ new_sent.append((val_, new_tag)) except IndexError: new_sent.append((val_, tag_)) new_toks.append(new_sent) toks = new_toks formatted_output = ''.join([ '\n'.join(['%s %s' % t for t in s]) + '\n\n' for s in toks ]) file_out.write(formatted_output) line_2.remove((val, tag)) elif tag == "string_literal": line_2.remove((val, tag)) line_2.append((val.replace('\n', ' '), tag)) file_out.write( '\n'.join(['%s %s' % (v, t) for v, t in line_2]) + '\n\n') else: observed_.add(cc_block_child) inner_observed = parse_cc(lucid_data_, lucid_data_[cc_block_child], file_out, with_l_id_) observed_.union(inner_observed) if len(cc_block['CommentText']) > 0: toks = [ pos_tag( [t for t in casual_tokenize(s) if t not in ['/', '\\', '*']], tagset="universal") for s in sent_tokenize(cc_block['CommentText'].strip()) ] if freq_context_ is not None: if with_l_id_: new_toks = list() for sent in toks: new_sent = list() for val, tag in sent: try: new_tag = sorted(freq_context_[val].items(), reverse=True, key=lambda p: p[1])[0][0] if new_tag.isupper(): new_tag = tag new_sent.append( (val, new_tag, 0 if new_tag.isupper() or new_tag == '.' else 1)) except IndexError: new_sent.append((val, tag, 0)) new_toks.append(new_sent) toks = new_toks else: new_toks = list() for sent in toks: new_sent = list() for val, tag in sent: try: new_tag = sorted(freq_context_[val].items(), reverse=True, key=lambda p: p[1])[0][0] if new_tag.isupper(): new_tag = tag new_sent.append((val, new_tag)) except IndexError: new_sent.append((val, tag)) new_toks.append(new_sent) toks = new_toks if with_l_id_: formatted_output = ''.join([ '\n'.join(['%s %s %d' % t for t in s]) + '\n\n' for s in toks ]) else: formatted_output = ''.join( ['\n'.join(['%s %s' % t for t in s]) + '\n\n' for s in toks]) file_out.write(formatted_output) return observed_
def ctx_lit_eval(s): if isinstance(s, str): sents = s.split("|") return [nltk.casual_tokenize(i.lower()) for i in sents] return []