def tokens(s): if tokenizer == 'words': return tokenize(s) elif tokenizer == 'raw': return s else: raise ValueError("Unknown tokenizer {}".format(tokenizer))
def parse_rpn(expr): tokens_list = tokens.tokenize(expr) result, tmp = [], [] i, n = 0, len(tokens_list) while i < n: token = tokens_list[i] if token.ttype in [tokens.Token.INT, tokens.Token.DOUBLE]: result.append(token) elif token.ttype == tokens.Token.ID: if (i + 1) < n and tokens_list[i + 1].ttype == tokens.Token.PAREN_LEFT: tmp.append(token) else: result.append(token) elif token.ttype == tokens.Token.PAREN_LEFT: tmp.append(token) elif token.ttype in tokens.Token.OPERATORS: op_priority = operator_priority(token) while tmp and operator_priority(tmp[len(tmp) - 1]) >= op_priority: result.append(tmp.pop()) tmp.append(token) elif token.ttype == tokens.Token.PAREN_RIGHT: while tmp and tmp[len(tmp) - 1].ttype != tokens.Token.PAREN_LEFT: result.append(tmp.pop()) tmp.pop() if tmp and tmp[len(tmp) - 1].ttype == tokens.Token.ID: result.append(tmp.pop()) i += 1 while tmp: result.append(tmp.pop()) return result
def emphasize(self, line) -> None: if not len(line) > 2: return first_modifier = line[1] second_modifier = None if str.isalpha(line[2]): second_modifier = line[2] self.last_line_length = len(line) - 2 first_markup, second_markup = '{}', None if first_modifier == 'B': first_markup = BOLD_MARKUP elif first_markup == 'I': first_markup = ITALICS_MARKUP if second_modifier == 'B': second_markup = BOLD_MARKUP elif second_modifier == 'I': second_markup = ITALICS_MARKUP elif second_modifier == 'R': second_markup = '{}' tokens = tokenize(line) result = '' for i in range(1, len(tokens)): if i % 2 == 1: result += first_markup.format(tokens[i]) else: if second_markup: result += second_markup.format(tokens[i]) else: result += first_markup.format(tokens[i]) _result = self.replace_special_symbols(result) self.last_line_length = len(self.clean_html(_result)) self.body += _result
def parse(code, cx): toks = [tokens.tok(t) for t in tokens.tokenize(code)] + [None] p = Parser(toks, cx.s) tree = p.expr() if p.tl[:1] != [None]: raise ParseException() if len(p.tl) > 1: cx.warn('not all tokens used') return tree
def tokens(sent, tokenizer='word'): if tokenizer == 'word': return tokenize(sent['raw']) elif tokenizer == 'word-clean': return sent['tokens'] elif tokenizer == 'char': return list(sent['raw']) elif tokenizer == 'phon': # remove spaces/punctuation, and lowercase return [c.lower() for c in sent['raw'] if c in string.letters]
def tokens(sent, tokenizer='word'): if tokenizer == 'word': return tokenize(sent['raw']) elif tokenizer == 'word-clean': return sent['tokens'] elif tokenizer == 'char': return list(sent['raw']) elif tokenizer == 'phon': # remove spaces/punctuation, and lowercase return [ c.lower() for c in sent['raw'] if c in string.letters ]
def generate_positional_score(all_tokens, soup): '''This function goes through all the tokens for a given webpage, and determines its positional score by examining the webpage's soup object Using the POSITIONAL_SCORING_METRIC dictionary, it assigns a score to the token @param all_tokens: all the tokens appearing in a given webpage @param soup: the BeautifulSoup object for the webpage in question ''' result = defaultdict(float) header_tokens = tokens.tokenize(get_all_header_strings(soup)) try: title_tokens = tokens.tokenize(soup.title.get_text()) except AttributeError: title_tokens = dict() for token, freq in all_tokens.items(): result[token] += POSITIONAL_SCORING_METRIC['body'] if(token in header_tokens.keys()): result[token] += POSITIONAL_SCORING_METRIC['header'] if(token in title_tokens.keys()): result[token] += POSITIONAL_SCORING_METRIC['title'] return result
def file(path): directory = os.path.dirname(os.path.realpath(os.path.abspath(path))) file = open(path, "r+") lines = file.readlines() linenum = 0 while (linenum < len(lines)): lines[linenum] = tokens.tokenize(lines[linenum], directory=relpath(path)) linenum += 1 content = '\n'.join(lines) exec(content)
def read_corpus_into_db(crawled_dict): '''Using the bookkeeping dictionary, we write all of the webpage information into the database using this function. @param crawled_dict: The bookkeeping dictionary read from bookkeeping.json ''' for path, url in crawled_dict.items(): print("Processing:",path, ' ', url) soup = create_soup(open(RAW_DATA_FOLDER + '/' + path)) try: title = soup.title.get_text() except AttributeError: title = '' all_tokens = tokens.tokenize(soup.get_text()) print("Tokenized: ", title) database.write_webpage(path,url,title,all_tokens)
def mgs_require(module_name): file = open(module_name, "r+") lines = file.readlines() linenum = 0 lines.insert(0, "from functions import * \n") while (linenum < len(lines)): lines[linenum] = tokens.tokenize(lines[linenum], directory=relpath(module_name)) linenum += 1 source = '\n'.join(lines) module = types.ModuleType(module_name) exec(source, module.__dict__) sys.modules[module_name] = module return module
def parse_log(lines): """ Args: lines (iterable of str) Assumes the trailing newline is part of the line Returns: (list of LogLineVertex, dict(str -> list of LogLineVertex), dict(str -> list of LogLineVertex)) List of the actual log line vertices, dict from tags to the vertices that have it, dict from ids to the vertices that have it """ log_lines = [] tag_map = {} id_map = {} for line_number, line in enumerate(lines): line = line.rstrip() ret = _parse_log_line(line) # TODO(trevor) for now skip non conforming lines if ret is None: continue (dt, thread_id, msg) = ret # 1 Create log line vertex for each line vertex = LogLineVertex(line, msg, line_number, thread_id, dt) log_lines.append(vertex) # 2 Look for tokens, then create separate tag and id maps # Each map is from token to list of associated vertices tokens = tokenize(msg) for (token, token_type) in tokens: if token_type == TokenType.TAG: this_map = tag_map elif token_type == TokenType.ID: this_map = id_map else: # Skip stop words and spaces continue # We use a set here so that a line which has a word more than once # doesn't get counted twice this_set = this_map.get(token, set()) this_set.add(vertex) this_map[token] = this_set # Turn the sets back to lists _dict_map(tag_map, lambda s: list(s)) _dict_map(id_map, lambda s: list(s)) return (log_lines, tag_map, id_map)
def parse(s): toks = tokens.tokenize(s) if (len(toks) >= 4 and toks[0].isIdentifier() and toks[1].isSymbol("=")): varname = toks[0].value toks.pop(0) toks.pop(0) value = parse_expression(toks) if not toks[0].isStop(): raise InputError("Expected operator or end of input", toks[0]) variables[varname] = value print("%s = %g" % (varname, value)) else: result = parse_expression(toks) if not toks[0].isStop(): raise InputError("Expected operator or end of input", toks[0]) print("==> %g" % result)
def new_subheader(self, subheader_title: str) -> None: if self.paragraph: self.body += '</div>' self.paragraph = False if self.subheader: self.body += '</div>' self.subheader = False self.subheader = True _title = tokenize(subheader_title.replace('.SS', '').strip())[0] self.body += '<h3 id="{}_{}">{}</h3>'.format(self.h_id - 1, self.sub_id, _title) self.body += '<div style="padding-left: 4em;">' self.contents[str(self.h_id - 1)]['sub'][str(self.sub_id)] = _title self.sub_id += 1
def encode_line(line, cmudict, syll_mgr): global total global success total += 1 line = tokens.clean(line) words = tokens.tokenize(line) words = tokens.fixtokens(words) words = tokens.hyphen(words, cmudict.syll_dict) encs = array.array('H') for word in words: sylls = cmudict.get_syllables(word.lower()) if sylls == None or len(sylls) == 0: continue for syll in sylls[0]: enc = syll_mgr.get_encoding(syll) if enc != syllables.unknown_encoding: encs.append(enc) return encs
def cmd(self, source): ''' The standard interpreter loop. Reads input, parses it, evaluates it, and writes the result to stdout. Continues to run until the user exits by sending an EOF. ''' # exit when an EOF is received if isinstance(source, EOFError): sys.stdout.write(os.linesep) return True elif isinstance(source, KeyboardInterrupt): # clear the line and reset the source when an interrupt is received self.prompt = self.standard_prompt self.source = u'' sys.stdout.write(os.linesep) return # otherwise, parse and evaluate the source code try: self.source += source # evaluate every entered expression sequentially for result in parse(tokens.tokenize(self.source)): self.stdout.write(util.to_string(evaluate(result, self.env)) + os.linesep) # reset the prompt and source self.prompt = self.standard_prompt self.source = u'' # allow the user to finish entering a correct expression except errors.ParserError: self.prompt = self.continue_prompt self.source += os.linesep # write all other problems and clear source except Exception, e: traceback.print_exc(file=self.stdout) # reset the source and prompt for the next parse self.source = u'' self.prompt = self.standard_prompt
def new_header(self, header_title: str) -> None: if self.paragraph: self.body += '</div>' self.paragraph = False if self.subheader: self.body += '</div>' self.subheader = False if self.header: self.body += '</div>' self.header = False self.header = True _title = tokenize(header_title.replace('.SH', '').strip())[0] self.body += '<h2 id="{}">{}</h2>'.format(self.h_id, _title) self.body += '<div style="padding-left: 4em;">' self.contents[str(self.h_id)] = {'title': _title, 'sub': {}} self.h_id += 1 self.sub_id = 1
def encode_line(line, cmudict, syll_mgr): global total global success total += 1 line = tokens.clean(line) words = tokens.tokenize(line) words = tokens.fixtokens(words) words = tokens.hyphen(words, cmudict.syll_dict) encs = [] for word in words: sylls = cmudict.get_syllables(word.lower()) if sylls == None or len(sylls) == 0: return None for syll in sylls[0]: enc = syll_mgr.get_encoding(syll) if enc != syllables.unknown_encoding: encs.append(enc) labels = [0] * syll_mgr.get_size() for enc in encs: labels[enc] = 1 success += 1 return labels
def format_log_line_vertex(vertex): """ Formats the log line as a string with ids and tags highlighted. Args: vertex (vertex.LogLineVertex) Returns: str """ msg = "" # Tokenize the message for (token, token_type) in tokenize(vertex.message): fmt = token if token_type is TokenType.TAG: fmt = format_tag(fmt) elif token_type is TokenType.ID: fmt = format_id(fmt) msg += "%s" % fmt # Rebuild the log message now ret = "%s [%s] %s" % (vertex.time, format_tag(vertex.thread_id), msg) return ret
def handle_query(query_str): '''This function is called when a query is presented. It tokenizes the query, and generates the results for the query @param query_str: The string entered by the user which represents the query. ''' query_tokens = tokens.tokenize(query_str) all_hits = defaultdict() if len(query_tokens) == 0: raise KeyError() if len(query_tokens.items()) == 1: for query in query_tokens.keys(): top_fifteen = dict( sorted(scripts.database.get_webpages(query).items(), key=lambda x: x[1], reverse=True)[:15]) return create_engine_info(top_fifteen) for query in query_tokens.keys(): hits = scripts.database.get_webpages(query) if hits == None or hits == {}: continue all_hits[query] = dict( sorted(hits.items(), key=lambda x: x[1], reverse=True)[:50]) top_fifteen = get_top_fifteen(all_hits) return create_engine_info(top_fifteen)
def get_words(self, sentence): """ Tokenizes a sentence into individual words. Converts Unicode punctuation into ASCII if that option is set. Ignores sentences with Unicode if that option is set. Returns an empty list of words if the sentence has Unicode and that is not allowed. """ if not isinstance(sentence, unicode): raise ValueError("All sentences should be Unicode-encoded!") sentence = sentence.strip().lower() if self.break_replacement: sentence = convert_linebreaks(sentence) if self.remove_variation_selectors: sentence = remove_variation_selectors(sentence) # Split into words using simple whitespace splitting and convert # Unicode. This is done to prevent word splitting issues with # twokenize and Unicode words = sentence.split() if True: # if not JAPAN: converted_words = [] for w in words: accept_sentence, c_w = self.convert_unicode_word(w) # Unicode word detected and not allowed if not accept_sentence: return [] else: converted_words.append(c_w) sentence = ' '.join(converted_words) words = tokenize(sentence) words = [process_word(w) for w in words] return words
def parse_file(self, path): '''Reads a file, parses it, and returns the AST.''' with open(os.path.abspath(path), 'r') as f: return parse(tokens.tokenize(util.file_char_iter(f)))
def evaluate(sexp, env): ''' Given an Atom or list, evaluates it using the given environment (global by default) and returns the result as represented in our language constructs. ''' # symbol if isinstance(sexp, lang.Symbol): # look it up in the environment for its value return env[sexp] # atom (not a literal list) elif not lang.Cons.is_list(sexp): # it's a generic atom and evaluates to itself return sexp # list else: # we can't evaluate functions that have nothing in them if len(sexp) == 0: raise errors.ApplicationError('nothing to apply') # evaluate functions using their arguments function = evaluate(sexp.car, env) args = sexp.cdr # make sure our first item evaluated to a function if not isinstance(function, lang.Callable): raise errors.ApplicationError('wrong type to apply: ' + str(function)) # quote if function is primitives.quote: # return the argument unevaluated util.ensure_args(args, num_required=1) return args.car # quasiquote elif function is primitives.quasiquote: util.ensure_args(args, num_required=1) return quasiquote_evaluate(args.car, env) # function elif function is primitives.lambda_: util.ensure_args(args, num_required=2) arg_symbols = args.car body = args.cdr.car # return a function with the current environment as the parent return lang.Function(evaluate, env, arg_symbols, body) # macro elif function is primitives.macro: util.ensure_args(args, num_required=2) arg_symbols = args.car body = args.cdr.car # return a macro with the given symbols and body return lang.Macro(evaluate, env, arg_symbols, body) # macro expand elif function is primitives.expand: util.ensure_args(args, num_required=1, is_variadic=True) # evaluate to get the macro and its arguments m = evaluate(args.car, env) arg_expressions = [evaluate(arg, env) for arg in args.cdr] # make sure we got a macro util.ensure_type(lang.Macro, m) return m(evaluate, env, *arg_expressions) # define elif function is primitives.define: util.ensure_args(args, num_required=2) symbol = args.car value = args.cdr.car # make sure we're defining to a symbol util.ensure_type(lang.Symbol, symbol) # evaluate the argument, map the symbol to the result in the current # environment, then return the evaluated value. this allows for # chains of definitions, or simultaneous variable assignments to the # same value. result = evaluate(value, env) env[symbol] = result # set the function or macro name if possible if isinstance(result, lang.Callable): result.name(symbol.value) return result # cond elif function is primitives.cond: for tup in args: # if e is not a list, len() raises an error for us if len(tup) != 2: # make sure each is a list of exactly two expressions s = 'expected 2 expressions, got ' + str(len(tup)) raise errors.IncorrectArgumentCountError(s) # first and second list items are condition and result condition = tup.car result = tup.cdr.car # evaluate and return the result if condition is true if evaluate(condition, env): return evaluate(result, env) # if no result is returned, result is undefined raise errors.ApplicationError('at least one condition must ' + 'evaluate to ' + tokens.TRUE) # logical and elif function is primitives.and_: util.ensure_args(args, num_required=2, is_variadic=True) # evaluate the arguments, returning the final one if none were #f, # otherwise the last evaluated item, #f. last_item = None for item in args: last_item = evaluate(item, env) if last_item is False: break return last_item # logical or elif function is primitives.or_: util.ensure_args(args, num_required=2, is_variadic=True) # evaluate the arguments, returning the first one that's not #f, last_item = None for item in args: last_item = evaluate(item, env) if not last_item is False: break return last_item # eval elif function is primitives.eval_: util.ensure_args(args, num_required=1) # evaluate the given s-expression and return it return evaluate(evaluate(args.car, env), env) # load elif function is primitives.load: util.ensure_args(args, num_required=1) util.ensure_type(basestring, args.car) # evaluate every expression in the file in sequence, top to bottom with open(os.path.abspath(args.car), 'r') as f: for result in parse(tokens.tokenize(util.file_char_iter(f))): evaluate(result, env) # return that we were successful return True # evaluate macros elif isinstance(function, lang.Macro): # evaluate the expanded form of the macro in the current environment return evaluate(function(evaluate, env, *args), env) else: # evaluate args and call the function with them return function(evaluate, *[evaluate(arg, env) for arg in args])
from sets import Set import sys import cmudict import tokens cmudict = cmudict.CMUDict() wordset = Set([]) for line in sys.stdin: text = line.split("\t")[0] words = tokens.tokenize(line) words = tokens.fixtokens(words) words = tokens.hyphen(words, cmudict.syll_dict) for word in words: wordset.add(word) for word in sorted(wordset): print(word)
def batch_sents(batcher, sents): return batcher.batch_inp(list(batcher.mapper.transform([ tokenize(s) for s in sents ])))
def processed_input(str_to_show): content = input(str_to_show) content = tokens.tokenize(content) return content
def batch_sents(batcher, sents): return batcher.batch_inp( list(batcher.mapper.transform([tokenize(s) for s in sents])))
def parse(self, lines): """Parse the given lines of text into a AST that represents the simple HTML document in the text. Raises a ParseError for parsing problems and a TokenizeError for tokenization problems.""" self.tokens = tokenize(lines, True) return Elems(self._elems())
import pickle from sklearn.feature_extraction.text import CountVectorizer params = {"q": sys.argv[1], "lang": "en", "rpp": 100, "result_type": "recent"} api_base_url = "http://search.twitter.com/search.json" raw_data = json.load(urllib2.urlopen(api_base_url + "?" + urllib.urlencode(params))) # Load Tweets tweets = [] for i in raw_data["results"]: tweets.append(tokens.tokenize(i["text"])) # Filter out Non-English Tweets tweets = lang_detector.filter_tweets(tweets) # Load Classifier f = open(sys.argv[2], 'rb') classifier = pickle.load(f) f.close() vectorizer = CountVectorizer() X = vectorizer.fit_transform([" ".join(t) for t in tweets]) y = classifier.predict_proba(X) print "Positive, Negative :", 100 * sum(y) / sum(sum(y))
from rnn import RNN from tokens import tokenize f = open('input.txt', 'r') sentences = list(filter(None, f.read().split('\n\n'))) f.close() # print((sentences)) tokens = [] X = [] Y = [] # word_dim = 58 for i in range(len(sentences)): tokens.append([]) for j in range(len(sentences[i])): if (tokenize(sentences[i][j]) != "UNKNOWN"): tokens[i].append(tokenize(sentences[i][j])) X.append(tokens[i][:-1]) Y.append(tokens[i][1:]) model = RNN(61, 200, 10) model.load('uwv.pkl') model.train(X, Y, 0.1, 10, 1)
#import lang_detector_em as lang_detector import lang_detector import pickle from sklearn.feature_extraction.text import CountVectorizer params = {"q": sys.argv[1], "lang": "en", "rpp": 100, "result_type": "recent"} api_base_url = "http://search.twitter.com/search.json" raw_data = json.load( urllib2.urlopen(api_base_url + "?" + urllib.urlencode(params))) # Load Tweets tweets = [] for i in raw_data["results"]: tweets.append(tokens.tokenize(i["text"])) # Filter out Non-English Tweets tweets = lang_detector.filter_tweets(tweets) # Load Classifier f = open(sys.argv[2], 'rb') classifier = pickle.load(f) f.close() vectorizer = CountVectorizer() X = vectorizer.fit_transform([" ".join(t) for t in tweets]) y = classifier.predict_proba(X) print "Positive, Negative :", 100 * sum(y) / sum(sum(y))
from rnn import RNN from tokens import tokenize, detokenize model = RNN(61,200) model.load('uwv.pkl') # print(model.U.shape) # generate(start_token, num_of_chars, top_k random predictions) text = model.generate(tokenize('F'), 300, 3) for i in text: print(detokenize(i), end="")
import tokens # build languagemodel of word -> [enc, enc, ...] sm = languagemodel.SyllableModel() cmudict = cmudict.CMUDict() syll_mgr = syllables.syllables() print('Starting') count = 0 for line in sys.stdin: parts = line.split('\t') text = parts[0] syllables = literal_eval(parts[1][:-1]) # newline text = tokens.clean(text) words = tokens.tokenize(text) words = tokens.fixtokens(words) words = tokens.hyphen(words, cmudict.syll_dict) clean = [] for word in words: if word != ',': clean.append(word) count += 1 if len(clean) != len(syllables): print("Line #: " + str(count)) print(clean) print(syllables) continue for i in range(len(clean)): encs = [] for s in syllables[i]:
""") def help(): print(""" Following are the all the commands that are avalible except the ones built into the functions.py libraries() -This command shows you all libraries that are used in fuctions.py and are loaded into this at the start of this program's execution file() -This command can execute your file and takes a single argument that being the path to the file terminal() -This command executes commands directly on your terminal so that you wouldn't have to exit and enter program repeatedly. exit() or quit() -Used simply to exit or quit the program. """) def terminal(command): os.system(command) while True: given = input('>>>') given = tokens.tokenize(given) try: exec(given) except Exception as error: print(''.join( format_exception(etype=type(error), value=error, tb=error.__traceback__)))
def parse(s): toks = tokens.tokenize(s) result = parse_expression(toks) if not toks[0].isStop(): raise InputError("Expected operator or end of input", toks[0]) return result
def parse_(s): '''Parse a string into a list of the S-expressions it describes.''' util.ensure_type(basestring, s) return lang.Cons.build(*parse(tokens.tokenize(s)))