def command_query(self, *args): """ Literal SQL query """ # Make sure the query is in double inverted commas raw_string = " ".join(args) re_match = re.search(r'^([^"]*)"(([^\]\"|[^"])+)"([^"]*)$', raw_string) if re_match is None: yield from self.send_to_client( "The query to be executed needs to be properly wrapped in " "double inverted commas." ) return # E.g., query this "is a" test pre_args = re_match.group(1) # 'this ' query = re_match.group(2) # 'is a' post_args = re_match.group(4) # ' test' logger.debug( "Client issued 'query'. " "<pre_args: {}, query: {}, post_args: {}>".format(pre_args, query, post_args) ) request = { 'command': 'literal_query', 'query': query, 'limit': self.config['per_page'] } yield from self.send_to_server(request)
def tokenise(sentence): ## General pre-processing # Remove common URL patterns from our sentence. logger.debug("Tokenising: '" + sentence + "'") sentence = re.sub(r'https?://[^ ]*', '', sentence) logger.debug("URLs removed: '" + sentence + "'") # For now, just use the default nltk tokeniser # TODO: Try the pyenchant tokeniser tokenised = nltk.tokenize.word_tokenize(sentence) ## General token processing # Split words with slashes into multiple tokens temp = [] for token in tokenised: split = token.split("/") temp = temp + split tokenised = temp # Remove punctuation, then whitespace. tokenised = [token.strip("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~").strip() for token in tokenised] # Drop empty tokens and return return [token for token in tokenised if token != '']
def _receive_to_queue(self): try: while True: msg = yield from self.websocket.recv() if msg is None: logger.info("[{}] Client connection closed.".format( self.websocket.remote_ip)) break # Attempt to parse JSON try: msg = json.loads(msg) except ValueError: logger.error("[{}] Bad input from client. " "(Could not parse JSON)".format( self.websocket.remote_ip)) break yield from self.input_queue.put(msg) logger.info("[{}] [RECV] {}".format(self.websocket.remote_ip, msg)) except CancelledError: logger.debug("[{}] CancelledError on receiver -- " "Should not be happening.".format( self.websocket.remote_ip))
def _send_from_queue(self): preview_length = 80 # ====================================== @asyncio.coroutine def execute(msg): msg_preview = (msg[0:preview_length].replace('\n', '\\n').replace( '\r', '\\r')) if len(msg) > preview_length: msg_preview += "..." self.writer.write(msg.encode()) yield from self.writer.drain() logger.info("[{}] [SEND] {}".format(self.remote_ip, msg_preview)) # ====================================== try: while True: msg = yield from self.output_queue.get() yield from execute(msg) except CancelledError: logger.debug("[{}] Cancelling sender...".format(self.remote_ip)) # Goodbye, client yield from self.output_queue.put("Server closing connection -- " "Goodbye.") while self.output_queue.qsize() > 0: msg = self.output_queue.get_nowait() yield from execute(msg)
def run_parser(self): # We're starting to communicate with the server; put in a preliminary # motd command. # yield from self.client_input.put(b'motd\r\n') yield from self.command_motd() # Keep reading and processing client_input/raw_output communication_tasks = [asyncio.async(self._read_client_input()), asyncio.async(self._read_server_output())] try: yield from asyncio.wait(communication_tasks, return_when=FIRST_COMPLETED) except CancelledError: logger.debug("[{}] Cancelling parser...".format(self.remote_ip)) # If we're here, either the parser was cancelled or the client raised # a TelnetExit got_exception = None for task in communication_tasks: if task.done(): e = task.exception() if isinstance(e, Exception): got_exception = e else: task.cancel() yield from task if got_exception is not None: raise got_exception
def run_parser(self): # We're starting to communicate with the server; put in a preliminary # motd command. # yield from self.client_input.put(b'motd\r\n') yield from self.command_motd() # Keep reading and processing client_input/raw_output communication_tasks = [ asyncio. async (self._read_client_input()), asyncio. async (self._read_server_output()) ] try: yield from asyncio.wait(communication_tasks, return_when=FIRST_COMPLETED) except CancelledError: logger.debug("[{}] Cancelling parser...".format(self.remote_ip)) # If we're here, either the parser was cancelled or the client raised # a TelnetExit got_exception = None for task in communication_tasks: if task.done(): e = task.exception() if isinstance(e, Exception): got_exception = e else: task.cancel() yield from task if got_exception is not None: raise got_exception
def command_query(self, *args): """ Literal SQL query """ # Make sure the query is in double inverted commas raw_string = " ".join(args) re_match = re.search(r'^([^"]*)"(([^\]\"|[^"])+)"([^"]*)$', raw_string) if re_match is None: yield from self.send_to_client( "The query to be executed needs to be properly wrapped in " "double inverted commas.") return # E.g., query this "is a" test pre_args = re_match.group(1) # 'this ' query = re_match.group(2) # 'is a' post_args = re_match.group(4) # ' test' logger.debug("Client issued 'query'. " "<pre_args: {}, query: {}, post_args: {}>".format( pre_args, query, post_args)) request = { 'command': 'literal_query', 'query': query, 'limit': self.config['per_page'] } yield from self.send_to_server(request)
def _receive_to_queue(self): try: while True: msg = yield from self.websocket.recv() if msg is None: logger.info( "[{}] Client connection closed.".format( self.websocket.remote_ip) ) break # Attempt to parse JSON try: msg = json.loads(msg) except ValueError: logger.error( "[{}] Bad input from client. " "(Could not parse JSON)".format( self.websocket.remote_ip) ) break yield from self.input_queue.put(msg) logger.info("[{}] [RECV] {}".format( self.websocket.remote_ip, msg) ) except CancelledError: logger.debug( "[{}] CancelledError on receiver -- " "Should not be happening.".format(self.websocket.remote_ip) )
def _send_from_queue(self): preview_length = 80 # ====================================== @asyncio.coroutine def execute(msg): msg_preview = msg[0:preview_length].replace("\n", "\\n").replace("\r", "\\r") if len(msg) > preview_length: msg_preview += "..." self.writer.write(msg.encode()) yield from self.writer.drain() logger.info("[{}] [SEND] {}".format(self.remote_ip, msg_preview)) # ====================================== try: while True: msg = yield from self.output_queue.get() yield from execute(msg) except CancelledError: logger.debug("[{}] Cancelling sender...".format(self.remote_ip)) # Goodbye, client yield from self.output_queue.put("Server closing connection -- " "Goodbye.") while self.output_queue.qsize() > 0: msg = self.output_queue.get_nowait() yield from execute(msg)
def has_spelling_language_unique(tokenised, language): for token in tokenised: if word_in_dictionary_unique(token, language): logger.debug( "Found unique word '{0}' for language: {1}".format( token, language) ) return True return False
def command_search(self, *args): """ FTS search request """ if len(args) == 0: yield from self.send_to_client("Syntax is:\r\n\r\n" "db search \"<query >\"" "<page number>") return # Make sure the query is in double inverted commas raw_string = " ".join(args) re_match = re.search(r'^([^"]*)"(([^\]\"|[^"])+)"([^"]*)$', raw_string) if re_match is None: yield from self.send_to_client( "The search string needs to be properly wrapped in double " "inverted commas." ) return # E.g., this "is a" test pre_args = re_match.group(1) # 'this ' query = re_match.group(2) # 'is a' post_args = re_match.group(4) # ' test' logger.debug( "Client issued 'search'. " "<pre_args: {}, query: {}, post_args: {}>".format(pre_args, query, post_args) ) post_args = post_args.strip() if post_args != '': try: page_number = int(post_args) except ValueError: yield from self.send_to_client( "'{}' is not a valid page number.\r\n\r\n" "Syntax is:\r\n\r\n" "db search \"<query>\" <page number>".format(post_args) ) return else: page_number = 1 # Transitional: The server's search function currently takes an # encoded URL string (for some reason...) request = { 'command': 'search', 'query': "s={}&p={}".format(query, page_number), 'perpage': self.config['per_page'] } yield from self.send_to_server(request)
def tokenise_as_list(self, text): """ Memoised version of the tokeniser; returns a list instead of an iterator. This cache should not need to be cleared; any modifications to the tokeniser will only take effect on restart """ logger.debug('Running OCETokeniser: {}'.format(text)) tokenised_list = [] token_open = False token_start = 0 token_end = 0 for char in text: if re.match(r'[^a-zA-Z0-9]', char): # The character is non-alphanumeric. If it is within the # ASCII range, close the current token and yield it. If # not, yield it as a new token. if ord(char) <= 127: if token_open: # Was in token. Yield token and advance cursors # to next character. token_open = False tokenised_list.append( self.yield_token(text, token_start, token_end)) token_end += 1 token_start = token_end else: # Was not in token. Advance cursors in tandem. token_end += 1 token_start += 1 else: if token_open: # Was in token. Yield and advance start cursor. token_open = False tokenised_list.append( self.yield_token(text, token_start, token_end)) token_start = token_end # Yield one more character (the non-ASCII one) token_end += 1 tokenised_list.append( self.yield_token(text, token_start, token_end)) token_start = token_end else: # In a token. Move the end cursor. token_open = True token_end += 1 # Yield the last token if token_open: tokenised_list.append( self.yield_token(text, token_start, token_end)) return tokenised_list
def _watch_client(self, client): """ Resolves the given future when the specified client provides some input. The future will contain a reference to the client as well, so we know exactly who gave us the input. (We wouldn't get this information if we were waiting only on each client's bare get_input_async()) """ try: message = yield from client.get_input_async() return client, message except CancelledError: logger.debug("_watch_client cancelled: We either lost the client or are " "shutting down.")
def command_search(self, *args): """ FTS search request """ if len(args) == 0: yield from self.send_to_client("Syntax is:\r\n\r\n" "db search \"<query >\"" "<page number>") return # Make sure the query is in double inverted commas raw_string = " ".join(args) re_match = re.search(r'^([^"]*)"(([^\]\"|[^"])+)"([^"]*)$', raw_string) if re_match is None: yield from self.send_to_client( "The search string needs to be properly wrapped in double " "inverted commas.") return # E.g., this "is a" test pre_args = re_match.group(1) # 'this ' query = re_match.group(2) # 'is a' post_args = re_match.group(4) # ' test' logger.debug("Client issued 'search'. " "<pre_args: {}, query: {}, post_args: {}>".format( pre_args, query, post_args)) post_args = post_args.strip() if post_args != '': try: page_number = int(post_args) except ValueError: yield from self.send_to_client( "'{}' is not a valid page number.\r\n\r\n" "Syntax is:\r\n\r\n" "db search \"<query>\" <page number>".format(post_args)) return else: page_number = 1 # Transitional: The server's search function currently takes an # encoded URL string (for some reason...) request = { 'command': 'search', 'query': "s={}&p={}".format(query, page_number), 'perpage': self.config['per_page'] } yield from self.send_to_server(request)
def exec_find_features(self, feature_name): """ Finds all language-tagged records in the corpus which match a certain classifier feature """ labelled = self.provider.fetch_search_results("has:language", 0, 0) raw_data = labelled["results"] for datum in raw_data: if self.langid.extract_features(datum["content"])[feature_name]: logger.debug( "Record {0} matches '{1}': {2}.\nLabel is {3}".format( datum["rowid"], feature_name, datum["content"], datum["language"] ) )
def _read_server_output(self): task = None try: while True: task = asyncio. async (self.raw_output.get()) msg = yield from task task = asyncio. async (self.parse_server_output(msg)) yield from task except CancelledError: logger.debug("[{}] Cancelling parser's output loop...".format( self.remote_ip)) # Note: wait_for doesn't work here, for some reason. yield from asyncio.wait([task])
def word_in_dictionary(word, language): dictionaries = spelling_dictionaries[language] # Checks common variants of the word to_check = [word, word.upper(), word.lower(), word.title()] for lang_variant, dictionary in dictionaries.items(): for variant in to_check: if dictionary.check(variant): logger.debug( "Found '{0}' in dictionary: '{1}'".format( variant, lang_variant) ) return True return False
def _receive_to_queue(self): try: while True: msg = yield from self.reader.readline() # "If the EOF was received and the internal buffer is empty, # return an empty bytes object." if msg == b"": logger.info("[{}] Client connection closed.".format(self.remote_ip)) break logger.info("[{}] [RECV] {}".format(self.remote_ip, msg)) yield from self.input_queue.put(msg) except CancelledError: logger.debug("[{}] Cancelling receiver...".format(self.remote_ip))
def _receive_to_queue(self): try: while True: msg = yield from self.reader.readline() # "If the EOF was received and the internal buffer is empty, # return an empty bytes object." if msg == b'': logger.info("[{}] Client connection closed.".format( self.remote_ip)) break logger.info("[{}] [RECV] {}".format(self.remote_ip, msg)) yield from self.input_queue.put(msg) except CancelledError: logger.debug("[{}] Cancelling receiver...".format(self.remote_ip))
def _read_server_output(self): task = None try: while True: task = asyncio.async(self.raw_output.get()) msg = yield from task task = asyncio.async(self.parse_server_output(msg)) yield from task except CancelledError: logger.debug( "[{}] Cancelling parser's output loop...".format( self.remote_ip ) ) # Note: wait_for doesn't work here, for some reason. yield from asyncio.wait([task])
def _read_client_input(self): try: while True: msg = yield from self.client_input.get() try: # Msg is a byte string msg = msg.decode() yield from self.parse_client_input(command_table, msg) except UnicodeDecodeError: # But it might contain undecodable characters # (E.g., interrupts) yield from self._handle_undecodable(msg) except CancelledError: logger.debug("[{}] Cancelling parser's input loop...".format( self.remote_ip))
def prep_tokens_for_spellcheck(tokenised): # Specific pre-processing steps needed for spellcheckers # Drop non-printable characters and numerals # (TBH, we should probably drop even more) printable = set(string.printable) - set("0123456789") tokenised = [''.join([char for char in token if char in printable]) for token in tokenised] # Words to ignore, including the empty string blacklist = {""} # Twitter jargon (should always be in uppercase?) blacklist.add("USERNAME") blacklist.add("RT") tokenised = [token for token in tokenised if token not in blacklist] logger.debug("Tokens for spellchecking: [" + '], ['.join(tokenised) + "]") return tokenised
def _send_from_queue(self): try: while True: msg = yield from self.output_queue.get() msg = json.dumps(msg) msg_preview = msg[0:80] msg = base64.b64encode(zlib.compress(msg.encode())).decode() if not self.websocket.open: logger.error( "[{}] Send error: Socket closed unexpectedly.".format( self.websocket.remote_ip)) break yield from self.websocket.send(msg) logger.info("[{}] [SEND] {}...".format( self.websocket.remote_ip, msg_preview)) except CancelledError: logger.debug("[{}] Cancelling sender...".format( self.websocket.remote_ip))
def _read_client_input(self): try: while True: msg = yield from self.client_input.get() try: # Msg is a byte string msg = msg.decode() yield from self.parse_client_input(command_table, msg) except UnicodeDecodeError: # But it might contain undecodable characters # (E.g., interrupts) yield from self._handle_undecodable(msg) except CancelledError: logger.debug( "[{}] Cancelling parser's input loop...".format( self.remote_ip) )
def _send_from_queue(self): try: while True: msg = yield from self.output_queue.get() msg = json.dumps(msg) msg_preview = msg[0:80] msg = base64.b64encode(zlib.compress(msg.encode())).decode() if not self.websocket.open: logger.error( "[{}] Send error: Socket closed unexpectedly.".format( self.websocket.remote_ip)) break yield from self.websocket.send(msg) logger.info("[{}] [SEND] {}...".format( self.websocket.remote_ip, msg_preview) ) except CancelledError: logger.debug("[{}] Cancelling sender...".format( self.websocket.remote_ip))
def _execute_literal_statements(self, statements): """ Given a List of single SQL statements to execute, does so. """ start_time = timeit.default_timer() try: assert (type(statements) is list and len(statements) > 0) results = [] with self.engine.begin() as connection: for statement in statements: raw = connection.execute(statement) if raw.returns_rows: results += raw.fetchall() time = "{:.3f}".format(timeit.default_timer() - start_time) return results, float(time) except Exception as e: logger.debug(e) raise e
def suffixes_as_list(self, text, search_mode): """ Memoised version of the suffixer; returns a list instead of an iterator. This cache should not need to be cleared; any modifications to the tokeniser will only take effect on restart """ logger.debug("Running OCESuffixer: {}, search_mode: {}".format( text, search_mode)) # Perform any pre-processing that might be appropriate text = self._preprocess_text(text) # The start and end values given by OCETokeniser are in bytes, # but we prefer to work with characters; we'll convert them in the # loop. b_text = text.encode('utf-8') main_tokenised = self.main_tokeniser.tokenise_as_list(text) if search_mode: return main_tokenised tokenised_list = [] for token, b_start, b_end in main_tokenised: if len(token) == 1: # One character token. No need to extract suffixes. continue # Byte position -> Character position c_before = len(b_text[:b_start].decode('utf-8')) # Skip the first suffix (i.e., the whole token) c_start = c_before + 1 c_end = c_before + len(token) for suffix_start in range(c_start, c_end): tokenised_list.append( self.yield_token(text, suffix_start, c_end)) return tokenised_list
def check_pinyin(word): """ Returns True if a word looks like (Mandarin) pinyin (http://pinyin.info/rules/initials_finals.html) :param word: :return: """ # logger.debug("Checking for pinyin: " + word) # Step 0: See if it is one of a few exceptions without an initial: # a, o, e, ai, ei, ao, ou, an, ang, en, eng pattern = r"a(([io]|ng?)?|ou?|e(i|ng?)?)$" if re.match(pattern, word) is not None: logger.debug("'" + word + "' looks like valid pinyin. (No initial)") else: # Step 1: Parse initial/final pattern = r"([bpmfdtnlgkhrjqxwy]|[zcs]h?)(.*)" match = re.match(pattern, word) if match is None: # logger.debug("Initial was not valid: " + word) return False initial = match.group(1) final = match.group(2) # logger.debug("Initial: " + initial + "; Final: " + final) # Step 2: Check final # a, ai, ao, an, ang a_pattern = r"a([io]|ng?)?" # o, ou, ong o_pattern = r"o(u|ng)?" # e, ei, en, eng e_pattern = r"e(i|ng?)?" # u, ua, uo, uai, ui, uan, uang, un, ueng* # *: romanised as w + eng u_pattern = r"u(a(i|ng?)?|o|i|n)?" # i, ia, ie, iao, iu, ian, iang, in, ing, iong i_pattern = r"i(a(o|ng?)?|e|u|ng?|ong)?" # v, ve v_pattern = r"ve?" # Final may end with a tone number (liberally, 0-5 including light tone) final = final.rstrip("012345") if final.startswith("a"): pattern = a_pattern elif final.startswith("o"): pattern = o_pattern elif final.startswith("e"): pattern = e_pattern elif final.startswith("u"): pattern = u_pattern elif final.startswith("i"): pattern = i_pattern elif final.startswith("v"): pattern = v_pattern else: # logger.debug("Final was not valid: " + word) return False if re.match(pattern, final) is None: # logger.debug("Final was not valid: " + word) return False # Step 3: See if initial and final are compatible if final in valid_pinyin[initial]: logger.debug("'" + word + "' looks like valid pinyin.") else: # logger.debug("Initial-Final combination was not valid: " + word) return False # Step 4: Check it against our other dictionaries; better safe than sorry? # We expect to see pinyin for 'zh' and 'sge' records other_languages = [x for x in spelling_dictionaries.keys() if x != 'zh' and x != 'sge'] for language in other_languages: if word_in_dictionary(word, language): logger.debug( "... But found '{0}' for language: {1}".format(word, language) ) return False # Step 5: Our word didn't fail on any of the short circuit checks; take it # as valid pinyin return True
def iterate_controller(self): """ In each iteration of the loop, we: 1) Watch all registered clients, grabbing the first bit(s) of input to come through. If any new clients are registered, restart the loop to include them too. 2) Process the input and send it back to the client The beauty of coroutines is that we are guaranteed synchronous operation until we `yield from`, which blocks until something does happen (which prevents our pseudo-infinite loop above from chewing up resources) """ # Watch new clients, stop watching dropped clients. # self.client_watch is a list of tuples (ClientInterface, Future) # that will be updated to represent all watched clients for this # iteration of the loop. watched_clients = [] watched_client_futures = [] for x in self.client_watch: if x[0] not in self.clients: # Goodbye x[1].cancel() yield from x[1] continue watched_clients.append(x[0]) watched_client_futures.append(x[1]) unwatched_clients = [client for client in self.clients if client not in watched_clients] for client in unwatched_clients: # Hello watched_clients.append(client) watched_client_futures.append(asyncio.async(self._watch_client(client))) self.client_watch = [(watched_clients[x], watched_client_futures[x]) for x in range(len(watched_clients))] # Add the watcher for new/lost clients # On completion, this future will return True. # self.clients_changed will also be in the list of done tasks. watched_client_futures.append(self.clients_changed) # Begin the watch shutdown_this_watch = False restart_this_watch = False logger.debug("Watcher: Watch begun. {} registered client(s).".format(len(watched_clients))) client_watcher = asyncio.wait(watched_client_futures, return_when=FIRST_COMPLETED) done, _ = yield from client_watcher # Now deal with the ones which completed. # We are NOT guaranteed to have only one completed task here, # and we are NOT guaranteed that pending futures will stay # incomplete before the watch ends. for task in done: if task == self.clients_changed: logger.debug("Watcher: Clients changed. " "Now have {} client(s).".format(len(self.clients))) self.clients_changed = asyncio.Future() else: client, request = task.result() logger.debug("Watcher: Received client request: {}".format(str(request))) # Remove the watch here; a new future will be generated for # this client by the next iteration of the loop self.client_watch.remove((client, task)) # If the client wanted a shutdown or restart, hold the request # until the end of the watch try: return_message = self.exec_command(request) yield from client.put_output_async(return_message) except oce.exceptions.ShutdownInterrupt: shutdown_this_watch = True except oce.exceptions.RestartInterrupt: restart_this_watch = True logger.debug("Watcher: Watch ended.") if shutdown_this_watch: raise oce.exceptions.ShutdownInterrupt elif restart_this_watch: raise oce.exceptions.RestartInterrupt