def parseHTML(self, htmlString): """ parseHTML converts the raw htmlString into a list of tokens. """ tsm = TokenizerStateMachine() tokenHandler = TokenHandler() tokenHandler.rootUrl = self.rootUrl i = 0 while (i < len(htmlString)): i = i + tsm.handleCharacter(htmlString[i]) if (not (tsm.currentEmittedToken == None)): if (isinstance(tsm.currentEmittedToken, StartTagToken) and tsm.currentEmittedToken.name == "link"): self.handleLinkToken(tsm.currentEmittedToken) else: tokenHandler.processToken(tsm.currentEmittedToken) tsm.currentEmittedToken = None #self.extractParagraphText(tsm.tokens) self.strList.clear() self.renderList.clear() self.renderObjects.clear() #tokenHandler.getRenderList(tokenHandler.elementTreeRoot, self.renderObjects) #for r in self.renderObjects: # print(r.text) # if (not(r.fontSize == None)): # print(r.fontSize) #print(tokenHandler.elementTreeRoot.getElementRepresentationString("")) #tokenHandler.getTextElements(tokenHandler.elementTreeRoot, self.strList) self.fillRenderList(tokenHandler.elementTreeRoot, self.renderList) for s in self.renderList: print(s)
def test_handleStartTagTokenHandlesFirstStartTag(): testToken = StartTagToken("html") th = TokenHandler("TestRootUrl/") assert len(th.elementTreeRoot.children) == 0 th.handleStartTagToken(testToken) assert len(th.elementTreeRoot.children) == 1 assert th.elementTreeRoot.children[0].name == "html"
class Statement(object): def __init__(self, tokens): self.t = TokenHandler(tokens) def createStatement(self, tokens): currenttoken = self.t.getCurrentToken() if currenttoken == "begin": s = CompoundStatement(tokens) return s elif self.t.isVariable(currenttoken): a = Assignment(tokens) return a elif currenttoken == "print": p = Print(tokens) return p elif currenttoken == "if": i = IF(tokens) return i elif currenttoken == "while": w = While(tokens) return w else: raise ParserException('\';\' expected. Got \'' + currenttoken + '\' instead.') def execute(self, skip): raise NotImplementedError("Subclass must implement abstract method")
def convertTokenListToHTMLElementTree(self, url, tokenList): # The root url is needed by the TokenHandler for handling # possible relative links rootUrl = self.extractRootUrl(url) print(rootUrl) tokenHandler = TokenHandler(rootUrl) for token in tokenList: tokenHandler.processToken(token) #print(tokenHandler.elementTreeRoot.getElementRepresentationString("")) return tokenHandler.elementTreeRoot
def test_getAbsoluteUrl(): rootUrl = "TestRootUrl/" th = TokenHandler(rootUrl) testUrls = [ "http://google.com", "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw", "ARelativeUrl", "/RelativeWithSlash" ] assert th.getAbsoluteUrl(testUrls[0]) == "http://google.com" assert th.getAbsoluteUrl( testUrls[1] ) == "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw" assert th.getAbsoluteUrl(testUrls[2]) == rootUrl + "ARelativeUrl" assert th.getAbsoluteUrl(testUrls[3]) == rootUrl + "RelativeWithSlash"
def get_annotated_comments(self): dct = {} for comment_key in self.comments.keys(): comments_val = self.comments[comment_key] tokens_comments = [TokenHandler().word_tokenizer(comment[1]) for comment in comments_val] if comment_key not in dct.keys(): dct[comment_key] = [] for tokens_comment in tokens_comments: result = '' ctr_token = 0 for token in tokens_comment: if token in self.kms.keys(): result += "{}/{} ".format(token, self.kms[token]) else: if self.normalised: prev_token = tokens_comment[ctr_token-1] if ctr_token > 0 else "<s>" next_token = token normalised_token = EDBNormaliser((prev_token, next_token), self.kms, self.cpd_pickled)\ .normalise_token() if normalised_token is None: result += "{}/{} ".format(next_token, 'UNK') else: result += self.get_token_polarity(normalised_token) else: result += "{}/{} ".format(token, 'UNK') ctr_token += 1 dct[comment_key].append(result) print(result) # result = [["{}/{}".format(w, (self.kms[w] if w in self.kms.keys() else 'UNK')) for w # in tokens_comment] for tokens_comment in tokens_comments] return self.comments, dct
def test_processTokenHandlesClosingTags(): testTokens = [ StartTagToken("p"), EndTagToken("p"), StartTagToken("a"), EndTagToken("a") ] th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert th.elementTreeRoot.name == "#root#" assert len(th.elementTreeRoot.children) == 2 assert th.elementTreeRoot.children[0].name == "p" assert th.elementTreeRoot.children[1].name == "a"
def test_processTokenProcessesIndependentTags(): testTokens = [ StartTagToken("br"), StartTagToken("img"), StartTagToken("html"), EndTagToken("html") ] testTokens[1].isSelfClosing = True th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert len(th.elementTreeRoot.children) == 3 assert th.elementTreeRoot.children[0].name == "br" assert th.elementTreeRoot.children[1].name == "img" assert th.elementTreeRoot.children[2].name == "html"
def __init__(self) -> None: ''' Initializes the program and sets class variables that are going to be used as the initial values across the program.\n Required command line arguements:\n GitHub URL (https://github.com/{Username}/{Repository})\n Optional command line arguements:\n GitHub Personal Access Token ''' self.args = sys.argv[1:] # All of the command line args excluding the filename self.githubURL = None self.githubUser = None self.githubRepo = None self.githubToken = None self.githubTokenList = None # This is pulled from keys.txt self.dbCursor = None # Database specific variable self.dbConnection = None # Database specific variables self.th = TokenHandler() # Class instance to write and read tokens to tokens.txt
class While(Statement): def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): self.t.match("while", self.t.tokens) conditiontokens = [] conditionistrue = self.t.ConditionIsTrue(self.t.tokens, conditiontokens) self.t.match("do", self.t.tokens) tokensexecuted = None while(conditionistrue): copytokens = list(self.t.tokens) s = self.createStatement(copytokens) tokensexecuted = s.execute(skip) copytokens = list(conditiontokens) conditiontokens = None conditiontokens = [] conditionistrue = self.t.ConditionIsTrue(copytokens, conditiontokens) size = len(self.t.executedtokens) i = size - 1 while(i >= size - len(conditiontokens)): self.t.executedtokens.pop(i) i -= 1 self.t.moveAhead(tokensexecuted) return self.t.resetTokens()
def test_processTokenHandlesUnacceptableTags(): testTokens = [ StartTagToken("p"), StartTagToken("NoHTMLTagShouldEverHaveThisName"), StartTagToken("body"), StartTagToken("a"), EndTagToken("a"), EndTagToken("body"), EndTagToken("NoHTMLTagShouldEverHaveThisName"), EndTagToken("p") ] th = TokenHandler("TestRootUrl/") for token in testTokens: th.processToken(token) assert len(th.elementTreeRoot.children) == 1 assert th.elementTreeRoot.children[0].name == "p"
class Print(Statement): def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): self.t.match("print", self.t.tokens) nexttoken = self.t.getCurrentToken() if not skip: print(self.t.readTokenValue(nexttoken)) self.t.match(nexttoken, self.t.tokens) return self.t.resetTokens()
class ProgramStatement(Statement): programName = None def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): self.t.match("program", self.t.tokens) currentToken = self.t.getCurrentToken() if not skip: self.programName = currentToken self.t.match(currentToken, self.t.tokens) copytokens = list(self.t.tokens) c = CompoundStatement(copytokens) tokensExecuted = c.execute(skip) self.t.moveAhead(tokensExecuted) return self.t.resetTokens()
def get_political_news_comment(self): conn = self.create_connection() with conn: cur = conn.cursor() sql = "select comment_id, comment from t_comments where comment_id " \ "in (select news_id from t_news where news_category = nc2 " \ "and news_category = 1)" cur.execute(sql) data = cur.fetchall() cur.close() comments = {} ctr_cmmnt = 0 for content in data: comment_id = content[0].strip() comment = ' '.join(TokenHandler().word_tokenizer( content[1].strip().lower())) if comment_id not in comments.keys(): comments[comment_id] = [] comments[comment_id].append((ctr_cmmnt, comment)) else: comments[comment_id].append((ctr_cmmnt, comment)) ctr_cmmnt += 1 return comments
resulting_file = input('File name as an output: ') # Read configuration configs = {} f = open('System.conf', 'r') for line in f.readlines(): cfg = line.strip().split('=') configs[cfg[0].strip()] = cfg[1].strip() # Get political news comments comments = NewsHandler(configs['dbnews']).get_political_news_comment() # Handle bigram model first to ease next computation f = open('political_comment_corpus.txt', 'r') corpus = TokenHandler().word_tokenizer(f.readline()) lm = bigrams(corpus) # MLE only -> change to Lidstone smoothing # cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm)) # cpd = ConditionalProbDist(pickle.loads(cfd_pickled), MLEProbDist) # cpd_pickled = pickle.dumps(cpd) # Lidstone smoothing cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm)) lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1) cpd = ConditionalProbDist(pickle.loads(cfd_pickled), lidstone_estimator) # Annotate each comment before_annotated, after_annotated = PolarityAnnotator(comments, configs['dbkbbi_cleaned_offline'], cpd, normalised=isNormalised)\ .get_annotated_comments()
class SSLMetrics: ''' This is what should be called to actually run the SSL Metrics tool.\n Call this tool in the command line as: python SSLMetrics.py {GitHub URL} {Optional Token} ''' def __init__(self) -> None: ''' Initializes the program and sets class variables that are going to be used as the initial values across the program.\n Required command line arguements:\n GitHub URL (https://github.com/{Username}/{Repository})\n Optional command line arguements:\n GitHub Personal Access Token ''' self.args = sys.argv[ 1:] # All of the command line args excluding the filename self.githubURL = None self.githubUser = None self.githubRepo = None self.githubToken = None self.githubTokenList = None # This is pulled from keys.txt self.dbCursor = None # Database specific variable self.dbConnection = None # Database specific variables self.th = TokenHandler( ) # Class instance to write and read tokens to tokens.txt def parseArgs(self) -> None: ''' This is a REQUIRED method.\n Logic to parse the list of command line arguements to make sure that they meet program requirements.\n Will also generate the keys.txt file, get data from it, and potentially write data to it as well. ''' if len(self.args) > 2: sys.exit("Too Many Args") try: self.githubURL = self.args[0] except IndexError: sys.exit("No URL Arg") try: self.githubToken = self.args[1] self.th.write(token=self.githubToken) self.githubTokenList = self.th.read() except IndexError: # There was no token as an arg self.githubTokenList = self.th.read() try: self.githubToken = self.githubTokenList[0] except IndexError: pass def stripURL(self) -> None: ''' This is a REQUIRED method. Logic to parse the URL arguement to make sure it contains both a username and a repository.\n Logic will error out if an invalid URL is the arguement.\n Further checks are made on the URL in the GitHubAPI.py file. It is possible for a URL to pass these tests here, however the program will error out if it fails other tests down the road. ''' if self.githubURL.find("github.com/") == -1: sys.exit("Invalid URL Arg") foo = self.githubURL.split("/") if len(foo) > 5: sys.exit("Invalid URL Arg") self.githubUser = foo[-2] self.githubRepo = foo[-1] def launch(self) -> None: ''' This is a REQUIRED method.\n Logic to actually begin the analysis. ''' self.dbCursor, self.dbConnection = sqlite_database.open_connection( self.githubRepo ) # Unsure of what this code does due to lack of knowledge on how the database works Master.Logic(username=self.githubUser, repository=self.githubRepo, token=self.githubToken, tokenList=self.githubTokenList, cursor=self.dbCursor, connection=self.dbConnection).program() def get_Args(self) -> list: ''' Returns the class variable args. ''' return self.args def get_GitHubURL(self) -> str: ''' Returns the class variable githubURL. ''' return self.githubURL def get_GitHubUser(self) -> str: ''' Returns the class variable githubUser ''' return self.githubUser def get_GitHubRepo(self) -> str: ''' Returns the class variable githubRepo. ''' return self.githubRepo def get_DbCursor(self) -> Cursor: ''' Returns the class variable dbCursor. ''' return self.dbCursor def get_DbConnection(self) -> Connection: ''' Returns the class variable dbConnection. ''' return self.dbConnection
from Statement import ProgramStatement from TokenHandler import TokenHandler filename = "prog1.txt" tokens = [] t = TokenHandler(tokens) t.create_Tokens(filename) program = ProgramStatement(t.tokens) program.execute(False)
f.write(grammar.grammar.start_symbol + "\n") f.write(view) directive_handler.code_generator.print_code() @staticmethod def empty_files(file_out, file_error): with(open(file=file_out, mode="w")): pass with(open(file=file_error, mode="w")): pass DEFAULT_FILE_IN_NAME = "scanner.txt" DEFAULT_FILE_OUT_NAME = "parsetree.txt" DEFAULT_FILE_ERROR_NAME = "error.txt" c_lexical_dfa = CLexicalDFA.make_c_lexical_dfa() not_printing_tokens = [CTokenType.WHITE_SPACE, CTokenType.COMMENT] c_token_handler = TokenHandler(c_lexical_dfa, not_printing_tokens) grammar = LL1Grammar(Grammar.make_grammar(compressed_grammar)) # for prod in grammar.grammar.prods: # print("{}->{}".format(grammar.grammar.prods[prod].non_terminal, grammar.grammar.prods[prod].rhses)) # # for f in grammar.first_sets["simple-expression"]: # print(str(f)) # print(grammar.grammar.compress()) parser = Parser(grammar) parse_handler = ParserHandler(parser) compiler = Compiler(c_token_handler, parse_handler) compiler.compile(DEFAULT_FILE_IN_NAME, DEFAULT_FILE_OUT_NAME, DEFAULT_FILE_ERROR_NAME)
def test_isRelativeUrl(): th = TokenHandler("TestRootUrl/") testUrls = ["http://google.com", "ARelativeUrl", "/RelativeWithSlash"] assert th.isRelativeUrl(testUrls[0]) == False assert th.isRelativeUrl(testUrls[1]) == True assert th.isRelativeUrl(testUrls[2]) == True
class IF(Statement): def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): self.t.match("if", self.t.tokens) currenttoken = self.t.getCurrentToken() if(skip): while(currenttoken != "else"): self.t.match(currenttoken, self.t.tokens) currenttoken = self.t.getCurrentToken() self.t.match("else", self.t.tokens) copytokens = list(self.t.tokens) s = self.createStatement(copytokens) tokensexecuted = s.execute(skip) self.moveAhead(tokensexecuted) if(self.t.ConditionIsTrue(self.t.tokens, outcondition = [])): self.t.match("then", self.t.tokens) copytokens = list(self.t.tokens) s = self.createStatement(copytokens) tokensexecuted = s.execute(skip) self.t.moveAhead(tokensexecuted) self.t.match("else", self.t.tokens) copytokens = list(self.t.tokens) s = self.createStatement(copytokens) tokensexecuted = s.execute(True) self.t.moveAhead(tokensexecuted) else: currenttoken = self.t.getCurrentToken() while(currenttoken != "else"): self.t.match(currenttoken, self.t.tokens) currenttoken = self.t.getCurrentToken() self.t.match("else", self.t.tokens) copytokens = list(self.t.tokens) s = self.createStatement(copytokens) tokensexecuted = s.execute(skip) self.t.moveAhead(tokensexecuted) return self.t.resetTokens()
def __init__(self, tokens): self.t = TokenHandler(tokens)
class Assignment(Statement): def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): currenttoken = self.t.getCurrentToken() if not self.t.isVariable(currenttoken): raise VariableException("Not a variable: " + currenttoken) variablebeingassigned = currenttoken self.t.match(currenttoken, self.t.tokens) self.t.match(":=", self.t.tokens) currenttoken = self.t.getCurrentToken() variablevalue = None if not skip: variablevalue = self.t.readTokenValue(currenttoken) TokenHandler.variables[variablebeingassigned] = variablevalue self.t.match(currenttoken, self.t.tokens) currenttoken = self.t.getCurrentToken() while(self.t.isMathOperator(currenttoken)): if currenttoken == "+": self.t.match("+", self.t.tokens) currenttoken = self.t.getCurrentToken() if not skip: operand = self.t.readTokenValue(currenttoken) variablevalue += operand TokenHandler.variables[variablebeingassigned] = variablevalue self.t.match(currenttoken, self.t.tokens) elif currenttoken == "-": self.t.match("-", self.t.tokens) currenttoken = self.t.getCurrentToken() if not skip: operand = self.t.readTokenValue(currenttoken) variablevalue -= operand TokenHandler.variables[variablebeingassigned] = variablevalue self.t.match(currenttoken, self.t.tokens) elif currenttoken == "*": self.t.match("*", self.t.tokens) currenttoken = self.t.getCurrentToken() if not skip: operand = self.t.readTokenValue(currenttoken) variablevalue *= operand TokenHandler.variables[variablebeingassigned] = variablevalue self.t.match(currenttoken, self.t.tokens) elif currenttoken == "/": self.t.match("/", self.t.tokens) currenttoken = self.t.getCurrentToken() if not skip: operand = self.t.readTokenValue(currenttoken) if operand == 0: raise ParserException("Can't divide by zero.") variablevalue /= operand TokenHandler.variables[variablebeingassigned] = variablevalue self.t.match(currenttoken, self.t.tokens) currenttoken = self.t.getCurrentToken() return self.t.resetTokens()
class CompoundStatement(Statement): def __init__(self, tokens): self.t = TokenHandler(tokens) def execute(self, skip): self.t.match("begin", self.t.tokens) currenttoken = self.t.getCurrentToken() condition = True while condition == True: copytokens = list(self.t.tokens) currentstatement = self.createStatement(copytokens) tokensexecuted = currentstatement.execute(skip) self.t.moveAhead(tokensexecuted) currenttoken = self.t.getCurrentToken() if currenttoken == ";": condition = True self.t.match(";", self.t.tokens) else: condition = False self.t.match("end", self.t.tokens) return self.t.resetTokens()