Exemplo n.º 1
0
    def parseHTML(self, htmlString):
        """
            parseHTML converts the raw htmlString into a list of tokens.
        """
        tsm = TokenizerStateMachine()
        tokenHandler = TokenHandler()
        tokenHandler.rootUrl = self.rootUrl
        i = 0
        while (i < len(htmlString)):
            i = i + tsm.handleCharacter(htmlString[i])
            if (not (tsm.currentEmittedToken == None)):
                if (isinstance(tsm.currentEmittedToken, StartTagToken)
                        and tsm.currentEmittedToken.name == "link"):
                    self.handleLinkToken(tsm.currentEmittedToken)
                else:
                    tokenHandler.processToken(tsm.currentEmittedToken)
                tsm.currentEmittedToken = None

        #self.extractParagraphText(tsm.tokens)
        self.strList.clear()
        self.renderList.clear()
        self.renderObjects.clear()
        #tokenHandler.getRenderList(tokenHandler.elementTreeRoot, self.renderObjects)
        #for r in self.renderObjects:
        #    print(r.text)
        #    if (not(r.fontSize == None)):
        #        print(r.fontSize)
        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        #tokenHandler.getTextElements(tokenHandler.elementTreeRoot, self.strList)
        self.fillRenderList(tokenHandler.elementTreeRoot, self.renderList)
        for s in self.renderList:
            print(s)
Exemplo n.º 2
0
def test_handleStartTagTokenHandlesFirstStartTag():
    testToken = StartTagToken("html")
    th = TokenHandler("TestRootUrl/")
    assert len(th.elementTreeRoot.children) == 0
    th.handleStartTagToken(testToken)
    assert len(th.elementTreeRoot.children) == 1
    assert th.elementTreeRoot.children[0].name == "html"
Exemplo n.º 3
0
class Statement(object):

    def __init__(self, tokens):
        self.t = TokenHandler(tokens)

    def createStatement(self, tokens):
        currenttoken = self.t.getCurrentToken()
    
        if currenttoken == "begin":
            s = CompoundStatement(tokens)
            return s
        
        elif self.t.isVariable(currenttoken):
            a = Assignment(tokens)
            return a
        
        elif currenttoken == "print":
            p = Print(tokens)
            return p
        
        elif currenttoken == "if":
            i = IF(tokens)
            return i
        
        elif currenttoken == "while":
            w = While(tokens)
            return w
        
        else:
            raise ParserException('\';\' expected. Got \'' + currenttoken + '\' instead.')

    def execute(self, skip):
        raise NotImplementedError("Subclass must implement abstract method")
Exemplo n.º 4
0
    def convertTokenListToHTMLElementTree(self, url, tokenList):
        # The root url is needed by the TokenHandler for handling
        # possible relative links
        rootUrl = self.extractRootUrl(url)
        print(rootUrl)
        tokenHandler = TokenHandler(rootUrl)
        for token in tokenList:
            tokenHandler.processToken(token)

        #print(tokenHandler.elementTreeRoot.getElementRepresentationString(""))
        return tokenHandler.elementTreeRoot
Exemplo n.º 5
0
def test_getAbsoluteUrl():
    rootUrl = "TestRootUrl/"
    th = TokenHandler(rootUrl)
    testUrls = [
        "http://google.com",
        "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&amp;rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw",
        "ARelativeUrl", "/RelativeWithSlash"
    ]
    assert th.getAbsoluteUrl(testUrls[0]) == "http://google.com"
    assert th.getAbsoluteUrl(
        testUrls[1]
    ) == "https://i.ytimg.com/vi/nrIDL7h9MFQ/hqdefault.jpg?sqp=-oaymwEYCNIBEHZIVfKriqkDCwgBFQAAiEIYAXAB&amp;rs=AOn4CLByrwt1ptJWI5zGkLOZhJpyrFeCSw"
    assert th.getAbsoluteUrl(testUrls[2]) == rootUrl + "ARelativeUrl"
    assert th.getAbsoluteUrl(testUrls[3]) == rootUrl + "RelativeWithSlash"
Exemplo n.º 6
0
 def get_annotated_comments(self):
     dct = {}
     for comment_key in self.comments.keys():
         comments_val = self.comments[comment_key]
         tokens_comments = [TokenHandler().word_tokenizer(comment[1]) for comment in comments_val]
         if comment_key not in dct.keys():
             dct[comment_key] = []
         for tokens_comment in tokens_comments:
             result = ''
             ctr_token = 0
             for token in tokens_comment:
                 if token in self.kms.keys():
                     result += "{}/{} ".format(token, self.kms[token])
                 else:
                     if self.normalised:
                         prev_token = tokens_comment[ctr_token-1] if ctr_token > 0 else "<s>"
                         next_token = token
                         normalised_token = EDBNormaliser((prev_token, next_token), self.kms, self.cpd_pickled)\
                             .normalise_token()
                         if normalised_token is None:
                             result += "{}/{} ".format(next_token, 'UNK')
                         else:
                             result += self.get_token_polarity(normalised_token)
                     else:
                         result += "{}/{} ".format(token, 'UNK')
                 ctr_token += 1
             dct[comment_key].append(result)
             print(result)
         # result = [["{}/{}".format(w, (self.kms[w] if w in self.kms.keys() else 'UNK')) for w
         #            in tokens_comment] for tokens_comment in tokens_comments]
     return self.comments, dct
Exemplo n.º 7
0
def test_processTokenHandlesClosingTags():
    testTokens = [
        StartTagToken("p"),
        EndTagToken("p"),
        StartTagToken("a"),
        EndTagToken("a")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert th.elementTreeRoot.name == "#root#"
    assert len(th.elementTreeRoot.children) == 2
    assert th.elementTreeRoot.children[0].name == "p"
    assert th.elementTreeRoot.children[1].name == "a"
Exemplo n.º 8
0
def test_processTokenProcessesIndependentTags():
    testTokens = [
        StartTagToken("br"),
        StartTagToken("img"),
        StartTagToken("html"),
        EndTagToken("html")
    ]
    testTokens[1].isSelfClosing = True

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 3
    assert th.elementTreeRoot.children[0].name == "br"
    assert th.elementTreeRoot.children[1].name == "img"
    assert th.elementTreeRoot.children[2].name == "html"
Exemplo n.º 9
0
    def __init__(self) -> None:
        '''
Initializes the program and sets class variables that are going to be used as the initial values across the program.\n
Required command line arguements:\n
GitHub URL (https://github.com/{Username}/{Repository})\n
Optional command line arguements:\n
GitHub Personal Access Token
        '''
        self.args = sys.argv[1:]  # All of the command line args excluding the filename
        self.githubURL = None
        self.githubUser = None
        self.githubRepo = None
        self.githubToken = None
        self.githubTokenList = None # This is pulled from keys.txt
        self.dbCursor = None  # Database specific variable
        self.dbConnection = None  # Database specific variables
        self.th = TokenHandler()    # Class instance to write and read tokens to tokens.txt
Exemplo n.º 10
0
class While(Statement):
    
    def __init__(self, tokens):
        self.t = TokenHandler(tokens)
        
    def execute(self, skip):
        self.t.match("while", self.t.tokens)
        conditiontokens = []
        conditionistrue = self.t.ConditionIsTrue(self.t.tokens, conditiontokens)
        
        self.t.match("do", self.t.tokens)

        tokensexecuted = None
        
        while(conditionistrue):
            copytokens = list(self.t.tokens)
            s = self.createStatement(copytokens)
            tokensexecuted = s.execute(skip)
            copytokens = list(conditiontokens)
            conditiontokens = None
            conditiontokens = []
            conditionistrue = self.t.ConditionIsTrue(copytokens, conditiontokens)
            size = len(self.t.executedtokens)
            
            i = size - 1
            while(i >= size - len(conditiontokens)):
                self.t.executedtokens.pop(i)
                i -= 1
            
        self.t.moveAhead(tokensexecuted)
        return self.t.resetTokens()
Exemplo n.º 11
0
def test_processTokenHandlesUnacceptableTags():
    testTokens = [
        StartTagToken("p"),
        StartTagToken("NoHTMLTagShouldEverHaveThisName"),
        StartTagToken("body"),
        StartTagToken("a"),
        EndTagToken("a"),
        EndTagToken("body"),
        EndTagToken("NoHTMLTagShouldEverHaveThisName"),
        EndTagToken("p")
    ]

    th = TokenHandler("TestRootUrl/")
    for token in testTokens:
        th.processToken(token)

    assert len(th.elementTreeRoot.children) == 1
    assert th.elementTreeRoot.children[0].name == "p"
Exemplo n.º 12
0
class Print(Statement):

    def __init__(self, tokens):
        self.t = TokenHandler(tokens)

    def execute(self, skip):
        self.t.match("print", self.t.tokens)
        nexttoken = self.t.getCurrentToken()
        if not skip:
            print(self.t.readTokenValue(nexttoken))
        self.t.match(nexttoken, self.t.tokens)
        return self.t.resetTokens()
Exemplo n.º 13
0
class ProgramStatement(Statement):

    programName = None

    def __init__(self, tokens):
        self.t = TokenHandler(tokens)

    def execute(self, skip):   
        self.t.match("program", self.t.tokens)
        currentToken = self.t.getCurrentToken()
        if not skip:
            self.programName = currentToken
        self.t.match(currentToken, self.t.tokens)
        copytokens = list(self.t.tokens)
        c = CompoundStatement(copytokens)
        tokensExecuted = c.execute(skip)
        self.t.moveAhead(tokensExecuted)
        return self.t.resetTokens()
Exemplo n.º 14
0
    def get_political_news_comment(self):
        conn = self.create_connection()
        with conn:
            cur = conn.cursor()
            sql = "select comment_id, comment from t_comments where comment_id " \
                  "in (select news_id from t_news where news_category = nc2 " \
                  "and news_category = 1)"
            cur.execute(sql)
            data = cur.fetchall()
            cur.close()

            comments = {}
            ctr_cmmnt = 0
            for content in data:
                comment_id = content[0].strip()
                comment = ' '.join(TokenHandler().word_tokenizer(
                    content[1].strip().lower()))
                if comment_id not in comments.keys():
                    comments[comment_id] = []
                    comments[comment_id].append((ctr_cmmnt, comment))
                else:
                    comments[comment_id].append((ctr_cmmnt, comment))
                ctr_cmmnt += 1
            return comments
Exemplo n.º 15
0
    resulting_file = input('File name as an output: ')

    # Read configuration
    configs = {}
    f = open('System.conf', 'r')
    for line in f.readlines():
        cfg = line.strip().split('=')
        configs[cfg[0].strip()] = cfg[1].strip()

    # Get political news comments
    comments = NewsHandler(configs['dbnews']).get_political_news_comment()

    # Handle bigram model first to ease next computation
    f = open('political_comment_corpus.txt', 'r')
    corpus = TokenHandler().word_tokenizer(f.readline())
    lm = bigrams(corpus)

    # MLE only -> change to Lidstone smoothing
    # cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm))
    # cpd = ConditionalProbDist(pickle.loads(cfd_pickled), MLEProbDist)
    # cpd_pickled = pickle.dumps(cpd)

    # Lidstone smoothing
    cfd_pickled = pickle.dumps(nltk.ConditionalFreqDist(lm))
    lidstone_estimator = lambda fd: LidstoneProbDist(fd, 0.01, fd.B() + 1)
    cpd = ConditionalProbDist(pickle.loads(cfd_pickled), lidstone_estimator)

    # Annotate each comment
    before_annotated, after_annotated = PolarityAnnotator(comments, configs['dbkbbi_cleaned_offline'], cpd, normalised=isNormalised)\
        .get_annotated_comments()
Exemplo n.º 16
0
class SSLMetrics:
    '''
This is what should be called to actually run the SSL Metrics tool.\n
Call this tool in the command line as: python SSLMetrics.py {GitHub URL} {Optional Token}
    '''
    def __init__(self) -> None:
        '''
Initializes the program and sets class variables that are going to be used as the initial values across the program.\n
Required command line arguements:\n
GitHub URL (https://github.com/{Username}/{Repository})\n
Optional command line arguements:\n
GitHub Personal Access Token
        '''
        self.args = sys.argv[
            1:]  # All of the command line args excluding the filename
        self.githubURL = None
        self.githubUser = None
        self.githubRepo = None
        self.githubToken = None
        self.githubTokenList = None  # This is pulled from keys.txt
        self.dbCursor = None  # Database specific variable
        self.dbConnection = None  # Database specific variables
        self.th = TokenHandler(
        )  # Class instance to write and read tokens to tokens.txt

    def parseArgs(self) -> None:
        '''
This is a REQUIRED method.\n
Logic to parse the list of command line arguements to make sure that they meet program requirements.\n
Will also generate the keys.txt file, get data from it, and potentially write data to it as well.
        '''

        if len(self.args) > 2:
            sys.exit("Too Many Args")
        try:
            self.githubURL = self.args[0]
        except IndexError:
            sys.exit("No URL Arg")
        try:
            self.githubToken = self.args[1]
            self.th.write(token=self.githubToken)
            self.githubTokenList = self.th.read()
        except IndexError:  # There was no token as an arg
            self.githubTokenList = self.th.read()
            try:
                self.githubToken = self.githubTokenList[0]
            except IndexError:
                pass

    def stripURL(self) -> None:
        '''
This is a REQUIRED method.
Logic to parse the URL arguement to make sure it contains both a username and a repository.\n
Logic will error out if an invalid URL is the arguement.\n
Further checks are made on the URL in the GitHubAPI.py file. It is possible for a URL to pass these tests here, however the program will error out if it fails other tests down the road.
        '''

        if self.githubURL.find("github.com/") == -1:
            sys.exit("Invalid URL Arg")

        foo = self.githubURL.split("/")

        if len(foo) > 5:
            sys.exit("Invalid URL Arg")

        self.githubUser = foo[-2]
        self.githubRepo = foo[-1]

    def launch(self) -> None:
        '''
This is a REQUIRED method.\n
Logic to actually begin the analysis.
        '''
        self.dbCursor, self.dbConnection = sqlite_database.open_connection(
            self.githubRepo
        )  # Unsure of what this code does due to lack of knowledge on how the database works
        Master.Logic(username=self.githubUser,
                     repository=self.githubRepo,
                     token=self.githubToken,
                     tokenList=self.githubTokenList,
                     cursor=self.dbCursor,
                     connection=self.dbConnection).program()

    def get_Args(self) -> list:
        '''
Returns the class variable args.
        '''
        return self.args

    def get_GitHubURL(self) -> str:
        '''
Returns the class variable githubURL.
        '''
        return self.githubURL

    def get_GitHubUser(self) -> str:
        '''
Returns the class variable githubUser
        '''
        return self.githubUser

    def get_GitHubRepo(self) -> str:
        '''
Returns the class variable githubRepo.
        '''
        return self.githubRepo

    def get_DbCursor(self) -> Cursor:
        '''
Returns the class variable dbCursor.
        '''
        return self.dbCursor

    def get_DbConnection(self) -> Connection:
        '''
Returns the class variable dbConnection.
        '''
        return self.dbConnection
Exemplo n.º 17
0
from Statement import ProgramStatement
from TokenHandler import TokenHandler

filename = "prog1.txt"
tokens = []
t = TokenHandler(tokens)
t.create_Tokens(filename)

program = ProgramStatement(t.tokens)
program.execute(False)
Exemplo n.º 18
0
            f.write(grammar.grammar.start_symbol + "\n")
            f.write(view)

        directive_handler.code_generator.print_code()

    @staticmethod
    def empty_files(file_out, file_error):
        with(open(file=file_out, mode="w")):
            pass
        with(open(file=file_error, mode="w")):
            pass


DEFAULT_FILE_IN_NAME = "scanner.txt"
DEFAULT_FILE_OUT_NAME = "parsetree.txt"
DEFAULT_FILE_ERROR_NAME = "error.txt"

c_lexical_dfa = CLexicalDFA.make_c_lexical_dfa()
not_printing_tokens = [CTokenType.WHITE_SPACE, CTokenType.COMMENT]
c_token_handler = TokenHandler(c_lexical_dfa, not_printing_tokens)
grammar = LL1Grammar(Grammar.make_grammar(compressed_grammar))
# for prod in grammar.grammar.prods:
#     print("{}->{}".format(grammar.grammar.prods[prod].non_terminal, grammar.grammar.prods[prod].rhses))
#
# for f in grammar.first_sets["simple-expression"]:
#     print(str(f))
# print(grammar.grammar.compress())
parser = Parser(grammar)
parse_handler = ParserHandler(parser)
compiler = Compiler(c_token_handler, parse_handler)
compiler.compile(DEFAULT_FILE_IN_NAME, DEFAULT_FILE_OUT_NAME, DEFAULT_FILE_ERROR_NAME)
Exemplo n.º 19
0
def test_isRelativeUrl():
    th = TokenHandler("TestRootUrl/")
    testUrls = ["http://google.com", "ARelativeUrl", "/RelativeWithSlash"]
    assert th.isRelativeUrl(testUrls[0]) == False
    assert th.isRelativeUrl(testUrls[1]) == True
    assert th.isRelativeUrl(testUrls[2]) == True
Exemplo n.º 20
0
class IF(Statement):
    
    def __init__(self, tokens):
        self.t = TokenHandler(tokens)
        
    def execute(self, skip):
        self.t.match("if", self.t.tokens)
        currenttoken = self.t.getCurrentToken()
        
        if(skip):
            while(currenttoken != "else"):
                self.t.match(currenttoken, self.t.tokens)
                currenttoken = self.t.getCurrentToken()
            
            self.t.match("else", self.t.tokens)
            copytokens = list(self.t.tokens)
            s = self.createStatement(copytokens)
            tokensexecuted = s.execute(skip)
            self.moveAhead(tokensexecuted)
            
        if(self.t.ConditionIsTrue(self.t.tokens, outcondition = [])):
            self.t.match("then", self.t.tokens)
            copytokens = list(self.t.tokens)
            s = self.createStatement(copytokens)
            tokensexecuted = s.execute(skip)
            self.t.moveAhead(tokensexecuted)
            
            self.t.match("else", self.t.tokens)
            copytokens = list(self.t.tokens)
            s = self.createStatement(copytokens)
            tokensexecuted = s.execute(True)
            self.t.moveAhead(tokensexecuted)
            
        else:
            currenttoken = self.t.getCurrentToken()
            while(currenttoken != "else"):
                self.t.match(currenttoken, self.t.tokens)
                currenttoken = self.t.getCurrentToken()
            
            self.t.match("else", self.t.tokens)
            copytokens = list(self.t.tokens)
            s = self.createStatement(copytokens)
            tokensexecuted = s.execute(skip)
            self.t.moveAhead(tokensexecuted)
            
        return self.t.resetTokens()
Exemplo n.º 21
0
 def __init__(self, tokens):
     self.t = TokenHandler(tokens)
Exemplo n.º 22
0
class Assignment(Statement):

    def __init__(self, tokens):
        self.t = TokenHandler(tokens)

    def execute(self, skip):
        currenttoken = self.t.getCurrentToken()
        if not self.t.isVariable(currenttoken):
            raise VariableException("Not a variable: " + currenttoken)
        variablebeingassigned = currenttoken
        self.t.match(currenttoken, self.t.tokens)
        self.t.match(":=", self.t.tokens)
        currenttoken = self.t.getCurrentToken()
        variablevalue = None
        if not skip:
            variablevalue = self.t.readTokenValue(currenttoken)
            TokenHandler.variables[variablebeingassigned] = variablevalue
        self.t.match(currenttoken, self.t.tokens)
        
        currenttoken = self.t.getCurrentToken()
        while(self.t.isMathOperator(currenttoken)):
            if currenttoken == "+":
                self.t.match("+", self.t.tokens)
                currenttoken = self.t.getCurrentToken()
                if not skip:
                    operand = self.t.readTokenValue(currenttoken)
                    variablevalue += operand
                    TokenHandler.variables[variablebeingassigned] = variablevalue
                self.t.match(currenttoken, self.t.tokens)
                
            elif currenttoken == "-":
                self.t.match("-", self.t.tokens)
                currenttoken = self.t.getCurrentToken()
                if not skip:
                    operand = self.t.readTokenValue(currenttoken)
                    variablevalue -= operand
                    TokenHandler.variables[variablebeingassigned] = variablevalue
                self.t.match(currenttoken, self.t.tokens)
                
            elif currenttoken == "*":
                self.t.match("*", self.t.tokens)
                currenttoken = self.t.getCurrentToken()
                if not skip:
                    operand = self.t.readTokenValue(currenttoken)
                    variablevalue *= operand
                    TokenHandler.variables[variablebeingassigned] = variablevalue
                self.t.match(currenttoken, self.t.tokens)          
                
            elif currenttoken == "/":
                self.t.match("/", self.t.tokens)
                currenttoken = self.t.getCurrentToken()
                if not skip:
                    operand = self.t.readTokenValue(currenttoken)
                    if operand == 0:
                        raise ParserException("Can't divide by zero.")
                    variablevalue /= operand
                    TokenHandler.variables[variablebeingassigned] = variablevalue
                self.t.match(currenttoken, self.t.tokens)
                
            currenttoken = self.t.getCurrentToken()
            
        return self.t.resetTokens()
Exemplo n.º 23
0
class CompoundStatement(Statement):

    def __init__(self, tokens):
        self.t = TokenHandler(tokens)
        
    def execute(self, skip):     
        self.t.match("begin", self.t.tokens)
        currenttoken = self.t.getCurrentToken()
        condition = True
        
        while condition == True:
            copytokens = list(self.t.tokens)
            currentstatement = self.createStatement(copytokens)
            tokensexecuted = currentstatement.execute(skip)
            self.t.moveAhead(tokensexecuted)
            currenttoken = self.t.getCurrentToken()
            if currenttoken == ";":
                condition = True
                self.t.match(";", self.t.tokens)
            else:
                condition = False
            
        self.t.match("end", self.t.tokens)
        return self.t.resetTokens()