示例#1
0
def parse_select_dependencies(string):
    """Parse a select query and return the dependencies

    Args:
        string(str): Input string to be parsed

    Returns:
        result(list of str): List of dependent tables
    """

    if string == '':
        return list()

    # Find all dependent tables
    dep_parse = WordStart() + (_from
                               | _join) + _db_name.setResultsName('table')
    output = dep_parse.setParseAction(lambda x: x.table).searchString(string)

    # Flatten the list before returning
    flattened_output = [item for sublist in output for item in sublist]

    # Deduplicated the list
    unique_output = deduplicate_with_order(flattened_output)

    if len(unique_output) == 0:
        raise ParseException('No dependent table in select query')
    return unique_output
示例#2
0
def parse_select_dependencies(string):
    """Parse a select query and return the dependencies

    Args:
        string(str): Input string to be parsed

    Returns:
        result(list of str): List of dependent tables
    """

    if string == "":
        return list()

    # Find all dependent tables
    dep_parse = WordStart() + (_from | _join) + _db_name.setResultsName("table")
    output = dep_parse.setParseAction(lambda x: x.table).searchString(string)

    # Flatten the list before returning
    flattened_output = [item for sublist in output for item in sublist]

    # Deduplicated the list
    unique_output = deduplicate_with_order(flattened_output)

    if len(unique_output) == 0:
        raise ParseException("No dependent table in select query")
    return unique_output
示例#3
0
def remove_transactional(string):
    """Remove begin or commit from the statement

    Args:
        string(str): String to be processed

    Returns:
        result(str): String with begin and commit trimmed
    """
    transaction = WordStart() + (CaselessKeyword('BEGIN')
                                 | CaselessKeyword('COMMIT'))

    return transaction.suppress().transformString(string)
示例#4
0
def remove_transactional(string):
    """Remove begin or commit from the statement

    Args:
        string(str): String to be processed

    Returns:
        result(str): String with begin and commit trimmed
    """
    transaction = WordStart() + (
        CaselessKeyword('BEGIN')| CaselessKeyword('COMMIT'))

    return transaction.suppress().transformString(string)
示例#5
0
    def __init__(self):
        real_word_dashes = Word(pyparsing.alphas + "-")
        punctuation = Word(".!?:,;-")
        punctuation_no_dash = Word(".!?:,;")
        punctuation_reference_letter = Word(".:,;-")

        printable = Word(pyparsing.printables, exact=1)
        letter = Word(pyparsing.alphas, exact=1)
        letter_reference = punctuation_reference_letter + letter

        nums = (
            Word(pyparsing.nums)
            + Optional(letter)
            + ZeroOrMore(letter_reference)
        )

        word_end = (
            pyparsing.ZeroOrMore(Word(")") | Word("}") | Word("]"))
            + Optional(punctuation_no_dash)
            + WordEnd()
        )

        self.single_number = WordStart() + real_word_dashes + nums + word_end

        self.single_number_parens = (
            printable
            + letter
            + Optional(punctuation_no_dash)
            + pyparsing.OneOrMore(
                Word("([{", exact=1)
                + pyparsing.OneOrMore(nums | Word("-"))
                + Word(")]}", exact=1)
            )
            + Optional(punctuation_no_dash)
            + word_end
        )

        self.number_then_punctuation = (
            printable
            + letter
            + nums
            + punctuation
            + pyparsing.ZeroOrMore(nums | punctuation)
            + word_end
        )

        self.punctuation_then_number = (
            printable
            + letter
            + punctuation_no_dash
            + nums
            + pyparsing.ZeroOrMore(punctuation | nums)
            + word_end
        )
示例#6
0
    Literal,
    NotAny,
    Optional,
    Or,
    Regex,
    replaceWith,
    upcaseTokens,
    Word,
    WordEnd,
    WordStart,
    ZeroOrMore,
    CaselessKeyword,
    White,
)

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)

uppercase_word = Word("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
not_uppercase_word_regex = Regex("[^A-Z]")

dot_fanging_patterns = Combine(
    Optional(White()) + Or([
        # '.' - enclosed with ( and )
        CaselessLiteral("(.("),
        CaselessLiteral("(.)"),
        CaselessLiteral(").("),
        CaselessLiteral(").)"),
        CaselessLiteral("(."),
        CaselessLiteral(".("),
        # CaselessLiteral(")."), # this is commented and is NOT used to fang indicators b/c this may appear in real text
示例#7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import copy

from pyparsing import alphanums, alphas, printables, nums, hexnums
from pyparsing import OneOrMore, Word, Combine, Optional, Or, Regex, WordStart, WordEnd, replaceWith, downcaseTokens, NotAny

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)
# the label definition ignores the fact that labels should not end in an hyphen
label = Word(initChars=alphanums, bodyChars=alphanums + '-', max=63)
# todo: build out the domain_tld list with only valid tlds
tlds = ['aaa', 'aarp', 'abarth', 'abb', 'abbott', 'abbvie', 'abc', 'able', 'abogado', 'abudhabi', 'ac', 'academy', 'accenture', 'accountant', 'accountants', 'aco', 'active', 'actor', 'ad', 'adac', 'ads', 'adult', 'ae', 'aeg', 'aero', 'aetna', 'af', 'afamilycompany', 'afl', 'africa', 'ag', 'agakhan', 'agency', 'ai', 'aig', 'aigo', 'airbus', 'airforce', 'airtel', 'akdn', 'al', 'alfaromeo', 'alibaba', 'alipay', 'allfinanz', 'allstate', 'ally', 'alsace', 'alstom', 'am', 'americanexpress', 'americanfamily', 'amex', 'amfam', 'amica', 'amsterdam', 'analytics', 'android', 'anquan', 'anz', 'ao', 'aol', 'apartments', 'app', 'apple', 'aq', 'aquarelle', 'ar', 'arab', 'aramco', 'archi', 'army', 'arpa', 'art', 'arte', 'as', 'asda', 'asia', 'associates', 'at', 'athleta', 'attorney', 'au', 'auction', 'audi', 'audible', 'audio', 'auspost', 'author', 'auto', 'autos', 'avianca', 'aw', 'aws', 'ax', 'axa', 'az', 'azure', 'ba', 'baby', 'baidu', 'banamex', 'bananarepublic', 'band', 'bank', 'bar', 'barcelona', 'barclaycard', 'barclays', 'barefoot', 'bargains', 'baseball', 'basketball', 'bauhaus', 'bayern', 'bb', 'bbc', 'bbt', 'bbva', 'bcg', 'bcn', 'bd', 'be', 'beats', 'beauty', 'beer', 'bentley', 'berlin', 'best', 'bestbuy', 'bet', 'bf', 'bg', 'bh', 'bharti', 'bi', 'bible', 'bid', 'bike', 'bing', 'bingo', 'bio', 'biz', 'bj', 'black', 'blackfriday', 'blanco', 'blockbuster', 'blog', 'bloomberg', 'blue', 'bm', 'bms', 'bmw', 'bn', 'bnl', 'bnpparibas', 'bo', 'boats', 'boehringer', 'bofa', 'bom', 'bond', 'boo', 'book', 'booking', 'bosch', 'bostik', 'boston', 'bot', 'boutique', 'box', 'br', 'bradesco', 'bridgestone', 'broadway', 'broker', 'brother', 'brussels', 'bs', 'bt', 'budapest', 'bugatti', 'build', 'builders', 'business', 'buy', 'buzz', 'bv', 'bw', 'by', 'bz', 'bzh', 'ca', 'cab', 'cafe', 'cal', 'call', 'calvinklein', 'cam', 'camera', 'camp', 'cancerresearch', 'canon', 'capetown', 'capital', 'capitalone', 'car', 'caravan', 'cards', 'care', 'career', 'careers', 'cars', 'cartier', 'casa', 'case', 'caseih', 'cash', 'casino', 'cat', 'catering', 'catholic', 'cba', 'cbn', 'cbre', 'cbs', 'cc', 'cd', 'ceb', 'center', 'ceo', 'cern', 'cf', 'cfa', 'cfd', 'cg', 'ch', 'chanel', 'channel', 'charity', 'chase', 'chat', 'cheap', 'chintai', 'christmas', 'chrome', 'chrysler', 'church', 'ci', 'cipriani', 'circle', 'cisco', 'citadel', 'citi', 'citic', 'city', 'cityeats', 'ck', 'cl', 'claims', 'cleaning', 'click', 'clinic', 'clinique', 'clothing', 'cloud', 'club', 'clubmed', 'cm', 'cn', 'co', 'coach', 'codes', 'coffee', 'college', 'cologne', 'com', 'comcast', 'commbank', 'community', 'company', 'compare', 'computer', 'comsec', 'condos', 'construction', 'consulting', 'contact', 'contractors', 'cooking', 'cookingchannel', 'cool', 'coop', 'corsica', 'country', 'coupon', 'coupons', 'courses', 'cr', 'credit', 'creditcard', 'creditunion', 'cricket', 'crown', 'crs', 'cruise', 'cruises', 'csc', 'cu', 'cuisinella', 'cv', 'cw', 'cx', 'cy', 'cymru', 'cyou', 'cz', 'dabur', 'dad', 'dance', 'data', 'date', 'dating', 'datsun', 'day', 'dclk', 'dds', 'de', 'deal', 'dealer', 'deals', 'degree', 'delivery', 'dell', 'deloitte', 'delta', 'democrat', 'dental', 'dentist', 'desi', 'design', 'dev', 'dhl', 'diamonds', 'diet', 'digital', 'direct', 'directory', 'discount', 'discover', 'dish', 'diy', 'dj', 'dk', 'dm', 'dnp', 'do', 'docs', 'doctor', 'dodge', 'dog', 'doha', 'domains', 'dot', 'download', 'drive', 'dtv', 'dubai', 'duck', 'dunlop', 'duns', 'dupont', 'durban', 'dvag', 'dvr', 'dz', 'earth', 'eat', 'ec', 'eco', 'edeka', 'edu', 'education', 'ee', 'eg', 'email', 'emerck', 'energy', 'engineer', 'engineering', 'enterprises', 'epost', 'epson', 'equipment', 'er', 'ericsson', 'erni', 'es', 'esq', 'estate', 'esurance', 'et', 'etisalat', 'eu', 'eurovision', 'eus', 'events', 'everbank', 'exchange', 'expert', 'exposed', 'express', 'extraspace', 'fage', 'fail', 'fairwinds', 'faith', 'family', 'fan', 'fans', 'farm', 'farmers', 'fashion', 'fast', 'fedex', 'feedback', 'ferrari', 'ferrero', 'fi', 'fiat', 'fidelity', 'fido', 'film', 'final', 'finance', 'financial', 'fire', 'firestone', 'firmdale', 'fish', 'fishing', 'fit', 'fitness', 'fj', 'fk', 'flickr', 'flights', 'flir', 'florist', 'flowers', 'fly', 'fm', 'fo', 'foo', 'food', 'foodnetwork', 'football', 'ford', 'forex', 'forsale', 'forum', 'foundation', 'fox', 'fr', 'free', 'fresenius', 'frl', 'frogans', 'frontdoor', 'frontier', 'ftr', 'fujitsu', 'fujixerox', 'fun', 'fund', 'furniture', 'futbol', 'fyi', 'ga', 'gal', 'gallery', 'gallo', 'gallup', 'game', 'games', 'gap', 'garden', 'gb', 'gbiz', 'gd', 'gdn', 'ge', 'gea', 'gent', 'genting', 'george', 'gf', 'gg', 'ggee', 'gh', 'gi', 'gift', 'gifts', 'gives', 'giving', 'gl', 'glade', 'glass', 'gle', 'global', 'globo', 'gm', 'gmail', 'gmbh', 'gmo', 'gmx', 'gn', 'godaddy', 'gold', 'goldpoint', 'golf', 'goo', 'goodyear', 'goog', 'google', 'gop', 'got', 'gov', 'gp', 'gq', 'gr', 'grainger', 'graphics', 'gratis', 'green', 'gripe', 'grocery', 'group', 'gs', 'gt', 'gu', 'guardian', 'gucci', 'guge', 'guide', 'guitars', 'guru', 'gw', 'gy', 'hair', 'hamburg', 'hangout', 'haus', 'hbo', 'hdfc', 'hdfcbank', 'health', 'healthcare', 'help', 'helsinki', 'here', 'hermes', 'hgtv', 'hiphop', 'hisamitsu', 'hitachi', 'hiv', 'hk', 'hkt', 'hm', 'hn', 'hockey', 'holdings', 'holiday', 'homedepot', 'homegoods', 'homes', 'homesense', 'honda', 'honeywell', 'horse', 'hospital', 'host', 'hosting', 'hot', 'hoteles', 'hotels', 'hotmail', 'house', 'how', 'hr', 'hsbc', 'ht', 'hu', 'hughes', 'hyatt', 'hyundai', 'ibm', 'icbc', 'ice', 'icu', 'id', 'ie', 'ieee', 'ifm', 'ikano', 'il', 'im', 'imamat', 'imdb', 'immo', 'immobilien', 'in', 'inc', 'industries', 'infiniti', 'info', 'ing', 'ink', 'institute', 'insurance', 'insure', 'int', 'intel', 'international', 'intuit', 'investments', 'io', 'ipiranga', 'iq', 'ir', 'irish', 'is', 'iselect', 'ismaili', 'ist', 'istanbul', 'it', 'itau', 'itv', 'iveco', 'jaguar', 'java', 'jcb', 'jcp', 'je', 'jeep', 'jetzt', 'jewelry', 'jio', 'jll', 'jm', 'jmp', 'jnj', 'jo', 'jobs', 'joburg', 'jot', 'joy', 'jp', 'jpmorgan', 'jprs', 'juegos', 'juniper', 'kaufen', 'kddi', 'ke', 'kerryhotels', 'kerrylogistics', 'kerryproperties', 'kfh', 'kg', 'kh', 'ki', 'kia', 'kim', 'kinder', 'kindle', 'kitchen', 'kiwi', 'km', 'kn', 'koeln', 'komatsu', 'kosher', 'kp', 'kpmg', 'kpn', 'kr', 'krd', 'kred', 'kuokgroup', 'kw', 'ky', 'kyoto', 'kz', 'la', 'lacaixa', 'ladbrokes', 'lamborghini', 'lamer', 'lancaster', 'lancia', 'lancome', 'land', 'landrover', 'lanxess', 'lasalle', 'lat', 'latino', 'latrobe', 'law', 'lawyer', 'lb', 'lc', 'lds', 'lease', 'leclerc', 'lefrak', 'legal', 'lego', 'lexus', 'lgbt', 'li', 'liaison', 'lidl', 'life', 'lifeinsurance', 'lifestyle', 'lighting', 'like', 'lilly', 'limited', 'limo', 'lincoln', 'linde', 'link', 'lipsy', 'live', 'living', 'lixil', 'lk', 'llc', 'loan', 'loans', 'locker', 'locus', 'loft', 'lol', 'london', 'lotte', 'lotto', 'love', 'lpl', 'lplfinancial', 'lr', 'ls', 'lt', 'ltd', 'ltda', 'lu', 'lundbeck', 'lupin', 'luxe', 'luxury', 'lv', 'ly', 'ma', 'macys', 'madrid', 'maif', 'maison', 'makeup', 'man', 'management', 'mango', 'map', 'market', 'marketing', 'markets', 'marriott', 'marshalls', 'maserati', 'mattel', 'mba', 'mc', 'mckinsey', 'md', 'me', 'med', 'media', 'meet', 'melbourne', 'meme', 'memorial', 'men', 'menu', 'merckmsd', 'metlife', 'mg', 'mh', 'miami', 'microsoft', 'mil', 'mini', 'mint', 'mit', 'mitsubishi', 'mk', 'ml', 'mlb', 'mls', 'mm', 'mma', 'mn', 'mo', 'mobi', 'mobile', 'mobily', 'moda', 'moe', 'moi', 'mom', 'monash', 'money', 'monster', 'mopar', 'mormon', 'mortgage', 'moscow', 'moto', 'motorcycles', 'mov', 'movie', 'movistar', 'mp', 'mq', 'mr', 'ms', 'msd', 'mt', 'mtn', 'mtr', 'mu', 'museum', 'mutual', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nab', 'nadex', 'nagoya', 'name', 'nationwide', 'natura', 'navy', 'nba', 'nc', 'ne', 'nec', 'net', 'netbank', 'netflix', 'network', 'neustar', 'new', 'newholland', 'news', 'next', 'nextdirect', 'nexus', 'nf', 'nfl', 'ng', 'ngo', 'nhk', 'ni', 'nico', 'nike', 'nikon', 'ninja', 'nissan', 'nissay', 'nl', 'no', 'nokia', 'northwesternmutual', 'norton', 'now', 'nowruz', 'nowtv', 'np', 'nr', 'nra', 'nrw', 'ntt', 'nu', 'nyc', 'nz', 'obi', 'observer', 'off', 'office', 'okinawa', 'olayan', 'olayangroup', 'oldnavy', 'ollo', 'om', 'omega', 'one', 'ong', 'onl', 'online', 'onyourside', 'ooo', 'open', 'oracle', 'orange', 'org', 'organic', 'origins', 'osaka', 'otsuka', 'ott', 'ovh', 'pa', 'page', 'panasonic', 'paris', 'pars', 'partners', 'parts', 'party', 'passagens', 'pay', 'pccw', 'pe', 'pet', 'pf', 'pfizer', 'pg', 'ph', 'pharmacy', 'phd', 'philips', 'phone', 'photo', 'photography', 'photos', 'physio', 'piaget', 'pics', 'pictet', 'pictures', 'pid', 'pin', 'ping', 'pink', 'pioneer', 'pizza', 'pk', 'pl', 'place', 'play', 'playstation', 'plumbing', 'plus', 'pm', 'pn', 'pnc', 'pohl', 'poker', 'politie', 'p**n', 'post', 'pr', 'pramerica', 'praxi', 'press', 'prime', 'pro', 'prod', 'productions', 'prof', 'progressive', 'promo', 'properties', 'property', 'protection', 'pru', 'prudential', 'ps', 'pt', 'pub', 'pw', 'pwc', 'py', 'qa', 'qpon', 'quebec', 'quest', 'qvc', 'racing', 'radio', 'raid', 're', 'read', 'realestate', 'realtor', 'realty', 'recipes', 'red', 'redstone', 'redumbrella', 'rehab', 'reise', 'reisen', 'reit', 'reliance', 'ren', 'rent', 'rentals', 'repair', 'report', 'republican', 'rest', 'restaurant', 'review', 'reviews', 'rexroth', 'rich', 'richardli', 'ricoh', 'rightathome', 'ril', 'rio', 'rip', 'rmit', 'ro', 'rocher', 'rocks', 'rodeo', 'rogers', 'room', 'rs', 'rsvp', 'ru', 'rugby', 'ruhr', 'run', 'rw', 'rwe', 'ryukyu', 'sa', 'saarland', 'safe', 'safety', 'sakura', 'sale', 'salon', 'samsclub', 'samsung', 'sandvik', 'sandvikcoromant', 'sanofi', 'sap', 'sarl', 'sas', 'save', 'saxo', 'sb', 'sbi', 'sbs', 'sc', 'sca', 'scb', 'schaeffler', 'schmidt', 'scholarships', 'school', 'schule', 'schwarz', 'science', 'scjohnson', 'scor', 'scot', 'sd', 'se', 'search', 'seat', 'secure', 'security', 'seek', 'select', 'sener', 'services', 'ses', 'seven', 'sew', 'sex', 'sexy', 'sfr', 'sg', 'sh', 'shangrila', 'sharp', 'shaw', 'shell', 'shia', 'shiksha', 'shoes', 'shop', 'shopping', 'shouji', 'show', 'showtime', 'shriram', 'si', 'silk', 'sina', 'singles', 'site', 'sj', 'sk', 'ski', 'skin', 'sky', 'skype', 'sl', 'sling', 'sm', 'smart', 'smile', 'sn', 'sncf', 'so', 'soccer', 'social', 'softbank', 'software', 'sohu', 'solar', 'solutions', 'song', 'sony', 'soy', 'space', 'spiegel', 'sport', 'spot', 'spreadbetting', 'sr', 'srl', 'srt', 'st', 'stada', 'staples', 'star', 'starhub', 'statebank', 'statefarm', 'stc', 'stcgroup', 'stockholm', 'storage', 'store', 'stream', 'studio', 'study', 'style', 'su', 'sucks', 'supplies', 'supply', 'support', 'surf', 'surgery', 'suzuki', 'sv', 'swatch', 'swiftcover', 'swiss', 'sx', 'sy', 'sydney', 'symantec', 'systems', 'sz', 'tab', 'taipei', 'talk', 'taobao', 'target', 'tatamotors', 'tatar', 'tattoo', 'tax', 'taxi', 'tc', 'tci', 'td', 'tdk', 'team', 'tech', 'technology', 'tel', 'telefonica', 'temasek', 'tennis', 'teva', 'tf', 'tg', 'th', 'thd', 'theater', 'theatre', 'tiaa', 'tickets', 'tienda', 'tiffany', 'tips', 'tires', 'tirol', 'tj', 'tjmaxx', 'tjx', 'tk', 'tkmaxx', 'tl', 'tm', 'tmall', 'tn', 'to', 'today', 'tokyo', 'tools', 'top', 'toray', 'toshiba', 'total', 'tours', 'town', 'toyota', 'toys', 'tr', 'trade', 'trading', 'training', 'travel', 'travelchannel', 'travelers', 'travelersinsurance', 'trust', 'trv', 'tt', 'tube', 'tui', 'tunes', 'tushu', 'tv', 'tvs', 'tw', 'tz', 'ua', 'ubank', 'ubs', 'uconnect', 'ug', 'uk', 'unicom', 'university', 'uno', 'uol', 'ups', 'us', 'uy', 'uz', 'va', 'vacations', 'vana', 'vanguard', 'vc', 've', 'vegas', 'ventures', 'verisign', 'versicherung', 'vet', 'vg', 'vi', 'viajes', 'video', 'vig', 'viking', 'villas', 'vin', 'vip', 'virgin', 'visa', 'vision', 'vistaprint', 'viva', 'vivo', 'vlaanderen', 'vn', 'vodka', 'volkswagen', 'volvo', 'vote', 'voting', 'voto', 'voyage', 'vu', 'vuelos', 'wales', 'walmart', 'walter', 'wang', 'wanggou', 'warman', 'watch', 'watches', 'weather', 'weatherchannel', 'webcam', 'weber', 'website', 'wed', 'wedding', 'weibo', 'weir', 'wf', 'whoswho', 'wien', 'wiki', 'williamhill', 'win', 'windows', 'wine', 'winners', 'wme', 'wolterskluwer', 'woodside', 'work', 'works', 'world', 'wow', 'ws', 'wtc', 'wtf', 'xbox', 'xerox', 'xfinity', 'xihuan', 'xin', 'xn--11b4c3d', 'xn--1ck2e1b', 'xn--1qqw23a', 'xn--2scrj9c', 'xn--30rr7y', 'xn--3bst00m', 'xn--3ds443g', 'xn--3e0b707e', 'xn--3hcrj9c', 'xn--3oq18vl8pn36a', 'xn--3pxu8k', 'xn--42c2d9a', 'xn--45br5cyl', 'xn--45brj9c', 'xn--45q11c', 'xn--4gbrim', 'xn--54b7fta0cc', 'xn--55qw42g', 'xn--55qx5d', 'xn--5su34j936bgsg', 'xn--5tzm5g', 'xn--6frz82g', 'xn--6qq986b3xl', 'xn--80adxhks', 'xn--80ao21a', 'xn--80aqecdr1a', 'xn--80asehdb', 'xn--80aswg', 'xn--8y0a063a', 'xn--90a3ac', 'xn--90ae', 'xn--90ais', 'xn--9dbq2a', 'xn--9et52u', 'xn--9krt00a', 'xn--b4w605ferd', 'xn--bck1b9a5dre4c', 'xn--c1avg', 'xn--c2br7g', 'xn--cck2b3b', 'xn--cg4bki', 'xn--clchc0ea0b2g2a9gcd', 'xn--czr694b', 'xn--czrs0t', 'xn--czru2d', 'xn--d1acj3b', 'xn--d1alf', 'xn--e1a4c', 'xn--eckvdtc9d', 'xn--efvy88h', 'xn--estv75g', 'xn--fct429k', 'xn--fhbei', 'xn--fiq228c5hs', 'xn--fiq64b', 'xn--fiqs8s', 'xn--fiqz9s', 'xn--fjq720a', 'xn--flw351e', 'xn--fpcrj9c3d', 'xn--fzc2c9e2c', 'xn--fzys8d69uvgm', 'xn--g2xx48c', 'xn--gckr3f0f', 'xn--gecrj9c', 'xn--gk3at1e', 'xn--h2breg3eve', 'xn--h2brj9c', 'xn--h2brj9c8c', 'xn--hxt814e', 'xn--i1b6b1a6a2e', 'xn--imr513n', 'xn--io0a7i', 'xn--j1aef', 'xn--j1amh', 'xn--j6w193g', 'xn--jlq61u9w7b', 'xn--jvr189m', 'xn--kcrx77d1x4a', 'xn--kprw13d', 'xn--kpry57d', 'xn--kpu716f', 'xn--kput3i', 'xn--l1acc', 'xn--lgbbat1ad8j', 'xn--mgb9awbf', 'xn--mgba3a3ejt', 'xn--mgba3a4f16a', 'xn--mgba7c0bbn0a', 'xn--mgbaakc7dvf', 'xn--mgbaam7a8h', 'xn--mgbab2bd', 'xn--mgbai9azgqp6j', 'xn--mgbayh7gpa', 'xn--mgbb9fbpob', 'xn--mgbbh1a', 'xn--mgbbh1a71e', 'xn--mgbc0a9azcg', 'xn--mgbca7dzdo', 'xn--mgberp4a5d4ar', 'xn--mgbgu82a', 'xn--mgbi4ecexp', 'xn--mgbpl2fh', 'xn--mgbt3dhd', 'xn--mgbtx2b', 'xn--mgbx4cd0ab', 'xn--mix891f', 'xn--mk1bu44c', 'xn--mxtq1m', 'xn--ngbc5azd', 'xn--ngbe9e0a', 'xn--ngbrx', 'xn--node', 'xn--nqv7f', 'xn--nqv7fs00ema', 'xn--nyqy26a', 'xn--o3cw4h', 'xn--ogbpf8fl', 'xn--otu796d', 'xn--p1acf', 'xn--p1ai', 'xn--pbt977c', 'xn--pgbs0dh', 'xn--pssy2u', 'xn--q9jyb4c', 'xn--qcka1pmc', 'xn--qxam', 'xn--rhqv96g', 'xn--rovu88b', 'xn--rvc1e0am3e', 'xn--s9brj9c', 'xn--ses554g', 'xn--t60b56a', 'xn--tckwe', 'xn--tiq49xqyj', 'xn--unup4y', 'xn--vermgensberater-ctb', 'xn--vermgensberatung-pwb', 'xn--vhquv', 'xn--vuq861b', 'xn--w4r85el8fhu5dnra', 'xn--w4rs40l', 'xn--wgbh1c', 'xn--wgbl6a', 'xn--xhq521b', 'xn--xkc2al3hye2a', 'xn--xkc2dl3a5ee0h', 'xn--y9a3aq', 'xn--yfro4i67o', 'xn--ygbi2ammx', 'xn--zfr164b', 'xxx', 'xyz', 'yachts', 'yahoo', 'yamaxun', 'yandex', 'ye', 'yodobashi', 'yoga', 'yokohama', 'you', 'youtube', 'yt', 'yun', 'za', 'zappos', 'zara', 'zero', 'zip', 'zippo', 'zm', 'zone', 'zuerich', 'zw']
domain_tld = Or(tlds)
domain_name = alphanum_word_start + Combine(Combine(OneOrMore(label + ('.')))('domain_labels') + domain_tld('tld')) + alphanum_word_end

ipv4_section = Word(nums, asKeyword=True).addCondition(lambda tokens: int(tokens[0]) < 256)
# basically, the grammar below says: start any words that start with a '.' or a number; I want to match words that start with a '.' because this will fail later in the grammar and I do not want to match anything that start with a '.'
ipv4_address = alphanum_word_start + WordStart('.' + nums) + Combine((ipv4_section + '.') * 3 + ipv4_section) + NotAny(Regex('\.\S')) + alphanum_word_end

hexadectet = Word(hexnums, min=1, max=4)
ipv6_address_full = alphanum_word_start + Combine((hexadectet + ":") * 7 + hexadectet)
# todo: the ipv6_address_shortened grammar needs some fine-tuning so it doesn't pull in content too broadly
ipv6_address_shortened = Combine(OneOrMore(Or([hexadectet + Word(':'), Word(':')])) + hexadectet)
ipv6_address = Or([ipv6_address_full, ipv6_address_shortened]) + alphanum_word_end

complete_email_comment = Combine('(' + Word(printables.replace(')', '')) + ')')
# the complete_email_local_part grammar ignores the fact that characters like <<<(),:;<>@[\] >>> are possible in a quoted complete_email_local_part (and the double-quotes and backslash should be preceded by a backslash)
complete_email_local_part = Combine(Optional(complete_email_comment)('email_address_comment') + Word(alphanums + "!#$%&'*+-/=?^_`{|}~." + '"') + Optional(complete_email_comment)('email_address_comment'))
complete_email_address = Combine(complete_email_local_part('email_address_local_part') + "@" + Or([domain_name, '[' + ipv4_address + ']', '[IPv6:' + ipv6_address + ']'])('email_address_domain'))
示例#8
0
 def expr(self) -> ParserElement:
     return WordStart() + Keyword("---", identChars="-").setParseAction(
         replaceWith("—")) + WordEnd()
示例#9
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from pyparsing import alphanums, alphas, printables, nums, hexnums
from pyparsing import OneOrMore, Word, Combine, Optional, Or, Regex, WordStart, WordEnd, replaceWith, downcaseTokens, NotAny

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)
# the label definition ignores the fact that labels should not end in an hyphen
label = Word(initChars=alphanums, bodyChars=alphanums + '-', max=63)
# todo: build out the domain_tld list with only valid tlds
tlds = [
    'aaa', 'aarp', 'abarth', 'abb', 'abbott', 'abbvie', 'abc', 'able',
    'abogado', 'abudhabi', 'ac', 'academy', 'accenture', 'accountant',
    'accountants', 'aco', 'active', 'actor', 'ad', 'adac', 'ads', 'adult',
    'ae', 'aeg', 'aero', 'aetna', 'af', 'afamilycompany', 'afl', 'africa',
    'ag', 'agakhan', 'agency', 'ai', 'aig', 'aigo', 'airbus', 'airforce',
    'airtel', 'akdn', 'al', 'alfaromeo', 'alibaba', 'alipay', 'allfinanz',
    'allstate', 'ally', 'alsace', 'alstom', 'am', 'americanexpress',
    'americanfamily', 'amex', 'amfam', 'amica', 'amsterdam', 'analytics',
    'android', 'anquan', 'anz', 'ao', 'aol', 'apartments', 'app', 'apple',
    'aq', 'aquarelle', 'ar', 'arab', 'aramco', 'archi', 'army', 'arpa', 'art',
    'arte', 'as', 'asda', 'asia', 'associates', 'at', 'athleta', 'attorney',
    'au', 'auction', 'audi', 'audible', 'audio', 'auspost', 'author', 'auto',
    'autos', 'avianca', 'aw', 'aws', 'ax', 'axa', 'az', 'azure', 'ba', 'baby',
    'baidu', 'banamex', 'bananarepublic', 'band', 'bank', 'bar', 'barcelona',
    'barclaycard', 'barclays', 'barefoot', 'bargains', 'baseball',
    'basketball', 'bauhaus', 'bayern', 'bb', 'bbc', 'bbt', 'bbva', 'bcg',
    'bcn', 'bd', 'be', 'beats', 'beauty', 'beer', 'bentley', 'berlin', 'best',
    'bestbuy', 'bet', 'bf', 'bg', 'bh', 'bharti', 'bi', 'bible', 'bid', 'bike',
    'bing', 'bingo', 'bio', 'biz', 'bj', 'black', 'blackfriday', 'blanco',
示例#10
0
class SearchParser(object):
    """The parser for bauble.search.MapperSearch
    """

    numeric_value = Regex(r'[-]?\d+(\.\d*)?([eE]\d+)?').setParseAction(
        NumericToken)('number')
    unquoted_string = Word(alphanums + alphas8bit + '%.-_*;:')
    string_value = (quotedString.setParseAction(removeQuotes)
                    | unquoted_string).setParseAction(StringToken)('string')

    none_token = Literal('None').setParseAction(NoneToken)
    empty_token = Literal('Empty').setParseAction(EmptyToken)

    value_list = Forward()
    typed_value = (Literal("|") + unquoted_string + Literal("|") + value_list +
                   Literal("|")).setParseAction(TypedValueToken)

    value = (typed_value | numeric_value | none_token | empty_token
             | string_value).setParseAction(ValueToken)('value')
    value_list << Group(OneOrMore(value) ^ delimitedList(value)
                        ).setParseAction(ValueListAction)('value_list')

    domain = Word(alphas, alphanums)
    binop = oneOf('= == != <> < <= > >= not like contains has ilike '
                  'icontains ihas is')
    equals = Literal('=')
    star_value = Literal('*')
    domain_values = (value_list.copy())('domain_values')
    domain_expression = (
        (domain + equals + star_value + stringEnd)
        | (domain + binop + domain_values + stringEnd)
    ).setParseAction(DomainExpressionAction)('domain_expression')

    AND_ = WordStart() + (CaselessLiteral("AND") | Literal("&&")) + WordEnd()
    OR_ = WordStart() + (CaselessLiteral("OR") | Literal("||")) + WordEnd()
    NOT_ = WordStart() + (CaselessLiteral("NOT") | Literal('!')) + WordEnd()
    BETWEEN_ = WordStart() + CaselessLiteral("BETWEEN") + WordEnd()

    query_expression = Forward()('filter')
    identifier = Group(delimitedList(Word(alphas + '_', alphanums + '_'),
                                     '.')).setParseAction(IdentifierToken)
    ident_expression = (
        Group(identifier + binop + value).setParseAction(IdentExpressionToken)
        | (Literal('(') + query_expression +
           Literal(')')).setParseAction(ParenthesisedQuery))
    between_expression = Group(identifier + BETWEEN_ + value + AND_ +
                               value).setParseAction(BetweenExpressionAction)
    query_expression << infixNotation(
        (ident_expression | between_expression),
        [(NOT_, 1, opAssoc.RIGHT, SearchNotAction),
         (AND_, 2, opAssoc.LEFT, SearchAndAction),
         (OR_, 2, opAssoc.LEFT, SearchOrAction)])
    query = (domain + Keyword('where', caseless=True).suppress() +
             Group(query_expression) + stringEnd).setParseAction(QueryAction)

    statement = (query('query')
                 | domain_expression('domain')
                 | value_list('value_list')
                 ).setParseAction(StatementAction)('statement')

    def parse_string(self, text):
        '''request pyparsing object to parse text

        `text` can be either a query, or a domain expression, or a list of
        values. the `self.statement` pyparsing object parses the input text
        and return a pyparsing.ParseResults object that represents the input
        '''

        return self.statement.parseString(text)
示例#11
0
        result = set([replace(i) for i in result])
        logger.debug("result is now %s" % result)
        if None in result:
            logger.warn('removing None from result set')
            result = set(i for i in result if i is not None)
        return result


from pyparsing import (
    Word, alphas8bit, removeQuotes, delimitedList, Regex,
    OneOrMore, oneOf, alphas, alphanums, Group, Literal,
    CaselessLiteral, WordStart, WordEnd, srange,
    stringEnd, Keyword, quotedString,
    infixNotation, opAssoc, Forward)

wordStart, wordEnd = WordStart(), WordEnd()


class SearchParser(object):
    """The parser for bauble.search.MapperSearch
    """

    numeric_value = Regex(
        r'[-]?\d+(\.\d*)?([eE]\d+)?'
        ).setParseAction(NumericToken)('number')
    unquoted_string = Word(alphanums + alphas8bit + '%.-_*;:')
    string_value = (
        quotedString.setParseAction(removeQuotes) | unquoted_string
        ).setParseAction(StringToken)('string')

    none_token = Literal('None').setParseAction(NoneToken)
示例#12
0
    NotAny,
    OneOrMore,
    Optional,
    Or,
    Regex,
    replaceWith,
    upcaseTokens,
    Word,
    WordEnd,
    WordStart,
    ZeroOrMore,
)

from data_lists import tlds, schemes

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)

# the label definition ignores the fact that labels should not end in an hyphen
label = Word(initChars=alphanums, bodyChars=alphanums + '-', max=63)
domain_tld = Or(tlds)
domain_name = (
    alphanum_word_start
    + Combine(
        Combine(OneOrMore(label + ('.' + FollowedBy(Word(alphanums + '-')))))('domain_labels') + domain_tld('tld')
    )
    + alphanum_word_end
).setParseAction(downcaseTokens)

ipv4_section = (
    Word(nums, asKeyword=True, max=3)
示例#13
0
def unqualified_identifier() -> Token:
    return (locatedExpr(
        WordStart(alphas) + Word(alphanums + "_") +
        WordEnd(alphanums +
                "_")).setParseAction(verify_identifier).setName("Identifier"))
示例#14
0
    NotAny,
    OneOrMore,
    Optional,
    Or,
    Regex,
    replaceWith,
    upcaseTokens,
    Word,
    WordEnd,
    WordStart,
    ZeroOrMore,
)

from data_lists import tlds, schemes

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)

# the label definition ignores the fact that labels should not end in an hyphen
label = Word(initChars=alphanums, bodyChars=alphanums + '-', max=63)
domain_tld = Or(tlds)
domain_name = (alphanum_word_start + Combine(
    Combine(OneOrMore(label + ('.' + FollowedBy(Word(alphanums + '-')))))
    ('domain_labels') + domain_tld('tld')) +
               alphanum_word_end).setParseAction(downcaseTokens)

ipv4_section = (Word(
    nums, asKeyword=True,
    max=3).setParseAction(lambda x: str(int(x[0]))).addCondition(
        lambda tokens: int(tokens[0]) < 256))
# basically, the grammar below says: start any words that start with a '.' or a number; I want to match words that start with a '.' because this will fail later in the grammar and I do not want to match anything that start with a '.'
示例#15
0
def WordBoundaries(grammar):
    return WordStart(alphanums) + grammar + WordEnd(alphanums)
示例#16
0
    def __init__(self, basedir: str = '.') -> None:
        self.__basedir = basedir
        self.__specifications: Dict[str, Specification] = {}
        self.__pdus: Dict[str, PDU] = {}
        self.__refinements: Dict[str, Refinement] = {}

        # Generic
        comma = Suppress(Literal(','))
        comma.setName('","')
        semicolon = Suppress(Literal(';'))
        semicolon.setName('";"')

        # Comments
        comment = Regex(r'--.*')

        # Names
        identifier = WordStart(alphanums) + Word(alphanums +
                                                 '_') + WordEnd(alphanums +
                                                                '_')
        identifier.setParseAction(verify_identifier)
        identifier.setName('Identifier')
        qualified_identifier = Optional(identifier + Literal('.')) - identifier
        qualified_identifier.setParseAction(lambda t: ''.join(t.asList()))
        attribute_designator = Keyword('First') | Keyword('Last') | Keyword(
            'Length')
        attribute_reference = identifier + Literal('\'') - attribute_designator
        attribute_reference.setParseAction(parse_attribute)
        attribute_reference.setName('Attribute')
        name = attribute_reference | identifier
        name.setName('Name')

        # Literals
        numeral = Word(nums) + ZeroOrMore(Optional(Word('_')) + Word(nums))
        numeral.setParseAction(
            lambda t: int(''.join(t.asList()).replace('_', '')))
        extended_digit = Word(nums + 'ABCDEF')
        based_numeral = extended_digit + ZeroOrMore(
            Optional('_') + extended_digit)
        based_literal = numeral + Literal('#') - based_numeral - Literal('#')
        based_literal.setParseAction(
            lambda t: int(t[2].replace('_', ''), int(t[0])))
        numeric_literal = based_literal | numeral
        numeric_literal.setParseAction(lambda t: Number(t[0]))
        numeric_literal.setName('Number')
        literal = numeric_literal

        # Operators
        mathematical_operator = (Literal('**') | Literal('+') | Literal('-')
                                 | Literal('*')
                                 | Literal('/'))
        relational_operator = (Keyword('<=') | Keyword('>=') | Keyword('=')
                               | Keyword('/=')
                               | Keyword('<') | Keyword('>'))
        logical_operator = Keyword('and') | Keyword('or')

        # Expressions
        mathematical_expression = Forward()
        relation = mathematical_expression + relational_operator - mathematical_expression
        relation.setParseAction(parse_relation)
        relation.setName('Relation')
        logical_expression = infixNotation(
            relation,
            [(logical_operator, 2, opAssoc.LEFT, parse_logical_expression)])
        logical_expression.setName('LogicalExpression')
        term = Keyword('null') | literal | name
        term.setParseAction(parse_term)
        mathematical_expression << infixNotation(
            term, [(mathematical_operator, 2, opAssoc.LEFT,
                    parse_mathematical_expression)])
        mathematical_expression.setName('MathematicalExpression')

        # Type Refinement
        value_constraint = Keyword('if') - logical_expression
        value_constraint.setParseAction(lambda t: t[1])
        type_refinement_definition = (
            Keyword('new') - qualified_identifier - Suppress(Literal('(')) -
            identifier - Suppress(Literal('=>')) -
            (Keyword('null') | qualified_identifier) - Suppress(Literal(')')) -
            Optional(value_constraint))
        type_refinement_definition.setName('Refinement')

        # Integer Types
        size_aspect = Keyword('Size') - Keyword('=>') - mathematical_expression
        size_aspect.setParseAction(parse_aspect)
        range_type_aspects = Keyword('with') - size_aspect
        range_type_aspects.setParseAction(parse_aspects)

        range_type_definition = (Keyword('range') - mathematical_expression -
                                 Suppress(Literal('..')) -
                                 mathematical_expression - range_type_aspects)
        range_type_definition.setName('RangeInteger')
        modular_type_definition = Keyword('mod') - mathematical_expression
        modular_type_definition.setName('ModularInteger')
        integer_type_definition = range_type_definition | modular_type_definition

        # Enumeration Types
        enumeration_literal = name
        positional_enumeration = enumeration_literal + ZeroOrMore(
            comma - enumeration_literal)
        positional_enumeration.setParseAction(
            lambda t: [(k, Number(v)) for v, k in enumerate(t.asList())])
        element_value_association = enumeration_literal + Keyword(
            '=>') - numeric_literal
        element_value_association.setParseAction(lambda t: (t[0], t[2]))
        named_enumeration = (element_value_association +
                             ZeroOrMore(comma - element_value_association))

        boolean_literal = Keyword('True') | Keyword('False')
        boolean_literal.setParseAction(lambda t: t[0] == 'True')
        boolean_aspect_definition = Optional(Keyword('=>') - boolean_literal)
        boolean_aspect_definition.setParseAction(lambda t:
                                                 (t if t else ['=>', True]))
        always_valid_aspect = Literal(
            'Always_Valid') - boolean_aspect_definition
        always_valid_aspect.setParseAction(parse_aspect)
        enumeration_aspects = Keyword('with') - delimitedList(
            size_aspect | always_valid_aspect)
        enumeration_aspects.setParseAction(parse_aspects)

        enumeration_type_definition = (
            Literal('(') - (named_enumeration | positional_enumeration) -
            Literal(')') - enumeration_aspects)
        enumeration_type_definition.setName('Enumeration')

        # Array Type
        unconstrained_array_definition = Keyword('array of') + name
        array_type_definition = unconstrained_array_definition
        array_type_definition.setName('Array')

        # Message Type
        first_aspect = Keyword('First') - Keyword(
            '=>') - mathematical_expression
        first_aspect.setParseAction(parse_aspect)
        length_aspect = Keyword('Length') - Keyword(
            '=>') - mathematical_expression
        length_aspect.setParseAction(parse_aspect)
        component_aspects = Keyword('with') - delimitedList(first_aspect
                                                            | length_aspect)
        component_aspects.setParseAction(parse_aspects)

        then = (Keyword('then') - (Keyword('null') | identifier) -
                Group(Optional(component_aspects)) -
                Group(Optional(value_constraint)))
        then.setParseAction(parse_then)
        then_list = then + ZeroOrMore(comma - then)
        then_list.setParseAction(lambda t: [t.asList()])
        component_list = Forward()
        message_type_definition = Keyword(
            'message') - component_list - Keyword('end message')
        message_type_definition.setName('Message')
        component_item = (~Keyword('end') + ~CaselessKeyword('Message') -
                          identifier + Literal(':') - name -
                          Optional(then_list) - semicolon)
        component_item.setParseAction(lambda t: Component(t[0], t[2], t[3]) if
                                      len(t) >= 4 else Component(t[0], t[2]))
        component_item.setName('Component')
        null_component_item = Keyword('null') - then - semicolon
        null_component_item.setParseAction(
            lambda t: Component(t[0], '', [t[1]]))
        null_component_item.setName('NullComponent')
        component_list << (Group(
            Optional(null_component_item) - component_item -
            ZeroOrMore(component_item)))
        component_list.setParseAction(lambda t: t.asList())

        # Types
        type_definition = (enumeration_type_definition
                           | integer_type_definition
                           | message_type_definition
                           | type_refinement_definition
                           | array_type_definition)
        type_declaration = (Keyword('type') - identifier - Keyword('is') -
                            type_definition - semicolon)
        type_declaration.setParseAction(parse_type)

        # Package
        basic_declaration = type_declaration
        package_declaration = (Keyword('package') - identifier -
                               Keyword('is') -
                               Group(ZeroOrMore(basic_declaration)) -
                               Keyword('end') - name - semicolon)
        package_declaration.setParseAction(
            lambda t: Package(t[1], t[3].asList()))

        # Context
        context_item = Keyword('with') - identifier - semicolon
        context_item.setParseAction(lambda t: t[1])
        context_clause = ZeroOrMore(context_item)
        context_clause.setParseAction(lambda t: Context(t.asList()))

        # Specification
        specification = Optional(context_clause + package_declaration)
        specification.setParseAction(lambda t: Specification(t[0], t[1])
                                     if len(t) == 2 else None)

        # Grammar
        self.__grammar = specification + StringEnd()
        self.__grammar.setParseAction(self.__evaluate_specification)
        self.__grammar.ignore(comment)
示例#17
0
)

from data_lists import (
    pre_attack_tactics,
    pre_attack_techniques,
    enterprise_attack_mitigations,
    enterprise_attack_tactics,
    enterprise_attack_techniques,
    mobile_attack_mitigations,
    mobile_attack_tactics,
    mobile_attack_techniques,
    tlds,
    schemes,
)

alphanum_word_start = WordStart(wordChars=alphanums)
alphanum_word_end = WordEnd(wordChars=alphanums)

# the label definition ignores the fact that labels should not end in an hyphen
label = Word(initChars=alphanums + '_', bodyChars=alphanums + '-_', max=63)
domain_tld = Or(tlds)
domain_name = (alphanum_word_start + Combine(
    Combine(OneOrMore(label + ('.' + FollowedBy(Word(alphanums + '-_')))))
    ('domain_labels') + domain_tld('tld')) +
               alphanum_word_end).setParseAction(downcaseTokens)

ipv4_section = (Word(
    nums, asKeyword=True,
    max=3).setParseAction(lambda x: str(int(x[0]))).addCondition(
        lambda tokens: int(tokens[0]) < 256))
# basically, the grammar below says: start any words that start with a '.' or a number; I want to match words that start with a '.' because this will fail later in the grammar and I do not want to match anything that start with a '.'
def ddlWord(string):
    return WordStart(alphanums + "_") + CaselessLiteral(string) + WordEnd(
        alphanums + "_")
示例#19
0
 def identifier(cls) -> Token:
     return ((WordStart(alphas) + Word(alphanums + "_") +
              WordEnd(alphanums + "_")
              ).setParseAction(verify_identifier).setName("Identifier"))