# Exercise 5b - Set complements # You're processing some DNA sequences and you notice that some of them # have been corrupted - they contain letters other than A,T,C and G! # Write a regex that will let you remove the corrupted sequences. from test_regex import test_regex not_dna_regex = r'[^ATCG]' # replace ellipsis with a regex that matches # sequences that contain anything other than A, T, C & G test_regex('Not DNA regex', not_dna_regex, ['ATCHGA', '#*@!'], ['AGGGGGCTAA', 'ACGAT']) # Notice that your regex matches non-alphabetic characters as well!
# Exercise 7 import re from test_regex import test_regex # Write a regex that matches filenames ending in .doc and .odt word_file_regex = r'\.(doc|odt)$' test_regex('Word file regex', word_file_regex, ['math_girls.doc', 'mathgirls2.odt'], ['other.ddt', 'odt', 'mathgirlsodt'])
# Exercise 2 - Specifying custom sets of characters # We saw some metacharacters in the previous slide: \w, \d, \s # Define custom character ranges that are equivalent to these, # WITHOUT using metacharacters (escape sequences like \n are fine) # Exercise 2a digit_regex = r'[...]' # match definition of \d # Exercise 2b wordlike_regex = r'[...]' # match definition of \w (at least, for English) # Exercise 2c spacelike_regex = r'[...]' # match definition of \s # No need to edit below this point. Run this program to see if your answers were correct. from test_regex import test_regex test_regex('Digit regex', digit_regex, '0123456789', 'aZ \t\n$_') test_regex('Wordlike regex', wordlike_regex, 'ampBLY_058', ' \t\n-$') test_regex('Spacelike regex', spacelike_regex, ' \t\r\n', 'ahlB_-$9')
# Exercise 7 import re from test_regex import test_regex # Write a regex that matches filenames ending in .doc and .odt word_file_regex = r'...' test_regex('Word file regex', word_file_regex, ['math_girls.doc', 'mathgirls2.odt'], ['other.ddt', 'odt'])
import re from test_regex import test_regex # --------------- # Exercise 6a # --------------- # Write a regex that matches the words bad, baad, baaad # for any arbitrary number of a's animal_farm_bleat_regex = r'\bba+d\b' # replace the ellipsis with your regex test_regex('Animal farm bleat', animal_farm_bleat_regex, ['baaaaaaad', 'This is baaaaaaaaaaad!'], # should match these ['bd', 'bacd', 'baaaaa', 'sinbaaaad']) # shouldn't match these # --------------- # Exercise 6b # --------------- # Write a regex that matches the word baaad (3 a's) up to baaaaaaaaaad (10 a's). animal_farm_short_bleat_regex = r'\bba{3,10}d\b' test_regex('Animal farm short bleat', animal_farm_short_bleat_regex, ['baaaaaaad', 'This is baaaaaaaaaad!'], # should match these ['bd', 'bacd', 'baaaaa', 'sinbaaaad', 'baad', 'baaaaaaaaaaaad']) # not these # ---------------
# Exercise 5c - Set complements # You're looking at a list of purchases again. # Acme company uses product codes with A,B,C followed by three digits. # Axeme company uses product codes X,Y,Z followed by three digits. from test_regex import test_regex # Write a regex to extract lines with just Acme and Axeme's product codes, # *using a set complement* rather than r'[ABCXYZ]...' # Also, make sure that the product code occurs at the *end* of the line acme_axeme_regex = r'[^D-W]\d\d\d' # replace the ellipsis with your regex # No need to edit below this line matches = ['This line contains Acme product code C180', 'This line contains Axeme product X007'] nonmatches = ['This line contains Hugo\'s product code J982', 'This line contains an incorrect Axeme product code Z48A', 'This line contains Acme code C180 but it\'s not at the end.'] test_regex('Acme/Axeme regex', acme_axeme_regex, matches, nonmatches) # Notice that we did something a bit dangerous here - # the regex also matches product codes like '0000'. # So, be careful when using set complements!
import re from test_regex import test_regex # --------------- # Exercise 6a # --------------- # Write a regex that matches the words bad, baad, baaad # for any arbitrary number of a's animal_farm_bleat_regex = r'...' # replace the ellipsis with your regex test_regex('Animal farm bleat', animal_farm_bleat_regex, ['baaaaaaad', 'This is baaaaaaaaaaad!'], # should match these ['bd', 'bacd', 'baaaaa', 'sinbaaaad']) # shouldn't match these # --------------- # Exercise 6b # --------------- # Write a regex that matches the word baaad (3 a's) up to baaaaaaaaaad (10 a's). animal_farm_short_bleat_regex = r'...' test_regex('Animal farm short bleat', animal_farm_short_bleat_regex, ['baaaaaaad', 'This is baaaaaaaaaad!'], # should match these ['bd', 'bacd', 'baaaaa', 'sinbaaaad', 'baad', 'baaaaaaaaaaaad']) # not these # ---------------
# Exercise 2 - Specifying custom sets of characters # We saw some metacharacters in the previous slide: \w, \d, \s # Define custom character ranges that are equivalent to these, # WITHOUT using metacharacters (escape sequences like \n are fine) # Exercise 2a digit_regex = r'[0-9]' # match definition of \d # Exercise 2b wordlike_regex = r'[a-zA-Z0-9_]' # match definition of \w (at least, for English) # Exercise 2c spacelike_regex = r'[\n\t\r ]' # match definition of \s # No need to edit below this point. Run this program to see if your answers were correct. from test_regex import test_regex test_regex('Digit regex', digit_regex, '0123456789', 'aZ \t\n$_') test_regex('Wordlike regex', wordlike_regex, 'ampBLY_058', ' \t\n-$') test_regex('Spacelike regex', spacelike_regex, ' \t\r\n', 'ahlB_-$9')
# Exercise 5a - Set complements from test_regex import test_regex # Let's rearrange the metacharacter_regex from Exercise 4d. # Why doesn't it work as we wanted? metacharacter_regex = r'[^$.]' test_regex('Metacharacter regex', metacharacter_regex, '^$.', 'a1') # Edit the metacharacter regex below to make it work, # in some way OTHER than rearranging the characters again edited_metacharacter_regex = r'[\^\$\.]' test_regex('Edited Metacharacter regex', edited_metacharacter_regex, '^$.', 'a1') # So, saying "within character sets [], metacharacters have their # regular meaning except backslashes" is true EXCEPT that # ^ means complement if it's the first character [^...]. # Elsewhere, it means the caret character.
# The purpose of this exercise is to illustrate that metacharacters # retain their regular meaning within character sets [] import re from test_regex import test_regex # Instructions: read the regex, and run the code to verify that # it does indeed match the literal characters $, ^ and . metacharacter_regex = r'[$^.]' # Test regex to make sure it matches '^','$','.' and not 'a' or '1'. test_regex('Metacharacter regex', metacharacter_regex, '^$.', 'a1')
# Exercise 5a - Set complements import re from test_regex import test_regex # Let's rearrange the metacharacter_regex from Exercise 4d. # Why doesn't it work as we wanted? metacharacter_regex = r'[^$.]' test_regex('Metacharacter regex', metacharacter_regex, '^$.', 'a1') # Edit the metacharacter regex below to make it work, # in some way OTHER than rearranging the characters again edited_metacharacter_regex = r'[^$.]' test_regex('Edited Metacharacter regex', edited_metacharacter_regex, '^$.', 'a1') # So, saying "within character sets [], metacharacters have their # regular meaning except backslashes" is true EXCEPT that # ^ means complement if it's the first character [^...]. # Elsewhere, it means the caret character.
# Exercise 5c - Set complements # You're looking at a list of purchases again. # Acme company uses product codes with A,B,C followed by three digits. # Axeme company uses product codes X,Y,Z followed by three digits. from test_regex import test_regex # Write a regex to extract lines with just Acme and Axeme's product codes, # *using a set complement* rather than r'[ABCXYZ]...' # Also, make sure that the product code occurs at the *end* of the line acme_axeme_regex = r'[^D-W]\d\d\d' # replace the ellipsis with your regex # No need to edit below this line matches = [ 'This line contains Acme product code C180', 'This line contains Axeme product X007' ] nonmatches = [ 'This line contains Hugo\'s product code J982', 'This line contains an incorrect Axeme product code Z48A', 'This line contains Acme code C180 but it\'s not at the end.' ] test_regex('Acme/Axeme regex', acme_axeme_regex, matches, nonmatches) # Notice that we did something a bit dangerous here - # the regex also matches product codes like '0000'. # So, be careful when using set complements!