def test_file_path_unix_double_dots(self): """Test for Unix file paths using ..""" self.assertEqual(tokenize_v2('..'), ['..']) self.assertEqual(tokenize_v2('../'), ['../']) self.assertEqual(tokenize_v2('../..'), ['../..']) self.assertEqual(tokenize_v2('/dev/..'), ['/dev/..']) self.assertEqual(tokenize_v2('../hello.txt'), ['../hello.txt'])
def test_class(self): """Test for classes.""" self.assertEqual(tokenize_v2('TokenizerClass()'), ['TokenizerClass()']) self.assertEqual(tokenize_v2('TokenizerClass(object)'), ['TokenizerClass(object)']) self.assertEqual(tokenize_v2('TokenizerClass(object.Object)'), ['TokenizerClass(object.Object)'])
def test_remove_html_tags_attr(self): """Test for removing html tags with attributes.""" self.assertEqual(tokenize_v2('<a href="https://google.com">hello</a>'), ['hello']) self.assertEqual( tokenize_v2( '<a href="https://google.com" rel="nofollow" >hello</a>'), ['hello'])
def test_remove_html_tags(self): """Test for removing simple html tags.""" self.assertEqual(tokenize_v2('<b>hello</b>'), ['hello']) self.assertEqual(tokenize_v2('<blockquote>hello</blockquote>'), ['hello']) self.assertEqual(tokenize_v2('<del>hello</del>'), ['hello']) self.assertEqual(tokenize_v2('<h1>hello</h1>'), ['hello']) self.assertEqual(tokenize_v2('<h2>hello</h2>'), ['hello']) self.assertEqual(tokenize_v2('<h3>hello</h3>'), ['hello']) self.assertEqual(tokenize_v2('<i>hello</i>'), ['hello']) self.assertEqual(tokenize_v2('<p>hello</p>'), ['hello']) self.assertEqual(tokenize_v2('<pre>hello</pre>'), ['hello'])
def tokenize_eval(): """Tokenize and evaluate the implemented tokenizer on the annotated dataset.""" data = pd.read_csv(os.path.join(RAW_DIR, 'TokenTagRaw.csv')) ovr_len_truth = 0 ovr_len_tokens = 0 ovr_accr = 0 for id_ in data['Id']: # get truths and tokens, remove code tags if exists truth = get_truths(id_) text = data[data['Id'] == id_]['Body'].values[0] tokens = [re.sub(r'</?code>', '', x) for x in tokenize_v2(text)] ovr_len_truth += len(truth) ovr_len_tokens += len(tokens) # evaluate single post accr, pre, rec, f1_ = evaluate(tokens, truth) ovr_accr += accr print('Id: {}, precision: {:.3f}, recall: {:.3f}, f1: {:.3f}'.format( id_, pre, rec, f1_)) # overall evaluation pre = ovr_accr / ovr_len_tokens rec = ovr_accr / ovr_len_truth f1_ = 2 * pre * rec / (pre + rec) print('Overall: precision: {:.3f}, recall: {:.3f}, f1: {:.3f}'.format( pre, rec, f1_))
def pos_irregular(): """POS tagging on a sample of sentences with irregular tokens.""" from nltk import pos_tag data = pd.read_csv(os.path.join(RAW_DIR, 'IrregularTokenSent.csv')) for str_ in data['Sentence']: print(pos_tag(tokenize_v2(str_)))
def test_mixed_7(self): """Mixed test case 7 (a weird one).""" in_string = 'C:\\WINDOWS\\$Hello world\\-txt hahaha lol.exe testing c: d: 0: c:\\ 0:\\' res = [ 'C:\\WINDOWS\\$Hello world\\-txt hahaha lol.exe', 'testing', 'c', ':', 'd', ':', '0', ':', 'c:\\', '0', ':', '\\' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_5(self): """Mixed test case 5.""" in_string = '[{testingdfig}] [e.g.] e.g i.e i.e. http://google.com google.com' res = [ '[{testingdfig}]', '[e.g.]', 'e.g', 'i.e', 'i.e.', 'http://google.com', 'google.com' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_4(self): """Mixed test case 4.""" in_string = '555 obj.func() func(arg) oodp.method(arg) [hello] {world}' res = [ '555', 'obj.func()', 'func(arg)', 'oodp.method(arg)', '[hello]', '{world}' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_3(self): """Mixed test case 3.""" in_string = '_test_test $1.00 _test_ test_test $interpolateProvider ash6.sad34sdf' res = [ '_test_test', '$', '1.00', '_test_', 'test_test', '$interpolateProvider', 'ash6.sad34sdf' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_2(self): """Mixed test case 2.""" in_string = 'length-2 _test /nfs/an/disks/jj/home/dir/file.txt /dev/test/file.txt' res = [ 'length-2', '_test', '/nfs/an/disks/jj/home/dir/file.txt', '/dev/test/file.txt' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_1(self): """Mixed test case 1.""" in_string = '<p>my string.</p><code>sfdsfdsfds\n\n\n\n\n\n(sdfdsfd)</code> function()' res = [ 'my', 'string', '.', '<code>sfdsfdsfds\n\n\n\n\n\n(sdfdsfd)</code>', 'function()' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_8(self): """Mixed test case 8 (another weird one).""" in_string = 'C:\\.WINDOWS\\Hello world\\-txt.exe testing ... .. ../.. ../. ./.. . ./. ../' res = [ 'C:\\.WINDOWS\\Hello world\\-txt.exe', 'testing', '...', '..', '../..', '../.', './..', '.', './.', '../' ] self.assertEqual(tokenize_v2(in_string), res)
def test_file_path_windows_sent(self): """Test for recognizing Windows file paths in a sentence.""" self.assertEqual( tokenize_v2('open file: C:\\Program Files\\Hello\\txt.exe and do'), [ 'open', 'file', ':', 'C:\\Program Files\\Hello\\txt.exe', 'and', 'do' ])
def test_mixed_9(self): """Mixed test case 9.""" in_string = 'C:\\WINDOWS\\Hello\\txt.exe testing /dev/test ../..' + \ ' ../../test /../test http://google.com https://googl.com' res = [ 'C:\\WINDOWS\\Hello\\txt.exe', 'testing', '/dev/test', '../..', '../../test', '/../test', 'http://google.com', 'https://googl.com' ] self.assertEqual(tokenize_v2(in_string), res)
def test_mixed_10(self): """Mixed test case 10.""" in_string = 'https://google.com/query#div?q=hello&a=test' + \ ' http://google.com/query#div?q=hello&a=test google.com/query#div?q=hello&a=test' res = [ 'https://google.com/query#div?q=hello&a=test', 'http://google.com/query#div?q=hello&a=test', 'google.com/query#div?q=hello&a=test' ] self.assertEqual(tokenize_v2(in_string), res)
def tokenize_dataset(): """Tokenize the whole dataset and print out most common tokens.""" from collections import Counter # tokenize whole dataset data = pd.read_csv(os.path.join(RAW_DIR, 'QueryResults.csv')) ovr_tokens = [] for id_ in data['Id']: # get tokens, remove code tags if exists text = data[data['Id'] == id_]['Body'].values[0] tokens = [re.sub(r'</?code>', '', x) for x in tokenize_v2(text)] ovr_tokens += tokens # print out top non-english, non-digit/punctuation tokens counter = Counter(ovr_tokens) english_words = get_english_words() # drop english, punctuation and digits-only keys drops = [x for x in counter.keys() if re.match(r"^[\W\d]+$", x) or x.lower() in english_words] for key in drops: counter.pop(key) print(counter.most_common(50))
def test_white_spaces_tokens(self): """Test for recognizing tokens with white spaces.""" self.assertEqual(tokenize_v2(' hello '), ['hello'])
def test_function_sent(self): """Test for functions in a sentence.""" self.assertEqual( tokenize_v2('declare functions fun1() and fun2() to'), ['declare', 'functions', 'fun1()', 'and', 'fun2()', 'to'])
def test_file_path_unix_sent(self): """Test for recognizing Unix paths in a sentence.""" self.assertEqual(tokenize_v2('open /dev/sda1 and write'), ['open', '/dev/sda1', 'and', 'write'])
def test_file_path_unix_mixed_dots(self): """Test for Unix file paths using both . and ..""" self.assertEqual(tokenize_v2('./..'), ['./..']) self.assertEqual(tokenize_v2('../.'), ['../.'])
def test_class_sent(self): """Test for classes in a sentence.""" self.assertEqual( tokenize_v2('create class Token() to store tokens'), ['create', 'class', 'Token()', 'to', 'store', 'tokens'])
def test_white_spaces(self): """Test for recognizing white spaces.""" self.assertEqual(tokenize_v2(' \n \t \r '), [])
def test_file_path_unix_dir(self): """Test for Unix folder paths.""" self.assertEqual(tokenize_v2('/home/nhanh/'), ['/home/nhanh/'])
def test_file_path_unix(self): """Test for recognizing Unix file paths.""" self.assertEqual(tokenize_v2('/home/nhanh/hello.txt'), ['/home/nhanh/hello.txt'])
def test_code_block_tokens(self): """Test for recognizing code blocks with surrounding tokens.""" self.assertEqual(tokenize_v2('code<code>eval()</code>more'), ['code', '<code>eval()</code>', 'more'])
def test_code_block_white_space(self): """Test for recognizing code blocks with white spaces.""" self.assertEqual(tokenize_v2('<code> eval() </code>'), ['<code> eval() </code>'])
def test_code_block(self): """Test for recognizing normal code blocks.""" self.assertEqual(tokenize_v2('<code>eval()</code>'), ['<code>eval()</code>'])
def test_code_empty(self): """Test for recognizing empty code blocks.""" self.assertEqual(tokenize_v2('<code></code>'), ['<code></code>'])
def test_mixed_6(self): """Mixed test case 6.""" in_string = 'test.com fdsfg <code> 2nd code</code><a href="sdgdsfdsfds">fdsfsdfdsf</a>' res = ['test.com', 'fdsfg', '<code> 2nd code</code>', 'fdsfsdfdsf'] self.assertEqual(tokenize_v2(in_string), res)