예제 #1
0
 def test_file_path_unix_double_dots(self):
     """Test for Unix file paths using .."""
     self.assertEqual(tokenize_v2('..'), ['..'])
     self.assertEqual(tokenize_v2('../'), ['../'])
     self.assertEqual(tokenize_v2('../..'), ['../..'])
     self.assertEqual(tokenize_v2('/dev/..'), ['/dev/..'])
     self.assertEqual(tokenize_v2('../hello.txt'), ['../hello.txt'])
예제 #2
0
 def test_class(self):
     """Test for classes."""
     self.assertEqual(tokenize_v2('TokenizerClass()'), ['TokenizerClass()'])
     self.assertEqual(tokenize_v2('TokenizerClass(object)'),
                      ['TokenizerClass(object)'])
     self.assertEqual(tokenize_v2('TokenizerClass(object.Object)'),
                      ['TokenizerClass(object.Object)'])
예제 #3
0
 def test_remove_html_tags_attr(self):
     """Test for removing html tags with attributes."""
     self.assertEqual(tokenize_v2('<a href="https://google.com">hello</a>'),
                      ['hello'])
     self.assertEqual(
         tokenize_v2(
             '<a href="https://google.com" rel="nofollow" >hello</a>'),
         ['hello'])
예제 #4
0
 def test_remove_html_tags(self):
     """Test for removing simple html tags."""
     self.assertEqual(tokenize_v2('<b>hello</b>'), ['hello'])
     self.assertEqual(tokenize_v2('<blockquote>hello</blockquote>'),
                      ['hello'])
     self.assertEqual(tokenize_v2('<del>hello</del>'), ['hello'])
     self.assertEqual(tokenize_v2('<h1>hello</h1>'), ['hello'])
     self.assertEqual(tokenize_v2('<h2>hello</h2>'), ['hello'])
     self.assertEqual(tokenize_v2('<h3>hello</h3>'), ['hello'])
     self.assertEqual(tokenize_v2('<i>hello</i>'), ['hello'])
     self.assertEqual(tokenize_v2('<p>hello</p>'), ['hello'])
     self.assertEqual(tokenize_v2('<pre>hello</pre>'), ['hello'])
예제 #5
0
def tokenize_eval():
    """Tokenize and evaluate the implemented tokenizer on the annotated dataset."""
    data = pd.read_csv(os.path.join(RAW_DIR, 'TokenTagRaw.csv'))

    ovr_len_truth = 0
    ovr_len_tokens = 0
    ovr_accr = 0
    for id_ in data['Id']:
        # get truths and tokens, remove code tags if exists
        truth = get_truths(id_)
        text = data[data['Id'] == id_]['Body'].values[0]
        tokens = [re.sub(r'</?code>', '', x) for x in tokenize_v2(text)]
        ovr_len_truth += len(truth)
        ovr_len_tokens += len(tokens)
        # evaluate single post
        accr, pre, rec, f1_ = evaluate(tokens, truth)
        ovr_accr += accr
        print('Id: {}, precision: {:.3f}, recall: {:.3f}, f1: {:.3f}'.format(
            id_, pre, rec, f1_))
    # overall evaluation
    pre = ovr_accr / ovr_len_tokens
    rec = ovr_accr / ovr_len_truth
    f1_ = 2 * pre * rec / (pre + rec)
    print('Overall: precision: {:.3f}, recall: {:.3f}, f1: {:.3f}'.format(
        pre, rec, f1_))
예제 #6
0
def pos_irregular():
    """POS tagging on a sample of sentences with irregular tokens."""
    from nltk import pos_tag

    data = pd.read_csv(os.path.join(RAW_DIR, 'IrregularTokenSent.csv'))
    for str_ in data['Sentence']:
        print(pos_tag(tokenize_v2(str_)))
예제 #7
0
 def test_mixed_7(self):
     """Mixed test case 7 (a weird one)."""
     in_string = 'C:\\WINDOWS\\$Hello world\\-txt hahaha lol.exe testing c: d: 0: c:\\ 0:\\'
     res = [
         'C:\\WINDOWS\\$Hello world\\-txt hahaha lol.exe', 'testing', 'c',
         ':', 'd', ':', '0', ':', 'c:\\', '0', ':', '\\'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #8
0
 def test_mixed_5(self):
     """Mixed test case 5."""
     in_string = '[{testingdfig}] [e.g.] e.g i.e i.e. http://google.com google.com'
     res = [
         '[{testingdfig}]', '[e.g.]', 'e.g', 'i.e', 'i.e.',
         'http://google.com', 'google.com'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #9
0
 def test_mixed_4(self):
     """Mixed test case 4."""
     in_string = '555 obj.func() func(arg) oodp.method(arg) [hello] {world}'
     res = [
         '555', 'obj.func()', 'func(arg)', 'oodp.method(arg)', '[hello]',
         '{world}'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #10
0
 def test_mixed_3(self):
     """Mixed test case 3."""
     in_string = '_test_test $1.00 _test_ test_test $interpolateProvider ash6.sad34sdf'
     res = [
         '_test_test', '$', '1.00', '_test_', 'test_test',
         '$interpolateProvider', 'ash6.sad34sdf'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #11
0
 def test_mixed_2(self):
     """Mixed test case 2."""
     in_string = 'length-2 _test /nfs/an/disks/jj/home/dir/file.txt /dev/test/file.txt'
     res = [
         'length-2', '_test', '/nfs/an/disks/jj/home/dir/file.txt',
         '/dev/test/file.txt'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #12
0
 def test_mixed_1(self):
     """Mixed test case 1."""
     in_string = '<p>my string.</p><code>sfdsfdsfds\n\n\n\n\n\n(sdfdsfd)</code> function()'
     res = [
         'my', 'string', '.',
         '<code>sfdsfdsfds\n\n\n\n\n\n(sdfdsfd)</code>', 'function()'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #13
0
 def test_mixed_8(self):
     """Mixed test case 8 (another weird one)."""
     in_string = 'C:\\.WINDOWS\\Hello world\\-txt.exe testing ... .. ../.. ../. ./.. . ./. ../'
     res = [
         'C:\\.WINDOWS\\Hello world\\-txt.exe', 'testing', '...', '..',
         '../..', '../.', './..', '.', './.', '../'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #14
0
 def test_file_path_windows_sent(self):
     """Test for recognizing Windows file paths in a sentence."""
     self.assertEqual(
         tokenize_v2('open file: C:\\Program Files\\Hello\\txt.exe and do'),
         [
             'open', 'file', ':', 'C:\\Program Files\\Hello\\txt.exe',
             'and', 'do'
         ])
예제 #15
0
 def test_mixed_9(self):
     """Mixed test case 9."""
     in_string = 'C:\\WINDOWS\\Hello\\txt.exe testing /dev/test ../..' + \
         ' ../../test /../test http://google.com https://googl.com'
     res = [
         'C:\\WINDOWS\\Hello\\txt.exe', 'testing', '/dev/test', '../..',
         '../../test', '/../test', 'http://google.com', 'https://googl.com'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #16
0
 def test_mixed_10(self):
     """Mixed test case 10."""
     in_string = 'https://google.com/query#div?q=hello&a=test' + \
         ' http://google.com/query#div?q=hello&a=test google.com/query#div?q=hello&a=test'
     res = [
         'https://google.com/query#div?q=hello&a=test',
         'http://google.com/query#div?q=hello&a=test',
         'google.com/query#div?q=hello&a=test'
     ]
     self.assertEqual(tokenize_v2(in_string), res)
예제 #17
0
def tokenize_dataset():
    """Tokenize the whole dataset and print out most common tokens."""
    from collections import Counter

    # tokenize whole dataset
    data = pd.read_csv(os.path.join(RAW_DIR, 'QueryResults.csv'))
    ovr_tokens = []
    for id_ in data['Id']:
        # get tokens, remove code tags if exists
        text = data[data['Id'] == id_]['Body'].values[0]
        tokens = [re.sub(r'</?code>', '', x) for x in tokenize_v2(text)]
        ovr_tokens += tokens

    # print out top non-english, non-digit/punctuation tokens
    counter = Counter(ovr_tokens)
    english_words = get_english_words()
    # drop english, punctuation and digits-only keys
    drops = [x for x in counter.keys() if re.match(r"^[\W\d]+$", x)
             or x.lower() in english_words]
    for key in drops:
        counter.pop(key)
    print(counter.most_common(50))
예제 #18
0
 def test_white_spaces_tokens(self):
     """Test for recognizing tokens with white spaces."""
     self.assertEqual(tokenize_v2('      hello       '), ['hello'])
예제 #19
0
 def test_function_sent(self):
     """Test for functions in a sentence."""
     self.assertEqual(
         tokenize_v2('declare functions fun1() and fun2() to'),
         ['declare', 'functions', 'fun1()', 'and', 'fun2()', 'to'])
예제 #20
0
 def test_file_path_unix_sent(self):
     """Test for recognizing Unix paths in a sentence."""
     self.assertEqual(tokenize_v2('open /dev/sda1 and write'),
                      ['open', '/dev/sda1', 'and', 'write'])
예제 #21
0
 def test_file_path_unix_mixed_dots(self):
     """Test for Unix file paths using both . and .."""
     self.assertEqual(tokenize_v2('./..'), ['./..'])
     self.assertEqual(tokenize_v2('../.'), ['../.'])
예제 #22
0
 def test_class_sent(self):
     """Test for classes in a sentence."""
     self.assertEqual(
         tokenize_v2('create class Token() to store tokens'),
         ['create', 'class', 'Token()', 'to', 'store', 'tokens'])
예제 #23
0
 def test_white_spaces(self):
     """Test for recognizing white spaces."""
     self.assertEqual(tokenize_v2(' \n \t \r '), [])
예제 #24
0
 def test_file_path_unix_dir(self):
     """Test for Unix folder paths."""
     self.assertEqual(tokenize_v2('/home/nhanh/'), ['/home/nhanh/'])
예제 #25
0
 def test_file_path_unix(self):
     """Test for recognizing Unix file paths."""
     self.assertEqual(tokenize_v2('/home/nhanh/hello.txt'),
                      ['/home/nhanh/hello.txt'])
예제 #26
0
 def test_code_block_tokens(self):
     """Test for recognizing code blocks with surrounding tokens."""
     self.assertEqual(tokenize_v2('code<code>eval()</code>more'),
                      ['code', '<code>eval()</code>', 'more'])
예제 #27
0
 def test_code_block_white_space(self):
     """Test for recognizing code blocks with white spaces."""
     self.assertEqual(tokenize_v2('<code> eval() </code>'),
                      ['<code> eval() </code>'])
예제 #28
0
 def test_code_block(self):
     """Test for recognizing normal code blocks."""
     self.assertEqual(tokenize_v2('<code>eval()</code>'),
                      ['<code>eval()</code>'])
예제 #29
0
 def test_code_empty(self):
     """Test for recognizing empty code blocks."""
     self.assertEqual(tokenize_v2('<code></code>'), ['<code></code>'])
예제 #30
0
 def test_mixed_6(self):
     """Mixed test case 6."""
     in_string = 'test.com fdsfg <code> 2nd code</code><a href="sdgdsfdsfds">fdsfsdfdsf</a>'
     res = ['test.com', 'fdsfg', '<code> 2nd code</code>', 'fdsfsdfdsf']
     self.assertEqual(tokenize_v2(in_string), res)