Пример #1
0
 def classify(self, test_path, output_file, filter_func, filter_words=None):
     line_counter = 1
     results = []
     for f in sorted(os.listdir(test_path)):
         tokens = generate_tokens(os.path.join(test_path, f), filter_func, filter_words)
         spam_score = self.score_spam(tokens)
         ham_score = self.score_ham(tokens)
         test_class = SPAM if spam_score > ham_score else HAM
         correct_class = get_label_from_file(f)
         results.append(self.compile_output_line(line_counter, f, test_class, ham_score, spam_score, correct_class))
         line_counter += 1
     with open(output_file, 'w') as f_to_write:
         f_to_write.write('\n'.join(results))
     return True
Пример #2
0
    def __init__(self, readline, offset=(0, 0), is_fast_parser=False):
        self.readline = readline
        self.gen = tokenize.generate_tokens(readline)
        self.offset = offset
        self.closed = False
        self.is_first = True
        self.push_backs = []

        # fast parser options
        self.is_fast_parser = is_fast_parser
        self.current = self.previous = [None, None, (0, 0), (0, 0), '']
        self.in_flow = False
        self.new_indent = False
        self.parser_indent = self.old_parser_indent = 0
        self.is_decorator = False
        self.first_stmt = True
Пример #3
0
    def __init__(self, readline, offset=(0, 0), is_fast_parser=False):
        self.readline = readline
        self.gen = tokenize.generate_tokens(readline)
        self.offset = offset
        self.closed = False
        self.is_first = True
        self.push_backs = []

        # fast parser options
        self.is_fast_parser = is_fast_parser
        self.current = self.previous = [None, None, (0, 0), (0, 0), '']
        self.in_flow = False
        self.new_indent = False
        self.parser_indent = self.old_parser_indent = 0
        self.is_decorator = False
        self.first_stmt = True
Пример #4
0
    def _get_path_until_cursor(self, start_pos=None):
        def fetch_line():
            if self._is_first:
                self._is_first = False
                self._line_length = self._column_temp
                line = self._first_line
            else:
                line = self.get_line(self._line_temp)
                self._line_length = len(line)
                line = line + '\n'
            # add lines with a backslash at the end
            while True:
                self._line_temp -= 1
                last_line = self.get_line(self._line_temp)
                #print self._line_temp, repr(last_line)
                if last_line and last_line[-1] == '\\':
                    line = last_line[:-1] + ' ' + line
                    self._line_length = len(last_line)
                else:
                    break
            return line[::-1]

        self._is_first = True
        self._line_temp, self._column_temp = start_cursor = start_pos
        self._first_line = self.get_line(self._line_temp)[:self._column_temp]

        open_brackets = ['(', '[', '{']
        close_brackets = [')', ']', '}']

        gen = tokenize.generate_tokens(fetch_line)
        string = ''
        level = 0
        force_point = False
        last_type = None
        try:
            for token_type, tok, start, end, line in gen:
                # print 'tok', token_type, tok, force_point
                if last_type == token_type == tokenize.NAME:
                    string += ' '

                if level > 0:
                    if tok in close_brackets:
                        level += 1
                    if tok in open_brackets:
                        level -= 1
                elif tok == '.':
                    force_point = False
                elif force_point:
                    # it is reversed, therefore a number is getting recognized
                    # as a floating point number
                    if token_type == tokenize.NUMBER and tok[0] == '.':
                        force_point = False
                    else:
                        break
                elif tok in close_brackets:
                    level += 1
                elif token_type in [tokenize.NAME, tokenize.STRING]:
                    force_point = True
                elif token_type == tokenize.NUMBER:
                    pass
                else:
                    self._column_temp = self._line_length - end[1]
                    break

                x = start_pos[0] - end[0] + 1
                l = self.get_line(x)
                l = self._first_line if x == start_pos[0] else l
                start_cursor = x, len(l) - end[1]
                self._column_temp = self._line_length - end[1]
                string += tok
                last_type = token_type
        except tokenize.TokenError:
            debug.warning("Tokenize couldn't finish", sys.exc_info)

        # string can still contain spaces at the end
        return string[::-1].strip(), start_cursor
Пример #5
0
    def _get_path_until_cursor(self, start_pos=None):
        def fetch_line():
            line = self.get_line(self._line_temp)
            if self._is_first:
                self._is_first = False
                self._line_length = self._column_temp
                line = line[:self._column_temp]
            else:
                self._line_length = len(line)
                line = line + '\n'
            # add lines with a backslash at the end
            while 1:
                self._line_temp -= 1
                last_line = self.get_line(self._line_temp)
                if last_line and last_line[-1] == '\\':
                    line = last_line[:-1] + ' ' + line
                    self._line_length = len(last_line)
                else:
                    break
            return line[::-1]

        self._is_first = True
        if start_pos is None:
            self._line_temp = self.position[0]
            self._column_temp = self.position[1]
        else:
            self._line_temp, self._column_temp = start_pos

        open_brackets = ['(', '[', '{']
        close_brackets = [')', ']', '}']

        gen = tokenize.generate_tokens(fetch_line)
        string = ''
        level = 0
        force_point = False
        last_type = None
        try:
            for token_type, tok, start, end, line in gen:
                #print 'tok', token_type, tok, force_point
                if last_type == token_type == tokenize.NAME:
                    string += ' '

                if level > 0:
                    if tok in close_brackets:
                        level += 1
                    if tok in open_brackets:
                        level -= 1
                elif tok == '.':
                    force_point = False
                elif force_point:
                    # it is reversed, therefore a number is getting recognized
                    # as a floating point number
                    if token_type == tokenize.NUMBER and tok[0] == '.':
                        force_point = False
                    else:
                        break
                elif tok in close_brackets:
                    level += 1
                elif token_type in [tokenize.NAME, tokenize.STRING]:
                    force_point = True
                elif token_type == tokenize.NUMBER:
                    pass
                else:
                    self._column_temp = self._line_length - end[1]
                    break

                self._column_temp = self._line_length - end[1]
                string += tok
                last_type = token_type
        except tokenize.TokenError:
            debug.warning("Tokenize couldn't finish", sys.exc_info)

        # string can still contain spaces at the end
        return string[::-1].strip()
Пример #6
0
 def test_tokenize_comment(self):
     self.assertIn(TokenType.COMMENT_IN_LINE, generate_tokens(self.code_js))
Пример #7
0
 def test_tokenize_line_terminator(self):
     self.assertIn(TokenType.LINE_TERMINATOR, generate_tokens(self.code_js))
Пример #8
0
 def test_tokenize_space(self):
     self.assertIn(TokenType.SPACE, generate_tokens(self.code_js))