def extract_tokens(list_of_tweets_as_str, count_usernames=True,is_single=False): if is_single: list_of_words_in_tweet = set([word for word in list_of_tweets_as_str.lower().split() if all([not word.isdigit(),not isdecimal(word)])]) else: list_of_words_in_tweet = set([word for tweet in list_of_tweets_as_str for word in tweet.lower().split() if all([not word.isdigit(),not isdecimal(word)])]) list_of_words_in_tweet -= set(string.punctuation) list_of_words_in_tweet = {token.replace('-','').replace('_','').replace('.','').replace("'",'').replace('/','').replace('*','') for token in list_of_words_in_tweet if len(token)>3} list_of_words_in_tweet = {token if token not in standard_spelling else standard_spelling[token] for token in list_of_words_in_tweet} list_of_words_in_tweet = {lmtzr.lemmatize(token,'v' if len(token)>4 and token.endswith('ing') or token.endswith('ed') else 'n') for token in list_of_words_in_tweet} usernames = {token for token in list_of_words_in_tweet if isusername(token)} if count_usernames else {} hashtags = {token for token in list_of_words_in_tweet if token.startswith('#')} return ({token for token in list_of_words_in_tweet if all([token not in stopwords.words('english'),len(token)>3, not isusername(token),hasvowels(token), not token.startswith('#')])},usernames,hashtags)
def _tokenize(readline, encoding): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] # 'stashed' and 'async_*' are used for async/await parsing stashed = None async_def = False async_def_indent = 0 async_def_nl = False if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') while True: # loop over lines in stream try: line = readline() except StopIteration: line = b'' if encoding is not None: line = line.decode(encoding) lnum += 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError("EOF in multi-line string", strstart) endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) yield TokenInfo(STRING, contstr + line[:end], strstart, (lnum, end), contline + line) contstr, needcont = '', 0 contline = None elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': yield TokenInfo(ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), contline) contstr = '' contline = None continue else: contstr = contstr + line contline = contline + line continue elif parenlev == 0 and not continued: # new statement if not line: break column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column += 1 elif line[pos] == '\t': column = (column // tabsize + 1) * tabsize elif line[pos] == '\f': column = 0 else: break pos += 1 if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines if line[pos] == '#': comment_token = line[pos:].rstrip('\r\n') nl_pos = pos + len(comment_token) yield TokenInfo(COMMENT, comment_token, (lnum, pos), (lnum, pos + len(comment_token)), line) yield TokenInfo(NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) else: yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], (lnum, pos), (lnum, len(line)), line) continue if column > indents[-1]: # count indents or dedents indents.append(column) yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) while column < indents[-1]: if column not in indents: raise IndentationError( "unindent does not match any outer indentation level", ("<tokenize>", lnum, pos, line)) indents = indents[:-1] if async_def and async_def_indent >= indents[-1]: async_def = False async_def_nl = False async_def_indent = 0 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) if async_def and async_def_nl and async_def_indent >= indents[-1]: async_def = False async_def_nl = False async_def_indent = 0 else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) continued = 0 while pos < max: pseudomatch = _compile(PseudoToken).match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end if start == end: continue token, initial = line[start:end], line[start] if token in _redir_check: yield TokenInfo(IOREDIRECT, token, spos, epos, line) elif (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield TokenInfo(NUMBER, token, spos, epos, line) elif initial in '\r\n': if stashed: yield stashed stashed = None if parenlev > 0: yield TokenInfo(NL, token, spos, epos, line) else: yield TokenInfo(NEWLINE, token, spos, epos, line) if async_def: async_def_nl = True elif initial == '#': assert not token.endswith("\n") if stashed: yield stashed stashed = None yield TokenInfo(COMMENT, token, spos, epos, line) # Xonsh-specific Regex Globbing elif re.match(SearchPath, token): yield TokenInfo(SEARCHPATH, token, spos, epos, line) elif token in triple_quoted: endprog = _compile(endpats[token]) endmatch = endprog.match(line, pos) if endmatch: # all on one line pos = endmatch.end(0) token = line[start:pos] yield TokenInfo(STRING, token, spos, (lnum, pos), line) else: strstart = (lnum, start) # multiple lines contstr = line[start:] contline = line break elif initial in single_quoted or \ token[:2] in single_quoted or \ token[:3] in single_quoted: if token[-1] == '\n': # continued string strstart = (lnum, start) endprog = _compile(endpats[initial] or endpats[token[1]] or endpats[token[2]]) contstr, needcont = line[start:], 1 contline = line break else: # ordinary string yield TokenInfo(STRING, token, spos, epos, line) elif token.startswith('$') and token[1:].isidentifier(): yield TokenInfo(DOLLARNAME, token, spos, epos, line) elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): if async_def: yield TokenInfo( ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue tok = TokenInfo(NAME, token, spos, epos, line) if token == 'async' and not stashed: stashed = tok continue if token == 'def' and (stashed and stashed.type == NAME and stashed.string == 'async'): async_def = True async_def_indent = indents[-1] yield TokenInfo(ASYNC, stashed.string, stashed.start, stashed.end, stashed.line) stashed = None if stashed: yield stashed stashed = None yield tok elif initial == '\\': # continued stmt continued = 1 else: if initial in '([{': parenlev += 1 elif initial in ')]}': parenlev -= 1 elif token in additional_parenlevs: parenlev += 1 if stashed: yield stashed stashed = None yield TokenInfo(OP, token, spos, epos, line) else: yield TokenInfo(ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) pos += 1 if stashed: yield stashed stashed = None for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
def _tokenize(readline, encoding): lnum = parenlev = continued = 0 numchars = '0123456789' contstr, needcont = '', 0 contline = None indents = [0] # 'stashed' and 'async_*' are used for async/await parsing stashed = None async_def = False async_def_indent = 0 async_def_nl = False if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') while True: # loop over lines in stream try: line = readline() except StopIteration: line = b'' if encoding is not None: line = line.decode(encoding) lnum += 1 pos, max = 0, len(line) if contstr: # continued string if not line: raise TokenError("EOF in multi-line string", strstart) endmatch = endprog.match(line) if endmatch: pos = end = endmatch.end(0) yield TokenInfo(STRING, contstr + line[:end], strstart, (lnum, end), contline + line) contstr, needcont = '', 0 contline = None elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': yield TokenInfo(ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), contline) contstr = '' contline = None continue else: contstr = contstr + line contline = contline + line continue elif parenlev == 0 and not continued: # new statement if not line: break column = 0 while pos < max: # measure leading whitespace if line[pos] == ' ': column += 1 elif line[pos] == '\t': column = (column // tabsize + 1) * tabsize elif line[pos] == '\f': column = 0 else: break pos += 1 if pos == max: break if line[pos] in '#\r\n': # skip comments or blank lines if line[pos] == '#': comment_token = line[pos:].rstrip('\r\n') nl_pos = pos + len(comment_token) yield TokenInfo(COMMENT, comment_token, (lnum, pos), (lnum, pos + len(comment_token)), line) yield TokenInfo(NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) else: yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:], (lnum, pos), (lnum, len(line)), line) continue if column > indents[-1]: # count indents or dedents indents.append(column) yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) while column < indents[-1]: if column not in indents: raise IndentationError( "unindent does not match any outer indentation level", ("<tokenize>", lnum, pos, line)) indents = indents[:-1] if async_def and async_def_indent >= indents[-1]: async_def = False async_def_nl = False async_def_indent = 0 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) if async_def and async_def_nl and async_def_indent >= indents[-1]: async_def = False async_def_nl = False async_def_indent = 0 else: # continued statement if not line: raise TokenError("EOF in multi-line statement", (lnum, 0)) continued = 0 while pos < max: pseudomatch = _compile(PseudoToken).match(line, pos) if pseudomatch: # scan for tokens start, end = pseudomatch.span(1) spos, epos, pos = (lnum, start), (lnum, end), end if start == end: continue token, initial = line[start:end], line[start] if token in _redir_check: yield TokenInfo(IOREDIRECT, token, spos, epos, line) elif (initial in numchars or # ordinary number (initial == '.' and token != '.' and token != '...')): yield TokenInfo(NUMBER, token, spos, epos, line) elif initial in '\r\n': if stashed: yield stashed stashed = None if parenlev > 0: yield TokenInfo(NL, token, spos, epos, line) else: yield TokenInfo(NEWLINE, token, spos, epos, line) if async_def: async_def_nl = True elif initial == '#': assert not token.endswith("\n") if stashed: yield stashed stashed = None yield TokenInfo(COMMENT, token, spos, epos, line) # Xonsh-specific Regex Globbing elif re.match(SearchPath, token): yield TokenInfo(SEARCHPATH, token, spos, epos, line) elif token in triple_quoted: endprog = _compile(endpats[token]) endmatch = endprog.match(line, pos) if endmatch: # all on one line pos = endmatch.end(0) token = line[start:pos] yield TokenInfo(STRING, token, spos, (lnum, pos), line) else: strstart = (lnum, start) # multiple lines contstr = line[start:] contline = line break elif initial in single_quoted or \ token[:2] in single_quoted or \ token[:3] in single_quoted: if token[-1] == '\n': # continued string strstart = (lnum, start) endprog = _compile(endpats[initial] or endpats[token[1]] or endpats[token[2]]) contstr, needcont = line[start:], 1 contline = line break else: # ordinary string yield TokenInfo(STRING, token, spos, epos, line) elif token.startswith('$') and token[1:].isidentifier(): yield TokenInfo(DOLLARNAME, token, spos, epos, line) elif initial.isidentifier(): # ordinary name if token in ('async', 'await'): if async_def: yield TokenInfo( ASYNC if token == 'async' else AWAIT, token, spos, epos, line) continue tok = TokenInfo(NAME, token, spos, epos, line) if token == 'async' and not stashed: stashed = tok continue if token == 'def' and (stashed and stashed.type == NAME and stashed.string == 'async'): async_def = True async_def_indent = indents[-1] yield TokenInfo(ASYNC, stashed.string, stashed.start, stashed.end, stashed.line) stashed = None if stashed: yield stashed stashed = None yield tok elif token == '\\\n' or token == '\\\r\n': # continued stmt continued = 1 yield TokenInfo(ERRORTOKEN, token, spos, epos, line) elif initial == '\\': # continued stmt # for cases like C:\\path\\to\\file continued = 1 else: if initial in '([{': parenlev += 1 elif initial in ')]}': parenlev -= 1 elif token in additional_parenlevs: parenlev += 1 if stashed: yield stashed stashed = None yield TokenInfo(OP, token, spos, epos, line) else: yield TokenInfo(ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) pos += 1 if stashed: yield stashed stashed = None for indent in indents[1:]: # pop remaining indent levels yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')