Exemplos de regex_or em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: twokenize

Método / Função: regex_or

Exemplos em hotexamples.com: 4

regex_or em Python - 4 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de twokenize.regex_or em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Relacionados

slugify

get_environment_limits

bench

defer_apply

skip_tasks

send_signup_confirmation

get_completed_stats_xml_desc

IntBoxplot

ipasetup

register_raw_filesCsvPy

Related in langs

ConfigurationHandler (PHP)

url_to_title (PHP)

DriverGroupAdapter (C#)

ParcelInput (C#)

glIsRenderbuffer (C++)

rt_raster_get_width (C++)

NewMulti (Go)

DeviceID (Go)

ChatContainer (Java)

ViewLayout (Java)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: bigrams.py Projeto: luyang1210/twittertopicsummary

if tweet['id'] in tok_cache: #print "CACHE HIT %s" % tweet['text'] toks = pickle.loads(tok_cache[tweet['id']]) else: #print "NEW ANALYSIS %s" % tweet['text'] toks = tokenize_and_clean(tweet['text'], alignments=True) tok_cache[tweet['id']] = pickle.dumps(toks) tweet['toks'] = toks from twokenize import regex_or mycompile = lambda pat: re.compile(pat, re.UNICODE) # junk tokens are a more aggressive cleaning assumption than usual. JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''') # dont make n-grams across phrase boundary markers. PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity) PhraseBoundaryTok = mycompile('^' + PhraseBoundaryTok + '$') EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$') def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: bigrams.py Projeto: AnnuSachan/tweetmotif

if tweet['id'] in tok_cache: #print "CACHE HIT %s" % tweet['text'] toks = pickle.loads(tok_cache[tweet['id']]) else: #print "NEW ANALYSIS %s" % tweet['text'] toks = tokenize_and_clean(tweet['text'], alignments=True) tok_cache[tweet['id']] = pickle.dumps(toks) tweet['toks'] = toks from twokenize import regex_or mycompile = lambda pat: re.compile(pat, re.UNICODE) # junk tokens are a more aggressive cleaning assumption than usual. JunkTok = mycompile(r'''^[^a-zA-Z0-9_@]+$''') # dont make n-grams across phrase boundary markers. PhraseBoundaryTok = regex_or(r'''[.,“"'?!:;|-]+''', twokenize.Entity) PhraseBoundaryTok = mycompile('^'+PhraseBoundaryTok+'$') EdgePunctTok = mycompile('^' + twokenize.EdgePunct + '+$') def tokenize_and_clean(msg, alignments): if alignments: toks = twokenize.tokenize(msg) else: toks = twokenize.simple_tokenize(msg) for i in range(len(toks)): toks[i] = toks[i].lower() inds = range(len(toks)) #if len(inds) < len(toks): print "dropping junk", sorted(list(toks[i] for i in (set(range(len(toks)))-set(inds)))) if alignments: return toks.subset(inds) else:

Exemplo n.º 3

0

Exibir arquivo

new_word_counts = defaultdict(int) new_doc_counts = defaultdict(int) for w in vocab: r = replacements.get(w,w) new_vocab.add(r) new_word_counts[r] += word_counts[w] new_doc_counts[r] += doc_counts[w] return new_vocab, new_word_counts, new_doc_counts, replacements cur_user_words = [] last_username = None Punct = twokenize.regex_or(twokenize.PunctChars, twokenize.Entity, twokenize.EdgePunct, r'[\*]') Punct_RE = re.compile('^(%s)+$' % Punct, re.I|re.U) def get_tokens(text): toks = twokenize.tokenize(text.lower()) toks = [t for t in toks if not t.startswith('@') and t != 'rt' ] toks = ["-PUNCT-" if Punct_RE.search(t) and not emoticons.Emoticon_RE.search(t) else t for t in toks] # toks = [t.replace("#","") for t in toks] return toks # for line in sys.stdin: # # print get_tokens(line) # print (" ".join(get_tokens(line))).encode('utf-8') # # print "\n" + "\n".join(get_tokens(line)) # sys.exit(0)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: emoticons.py Projeto: robinsonsarah01/politeness-gauge

import re # define our own slightly more limited version of the emoticon regex # adapted from twokenize.py # things we need to deal with separately our_bfLeft = u"(0|[oO]|[vV]|\\$|[tT]|[xX]|;|@|\\^|\\*)".encode('utf-8') our_basicface = "(?:" +our_bfLeft+tw.bfCenter+ ")|" +tw.s3+ "|" +tw.s4+ "|" +tw.s5 pattern = tw.regex_or( # myleott: Standard version :) :( :] :D :P "(?:>|)?" + tw.regex_or(tw.normalEyes, tw.wink) + tw.regex_or(tw.noseArea,"[Oo]") + tw.regex_or(tw.tongue+r"(?=\W|$|RT|rt|Rt)", tw.otherMouths+r"(?=\W|$|RT|rt|Rt)", tw.sadMouths, tw.happyMouths), # myleott: reversed version (: D: use positive lookbehind to remove "(word):" # myleott: because eyes on the right side is more ambiguous with the standard usage of : ; tw.regex_or("(?<=(?: ))", "(?<=(?:^))") + tw.regex_or(tw.sadMouths,tw.happyMouths,tw.otherMouths) + tw.noseArea + tw.regex_or(tw.normalEyes, tw.wink) + "(?:<|)?", our_basicface, # myleott: o.O and O.o are two of the biggest sources of differences # between this and the Java version. One little hack won't hurt... tw.oOEmote ) pattern = unicode(pattern).decode('utf-8') reg = re.compile(pattern, re.UNICODE) # print reg def get_emoticon_count(text): """ Get an approximate number of emoticons contained in the input text.