def test_hash_regex(self): pat = regex.Regex("foo") hash1 = md5(datasets.utils.dumps(pat)).hexdigest() pat = regex.Regex("bar") hash2 = md5(datasets.utils.dumps(pat)).hexdigest() pat = regex.Regex("foo") hash3 = md5(datasets.utils.dumps(pat)).hexdigest() self.assertEqual(hash1, hash3) self.assertNotEqual(hash1, hash2)
def get_people_winner(tweets,award_names): nlp = spacy.load('en') stop_words = set(stopwords.words('english')) reg = regex.Regex() results={} for movie in award_names: # print(movie) if movie in reg.people_award: # if movie != 'best performance by an actor in a supporting role in a motion picture': # continue search_term = reg.getRegex(movie) word_size = 2 # print(search_term) result = [] for tweet in tweets: text = tweet['text'] if 'RT' not in tweet['text']: # text = text.lower() if re.search(search_term, text): # print(text) winner = extract_award(text) if winner: result.append(winner) # print(winner) # print(Counter(result).most_common()) name = "" for res in Counter(result).most_common(): name = res[0] if get_names(name,nlp) or validate_name(name): break results[movie] = name return results
def read_from_file(self, fp): self.size = int(fp.readline()) nregex = self.size * 2 + 1 for i in range(3): fp.readline() self.regex[i] = [ regex.Regex(fp.readline().strip()) for j in range(nregex) ] self.grid = Grid(self.size)
def main(): pattern = '(foo(ba)?)*(bar)+' test = 'foofoobabarbar' r = regex.Regex(pattern) r2 = re.compile(pattern) print "Custom implementation matches:", "yes" if r.match(test) else "no" m = r2.match(test) match = m is not None and m.group(0) == test print "Re module implementation matches:", "yes" if match else "no" print "Custom implementation runs in {{:.{precision}}} microseconds".format( precision=3).format(benchmark_micros(lambda: r.match(test))) print "Re module implementation runs in {{:.{precision}}} microseconds".format( precision=3).format(benchmark_micros(lambda: r2.match(test)))
def get_allAwards(tweets, year): reg = regex.Regex() search_terms = ['best', 'w[io]n', 'go(es)? to', 'went to', ':'] award_names = [] for tweet in tweets: text = tweet['text'].lower() if 'RT' not in tweet['text']: if re.search(search_terms[0], text) and (re.search(search_terms[1], text) or re.search(search_terms[2], text) or re.search(search_terms[3], text) or re.search(search_terms[4], text)): extract_str = extract_award(text) if re.search('usa', extract_str): continue if re.search('globe', extract_str): continue if len(extract_str.split()) < 4: continue count = Counter(extract_str) if count['-'] > 1: continue award_names.append(extract_str) results = Counter(award_names).most_common() results = merge_awards(results[:200]) # results = Counter(results).most_common() results = sorted(results, key=lambda x: x[1], reverse=True) results = filter_awards(results) results = sanitize(results[:100]) results = final_sanitize(results) answers = [] for i in range(len(results)): if i >= 26: break res = results[i] answers.append(res[0]) return answers # import json # tweets = json.load(open("gg2015.json")) # results = get_allAwards(tweets,2015) # print(results)
obj.co_lnotab, obj.co_freevars, obj.co_cellvars, ) pickler.save_reduce(CodeType, args, obj=obj) dill._dill.log.info("# Co") return def copyfunc(func): return types.FunctionType(func.__code__, func.__globals__, func.__name__, func.__defaults__, func.__closure__) try: import regex @pklregister(type(regex.Regex("", 0))) def _save_regex(pickler, obj): dill._dill.log.info("Re: %s" % obj) args = ( obj.pattern, obj.flags, ) pickler.save_reduce(regex.compile, args, obj=obj) dill._dill.log.info("# Re") return except ImportError: pass
def get_regex_from_pattern(self, patterns, patternid): ret = None pattern = self.get_pattern_from_id(patternid) if pattern: ret = self.regexs.get(patternid, None) if ret is None: if 'error' in pattern: del pattern['error'] ret = pattern['pattern'] if ret: # eg. iii hidas et ii carrucas # iiii hidis et i uirgata # u hidis # pro dimidia hida Hanc # Ibi habet abbas ii hidas et dimidiam in dominio et ii carrucas et uillani dimidiam hidam # hides:different units hid*: hida, uirgat*, ferdi*/ferlin* # ? 47b1: et ui agris # 41a2: iiii hidis et uirga et dimidia # # c bordarios x minus # iiii libras et iii solidos i denarium minus # measurement = self.get_pattern_from_key( 'helper_measurement') def replace_reference(match): rep = match.group(0) ref = match.group(1) if ref == 'PATTERN': return rep # TODO: try other than helper_, not now as it might have # accidental match ref_pattern = self.get_pattern_from_key('helper_' + ref) if ref_pattern: rep = ref_pattern['pattern'] else: if measurement: # try singular helper if ref.endswith('s'): ref_pattern = self.get_pattern_from_key( ('helper_%s' % ref)[0:-1]) if ref_pattern: # let's apply measurement to this rep = ref_pattern['pattern'] rep = measurement['pattern'].replace( '<PATTERN>', rep) if not ref_pattern: pattern[ 'error'] = 'Reference to an unknown pattern: <%s>. Check the spelling.' % ref return rep i = 0 while 'error' not in pattern: i += 1 before = ret ret = re.sub(ur'<([^>]+)>', replace_reference, ret) if i > 100: pattern[ 'error'] = 'Detected circular references in the pattern. E.g. p1 = <p2>; p2 = <p1>.' break if ret == before: break # LOW LEVEL SYNTACTIC SUGAR if not 'error' in pattern: # e.g. x (<number>)? y while True: ret2 = ret ret = re.sub(ur'( |^)(\([^)]+\))\?( |$)', ur'(\1\2)?\3', ret2) if ret == ret2: break # <person> habet <number> mansionem ret = ret.replace(ur'%', ur'\w*') # aliam = another # unam = one # dimidia = half # duabus = two ret = ret.replace(ur'7', ur'et') if ret[0] not in [ur'\b', '^']: ret = ur'\b' + ret if not ret.endswith(ur'\b'): ret = ret + ur'\b' try: ret = re.Regex(ret) except Exception, e: pattern['error'] = unicode(e) finally:
if not 'error' in pattern: # e.g. x (<number>)? y while True: ret2 = ret ret = re.sub(ur'( |^)(\([^)]+\))\?( |$)', ur'(\1\2)?\3', ret2) if ret == ret2: break # <person> habet <number> mansionem ret = ret.replace(ur'%', ur'\w*') # aliam = another # unam = one # dimidia = half # duabus = two ret = ret.replace(ur'7', ur'et') if ret[0] not in [ur'\b', '^']: ret = ur'\b' + ret if not ret.endswith(ur'\b'): ret = ret + ur'\b' try: ret = re.Regex(ret) except Exception, e: pattern['error'] = unicode(e) finally: self.regexs[patternid] = ret if 'error' in pattern: ret = re.Regex('INVALID PATTERN') return ret
def get_presenters(tweets): search_terms = [r'[Pp]resent'] stop_terms = [r'[Rr]epresent'] award_dict = reg = regex.Regex().award_dict gg_stop_words = ['Globe', 'RT', 'http', 'Golden', 'Globes', 'GoldenGlobes', 'Goldenglobes', 'Goldenglobe', 'gg','golden globes', 'golden globe', 'goldenglobe','goldenglobes','gg2015','gg15','goldenglobe2015','goldenglobe15','goldenglobes2015','goldenglobes15', 'gg2013','gg13','goldenglobe2013','goldenglobe13','goldenglobes2013','goldenglobes13', 'rt', '2013', '2015'] awards = list(award_dict.values()) clean_data = [] for x in tweets: if 'RT' not in x['text']: clean_data.append(x) award_results = {} for x in clean_data: tweet = x["text"] for award_regex in awards: award = get_key(award_dict, award_regex) for search_term in search_terms: if re.search(search_term, tweet) and not re.search(stop_terms[0], tweet) and re.search(award_regex, tweet): if award_results.get(award): award_results[award].append(x['text']) else: award_results[award] = [x['text']] break gg_stop_words = ['Globe', 'RT', 'http', 'Golden', 'Globes', 'GoldenGlobes', 'Goldenglobes', 'Goldenglobe', 'gg','golden globes', 'golden globe', 'goldenglobe','goldenglobes','gg2015','gg15','goldenglobe2015','goldenglobe15','goldenglobes2015','goldenglobes15', 'gg2013','gg13','goldenglobe2013','goldenglobe13','goldenglobes2013','goldenglobes13', 'rt', '2013', '2015', 'Best', 'BEST', 'Present', 'Presents', 'Angeles'] final = {} proper = [] for award in award_results.keys(): final[award] = [] proper_bi = [] for tweet in award_results[award]: bigrams = list(nltk.bigrams(nltk.word_tokenize(tweet))) text = nltk.word_tokenize(tweet) tagged_text = nltk.pos_tag(text) ''' for single tokens: for token in tagged_text: if token[1] == "NNP" and token[0] not in gg_stop_words: #print(token[0]) proper.append(token[0]) ''' # tag double words with pos and pull out the two-proper-nouns-in-a-row for bigram in bigrams: tagged_text = nltk.pos_tag(bigram) if tagged_text[0][1] == "NNP" and tagged_text[0][0] not in gg_stop_words and tagged_text[1][1] == "NNP" and tagged_text[1][0] not in gg_stop_words: proper_bi.append((tagged_text[0][0], tagged_text[1][0])) most_common = Counter(proper_bi).most_common() presenter_count = 3 # maximum of 3 presenters i = 0 while len(most_common) > 1 and i < len(most_common)-1: # combine any 3-name sets among the most common, eg "Sacha Baron Cohen". >3 names is not accounted for. if(most_common[i][0][1] == most_common[i+1][0][0]): if presenter_count > 0: final[award].append(most_common[i][0][0] + ' ' + most_common[i][0][1] + ' ' + most_common[i+1][0][1]) del most_common[i] del most_common[i] presenter_count = presenter_count - 1 else: i = i + 1 # fill in the rest of the most common up to the top 3 (seems like max # of presenters is 3) while presenter_count > 0 and len(most_common) != 0: presenter_count = presenter_count - 1 final[award].append(most_common[0][0][0] + ' ' + most_common[0][0][1]) del most_common[0] for award in award_dict.keys(): if not award in final: final[award] = ["a", "e"] # print(final) return(final)
import re import sys import utils import regex print("REGEX") rgx = regex.Regex() rgx.maxLength = 36 fileName = sys.argv[1] #print(sys.argv[1]) if (len(sys.argv) == 3): rgx.util.parameters = sys.argv[2] #print(sys.argv[2]) #rgx.util.parameters = '' lineList = [line.rstrip('\n') for line in open(fileName)] #print('INPUT') for i in range(len(lineList)): inputMap = rgx.regexStructure(lineList[i]) if i == 0: mergeMap = inputMap else: mergeMap = rgx.merge(inputMap, mergeMap) print(mergeMap) def matchRegex():
def get_film_winner(tweets,award_names): reg = regex.Regex() # print(reg.film_award) results={} for movie in award_names: # print(movie) if movie in reg.film_award: # if movie != 'best original song - motion picture': # continue search_term = reg.getRegex(movie) word_size = 2 # print(search_term) result = [] for tweet in tweets: text = tweet['text'] if 'RT' not in tweet['text']: if re.search(search_term, text): # print(text) A = re.findall(r'“(.*?)”', text) B = re.findall(r'"(.*?)"', text) if len(A)!=0: for sentence in A: if len(sentence.split())<10: result.append(sentence.lower()) if len(B)!=0: for sentence in B: if len(sentence.split())<10: result.append(sentence.lower()) if len(result)==0: for tweet in tweets: text = tweet['text'] if 'RT' not in tweet['text']: if re.search(search_term, text): C = re.findall(r'-(.*?)-', text) if len(C)!=0: # print(text) for sentence in C: if len(sentence.split())<5: result.append(sentence.strip()) # print(Counter(result)) name = "" for res in Counter(result).most_common(): name = res[0] if validate_film(name): name = res[0] break # name = Counter(result).most_common()[0][0] results[movie] = name return results # for key in results.keys(): # print(key,results[key]) # Counter(result).most_common() # tweets = json.load(open("gg2013.json")) # winner1 = get_film_winner(tweets) # winner2 = get_people_winner(tweets) # winner1.update(winner2) # print(len(winner1))
# if __name__ == '__main__': # mytable = ( # ('Joe', 'Clark', '1989'), # ('Charlie', 'Babbitt', '1988'), # ('Frank', 'Abagnale', '2002'), # ('Bill', 'Clark', '2009'), # ('Alan', 'Clark', '1804'), # ) # for row in sort_table(mytable, (1,0)): # print(row) """ END of SORTER ----------------------------------------------------------------------------- """ psq_re_f = regex.Regex(args.regex,regex.VERBOSE|regex.MULTILINE) psq_re_r = regex.Regex(regexrev,regex.VERBOSE|regex.MULTILINE) ref_seq_fh = open(args.fasta) ref_seq = [] line = (ref_seq_fh.readline()).strip() chr = re.sub('^>', '', line) line = (ref_seq_fh.readline()).strip() gquad_list = [] while True: while line.startswith('>') is False: ref_seq.append(line) line = (ref_seq_fh.readline()).strip() if line == '': break