def get_url_keywords(url): az_re = re.compile('[a-zA-Z]{1}') domain_text = ''.join(urlparse(url).netloc.split('.')[:-1]) uri_str = ' '.join(urlparse(url).path.split('/')) uri_text = ''.join(az_re.findall(uri_str)) keywords = segment('{}{}'.format(domain_text, uri_text)) return keywords
def get_url_keywords(url): az_re = re.compile('[a-zA-Z]{1}') domain_text = ''.join(urlparse(url).netloc.split('.')[:-1]) uri_str = ' '.join(urlparse(url).path.split('/')) uri_text=''.join(az_re.findall(uri_str)) keywords = segment('{}{}'.format(domain_text, uri_text)) return keywords
def find_best(text, ht, checker): if len(text) >= 40: return ngrams.most_match(text, checker) candidates = ngrams.segment(text, checker) for key in ngrams.segment.memo.keys(): del(ngrams.segment.memo[key]) if len(candidates) == 1: return candidates[0] else: return max(candidates, key=rate_cmp(text, ht))
if domaintested == list_size: list_size = len(domains_d) to_write += "iteration " + str(passe) + ", " + str(tot) + " domains tested, " + str(list_size - domaintested) + " domains discovered\n" tot = 0 passe += 1 if passe > options.vertical: del(domains_d[:]) domaintested += 1 if options.increment or options.splitter : decoupe = segment(prefixe) #split the prefix in different parts if possible if len(decoupe) > 1 and len(decoupe) < 4: # if we obtain different part in one suffixe i = -1 allwords = {} for part in decoupe: #for each parts we try to find if it's a number or a word i = i + 1 allwords[i] = [] allwords[i].append(part) disco = "" to_test = []
def test_segment(): assert segment('choosespain') == ['choose', 'spain'] assert segment('thisisatest') == ['this', 'is', 'a', 'test'] assert segment('wheninthecourseofhumaneventsitbecomesnecessary') == [ 'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary' ] assert segment('whorepresents') == ['who', 'represents'] assert segment('expertsexchange') == ['experts', 'exchange'] assert segment('speedofart') == ['speed', 'of', 'art'] assert segment('nowisthetimeforallgood') == [ 'now', 'is', 'the', 'time', 'for', 'all', 'good' ] assert segment('itisatruthuniversallyacknowledged') == [ 'it', 'is', 'a', 'truth', 'universally', 'acknowledged' ] assert segment( 'itwasabrightcolddayinaprilandtheclockswerestrikingthirteen') == [ 'it', 'was', 'a', 'bright', 'cold', 'day', 'in', 'april', 'and', 'the', 'clocks', 'were', 'striking', 'thirteen' ] assert segment('itwasthebestoftimesitwastheworstoftimes' 'itwastheageofwisdomitwastheageoffoolishness') == [ 'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was', 'the', 'worst', 'of', 'times', 'it', 'was', 'the', 'age', 'of', 'wisdom', 'it', 'was', 'the', 'age', 'of', 'foolishness' ] assert segment('asgregorsamsaawokeonemorningfromuneasydreamshefound' 'himselftransformedinhisbedintoagiganticinsect') == [ 'as', 'gregor', 'samsa', 'awoke', 'one', 'morning', 'from', 'uneasy', 'dreams', 'he', 'found', 'himself', 'transformed', 'in', 'his', 'bed', 'into', 'a', 'gigantic', 'insect' ] assert segment('inaholeinthegroundtherelivedahobbitnotanastydirtywet' 'holefilledwiththeendsofwormsandanoozysmellnoryetadry' 'baresandyholewithnothinginittositdownonortoeatitwasa' 'hobbitholeandthatmeanscomfort') == [ 'in', 'a', 'hole', 'in', 'the', 'ground', 'there', 'lived', 'a', 'hobbit', 'not', 'a', 'nasty', 'dirty', 'wet', 'hole', 'filled', 'with', 'the', 'ends', 'of', 'worms', 'and', 'an', 'oozy', 'smell', 'nor', 'yet', 'a', 'dry', 'bare', 'sandy', 'hole', 'with', 'nothing', 'in', 'it', 'to', 'sitdown', 'on', 'or', 'to', 'eat', 'it', 'was', 'a', 'hobbit', 'hole', 'and', 'that', 'means', 'comfort' ] assert segment('faroutintheunchartedbackwatersoftheunfashionablee' 'ndofthewesternspiralarmofthegalaxyliesasmallunreg' 'ardedyellowsun') == [ 'far', 'out', 'in', 'the', 'uncharted', 'backwaters', 'of', 'the', 'unfashionable', 'end', 'of', 'the', 'western', 'spiral', 'arm', 'of', 'the', 'galaxy', 'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun' ]
# encoding: utf-8 import sys import chinese import ngrams import uniout import codecs Pw = ngrams.Pdist(ngrams.datafile('ciku.txt'), missingfn=ngrams.avoid_long_words) with codecs.open(sys.argv[1], 'r', 'utf-8') as fin: with codecs.open(sys.argv[2], 'w', 'utf-8') as fout: for line in fin: start = -1 for i in range(len(line)): if chinese.is_chinese(line[i]): if start == -1: start = i else: if start != -1: # 看到一个非汉字字符,那么结束前面的连续汉字字符串; # 因为没有 trim 过,故此 line 一定以 \n 结尾,就是说,每行的最后一个字符一定不会是汉字 text = line[start:i] seg = ngrams.segment(text, Pw) # 前后都加一个空格 fout.write(u' ' + u' '.join(seg) + u' ') start = -1 fout.write(line[i])