Exemplo n.º 1
0
def get_url_keywords(url):
    az_re = re.compile('[a-zA-Z]{1}')
    domain_text = ''.join(urlparse(url).netloc.split('.')[:-1])
    uri_str = ' '.join(urlparse(url).path.split('/'))
    uri_text = ''.join(az_re.findall(uri_str))
    keywords = segment('{}{}'.format(domain_text, uri_text))
    return keywords
Exemplo n.º 2
0
def get_url_keywords(url):
    az_re = re.compile('[a-zA-Z]{1}')
    domain_text = ''.join(urlparse(url).netloc.split('.')[:-1])
    uri_str = ' '.join(urlparse(url).path.split('/'))
    uri_text=''.join(az_re.findall(uri_str))
    keywords = segment('{}{}'.format(domain_text,
                                     uri_text))
    return keywords
def find_best(text, ht, checker):
    if len(text) >= 40:
        return ngrams.most_match(text, checker)

    candidates = ngrams.segment(text, checker)
    for key in ngrams.segment.memo.keys():
        del(ngrams.segment.memo[key])

    if len(candidates) == 1:
        return candidates[0]
    else:
        return max(candidates, key=rate_cmp(text, ht))
Exemplo n.º 4
0
        if domaintested == list_size:

            list_size = len(domains_d)
            to_write += "iteration " + str(passe) + ", " + str(tot) + " domains tested, " + str(list_size - domaintested) + " domains discovered\n"
            tot = 0
            passe += 1

            if passe > options.vertical:
                del(domains_d[:])

        domaintested += 1


     if options.increment or options.splitter :

        decoupe = segment(prefixe)    #split the prefix in different parts if possible
    

        if len(decoupe) > 1 and len(decoupe) < 4:        #  if we obtain different part in one suffixe

            i = -1
            allwords = {}

            for part in decoupe:    #for each parts we try to find if it's a number or a word

                i = i + 1
                allwords[i] = []
                allwords[i].append(part)

                disco = ""
                to_test = []
Exemplo n.º 5
0
def test_segment():
    assert segment('choosespain') == ['choose', 'spain']
    assert segment('thisisatest') == ['this', 'is', 'a', 'test']
    assert segment('wheninthecourseofhumaneventsitbecomesnecessary') == [
        'when', 'in', 'the', 'course', 'of', 'human', 'events', 'it',
        'becomes', 'necessary'
    ]
    assert segment('whorepresents') == ['who', 'represents']
    assert segment('expertsexchange') == ['experts', 'exchange']
    assert segment('speedofart') == ['speed', 'of', 'art']
    assert segment('nowisthetimeforallgood') == [
        'now', 'is', 'the', 'time', 'for', 'all', 'good'
    ]
    assert segment('itisatruthuniversallyacknowledged') == [
        'it', 'is', 'a', 'truth', 'universally', 'acknowledged'
    ]
    assert segment(
        'itwasabrightcolddayinaprilandtheclockswerestrikingthirteen') == [
            'it', 'was', 'a', 'bright', 'cold', 'day', 'in', 'april', 'and',
            'the', 'clocks', 'were', 'striking', 'thirteen'
        ]
    assert segment('itwasthebestoftimesitwastheworstoftimes'
                   'itwastheageofwisdomitwastheageoffoolishness') == [
                       'it', 'was', 'the', 'best', 'of', 'times', 'it', 'was',
                       'the', 'worst', 'of', 'times', 'it', 'was', 'the',
                       'age', 'of', 'wisdom', 'it', 'was', 'the', 'age', 'of',
                       'foolishness'
                   ]
    assert segment('asgregorsamsaawokeonemorningfromuneasydreamshefound'
                   'himselftransformedinhisbedintoagiganticinsect') == [
                       'as', 'gregor', 'samsa', 'awoke', 'one', 'morning',
                       'from', 'uneasy', 'dreams', 'he', 'found', 'himself',
                       'transformed', 'in', 'his', 'bed', 'into', 'a',
                       'gigantic', 'insect'
                   ]
    assert segment('inaholeinthegroundtherelivedahobbitnotanastydirtywet'
                   'holefilledwiththeendsofwormsandanoozysmellnoryetadry'
                   'baresandyholewithnothinginittositdownonortoeatitwasa'
                   'hobbitholeandthatmeanscomfort') == [
                       'in', 'a', 'hole', 'in', 'the', 'ground', 'there',
                       'lived', 'a', 'hobbit', 'not', 'a', 'nasty', 'dirty',
                       'wet', 'hole', 'filled', 'with', 'the', 'ends', 'of',
                       'worms', 'and', 'an', 'oozy', 'smell', 'nor', 'yet',
                       'a', 'dry', 'bare', 'sandy', 'hole', 'with', 'nothing',
                       'in', 'it', 'to', 'sitdown', 'on', 'or', 'to', 'eat',
                       'it', 'was', 'a', 'hobbit', 'hole', 'and', 'that',
                       'means', 'comfort'
                   ]
    assert segment('faroutintheunchartedbackwatersoftheunfashionablee'
                   'ndofthewesternspiralarmofthegalaxyliesasmallunreg'
                   'ardedyellowsun') == [
                       'far', 'out', 'in', 'the', 'uncharted', 'backwaters',
                       'of', 'the', 'unfashionable', 'end', 'of', 'the',
                       'western', 'spiral', 'arm', 'of', 'the', 'galaxy',
                       'lies', 'a', 'small', 'un', 'regarded', 'yellow', 'sun'
                   ]
# encoding: utf-8

import sys
import chinese
import ngrams
import uniout 
import codecs

Pw  = ngrams.Pdist(ngrams.datafile('ciku.txt'), missingfn=ngrams.avoid_long_words)

with codecs.open(sys.argv[1], 'r', 'utf-8') as fin:
    with codecs.open(sys.argv[2], 'w', 'utf-8') as fout:
        for line in fin:
            start = -1
            for i in range(len(line)):
                if chinese.is_chinese(line[i]):
                    if start == -1:
                        start = i
                else:
                    if start != -1:
                        # 看到一个非汉字字符,那么结束前面的连续汉字字符串;
                        # 因为没有 trim 过,故此 line 一定以 \n 结尾,就是说,每行的最后一个字符一定不会是汉字
                        text = line[start:i]
                        seg = ngrams.segment(text, Pw)
                        # 前后都加一个空格
                        fout.write(u' ' + u' '.join(seg) + u' ')
                        start = -1
                    fout.write(line[i])