Пример #1
0
def bing_search(query, pages=10, search_type='Web'):
    if os.path.exists('%ssearch_result_%s.html' %
                      (html_path, query.replace('/', '_'))):
        res = open(
            '%ssearch_result_%s.html' % (html_path, query.replace('/', '_')),
            'rb').read()
        data = json.loads(res)
        return data['d']['results']
    query = urllib.quote(query)
    credentials = (':%s' % bing_api_key).encode('base64')[:-1]
    authorrization = 'Basic %s' % credentials
    search_url = 'https://api.datamarket.azure.com/Bing/Search/v1/' \
                 '%(search_type)s?Query=%%27%(query)s%%27&$top=%(number)s&$format=json' % \
    {
        'search_type': search_type, 'query': query, 'number': pages*10
    }
    headers = {
        'Authorization': authorrization,
    }
    res = gunzip(get(search_url, headers).read())
    write_file(
        '%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), res,
        True)
    data = json.loads(res)
    return data['d']['results']
def get_all_corpus(website):
    print '%s%s.txt' % (corpus_path, website[0])
    if os.path.exists(u'%s%s.txt' % (corpus_path, website[0])):
        return open(u'%s%s.txt' % (corpus_path, website[0]), 'rb').read()
    content = '%s\n%s' % (get_corpus_by_baidu(website), get_corpus_by_bing(website))
    split_words = split_word_only(content)
    write_file(u'%s%s.txt' % (corpus_path, website[0]), split_words, debug_flag)
    return split_words
Пример #3
0
def expand(word):
    file_path = u'%s%s' % (expanded_words_path, word)
    if os.path.exists(file_path):
        return open(file_path, 'rb').read()
    if debug_flag:
        print 'wiki expanding ... '
    p = search(word)
    content = get_content(p)
    write_file(file_path, content, True)
    return content
def baidu_search(query, pages=10, search_type='Web'):
    if os.path.exists(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_'))):
        res = open(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), 'rb').read()
        data = json.loads(res)
        return data
    result_list = []
    for page in range(pages):
        result_list += baidu_search_single_search(query, page, search_type)
    res = json.dumps(result_list)
    write_file(u'%ssearch_result_%s.html' % (html_path, query.replace('/', '_')), res, True)
    return result_list
Пример #5
0
def split_word(sentence, filename='', cut_all=True, show_nominal=False):
    split_words = ''
    if not show_nominal:
        word_list = jieba.cut(sentence, cut_all)
    else:
        word_list = pseg.cut(sentence, cut_all)
    for word in word_list:
        if debug_flag:
            print word
        split_words += '%s ' % word
    if not filename == '':
        write_file(filename, split_words)
    return split_words