Пример #1
0
    def POST(self):
        global _vm, _session

        _vm.attachCurrentThread()
        user_data = json.loads(web.input()['data'])
        print user_data
        tgtype = user_data['type']
        if tgtype == 'user':
            tfield = 'author_index'
        elif tgtype == 'question':
            tfield = 'question_index'
        else:
            return ''
        with zh_iatd.create_searcher() as searcher:
            res1 = searcher.searcher.search(
                zh_iatd.create_query({
                    'type': 'answer',
                    tfield: user_data['index']
                }), 200, Sort(SortField('likes', SortField.Type.INT, True)))
            res2 = searcher.searcher.search(
                zh_iatd.create_query({
                    'type': 'answer',
                    tfield: user_data['index']
                }), 200, Sort(SortField('date', SortField.Type.INT, True)))
            res1 = [
                zh_pganlz.document_to_obj(searcher.searcher.doc(
                    x.doc)).data.likes for x in res1.scoreDocs
            ]
            res2 = [
                zh_pganlz.document_to_obj(searcher.searcher.doc(x.doc))
                for x in res2.scoreDocs
            ]
            res2 = [{'x': x.data.date, 'y': x.data.likes} for x in res2]
        return json.dumps({'histogram': res1, 'graph': res2})
Пример #2
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    query = BooleanQuery()
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    query.add(TermQuery(Term('type', 'user')), BooleanClause.Occur.MUST)
    i = 0
    with zh_iatd.create_searcher() as searcher:
        with open('pagerank_data.txt', 'w') as fout:
            reslst = searcher.searcher.search(query, 100)
            initval = 1.0 / reslst.totalHits
            while len(reslst.scoreDocs) > 0:
                for x in reslst.scoreDocs:
                    realdoc = searcher.searcher.doc(x.doc)
                    obj = document_to_obj(realdoc)
                    if not obj.data.followed_users is None:
                        print '{0:8}'.format(i), '  user', obj.index, len(
                            obj.data.followed_users)
                        fout.write('{0}\t{1}\t{2}\n'.format(
                            obj.index, initval, ' '.join(
                                (x.encode('utf8')
                                 for x in obj.data.followed_users))))
                    else:
                        print '{0:8}'.format(i), 'I user', obj.index
                    i += 1
                reslst = searcher.searcher.searchAfter(reslst.scoreDocs[-1],
                                                       query, 100)
Пример #3
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    db_writer = zh_iatd.create_index_writer('.newdb')
    db_reader = zh_iatd.create_searcher(INDEXED_FOLDER)

    if len(sys.argv) < 2:
        res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
        tot = 0
        while len(res.scoreDocs) > 0:
            for x in res.scoreDocs:
                realdoc = db_reader.searcher.doc(x.doc)
                obj = document_to_obj(realdoc)
                newdoc = obj_to_document(obj)
                db_writer.addDocument(newdoc)
                tot += 1
                sys.stdout.write('\r{0}'.format(tot))
                sys.stdout.flush()
            res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                                 MatchAllDocsQuery(), 100)
    elif sys.argv[1] == 'mergerank':
        ranks = {}
        with open('prrank.txt', 'r') as fin:
            for x in fin.readlines():
                v = x.split()
                ranks[v[0]] = float(v[1])

        res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
        tot = 0
        while len(res.scoreDocs) > 0:
            for x in res.scoreDocs:
                realdoc = db_reader.searcher.doc(x.doc)
                obj = document_to_obj(realdoc)
                if isinstance(obj, zh_pganlz.user):
                    if obj.index in ranks.keys():
                        obj.data.rank = ranks[obj.index]
                newdoc = obj_to_document(obj)
                db_writer.addDocument(newdoc)
                tot += 1
                sys.stdout.write('\r{0}'.format(tot))
                sys.stdout.flush()
            res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                                 MatchAllDocsQuery(), 100)

    db_writer.commit()
Пример #4
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    db_writer = zh_iatd.create_index_writer('.newdb')
    db_reader = zh_iatd.create_searcher(INDEXED_FOLDER)
    res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
    tot = 0
    while len(res.scoreDocs) > 0:
        for x in res.scoreDocs:
            realdoc = db_reader.searcher.doc(x.doc)
            obj = document_to_obj(realdoc)
            if isinstance(obj, zh_pganlz.article):
                if 'contents' in vars(obj.data).keys():
                    obj.data.text = obj.data.contents
                    del obj.data.contents
            newdoc = obj_to_document(obj)
            db_writer.addDocument(newdoc)
            tot += 1
            sys.stdout.write('\r{0}'.format(tot))
            sys.stdout.flush()
        res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                             MatchAllDocsQuery(), 100)
    db_writer.commit()
Пример #5
0
    def POST(self):
        def build_text_query(k, v):
            return QueryParser(k, WhitespaceAnalyzer()).parse(' '.join(
                jieba.lcut(v)))

        def build_anyterm_query(field, strv):
            res = BooleanQuery()
            for i in strv.split():
                res.add(TermQuery(Term(field, i)), BooleanClause.Occur.SHOULD)
            return res

        def get_query_result(sarc, dct):
            PAGE_SIZE = 10
            PAGE_JUMP = 10

            query = BooleanQuery()
            query.add(TermQuery(Term(zh_pganlz.LTPF_TYPE, '1')),
                      BooleanClause.Occur.MUST)
            page = 0
            sort_lists = []
            summ_set = set()
            exclus_set = None
            words = []
            for k, v in dct.items():
                if k in ('index', 'type', 'tag_indices', 'author_index'):
                    query.add(build_anyterm_query(k, dct[k]),
                              BooleanClause.Occur.MUST)
                elif k in ('text', 'contents', 'title', 'description',
                           'alias'):
                    words += jieba.lcut(v)
                    query.add(
                        build_text_query(k + zh_pganlz.LTPF_FOR_QUERY, dct[k]),
                        BooleanClause.Occur.MUST)

                elif k == 'raw':
                    query.add(
                        QueryParser('index',
                                    WhitespaceAnalyzer()).parse(dct[k]),
                        BooleanClause.Occur.MUST)
                elif k == 'enhraw':
                    x = 0
                    reslst = []
                    for entry in v:
                        if x == 2:
                            reslst += [
                                lastdoc + x.encode('utf8')
                                for x in jieba.cut(entry)
                            ]
                            x = 0
                        else:
                            if x == 0:
                                reslst.append(entry.encode('utf8'))
                            else:
                                lastdoc = entry.encode('utf8')
                            x += 1
                    query.add(
                        QueryParser('index', WhitespaceAnalyzer()).parse(
                            ' '.join(reslst)), BooleanClause.Occur.MUST)

                elif k == 'page':
                    page = int(dct[k])
                elif k == 'sort':
                    for x in dct['sort']:
                        sort_type = SortField.Type.STRING
                        if 'type' in x.keys():
                            if x['type'] == 'int':
                                sort_type = SortField.Type.INT
                            elif x['type'] == 'float':
                                sort_type = SortField.Type.FLOAT
                        reverse = False
                        if 'reverse' in x.keys():
                            reverse = x['reverse']
                        sort_lists.append(
                            SortField(x['key'], sort_type, reverse))

                elif k == 'summarize':
                    summ_set = set(v)
                elif k == 'exclusive':
                    exclus_set = set(v)

            ressrt = Sort(*sort_lists)
            resdocs = sarc.searcher.search(query, PAGE_SIZE, ressrt)
            if page > 0:
                if resdocs.totalHits > page * PAGE_SIZE:
                    page -= 1
                    while page > PAGE_JUMP:
                        resdocs = sarc.searcher.searchAfter(
                            resdocs.scoreDocs[-1], query,
                            PAGE_SIZE * PAGE_JUMP, ressrt)
                        page -= PAGE_JUMP
                    if page > 0:
                        resdocs = sarc.searcher.searchAfter(
                            resdocs.scoreDocs[-1], query, PAGE_SIZE * page,
                            ressrt)
                    resdocs = sarc.searcher.searchAfter(
                        resdocs.scoreDocs[-1], query, PAGE_SIZE, ressrt)
                else:
                    resdocs.scoreDocs = []
            reslst = []
            for x in resdocs.scoreDocs:
                dictobj = zh_pganlz.obj_to_json(
                    zh_pganlz.document_to_obj(sarc.searcher.doc(x.doc)))
                if 'additional' in dct.keys():
                    adres = []
                    for x in dct['additional']:
                        if isinstance(dictobj[x['sourcefield']], list):
                            qlist = dictobj[x['sourcefield']]
                        else:
                            qlist = [dictobj[x['sourcefield']]]
                        cres = []
                        for qword in qlist:
                            if not isinstance(qword, (unicode, str)):
                                qword = str(qword)
                            searchres = sarc.searcher.search(
                                zh_iatd.create_query({
                                    'type': x['type'],
                                    x['targetfield']: qword
                                }), 1)
                            if searchres.totalHits > 1:
                                print x, 'FOUND', searchres
                            elif searchres.totalHits == 0:
                                cres.append(None)
                            else:
                                cres.append(
                                    zh_pganlz.obj_to_json(
                                        zh_pganlz.document_to_obj(
                                            sarc.searcher.doc(
                                                searchres.scoreDocs[0].doc))))
                        adres.append(cres)
                for k, v in dictobj.items():
                    if k in summ_set:
                        dictobj[k + '_summary'] = summarize(
                            hyper_text(v).text, list(set(words)))
                if not exclus_set is None:
                    for k in dictobj.keys():
                        if not k in exclus_set:
                            del dictobj[k]
                if 'additional' in dct.keys():
                    dictobj['additional'] = adres
                reslst.append(dictobj)
            return {'total': resdocs.totalHits, 'data': reslst}

        global _vm

        _vm.attachCurrentThread()
        user_data = web.input()
        print user_data
        user_data = json.loads(user_data['data'])
        print user_data
        searcher = zh_iatd.create_searcher()
        print 'querys' in user_data
        if 'querys' in user_data:
            reslst = []
            for x in user_data['querys']:
                reslst.append(get_query_result(searcher, x))
            print len(reslst)
            print json.dumps({'results': reslst})
            return json.dumps({'results': reslst})
        else:
            print get_query_result(searcher, user_data)
            return json.dumps(get_query_result(searcher, user_data))
Пример #6
0
def crawl_until_stop(session):
    global _vm, _stop, _stopped

    _vm.attachCurrentThread()

    db_writer = zh_iatd.create_index_writer()

    info_logger = external_console_logger('/tmp/zh_c_info')
    error_logger = external_console_logger('/tmp/zh_c_err')
    strategy = crawl_strategy()

    errcount = 0

    while not _stop:
        info_logger.write('  acquiring new tasks... ')
        task_reader = zh_iatd.create_searcher(TASK_FOLDER)
        default_query = BooleanQuery()
        default_query.add(TermQuery(Term('finish_time', '0')),
                          BooleanClause.Occur.MUST)
        strategy.process_query(default_query)
        idstart = task_reader.reader.numDocs()
        searchres = task_reader.searcher.search(default_query, 100)
        resdocs = [
            task_reader.searcher.doc(x.doc) for x in searchres.scoreDocs
        ]
        info_logger.write('got:{0} total:{1}\n'.format(searchres.totalHits,
                                                       idstart))
        task_reader.close()

        task_writer = zh_iatd.create_index_writer(TASK_FOLDER)

        for doct in resdocs:
            curt = task()
            curt.from_document(doct)
            crlt = curt.to_crawler_task()
            try:
                crlt.func(session, crlt)
            except Exception as e:
                info_logger.write('FAIL')
                error_logger.write(
                    '## ERROR ################################\n')
                zh_pganlz.print_object(crlt, out=error_logger)
                error_logger.write(
                    '-- stacktrace ---------------------------\n')
                error_logger.write(traceback.format_exc())
                errcount += 1
                error_logger.write('[Error count: {0}]\n'.format(errcount))

                task_writer.deleteDocuments(Term('docid', str(doct['docid'])))
                curt.fails += 1
                task_writer.addDocument(curt.to_document())
            else:
                if not crlt.result_rep_obj is None:
                    db_writer.deleteDocuments(crlt.result_query)
                    db_writer.addDocument(
                        zh_pganlz.obj_to_document(crlt.result_rep_obj))
                for x in crlt.result_new:
                    db_writer.addDocument(zh_pganlz.obj_to_document(x))
                db_writer.commit()

                task_writer.deleteDocuments(Term('docid', str(doct['docid'])))
                curt.finish_time = int(time.time())
                task_writer.addDocument(curt.to_document())
                for x in crlt.result_tasks:
                    newt = task()
                    newt.from_crawler_task(x)
                    newt.docid = idstart
                    idstart += 1
                    task_writer.addDocument(newt.to_document())
            if isinstance(crlt.prm_id, unicode):
                prids = crlt.prm_id.encode('utf8')
            else:
                prids = str(crlt.prm_id)
            info_logger.write(
                ' ~{0}(+{1}) -{2} {3}({4}, {5}, {6}, {7})\n'.format(
                    task_writer.numDocs(), len(crlt.result_tasks), curt.fails,
                    crlt.func.func_name[14:], prids, crlt.prm_start,
                    crlt.prm_pagesize, crlt.prm_extra))
            if _stop:
                break
            time.sleep(1)
        task_writer.close()
    info_logger.write('stopped\n')
    _stopped = True
Пример #7
0
def index_images_until_stop(session, handler, lbound):
    global _stop, _stopped, _vm

    _vm.attachCurrentThread()
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER))))
    query = BooleanQuery()
    query.add(TermQuery(Term('finish_time', '0')),
              BooleanClause.Occur.MUST_NOT)
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    if not lbound is None:
        query.add(
            TermRangeQuery.newStringRange('finish_time', lbound, '9999999999',
                                          False, True),
            BooleanClause.Occur.MUST)
    sort = Sort(SortField('finish_time', SortField.Type.INT))
    tmpbk = None
    res = searcher.search(query, 100, sort)
    answer_content_searcher = zh_iatd.create_searcher()
    logger = external_console_logger('/tmp/zh_imgc_info')
    while not _stop:
        print 'got', len(res.scoreDocs), 'docs'
        for x in res.scoreDocs:
            try:
                imgsgot = 0
                realdoc = searcher.doc(x.doc)
                doctype = realdoc['func_name']
                objid = realdoc['id']
                logger.write(' ft:{0}'.format(realdoc['finish_time']))
                if doctype == 'user_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/people/{0}'.format(
                                    objid))), HTML_PARSER)
                    cover = soup.select(
                        '#ProfileHeader .ProfileHeader-userCover img')
                    if len(cover) > 0:
                        cover_img = cover[0]['src']
                        imgsgot += 1
                        handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid)
                    avatar_img = soup.select(
                        '#ProfileHeader .ProfileHeader-main .UserAvatar img'
                    )[0]['src']
                    imgsgot += 1
                    handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid)
                elif doctype == 'article_data':
                    jsondata = session.get_article_content_raw(objid)
                    if 'titleImage' in jsondata.keys():
                        cover_img = jsondata['titleImage']
                        if len(cover_img) > 0:
                            imgsgot += 1
                            handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid)
                    soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER)
                    for x in soup.select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid)
                elif doctype == 'topic_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/topic/{0}/hot'.
                                format(objid))), HTML_PARSER)
                    topic_img = soup.select(
                        '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img'
                    )[0]['src']
                    imgsgot += 1
                    handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid)
                elif doctype == 'answer_comments' and realdoc['start'] == '0':
                    obj, q = zh_iatd.query_object(answer_content_searcher,
                                                  objid, zh_pganlz.answer)
                    for x in obj.data.text.as_soup().select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid)
                elif doctype == 'question_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/question/{0}'.
                                format(objid))), HTML_PARSER)
                    for x in soup.select('#zh-question-detail img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid)
                else:
                    logger.write('\n')
                    continue
                logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot))
                if _stop:
                    break
                time.sleep(3)
            except Exception as e:
                logger.write('\n## ERROR ################################\n')
                logger.write(traceback.format_exc())
        if len(res.scoreDocs) > 0:
            tmpbk = res.scoreDocs[-1]
        res = searcher.searchAfter(tmpbk, query, 100, sort)
    print 'stopped'
    _stopped = True