Пример #1
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    query = BooleanQuery()
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    query.add(TermQuery(Term('type', 'user')), BooleanClause.Occur.MUST)
    i = 0
    with zh_iatd.create_searcher() as searcher:
        with open('pagerank_data.txt', 'w') as fout:
            reslst = searcher.searcher.search(query, 100)
            initval = 1.0 / reslst.totalHits
            while len(reslst.scoreDocs) > 0:
                for x in reslst.scoreDocs:
                    realdoc = searcher.searcher.doc(x.doc)
                    obj = document_to_obj(realdoc)
                    if not obj.data.followed_users is None:
                        print '{0:8}'.format(i), '  user', obj.index, len(
                            obj.data.followed_users)
                        fout.write('{0}\t{1}\t{2}\n'.format(
                            obj.index, initval, ' '.join(
                                (x.encode('utf8')
                                 for x in obj.data.followed_users))))
                    else:
                        print '{0:8}'.format(i), 'I user', obj.index
                    i += 1
                reslst = searcher.searcher.searchAfter(reslst.scoreDocs[-1],
                                                       query, 100)
Пример #2
0
    def testFacetAndTopsMultiCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(99):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1", "value%s" % (i % 10))
                                               ])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.commit()
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        t = TopScoreDocSuperCollector(10, True)
        collectors = ArrayList().of_(SuperCollector)
        collectors.add(t)
        collectors.add(f)
        C = MultiSuperCollector(collectors)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)

        self.assertEquals(99, t.topDocs(0).totalHits)
        self.assertEquals(10, len(t.topDocs(0).scoreDocs))
        tc = f.getTopChildren(10, "facet1", [])

        self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10),
                           ('value3', 10), ('value4', 10), ('value5', 10),
                           ('value6', 10), ('value7', 10), ('value8', 10),
                           ('value9', 9)], [(l.label, l.value.intValue())
                                            for l in tc.labelValues])
Пример #3
0
 def searchWithFacets(cls, indexReader, taxoReader, facets_config):
     """
     Search an index with facets.
     return a list of FacetResult instances
     """
     # MatchAllDocsQuery is for "browsing" (counts facets for all non-deleted docs in the index)
     query = MatchAllDocsQuery()
     return cls.searchWithQuery(query, indexReader, taxoReader, facets_config)
Пример #4
0
 def facets(self, facets, filterQueries, drilldownQueries=None, filter=None):
     facetCollector = self._facetCollector() if facets else None
     filter_ = self._filterFor(filterQueries, filter=filter)
     query = MatchAllDocsQuery()
     if drilldownQueries:
         query = self.createDrilldownQuery(query, drilldownQueries)
     self._index.search(query, filter_, facetCollector)
     generatorReturn(self._facetResult(facetCollector, facets))
     yield
Пример #5
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    db_writer = zh_iatd.create_index_writer('.newdb')
    db_reader = zh_iatd.create_searcher(INDEXED_FOLDER)

    if len(sys.argv) < 2:
        res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
        tot = 0
        while len(res.scoreDocs) > 0:
            for x in res.scoreDocs:
                realdoc = db_reader.searcher.doc(x.doc)
                obj = document_to_obj(realdoc)
                newdoc = obj_to_document(obj)
                db_writer.addDocument(newdoc)
                tot += 1
                sys.stdout.write('\r{0}'.format(tot))
                sys.stdout.flush()
            res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                                 MatchAllDocsQuery(), 100)
    elif sys.argv[1] == 'mergerank':
        ranks = {}
        with open('prrank.txt', 'r') as fin:
            for x in fin.readlines():
                v = x.split()
                ranks[v[0]] = float(v[1])

        res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
        tot = 0
        while len(res.scoreDocs) > 0:
            for x in res.scoreDocs:
                realdoc = db_reader.searcher.doc(x.doc)
                obj = document_to_obj(realdoc)
                if isinstance(obj, zh_pganlz.user):
                    if obj.index in ranks.keys():
                        obj.data.rank = ranks[obj.index]
                newdoc = obj_to_document(obj)
                db_writer.addDocument(newdoc)
                tot += 1
                sys.stdout.write('\r{0}'.format(tot))
                sys.stdout.flush()
            res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                                 MatchAllDocsQuery(), 100)

    db_writer.commit()
Пример #6
0
 def testSearch(self):
     C = TotalHitCountSuperCollector()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     self.assertEquals(0, C.getTotalHits())
     I._indexWriter.addDocument(document(name="one", price="2"))
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I.search(Q, None, C)
     self.assertEquals(1, C.getTotalHits())
Пример #7
0
def main():
    _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    db_writer = zh_iatd.create_index_writer('.newdb')
    db_reader = zh_iatd.create_searcher(INDEXED_FOLDER)
    res = db_reader.searcher.search(MatchAllDocsQuery(), 100)
    tot = 0
    while len(res.scoreDocs) > 0:
        for x in res.scoreDocs:
            realdoc = db_reader.searcher.doc(x.doc)
            obj = document_to_obj(realdoc)
            if isinstance(obj, zh_pganlz.article):
                if 'contents' in vars(obj.data).keys():
                    obj.data.text = obj.data.contents
                    del obj.data.contents
            newdoc = obj_to_document(obj)
            db_writer.addDocument(newdoc)
            tot += 1
            sys.stdout.write('\r{0}'.format(tot))
            sys.stdout.flush()
        res = db_reader.searcher.searchAfter(res.scoreDocs[-1],
                                             MatchAllDocsQuery(), 100)
    db_writer.commit()
Пример #8
0
def getQueryBuiler():
    # builder = QueryBuilder(analyzer)
    boolean_query = BooleanQuery.Builder()

    # print(args.search)

    if len(args.search) == 0:
        boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
        return boolean_query
    
    for i in range(len(args.search)):
        curSearch = args.search[i].split(' ')

        if curSearch[1] == 'query':
            parser = QueryParser(curSearch[2], analyzer)
            query = parser.parse(curSearch[3])
        elif curSearch[1] == 'intrange':
            query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4])
        elif curSearch[1] == 'termrange':
            lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S')
            upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S')
            query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True)

        if curSearch[0] == 'must':
            boolean_query.add(query, BooleanClause.Occur.MUST)
        elif curSearch[0] == 'should':
            boolean_query.add(query, BooleanClause.Occur.SHOULD)
        elif curSearch[0] == 'filter':
            boolean_query.add(query, BooleanClause.Occur.FILTER)
        elif curSearch[0] == 'must_not':
            boolean_query.add(query, BooleanClause.Occur.MUST_NOT)
        else:
            print('raise exception')
            # raise Exception
    # exit()
    # parser = QueryParser('method1', analyzer)
    # query = parser.parse('options')
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # parser = QueryParser('response_code', analyzer)
    # query = IntPoint.newRangeQuery('response_code', 200, 300)
    # boolean_query.add(query, BooleanClause.Occur.MUST)

    # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000")
    # upperDate = handleDate("19/Jul/2020:06:45:04 +0000")
    # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True)
    # boolean_query.add(query, BooleanClause.Occur.MUST)


    return boolean_query
Пример #9
0
 def _sinqleQuery(self, query):
     t0 = time()
     resultCoreName = query.resultsFrom
     resultCoreQuery = query.queryFor(core=resultCoreName)
     if resultCoreQuery is None:
         resultCoreQuery = MatchAllDocsQuery()
     result = yield self.any[resultCoreName].executeQuery(
         luceneQuery=resultCoreQuery,
         facets=query.facetsFor(resultCoreName),
         filterQueries=query.filterQueriesFor(resultCoreName),
         drilldownQueries=query.drilldownQueriesFor(resultCoreName),
         **query.otherKwargs())
     result.queryTime = millis(time() - t0)
     generatorReturn(result)
Пример #10
0
 def getLastStampId(self, prefix='oai_dc', setSpec=None):
     searcher = self._getSearcher()
     sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True))
     if prefix is None and setSpec is None:
         query = MatchAllDocsQuery()
     else:
         if prefix is None:
             query = TermQuery(Term(SETS_FIELD, setSpec))
         else:
             query = TermQuery(Term(PREFIX_FIELD, prefix))
     results = searcher.search(query, 1, sort)
     if results.totalHits.value < 1:
         return None
     return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
Пример #11
0
 def testSearchTopDocs(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(document(name="one", price="aap noot mies"))
     I._indexWriter.addDocument(document(name="two", price="aap vuur boom"))
     I._indexWriter.addDocument(
         document(name="three", price="noot boom mies"))
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     C = TopScoreDocSuperCollector(2, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
Пример #12
0
    def testCollectorFiltersTwoSimilar(self):
        self._addDocument("urn:1", 2, 1)
        self._addDocument("urn:2", 2, 2)
        tc = TopScoreDocSuperCollector(100, True)
        c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
        self.lucene.search(query=MatchAllDocsQuery(), collector=c)
        topDocsResult = tc.topDocs(0)
        self.assertEquals(1, topDocsResult.totalHits)
        self.assertEquals(1, len(topDocsResult.scoreDocs))

        docId = topDocsResult.scoreDocs[0].doc
        key = c.keyForDocId(docId)
        identifier = self.lucene._index.getDocument(
            key.getDocId()).get(IDFIELD)
        self.assertEquals('urn:2', identifier)
        self.assertEquals(2, key.count)
Пример #13
0
 def testShouldAddResultsWithoutIsFormatOf(self):
     self._addDocument("urn:1", 2)
     self._addDocument("urn:2", None)
     self._addDocument("urn:3", 2)
     self._addDocument("urn:4", None)
     self._addDocument("urn:5", None)
     self._addDocument("urn:6", None)
     self._addDocument("urn:7", None)
     self._addDocument("urn:8", None)
     self._addDocument("urn:9", None)
     self._addDocument("urn:A", None)
     self._addDocument("urn:B", None)  # trigger a merge
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(10, tc.topDocs(0).totalHits)
Пример #14
0
    def visitSEARCH_CLAUSE(self, node):
        # possible children:
        # CQL_QUERY
        # SEARCH_TERM
        # INDEX, RELATION, SEARCH_TERM
        firstChild = node.children[0].name
        results = CqlVisitor.visitSEARCH_CLAUSE(self, node)
        if firstChild == 'SEARCH_TERM':
            (unqualifiedRhs, ) = results
            if unqualifiedRhs == '*':
                return MatchAllDocsQuery()
            subQueries = []
            for fieldname, boost in self._unqualifiedTermFields:
                subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs)
                if isinstance(
                        subQuery, PhraseQuery
                ) and not self._fieldRegistry.phraseQueryPossible(fieldname):
                    continue
                subQuery.setBoost(boost)
                subQueries.append(subQuery)
            if len(subQueries) == 1:
                query = subQueries[0]
            else:
                query = BooleanQuery()
                for subQuery in subQueries:
                    query.add(subQuery, BooleanClause.Occur.SHOULD)
            return query
        elif firstChild == 'INDEX':
            (left, (relation, boost), right) = results
            if relation in [
                    '==', 'exact'
            ] or (relation == '=' and self._fieldRegistry.isUntokenized(left)):
                query = TermQuery(self._createTerm(left, right))
            elif relation == '=':
                query = self._termOrPhraseQuery(left, right)
            elif relation in ['<', '<=', '>=', '>']:
                query = self._termRangeQuery(left, relation, right)
            else:
                raise UnsupportedCQL("'%s' not supported for the field '%s'" %
                                     (relation, left))

            query.setBoost(boost)
            return query
        else:
            ((query, ), ) = results
            return query
Пример #15
0
    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title
Пример #16
0
    def _multipleCoreQuery(self, query):
        t0 = time()
        resultCoreName = query.resultsFrom
        resultCoreKey = query.keyName(resultCoreName)
        otherCoreNames = [
            coreName for coreName in query.cores if coreName != resultCoreName
        ]

        finalKeys = self._uniteFilter(query)
        for otherCoreName in otherCoreNames:
            finalKeys = self._coreQueries(otherCoreName, query, finalKeys)

        summaryFilter = None
        if finalKeys is not None:
            summaryFilter = KeyFilter(finalKeys, resultCoreKey)

        resultCoreQuery = self._luceneQueryForCore(resultCoreName, query)
        aggregateScoreCollector = self._createAggregateScoreCollector(
            query, resultCoreKey)
        keyCollector = KeySuperCollector(
            resultCoreKey) if self._multithreaded else KeyCollector(
                resultCoreKey)
        result = yield self.any[resultCoreName].executeQuery(
            luceneQuery=resultCoreQuery or MatchAllDocsQuery(),
            filter=summaryFilter,
            facets=query.facetsFor(resultCoreName),
            scoreCollector=aggregateScoreCollector,
            keyCollector=keyCollector,
            **query.otherKwargs())

        for otherCoreName in otherCoreNames:
            if query.facetsFor(otherCoreName):
                keyFilter = KeyFilter(keyCollector.getCollectedKeys(),
                                      query.keyName(otherCoreName))
                result.drilldownData.extend(
                    (yield self.any[otherCoreName].facets(
                        facets=query.facetsFor(otherCoreName),
                        filterQueries=query.queriesFor(otherCoreName) +
                        query.uniteQueriesFor(otherCoreName),
                        drilldownQueries=query.drilldownQueriesFor(
                            otherCoreName),
                        filter=keyFilter)))

        result.queryTime = millis(time() - t0)
        generatorReturn(result)
Пример #17
0
 def testCollectorFiltersTwoTimesTwoSimilarOneNot(self):
     self._addDocument("urn:1", 1, 2001)
     self._addDocument("urn:2", 3, 2009)  # result 2x
     self._addDocument("urn:3", 50, 2010)  # result 1x
     self._addDocument("urn:4", 3, 2001)
     self._addDocument("urn:5", 1, 2009)  # result 2x
     #expected: "urn:2', "urn:3" and "urn:5" in no particular order
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     topDocsResult = tc.topDocs(0)
     self.assertEquals(3, topDocsResult.totalHits)
     self.assertEquals(3, len(topDocsResult.scoreDocs))
     rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs]
     netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds]
     identifiers = set(
         self.lucene._index.getDocument(doc).get(IDFIELD)
         for doc in netDocIds)
     self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers)
     self.assertEquals(
         [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))
Пример #18
0
 def _luceneQueryBuilder(self,
                         prefix,
                         sets=None,
                         setsMask=None,
                         partition=None):
     numberOfClausesAdded = 0
     queryBuilder = BooleanQuery.Builder()
     if prefix:
         queryBuilder.add(TermQuery(Term(PREFIX_FIELD, prefix)),
                          BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if sets:
         setQueryBuilder = BooleanQuery.Builder()
         for setSpec in sets:
             setQueryBuilder.add(TermQuery(Term(SETS_FIELD, setSpec)),
                                 BooleanClause.Occur.SHOULD)
         queryBuilder.add(setQueryBuilder.build(), BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     for set_ in setsMask or []:
         queryBuilder.add(TermQuery(Term(SETS_FIELD, set_)),
                          BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if partition:
         partitionQueries = []
         for start, stop in partition.ranges():
             partitionQueries.append(
                 IntPoint.newRangeQuery(HASH_FIELD, start, stop - 1))
         if len(partitionQueries) == 1:
             pQuery = partitionQueries[0]
         else:
             pQueryBuilder = BooleanQuery.Builder()
             for q in partitionQueries:
                 pQueryBuilder.add(q, BooleanClause.Occur.SHOULD)
             pQuery = pQueryBuilder.build()
         queryBuilder.add(pQuery, BooleanClause.Occur.MUST)
         numberOfClausesAdded += 1
     if numberOfClausesAdded == 0:
         queryBuilder.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
     return queryBuilder
Пример #19
0
    def testFacetSuperCollector(self):
        I = Index(path=self.tempdir, settings=LuceneSettings())
        for i in xrange(1000):
            document1 = createDocument(fields=[("field1", str(i)),
                                               ("field2", str(i) * 1000)],
                                       facets=[("facet1",
                                                "value%s" % (i % 100))])
            document1 = I._facetsConfig.build(I._taxoWriter, document1)
            I._indexWriter.addDocument(document1)
        I.close()
        I = Index(path=self.tempdir, settings=LuceneSettings())

        C = FacetSuperCollector(I._indexAndTaxonomy.taxoReader,
                                I._facetsConfig, I._ordinalsReader)
        Q = MatchAllDocsQuery()
        I.search(Q, None, C)
        tc = C.getTopChildren(10, "facet1", [])
        self.assertEquals([('value90', 10), ('value91', 10), ('value92', 10),
                           ('value93', 10), ('value94', 10), ('value95', 10),
                           ('value96', 10), ('value97', 10), ('value98', 10),
                           ('value99', 10)], [(l.label, l.value.intValue())
                                              for l in tc.labelValues])
Пример #20
0
 def testSearchTopField(self):
     I = Index(path=self.tempdir, settings=LuceneSettings())
     I._indexWriter.addDocument(
         document(__id__='1', name="one", price="aap noot mies"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='2', name="two", price="aap vuur boom"))
     I.commit()
     I._indexWriter.addDocument(
         document(__id__='3', name="three", price="noot boom mies"))
     I.commit()
     I.close()
     I = Index(path=self.tempdir, settings=LuceneSettings())
     sort = Sort(SortField("name", SortField.Type.STRING, True))
     C = TopFieldSuperCollector(sort, 2, True, False, True)
     Q = MatchAllDocsQuery()
     I.search(Q, None, C)
     td = C.topDocs(0)
     self.assertEquals(3, C.getTotalHits())
     self.assertEquals(3, td.totalHits)
     self.assertEquals(2, len(td.scoreDocs))
     self.assertEquals(
         ['2', '3'],
         [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
Пример #21
0
 def get_all_docs(self, n_hits=1000):
     # debug method
     return [
         self._process_search_result(result) for result in
         self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs
     ]
Пример #22
0
def index_images_until_stop(session, handler, lbound):
    global _stop, _stopped, _vm

    _vm.attachCurrentThread()
    searcher = IndexSearcher(
        DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER))))
    query = BooleanQuery()
    query.add(TermQuery(Term('finish_time', '0')),
              BooleanClause.Occur.MUST_NOT)
    query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST)
    if not lbound is None:
        query.add(
            TermRangeQuery.newStringRange('finish_time', lbound, '9999999999',
                                          False, True),
            BooleanClause.Occur.MUST)
    sort = Sort(SortField('finish_time', SortField.Type.INT))
    tmpbk = None
    res = searcher.search(query, 100, sort)
    answer_content_searcher = zh_iatd.create_searcher()
    logger = external_console_logger('/tmp/zh_imgc_info')
    while not _stop:
        print 'got', len(res.scoreDocs), 'docs'
        for x in res.scoreDocs:
            try:
                imgsgot = 0
                realdoc = searcher.doc(x.doc)
                doctype = realdoc['func_name']
                objid = realdoc['id']
                logger.write(' ft:{0}'.format(realdoc['finish_time']))
                if doctype == 'user_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/people/{0}'.format(
                                    objid))), HTML_PARSER)
                    cover = soup.select(
                        '#ProfileHeader .ProfileHeader-userCover img')
                    if len(cover) > 0:
                        cover_img = cover[0]['src']
                        imgsgot += 1
                        handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid)
                    avatar_img = soup.select(
                        '#ProfileHeader .ProfileHeader-main .UserAvatar img'
                    )[0]['src']
                    imgsgot += 1
                    handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid)
                elif doctype == 'article_data':
                    jsondata = session.get_article_content_raw(objid)
                    if 'titleImage' in jsondata.keys():
                        cover_img = jsondata['titleImage']
                        if len(cover_img) > 0:
                            imgsgot += 1
                            handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid)
                    soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER)
                    for x in soup.select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid)
                elif doctype == 'topic_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/topic/{0}/hot'.
                                format(objid))), HTML_PARSER)
                    topic_img = soup.select(
                        '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img'
                    )[0]['src']
                    imgsgot += 1
                    handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid)
                elif doctype == 'answer_comments' and realdoc['start'] == '0':
                    obj, q = zh_iatd.query_object(answer_content_searcher,
                                                  objid, zh_pganlz.answer)
                    for x in obj.data.text.as_soup().select('img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid)
                elif doctype == 'question_data':
                    soup = bs4.BeautifulSoup(
                        session.opener.open(
                            urllib2.Request(
                                url='https://www.zhihu.com/question/{0}'.
                                format(objid))), HTML_PARSER)
                    for x in soup.select('#zh-question-detail img'):
                        imgsgot += 1
                        handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid)
                else:
                    logger.write('\n')
                    continue
                logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot))
                if _stop:
                    break
                time.sleep(3)
            except Exception as e:
                logger.write('\n## ERROR ################################\n')
                logger.write(traceback.format_exc())
        if len(res.scoreDocs) > 0:
            tmpbk = res.scoreDocs[-1]
        res = searcher.searchAfter(tmpbk, query, 100, sort)
    print 'stopped'
    _stopped = True
Пример #23
0
 def testCollectorTransparentlyDelegatesToNextCollector(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)
Пример #24
0
 def _collectKeys(self, filter, keyName, query):
     keyCollector = KeySuperCollector(keyName) if self._multithreaded else KeyCollector(keyName)
     self.search(query=query or MatchAllDocsQuery(), filterQuery=filter, collector=keyCollector)
     return keyCollector.getCollectedKeys()
Пример #25
0
 def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self):
     self._addDocument("urn:1", 2)
     tc = TopScoreDocSuperCollector(100, True)
     c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc)
     self.lucene.search(query=MatchAllDocsQuery(), collector=c)
     self.assertEquals(1, tc.topDocs(0).totalHits)
Пример #26
0
 def testMatchAllQuery(self):
     self.assertConversion(MatchAllDocsQuery(), '*')