예제 #1
0
    def query_parser(self, default_fields, idx_name=LATEST_REVS):
        """
        Build a query parser for a list of default fields.
        """
        schema = self.schemas[idx_name]
        if len(default_fields) > 1:
            qp = MultifieldParser(default_fields, schema=schema)
        elif len(default_fields) == 1:
            qp = QueryParser(default_fields[0], schema=schema)
        else:
            raise ValueError("default_fields list must at least contain one field name")
        qp.add_plugin(RegexPlugin())

        def userid_pseudo_field_factory(fieldname):
            """generate a translator function, that searches for the userid
               in the given fieldname when provided with the username
            """
            def userid_pseudo_field(node):
                username = node.text
                users = user.search_users(**{NAME_EXACT: username})
                if users:
                    userid = users[0].meta[ITEMID]
                    node = WordNode(userid)
                    node.set_fieldname(fieldname)
                    return node
                return node
            return userid_pseudo_field
        qp.add_plugin(PseudoFieldPlugin(dict(
            # username:JoeDoe searches for revisions modified by JoeDoe
            username=userid_pseudo_field_factory(USERID),
            # assigned:JoeDoe searches for tickets assigned to JoeDoe
            assigned=userid_pseudo_field_factory(ASSIGNED_TO),
        )))
        return qp
예제 #2
0
def query_search(indexdir, queries, n=10, function='BM25F'):
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            rates = sorting.FieldFacet('rating', reverse=True)
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=n, sortedby=[scores, rates])
            k = min(len(results), n)
            for i in range(k):
                result_index.append(int(results[i]['ID']))
    return result_index
예제 #3
0
def searchPage(keyword, curPage=1, pagelen=10):
    with ix.searcher() as searcher:
        # res=dict()
        # parser = QueryParser('content', schema=ix.schema)
        hf = HtmlFormatter(tagname="code", classname="match", termclass="term")
        fragmenter = WholeFragmenter(charlimit=None)
        parser = MultifieldParser(["title", "content", 'createAt'],
                                  schema=ix.schema)
        parser.add_plugin(DateParserPlugin())
        q = parser.parse(keyword)
        page = searcher.search_page(q, curPage, pagelen)  #,terms=True
        page.results.fragmenter = fragmenter
        #page.results.fragmenter.charlimit=None
        page.results.formatter = hf
        # terms = page.results.matched_terms()
        # key=[ e for e in terms ][0][1].decode('UTF-8')
        resPage = dict(pagenum=page.pagenum,
                       pagecount=page.pagecount,
                       total=page.total,
                       posts=[])
        for hint in page:
            tmp = dict()
            tmp['title'] = hint.highlights("title", minscore=0)
            tmp['author'] = hint["author"]
            tmp['location'] = hint["location"].replace(os.sep,
                                                       '/').replace('//', '/')
            if tmp['location'].startswith('/'):
                tmp['location'] = tmp['location'][1:]
            tmp['summary'] = hint.highlights(
                "summary", minscore=0
            )  #hint["content"].replace(key,"<code>%s</code>" % key)

            resPage['posts'].append(tmp)
        return resPage
예제 #4
0
def indexquery(name, www):
    if name == None:
        return []
    #print("Name: %s" % name)
    ix = index.open_dir("/var/www/restnames/index")
    qp = MultifieldParser([
        "commonname", "database", "tags", "name", "name_part", "country",
        "project", "url"
    ],
                          schema=ix.schema,
                          termclass=FuzzyTerm)
    qp.add_plugin(qparser.FuzzyTermPlugin())
    q = qp.parse(name)
    #q = Every()
    tempvar = []
    with ix.searcher() as searcher:
        results = searcher.search(q, limit=None)
        for hit in results:
            tempvar.append({
                'name': hit["name"],
                'commonname': hit["commonname"],
                'url': hit["url"]
            })
    if not www:
        return tempvar
    else:
        response = Response(
            render_template("searchresults.html", resultlist=tempvar))
        response.headers['content-type'] = 'text/html'
        return response
예제 #5
0
    def getSpecialCasesResults(self, value, translation):
        """Takes in a query and compares it to hard-coded special cases.
        The special cases are for the "Miracle Letters"

        :param translation: The requested translation type
        :type translation: str
        :return: A list of ayah matches if there is a match, otherwise returns None
        :rtype: list, None
        """
        matchingAyahList = []
        for case in SPECIAL_CASES:
            if case[0] == value:
                value = case[1]
                matchingAyahList = case[2]

        if len(matchingAyahList) > 0:
            allowedResults = []
            for matchingAyah in matchingAyahList:
                allowedResults.append(
                    "surah_num:" + str(matchingAyah[0]) + " AND ayah_num:" + str(
                            matchingAyah[1]))
            parser = MultifieldParser(["surah_num", "ayah_num"], self._ix.schema)
            parser.remove_plugin_class(PhrasePlugin)
            parser.add_plugin(SequencePlugin())
            query = parser.parse(" OR ".join(allowedResults))
            with self._ix.searcher() as searcher:
                results = searcher.search(query, limit=7)
                return self._getResponseObjectFromParams(
                        value,
                        self._getMatchesFromResults(results, translation),
                        [],
                        []
                )
        else:
            return None
예제 #6
0
	def search(self, queryEntered, page):
		title    = list()
		plot     = list()
		poster   = list()
		year     = list()
		director = list()
		genre    = list()
		actors   = list()
		tomato_score = list()

 		# JY for the sake of demonstrating ranking weight, not going to affect search much visibly. 
		#mw=MultiWeighting(BM25F(), tomato_score=FunctionWeighting(custom_weight)) # plot=BM25F(B=0.75, plot_B=1.0, K1=2.0), actors=BM25F(B=0.75, actors_B=1.0, K1=1.5), director=TF_IDF()  )
		with self.indexer.searcher(weighting=BM25F()) as search: 
			parser = MultifieldParser(['title', 'plot','actors', 'director', 'genre'], schema=self.indexer.schema, termclass=FuzzyTerm) #
			parser.add_plugin(plugins.FuzzyTermPlugin())
			parser.add_plugin(plugins.SequencePlugin())
			query = parser.parse(queryEntered)
			results = search.search_page(query, page, 20, sortedby = {'tomato_score'}, reverse=True) # 'tomato_score', 'year'

			for x in results:
				title.append(x['title'])
				plot.append(x['plot'])
				poster.append(x['poster'])
				tomato_score.append(x['tomato_score'])
				year.append(x['year'])
				director.append(x['director'])
				actors.append(x['actors'])
				genre.append(x['genre'])

		return title, plot, poster, tomato_score, year, actors, director, genre, results.pagecount if results.pagecount < 23 else 23 
예제 #7
0
def generic(idx, qs=None, q=None, limit=5, parser=None, page=1):
    if qs is q is None:
        raise ValueError('cannot have a null querystring and query')

    if parser is None:
        parser = MultifieldParser(
                ['title', 'keywords', 'summary', 'content', 'author'], idx.schema, group=OrGroup)

    # add better date parsing support
    parser.add_plugin(DateParserPlugin())
    parser.remove_plugin_class(WildcardPlugin)

    with idx.searcher() as search:
        # generate the Query object
        if qs:
            query = parser.parse(qs)
        else:
            query = q

        facet = MultiFacet()
        facet.add_score()
        facet.add_field('modified', reverse=True)
        facet.add_field('title')

        results = search.search_page(query, pagenum=page, sortedby=facet, pagelen=limit)
        res = clean_results(idx, results, query)

        # pagination attributes on `search_page` method
        res.page_number = results.pagenum   # current page number
        res.page_total = results.pagecount  # total pages in results
        res.offset = results.offset         # first result of current page
        res.pagelen = results.pagelen       # the number of max results per page

    return res
예제 #8
0
파일: viewer.py 프로젝트: jdemaris/digipal
def get_whoosh_parser(index):
    from whoosh.qparser import MultifieldParser, GtLtPlugin

    # TODO: only active columns
    term_fields = ['content', 'unitid']
    parser = MultifieldParser(term_fields, index.schema)
    parser.add_plugin(GtLtPlugin)
    return parser
예제 #9
0
파일: viewer.py 프로젝트: kcl-ddh/digipal
def get_whoosh_parser(index):
    from whoosh.qparser import MultifieldParser, GtLtPlugin

    # TODO: only active columns
    term_fields = ['content', 'unitid']
    parser = MultifieldParser(term_fields, index.schema)
    parser.add_plugin(GtLtPlugin)
    return parser
예제 #10
0
 def _create_parser(self, context):
     parser = MultifieldParser(self.field_boosts.keys(),
                               WhooshBackend.SCHEMA,
                               fieldboosts=self.field_boosts)
     parser.add_plugin(
         MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers,
                           context=context))
     return parser
예제 #11
0
파일: core.py 프로젝트: STguerin/WhooshSQL
    def _query_keys(self, query, limit=None, plugin=None):
        parser = MultifieldParser(self.subscription.table.__searchable__, self.subscription.schema)
        if plugin:
            parser.add_plugin(plugin)

        pk = self.subscription.primary_key
        results = self.subscription.index.searcher().search(parser.parse(query), limit=limit)
        keys = [x[pk.name] for x in results]
        return keys
예제 #12
0
def search_index(index_dir, schema, attributes, id_name, query):
    ix = index.open_dir(dir + "/" + index_dir)
    mqp = MultifieldParser(attributes, schema=schema, group=OrGroup)
    mqp.add_plugin(FuzzyTermPlugin)
    q = mqp.parse("*%s~3/2*" % (query))

    with ix.searcher() as s:
        results = s.search(q)
        return [r[id_name] for r in results]
예제 #13
0
def person_query_search(indexdir, queries, user_id, E, n=10, function='BM25F'):
    prediction = user_cf(E, user_id, 3)
    ix = index.open_dir(indexdir)
    search_fields = ['resname', 'categories', 'address', 'city',
                     'state']  # search fields
    og = qparser.OrGroup.factory(0.9)
    qp = MultifieldParser(search_fields,
                          ix.schema,
                          termclass=query.Variations,
                          group=og)
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(queries)
    result_index = []
    if function == 'BM25F':
        # with ix.searcher(weighting=scoring.BM25F(B=0.75, resname_B = 1.0, categories_B = 0.8, K1=1.2)) as s:
        # add weight for the resname and the categories_B
        with ix.searcher(weighting=scoring.BM25F(B=0.75, K1=1.2)) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=None, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (relevance.max() -
                                                             relevance.min())
                # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    if function == 'TF_IDF':
        with ix.searcher(weighting=scoring.TF_IDF()) as s:
            scores = sorting.ScoreFacet()
            results = s.search(q, limit=m, sortedby=[scores])
            m = len(results)
            if m != 0:
                relevance = np.zeros(m)
                expected = np.zeros(m)
                for i in range(m):
                    relevance[i] = -results[i].score
                relevance = (relevance - relevance.min()) / (
                    relevance.max() - relevance.min()
                )  # normalized score from 0 to 1
                for i in range(m):
                    expected[i] = relevance[i] * prediction[int(
                        results[i]['ID'])]
                indorder = np.argsort(expected)
                k = min(m, n)
                for i in range(k):
                    result_index.append(int(results[indorder[-1 - i]]['ID']))
    return result_index
예제 #14
0
def answer_query(query):
    with main_index.searcher() as searcher:
        parser = MultifieldParser(['title', 'summary'], main_index.schema, fieldboosts={'title': 5.0, 'summary': 0.2})
        parser.add_plugin(FuzzyTermPlugin())
        # tilde adds fuzzy parsing for 1 character and /1 requires the first letter to match
        query = parser.parse(unicode(query) + '~/1') 
        
        results = searcher.search(query, limit=100)
        tags = [r['tag'] for r in results]
    return tags
 def render_GET(self, request):
     section_path = '/'.join(request.postpath).strip('/')
     if not section_path:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'unable to search root'}))
     
     section_name = request.postpath[0]
     
     ix = self._get_index(section_path)
     if not ix:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'unknown index for %s' % section_path}))
     
     schema_settings = self._get_schema_settings(section_path)
     schema = schema_settings['schema']
     
     if 'schema' in request.args:
         if section_path in self.currently_indexing:
             yield self.currently_indexing[section_path]
         
         field_choices = schema_settings.get('field_choices', {})
         fields = {}
         
         for field in set(schema.names()):
             if isinstance(schema[field], KEYWORD) and field in field_choices:
                 fields[field] = sorted(x for x in field_choices[field] if x)
         
         defer.returnValue(json.dumps({'status': 'ok', 'schema': fields, 'type': schema_settings['type']}))
     
     if 'q' not in request.args:
         defer.returnValue(json.dumps({'status': 'error', 'message': 'missing q argument in url'}))
     q = unicode(request.args['q'][0])
     
     parser = MultifieldParser(['search_field'], schema=schema)
     parser.add_plugin(GtLtPlugin())
     query = parser.parse(q)
     
     with ix.searcher() as searcher:
         results = yield threads.deferToThread(searcher.search, query, limit=10000)
         #corrected = searcher.correct_query(query, q) # jesus this is bad for titles
         results = [x['linkitem'] for x in results]
     
     section = settings.SECTIONS[section_name]
     rootfolder = RootFolder(parent_path='', name='Search result for: %s' % q, urlname=self.name, date=0)
     rootfolder['content_type'] = section.levels[0].content_type
     
     for result in results:
         rootfolder.add_item(result)
     
     #if corrected.query != query:
     #    retval['suggestion'] = {
     #        'rel': 'suggested_query',
     #        'href': urlparse.urljoin(settings.BASE_URL, '/search/%s' % urllib.quote(section_path)) + '?%s' % urllib.urlencode({'q': corrected.string}),
     #        'suggested_query': corrected.string,
     #    }
     
     defer.returnValue(rootfolder.serialize())
예제 #16
0
 def _create_parser(self, context):
     parser = MultifieldParser(
         self.field_boosts.keys(),
         WhooshBackend.SCHEMA,
         fieldboosts=self.field_boosts
     )
     parser.add_plugin(
         MetaKeywordPlugin(meta_keyword_parsers=self.meta_keyword_parsers,
                           context=context)
     )
     return parser
예제 #17
0
파일: search.py 프로젝트: oii/ogre
    def query(self, s=None, is_curated=True, is_fiction=True, pagenum=1, allpages=False):
        '''
        Search for books using whoosh, or return first page from all
        '''
        if self.whoosh is None:
            return

        if not s:
            # default to list all authors
            query = Every('author')
        else:
            # create a search by author and title
            qp = MultifieldParser(['author', 'title'], self.whoosh.schema, group=OrGroup)

            # fuzzy query only if wildcard not present
            if '*' not in s:
                qp.add_plugin(FuzzyTermPlugin())

                # setup search terms for fuzzy match
                fuzzy_terms = []
                for w in s.split():
                    fuzzy_terms.append('{}~'.format(w))
                s = ' '.join(fuzzy_terms)

            # parse the search terms
            query = qp.parse(s)

        # only filter is_fiction / is_curated when true
        filters = []
        if is_curated is True:
            filters.append(Term('is_curated', is_curated))
        if is_fiction is True:
            filters.append(Term('is_fiction', is_fiction))
        qfilter = And(filters)

        with self.whoosh.searcher() as searcher:
            pagecount = None

            if allpages:
                # special search returning all pages upto pagenum
                results = searcher.search(query, filter=qfilter, limit=(self.pagelen * pagenum))
            else:
                # paginated search for specific page, or to feed infinite scroll
                results = searcher.search_page(query, int(pagenum), filter=qfilter, pagelen=self.pagelen)
                pagecount = results.pagecount

            output = [item.fields() for item in results]

            if pagecount is None:
                pagecount = int(math.ceil(float(len(output)) / self.pagelen))

        return {'results': output, 'pagecount': pagecount}
예제 #18
0
    def _query(self):
        q_str = self.query_params['query']
        qp = MultifieldParser(
            ["content", "title", "correspondent", "tag", "type"],
            self.searcher.ixreader.schema)
        qp.add_plugin(DateParserPlugin())
        q = qp.parse(q_str)

        corrected = self.searcher.correct_query(q, q_str)
        if corrected.query != q:
            corrected_query = corrected.string

        return q, None
예제 #19
0
def inicia():
    pth = os.path.abspath(os.path.dirname(os.path.realpath(__file__)) + "/indiceJuego")
    if not os.path.exists(pth):
        os.mkdir(pth)
        esquemaJuego = Schema(titulo=KEYWORD(stored=True), descripcion=TEXT,
                              categorias=KEYWORD(stored=True), plataformas=KEYWORD(stored=True),
                              precio=NUMERIC(stored=True))
        indiceJuego = create_in("indiceJuego",esquemaJuego)
    else:
        indiceJuego = open_dir(pth)
 
    parser = MultifieldParser(["titulo"], schema=indiceJuego.schema)
    parser.add_plugin(FuzzyTermPlugin())
 
    return indiceJuego,parser
예제 #20
0
class DBworldSearcher:

    def __init__(self, indexdir, fieldlist=["subject", "content"]):
        self.indexdir = indexdir
        ix = open_dir(indexdir)

        #self.parser = QueryParser("subject", self.ix.schema)
        self.parser = MultifieldParser(fieldlist, ix.schema)
        self.parser.add_plugin(DateParserPlugin())
        self.searcher = ix.searcher()

    def search(self, querytext, limit):
        myquery = self.parser.parse(querytext)
        results = self.searcher.search(myquery, limit=limit)
        return results
예제 #21
0
def search_in_index(search_kw, index):
    parser = MultifieldParser(["content", "title"], index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    searcher = index.searcher()
    to_parse = ' '.join([i + '~0' for i in search_kw.split(' ')])
    myquery = parser.parse(to_parse)
    r = searcher.search(myquery)
    results = []
    for res in r:
        results.append(res['path'])
    corrector = searcher.corrector("content")
    suggestions = []
    for kw in search_kw.split(' '):
        suggestions.append(corrector.suggest(kw))
    searcher.close()
    return results, suggestions
예제 #22
0
def query_page(ix, page, querystring, more_like_doc_id, more_like_doc_content):
    searcher = ix.searcher()
    try:
        if querystring:
            qp = MultifieldParser(
                ["content", "title", "correspondent", "tag", "type"],
                ix.schema)
            qp.add_plugin(DateParserPlugin())
            str_q = qp.parse(querystring)
            corrected = searcher.correct_query(str_q, querystring)
        else:
            str_q = None
            corrected = None

        if more_like_doc_id:
            docnum = searcher.document_number(id=more_like_doc_id)
            kts = searcher.key_terms_from_text('content',
                                               more_like_doc_content,
                                               numterms=20,
                                               model=classify.Bo1Model,
                                               normalize=False)
            more_like_q = query.Or([
                query.Term('content', word, boost=weight)
                for word, weight in kts
            ])
            result_page = searcher.search_page(more_like_q,
                                               page,
                                               filter=str_q,
                                               mask={docnum})
        elif str_q:
            result_page = searcher.search_page(str_q, page)
        else:
            raise ValueError(
                "Either querystring or more_like_doc_id is required.")

        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        if corrected and corrected.query != str_q:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()
예제 #23
0
    def whoosh_search(self, query, searcher, index, info):
        ret = ''
        # run a whoosh search and display the hits
        # query applies to all fields in the schema
        # special query: ALL, ANY
        #
        limit = int(self.options['limit'] or '1000000')
        if query in ['ALL', 'ANY']:
            from whoosh.query.qcore import Every
            q = Every()
        else:
            from whoosh.qparser import MultifieldParser, GtLtPlugin

            # TODO: only active columns
            term_fields = [item[0] for item in index.schema.items()]
            parser = MultifieldParser(term_fields, index.schema)
            parser.add_plugin(GtLtPlugin)

            q = parser.parse(u'%s' % query)

        if query in ['ANY']:
            limit = 1

        afield = self.options['field']
        res = searcher.search(q, limit=limit)
        vs = {}
        for hit in res:
            if afield:
                # display only the unique value in the requested field
                vs[hit[afield]] = vs.get(hit[afield], 0) + 1
            else:
                # display all field, value in this record
                for k, v in hit.iteritems():
                    ret += '\t%-20s %s\n' % (k, repr(v)[0:30])
                ret += '\t' + ('-' * 20) + '\n'

        if vs:
            for v, c in vs.iteritems():
                ret += '\t%6s x %s\n' % (c, repr(v))

        info['results'] = ret
        info['result_size'] = len(res)

        ret += '\n\n%s documents found' % len(res)

        return ret
예제 #24
0
    def whoosh_search(self, query, searcher, index, info):
        ret = ''
        # run a whoosh search and display the hits
        # query applies to all fields in the schema
        # special query: ALL, ANY
        #
        limit = int(self.options['limit'] or '1000000')
        if query in ['ALL', 'ANY']:
            from whoosh.query.qcore import Every
            q = Every()
        else:
            from whoosh.qparser import MultifieldParser, GtLtPlugin

            # TODO: only active columns
            term_fields = [item[0] for item in index.schema.items()]
            parser = MultifieldParser(term_fields, index.schema)
            parser.add_plugin(GtLtPlugin)

            q = parser.parse(u'%s' % query)

        if query in ['ANY']:
            limit = 1

        afield = self.options['field']
        res = searcher.search(q, limit=limit)
        vs = {}
        for hit in res:
            if afield:
                # display only the unique value in the requested field
                vs[hit[afield]] = vs.get(hit[afield], 0) + 1
            else:
                # display all field, value in this record
                for k, v in hit.iteritems():
                    ret += '\t%-20s %s\n' % (k, repr(v)[0:30])
                ret += '\t' + ('-' * 20) + '\n'

        if vs:
            for v, c in vs.iteritems():
                ret += '\t%6s x %s\n' % (c, repr(v))

        info['results'] = ret
        info['result_size'] = len(res)

        ret += '\n\n%s documents found' % len(res)

        return ret
예제 #25
0
    def search(self, query_list, fields=None):

        with self.ix.searcher() as searcher:

            query_list2 = []
            for qq in query_list:
                if qq=='AND' or qq=='OR':
                    query_list2.append(qq)
                else:
                    query_list2.append(qq.lower())
            query_string = " ".join(query_list2)

            query = None
            if ":" in query_string:
                # If the user DOES specify a field,
                # setting the fields determines what fields
                # are searched with the free terms (no field)
                fields = ['title', 'content','owner_name','owner_email','github_user']
                query = MultifieldParser(fields, schema=self.ix.schema)
                est = pytz.timezone('America/New_York')
                query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow())))
                query.add_plugin(GtLtPlugin())
                try:
                    query = query.parse(query_string)
                except:
                    # Because the DateParser plugin is an idiot
                    query_string2 = re.sub(r':(\w+)',':\'\g<1>\'',query_string)
                    try:
                        query = query.parse(query_string2)
                    except:
                        print("parsing query %s failed"%(query_string))
                        print("parsing query %s also failed"%(query_string2))
                        query = query.parse('')

            else:
                # If the user does not specify a field,
                # these are the fields that are actually searched
                fields = ['url','title', 'content','owner_name','owner_email','github_user']
                query = MultifieldParser(fields, schema=self.ix.schema)
                est = pytz.timezone('America/New_York')
                query.add_plugin(DateParserPlugin(free=True, basedate=est.localize(datetime.utcnow())))
                query.add_plugin(GtLtPlugin())
                try:
                    query = query.parse(query_string)
                except:
                    print("parsing query %s failed"%(query_string))
                    query = query.parse('')
            parsed_query = "%s" % query
            print("query: %s" % parsed_query)
            results = searcher.search(query, terms=False, scored=True, groupedby="kind")
            search_result = self.create_search_result(results)

        return parsed_query, search_result
예제 #26
0
 def get(self, search):
     ix = open_dir("index")
     with ix.searcher() as searcher:
         qp = MultifieldParser(['title', 'content', 'url'], ix.schema)
         qp.add_plugin(DateParserPlugin())
         query = qp.parse(search)
         results = searcher.search(query)
         self.write(
             tornado.escape.json_encode([{
                 'title':
                 r.get('title'),
                 'url':
                 r.get('url'),
                 'date':
                 r.get('date').strftime("%A, %d. %B %Y %I:%M%p"),
                 'hash':
                 r.get('hash', 'blank')
             } for r in results[:10]]))
         self.set_header('Content-Type', 'application/json')
예제 #27
0
 def search_for_track(self, querystring):
     if len(querystring) >= 3:
         with self.ix.searcher() as searcher:
             collector = searcher.collector(limit=20)
             tlc = TimeLimitCollector(collector, timelimit=1.4, use_alarm=False)
             parser = MultifieldParser(["artist", "album", "title"], self.ix.schema)
             parser.add_plugin(qparser.FuzzyTermPlugin())
             myquery = parser.parse(querystring)
             try:
                 searcher.search_with_collector(myquery, tlc)
                 if len(tlc.results()) == 0:
                     myquery = parser.parse(" ".join(word + "~2" for word in querystring.split()))
                     searcher.search_with_collector(myquery, tlc)
             except TimeLimit:
                 logging.info("Time Limit for query reached!")
             logging.debug("czas zapytania: ", collector.runtime)
             ret = [self.__tracks[int(result["id"])] for result in tlc.results()]
             return ret
     else:
         return []
예제 #28
0
    def page(self, page, limit):
        with self.engine.index.searcher() as searcher:
            parser = MultifieldParser(
                self.engine.search_fields,
                schema = self.engine.index.schema,
            )
            parser.add_plugin(GtLtPlugin())
            parser.add_plugin(PhrasePlugin())
            parser.add_plugin(FieldsPlugin())
            #parser.remove_plugin_class(WildcardPlugin)
            #parser.add_plugin(WildcardPlugin())
            parser.add_plugin(PrefixPlugin())

            whoosh_query = parser.parse(self.query.toString(self.engine))
            #print "============" + str(whoosh_query)
            results = searcher.search_page(whoosh_query, page, limit, sortedby = self.order)
            self.rows = results.total

            _results = []

            doc_class = self.engine.database.document

            for result in results:
                doc = doc_class(data = {field: result.get(field, None) for field in self.engine.stored_fields}, restore = True)
                _results.append(doc)

        return _results
예제 #29
0
def query_page(ix, querystring, page):
    searcher = ix.searcher()
    try:
        qp = MultifieldParser(
            ["content", "title", "correspondent", "tag", "type"], ix.schema)
        qp.add_plugin(DateParserPlugin())

        q = qp.parse(querystring)
        result_page = searcher.search_page(q, page)
        result_page.results.fragmenter = highlight.ContextFragmenter(
            surround=50)
        result_page.results.formatter = JsonFormatter()

        corrected = searcher.correct_query(q, querystring)
        if corrected.query != q:
            corrected_query = corrected.string
        else:
            corrected_query = None

        yield result_page, corrected_query
    finally:
        searcher.close()
예제 #30
0
 def query_parser(self, default_fields, idx_name=LATEST_REVS):
     """
     Build a query parser for a list of default fields.
     """
     schema = self.schemas[idx_name]
     if len(default_fields) > 1:
         qp = MultifieldParser(default_fields, schema=schema)
     elif len(default_fields) == 1:
         qp = QueryParser(default_fields[0], schema=schema)
     else:
         raise ValueError("default_fields list must at least contain one field name")
     qp.add_plugin(RegexPlugin())
     def username_pseudo_field(node):
         username = node.text
         users = user.search_users(**{NAME_EXACT: username})
         if users:
             userid = users[0].meta['userid']
             node = WordNode(userid)
             node.set_fieldname("userid")
             return node
         return node
     qp.add_plugin(PseudoFieldPlugin({'username': username_pseudo_field}))
     return qp
예제 #31
0
def search_in_index(search_kw, index):
    '''
    search_kw: ce que rentre l'utilisateur dans la barre de recherche
    index: l'index ouvert (objet ix dans le code qui précède)
    
    La fonction renvoie un dictionnaire avec pour clefs:
        - results: une liste contenant des dictionnaires. Chaque dictionnaire 
        correspond à un résultat de recherche. Le premier élément de la liste 
        est le meilleur résultat. Les dictionnaires on deux clefs: 'title' avec 
        le titre du doc, et 'path' avec le chemin (vers le doc texte, pour le moment)
        - suggestions: dictionnaire de suggestion qui propose une éventuelle 
        correction pour chaque mot entré par l'utilisateur. A voir comment on 
        mélange les suggestions des différents mots pour fournir des suggestions 
        complètes
    '''
    #on utilise un MultifieldParser pour rechercher à la fois dans le titre et dans le contenu
    parser = MultifieldParser(["content", "title"], index.schema)
    #on rajoute un plugin de FuzzyMatching pour pouvoir chercher au delà des mots exacts
    parser.add_plugin(FuzzyTermPlugin())
    searcher = index.searcher()
    #on transforme la requête utilisateur pour la mettre en format compréhensible par le plugin de FuzzyMatching
    to_parse = ' '.join([i + '~1' for i in search_kw.split(' ')])
    myquery = parser.parse(to_parse)
    #on récupère les résultats pour pouvoir fermer le searcher  par la suite
    r = searcher.search(myquery)
    results = []
    for res in r:
        results.append({'title': res['title'], 'path': res['path']})
    #on set-up le correcteur et on stock ce qu'il propose pour chaque mot tapé
    corrector = searcher.corrector("content")
    suggestions = {}
    for kw in search_kw.split(' '):
        suggestions[kw] = corrector.suggest(kw)
    #on ferme le seacher
    searcher.close()
    return {'results': results, 'suggestions': suggestions}
예제 #32
0
 def search(self, queries, fuzzy = True, default_fields = [], max_results = None):
     if type(queries) != list:
         queries = [queries]
     if type(default_fields) != list:
         default_fields = [default_fields]
     if fuzzy and len(queries) == 1 and len(queries[0].split()) == 1 and ':' not in queries[0] and '*' not in queries[0]:
         queries = ['*%s*' % (queries[0])]
     for query in queries:
         if type(query) != unicode:
             query = query.decode('utf-8')
         log.msg('search query: %s' % (query))
         with self.ix.searcher() as searcher:
             parser = MultifieldParser(default_fields, self.ix.schema)
             parser.remove_plugin_class(plugins.WildcardPlugin)
             parser.add_plugin(WildcardPlugin)
             query = parser.parse(query)
             log.msg('search query parsed: %s' % (query))
             results = searcher.search(query, limit = None)
             count = 0
             for result in results:
                 yield result['oid']
                 count += 1
                 if max_results and count >= max_results:
                     break
class ConceptSearcher:
    def __init__(self, tree_dict, tree_identity):
        self.ix = None
        self.parser = None
        self.id_ = tree_identity
        self._tree_dict = tree_dict
        self.get_schema()

    def search(self, query_string, limit=50, allowed_nodes: set = None):
        with self.ix.searcher() as searcher:
            query = self.parser.parse(query_string)

            if allowed_nodes is not None:
                allowed_nodes = {
                    doc_num
                    for doc, doc_num in zip(searcher.documents(),
                                            searcher.document_numbers())
                    if doc.get('fullname') in allowed_nodes
                }

            results = searcher.search(query, limit=limit, filter=allowed_nodes)
            return [r['fullname'] for r in results]

    def get_schema(self):
        schema_dir = os.path.join(cache_dir, self.id_)
        os.makedirs(schema_dir, exist_ok=True)

        if exists_in(schema_dir) and open_dir(schema_dir).doc_count() != 0:
            self.ix = open_dir(schema_dir)
            print('Existing index cache found. Loaded {} tree nodes. Hooray!'.
                  format(self.ix.doc_count()))

        else:
            print('No valid cache found. Building indexes...')
            now = time.time()
            self.__build_whoosh_index(schema_dir)
            print('Finished in {:.2f} seconds'.format(time.time() - now))

        self.parser = MultifieldParser(self.ix.schema.names(),
                                       schema=self.ix.schema)
        self.parser.add_plugin(FuzzyTermPlugin())

    def __build_whoosh_index(self, schema_dir):

        fields = dict(
            node=TEXT(),
            fullname=TEXT(stored=True),
            path=TEXT(),
            type=NGRAM(minsize=4),
            study=NGRAM(field_boost=10.0),
            name=NGRAMWORDS(minsize=3, field_boost=3.0),
            metadata=NGRAMWORDS(minsize=3),
        )
        schema = Schema(**fields)
        self.ix = create_in(schema_dir, schema)

        with self.ix.writer(procs=2, multisegment=True, limitmb=512) as writer:
            for key, value in self._tree_dict.items():
                writer.add_document(node=key.replace('\\',
                                                     ' ').replace('_', ' '),
                                    path=value.get('conceptPath'),
                                    fullname=key,
                                    type=value.get('type'),
                                    study=str(value.get('studyId')),
                                    name=str(value.get('name')),
                                    metadata=str(value.get('metadata')))
예제 #34
0
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser.dateparse import DateParserPlugin

ix = open_dir("index")

with ix.searcher() as searcher:
    qp = MultifieldParser(['title', 'content', 'url'], ix.schema)
    qp.add_plugin(DateParserPlugin())
    query = qp.parse("qoqa")
    results = searcher.search(query, terms=True)
    for r in results:
        print(r.get('hash'))
        print(r.matched_terms())
        #print(r.highlights('title'))
예제 #35
0
_string = sys.argv[1]
_mode = sys.argv[2]
normal = (_mode == "normal")

_distance = 0
if (normal is False):
    _distance = int(sys.argv[3])

with ix.searcher() as searcher:
    # og = qparser.OrGroup.factory(0.9)
    parser = MultifieldParser(["title", "sub_title", "author", "content"],
                              schema=ix.schema)
    # parser = qparser.QueryParser("content", ix.schema)
    parser.remove_plugin_class(qparser.PhrasePlugin)
    parser.add_plugin(qparser.SequencePlugin())

    if (normal):
        string = _string
        query = parser.parse(string)
    else:
        # proximity
        distance = _distance
        proximty_query = "\"" + _string + "\"" + '~' + str((1 + distance) * 3)
        query = parser.parse(proximty_query)

    # sys.stdout.buffer.write(query)
    sys.stdout.buffer.write(">>>>>>OUTPUT start<<<<<<".encode('utf-8'))
    results = searcher.search(query, limit=20)
    results.fragmenter.maxchars = 100
    # Show more context before and after
예제 #36
0
class WhooshEngine(Engine):
    def __init__(self, config):
        self.schema = Schema(
            id=ID(unique=True),
            title=TEXT(stored=True, field_boost=3.0, analyzer=StandardAnalyzer() | NgramFilter(minsize=2, maxsize=3)),
            author=TEXT(stored=True),
            creation_date=DATETIME(stored=True),
            pages=STORED,
            content=TEXT(stored=True, analyzer=StandardAnalyzer(stoplist=None)),
            lang=TEXT(stored=True),
            size=STORED,
            tags=KEYWORD(stored=True, commas=True)
        )

        self.index_path = config['WHOOSH_INDEX']

        if not os.path.exists(self.index_path):
            os.mkdir(self.index_path)
            create_in(self.index_path, self.schema)

        self.indexer = open_dir(self.index_path)
        self.parser_content = MultifieldParser(["title", "content"], schema=self.schema)
        self.parser_content.add_plugin(DateParserPlugin())
        self.date_format = {
            'last_24h': u'-24h to now',
            'last_week': u'last week',
            'last_month_to_now': u'-1mo to now',
            'last_year_to_now': u"[-2yrs to now]"
        }

    def index_document(self, data):
        """
        Index a document
        :param data: set of data values per attributes to index
        :return: void
        """
        writer = self.indexer.writer()
        writer.add_document(
            id=unicode(data['Id']),
            author=data['Author'],
            title=data['Title'],
            creation_date=data['CreationDate'],
            content=unicode(data['Content'], 'utf-8'),
            pages=data['NumPages'],
            size=data['Size'],
            lang=data['Lang'],
            tags=unicode(','.join(data['Tags']), 'utf-8')
        )
        writer.commit()

    def update_document(self, id, data):
        """
        Update the documents attributes
        :param id: document's identifier
        :param data: set of values per attributes
        :return: void
        """
        writer = self.indexer.writer()
        data['id'] = unicode(id)
        writer.update_document(**data)
        writer.commit()

    def delete_document(self, id):
        """
        Delete a document by id
        :param id: document's identifier
        :return: void
        """
        self.indexer.delete_by_term('id', unicode(id))
        self.indexer.commit()

    def delete_index(self):
        """Delete de current index"""
        create_in(self.index_path, schema=self.schema)

    def search_ngram(self, args):
        """
        Get n-gram result ,when you typing it show the result
        :param args: The field to query like a title by example
        :return data: a result list that matched
        """
        criteria = args['criteria']
        with self.indexer.searcher() as searcher:
            simple_parser = QueryParser("title", group=OrGroup, schema=self.schema)
            query = simple_parser.parse(criteria)
            response = searcher.search_page(query, pagenum=1)
            data = [result['title'] for result in response.results]
            return data

    def search_document(self, args):
        """
        Search all documents that mach with the query
        :param args:  params from request to query
        :return: query result with pagination
        """
        with self.indexer.searcher() as searcher:
            query = self.parser_content.parse(args['criteria'])
            filters = self.create_filters(args)

            response = searcher.search_page(query, pagenum=args['page'], filter=filters)

            return self.normalize_data(response)

    def create_filters(self, args):
        """
        :param args: params from request to filter
        :return: Search instance
        """
        allow_list = []
        if "creation_date" in args:
            # :creation date format ex: creation_date:[last week to now]
            date_query = 'creation_date:' + self.date_format[args["creation_date"]]
            allow_list.append(date_query)

        if "lang" in args:
            # :lang format ex: (lang:es OR lang:en)
            lang_list = ['lang:' + lang for lang in args['lang']]
            lang_query = ' OR '.join(lang_list)
            allow_list.append('(' + lang_query + ')')

        if "author" in args:
            # :autor format ex: (author:Antonio)
            author_query = 'author:' + args['author']
            allow_list.append('(' + author_query + ')')

        if "tags" in args:
            # :tags format ex: (tag:'history' OR tag:'docker')
            tag_list = ['tags:' + tag for tag in args['tags']]
            tag_query = ' OR '.join(tag_list)
            allow_list.append('(' + tag_query + ')')
        # : all filters
        query_string = u' AND '.join(allow_list)
        return self.parser_content.parse(query_string) if query_string else None

    def normalize_data(self, response):
        """
        Normalize the response adding pagination
        :param response: Response from elastic search  
        :return: data normalized
        """
        data = {'items': [], 'id_list': []}
        response.results.fragmenter.surround = 80  #: summary length
        # page_result.results.fragmenter.maxchars = 300
        my_cf = highlight.SentenceFragmenter()
        # page_result.results.fragmenter = my_cf
        for result in response.results:
            # print result.title
            result_dict = dict(result)
            result_dict['summary'] = result.highlights("content", top=2)
            data['items'].append(result_dict)
            data['id_list'].append(int(result_dict['id']))

        data['total'] = response.total
        data['pages'] = response.pagecount
        data['page'] = response.pagenum
        return data

    def rebuild_index(self):
        shutil.rmtree(self.index_path)
        os.mkdir(self.index_path)
        create_in(self.index_path, schema=self.schema)
예제 #37
0
# coding=utf-8
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser import FuzzyTermPlugin

idx_dir = 'lagou_idx'
ix = open_dir(idx_dir)
searcher = ix.searcher()

parser = MultifieldParser(["name", "desc"], schema=ix.schema)
parser.add_plugin(FuzzyTermPlugin())

# Single field parser.
k = u'搜索 OR Pythn~2 city:上海'
q = parser.parse(k)

results = searcher.search_page(q, 1, pagelen=5)

print(u'{0} results found for keyword {1}, {2} returned: '.format(len(results), k, results.scored_length()))
for hit in results[:50]:
    print(hit['id'])
    print(hit['name'])
    # print(hit['city'])
    print(hit['com_name'])
    print('************')
예제 #38
0
def main():
    """ The main loop for the program """
    g = Gui()
    ix = index.open_dir("indexdir")

    while True:
        event, values = g.window.read()
        g.window["_output_"]('')

        # close windows
        if event is None:
            break

        if event == '_SEARCH_' and values['TERM'] is not None:

            # il parametro 'fieldboosts' regola quanta importanza dare ai match nei vari campi
            qp = MultifieldParser(
                ["procTitle", "topics", "categories", "procContent"],
                termclass=Variations,
                schema=ix.schema,
                fieldboosts={
                    "procTitle": 1.5,
                    "categories": 1.3
                })
            qp.add_plugin(FuzzyTermPlugin())

            terms = str(values['TERM'])
            terms = terms.replace("title", "procTitle").replace("topic", "topics") \
                 .replace("category", "categories").replace("content", "procContent")

            # Modifica della query immessa con aggiunta dei sinonimi nel caso l'opzione sia abilitata, con attenzione
            # al riportare i token booleani senza modifiche ed a tradurre correttamente la definizione dei campi in cui
            # ricercare i termini se richiesti.
            if values['syn_search']:
                with open("utils/wn_s.pl", "r") as f:
                    thesaurus = Thesaurus.from_file(f)
                termsWithSynonyms = []
                for term in terms.split(" "):
                    field = None
                    if ":" in term:
                        field = term.split(":")[0]
                        term = term.split(":")[1]
                    if term not in booleanTokens:
                        termSynonyms = thesaurus.synonyms(term)
                        if field is not None:
                            termSynonyms = [
                                f"{field}:{t}" for t in termSynonyms
                            ]
                            termSynonyms.append(f"{field}:{term}")
                        else:
                            termSynonyms.append(term)
                        termsWithSynonyms.append(" OR ".join(termSynonyms))
                    else:
                        termsWithSynonyms.append(term)
                terms = ' '.join(termsWithSynonyms)

            print("- Searching for >>> " + str(terms))

            # stemming dei termini della query e aggiunta della tilde per ricerca "fuzzy" a quelle effettivamente modificate
            words = terms.split(' ')
            stemmedWords = list()
            for word in words:
                stemmed = stem(word)
                if word != stemmed:
                    stemmedWords.append(stemmed + '~')
                else:
                    stemmedWords.append(stemmed)

            q = qp.parse(' '.join(stemmedWords))

            with ix.searcher() as searcher:
                if not values['syn_search']:
                    correction = searcher.correct_query(q=q,
                                                        qstring=terms,
                                                        maxdist=2)
                    if terms != correction.string:
                        print("- Did you mean >>> " + correction.string)
                results = searcher.search(q, terms=True)

                if not values['syn_search'] and results.is_empty():
                    print(
                        "- No relevant result has been found for query, trying corrected query"
                    )
                    results = searcher.search(qp.parse(correction.string))

                numb = 1
                if not results.is_empty():
                    for elem in results:
                        # print(elem)
                        print(
                            f"Result n.{numb} >>> Title: {str(elem['docTitle'])}\n\tScore: {str(elem.score)}\n"
                            f"\tLink to the page: {str(elem['pageUrl'])}\n")
                        numb += 1
                else:
                    print("- No relevant result has been found")
예제 #39
0
파일: core.py 프로젝트: afreeorange/bock
class BockCore():
    def __init__(self, articles_path):
        """Attempt to initialize a folder with Markdown articles. If a git
        repo, create a search index and populate.

        Markdown Extension References
        * http://facelessuser.github.io/pymdown-extensions
        * https://pythonhosted.org/Markdown/extensions
        """
        self.article_repo = Repo(articles_path)
        self.articles_path = articles_path
        self.markdown_extensions = [
            'markdown.extensions.abbr',
            'markdown.extensions.attr_list',
            'markdown.extensions.def_list',
            'markdown.extensions.fenced_code',
            'markdown.extensions.footnotes',
            'markdown.extensions.tables',
            'markdown.extensions.smart_strong',
            'markdown.extensions.admonition',
            'markdown.extensions.codehilite',
            'markdown.extensions.headerid',
            'markdown.extensions.sane_lists',
            'markdown.extensions.smarty',
            'markdown.extensions.toc',
            'markdown.extensions.wikilinks',
            'pymdownx.betterem',
            'pymdownx.caret',
            'pymdownx.githubemoji',
            'pymdownx.headeranchor',
            'pymdownx.magiclink',
            'pymdownx.mark',
            'pymdownx.smartsymbols',
            'pymdownx.tasklist',
            'pymdownx.tilde',
            'pymdownx.critic',
        ]
        self.markdown_extensions_config = {
            'markdown.extensions.codehilite': {
                'css_class': 'code-highlight'
            }
        }
        self.__search_schema = Schema(
            title=ID(stored=True, unique=True),
            path=ID(stored=True),
            content=TEXT,
        )
        self.__search_parser = MultifieldParser(
            ['title', 'content'],
            schema=self.__search_schema,
        )
        self.__search_parser.add_plugin(FuzzyTermPlugin())
        self.__search_index = self.create_search_index()
        self.populate_search_index()

    # ------------------------ Article Functions ------------------------

    def markdown_to_html(self, text):
        """Converts a given Markdown string to HTML
        """
        return markdown.markdown(
            text=text,
            output_format='html5',
            extensions=self.markdown_extensions,
            extension_configs=self.markdown_extensions_config,
        )

    def raw_article(self, article_path):
        """Return the text contents of an article
        """
        with open(self.full_article_path(article_path)) as f:
            article_content = f.read()

        return article_content

    def processed_article(self, article_path):
        """Return the 'marked-down' HTML version of an article
        """
        return self.markdown_to_html(self.raw_article(article_path))

    def article_last_modified(self, article_path):
        """Return the last modified date of a given article in ISO8601 format
        """
        return str(
            arrow.get(
                os.stat(
                    self.full_article_path(article_path)
                ).st_mtime
            )
        )

    def article_last_modified_human(self, article_path):
        """Return the last modified date of a given article in a
        human-readable format
        """
        return arrow.get(
            self.article_last_modified(article_path)
        ).humanize()

    def is_article_modified(self, article_path):
        """Determine if the article is modified
        """
        if not os.path.isfile(self.full_article_path(article_path)):
            raise FileNotFoundError

        if article_path in self.list_of_uncommitted_articles:
            return True

        return False

    def get_article(self, article_path):
        """A convenience method that returns an object with a single
        article and associated metadata
        """
        return {
            'title': self.article_title(article_path),
            'html': self.processed_article(article_path),
            'raw': self.raw_article(article_path),
            'modified': self.article_last_modified(article_path),
            'modified_humanized': self.article_last_modified_human(
                article_path
            ),
            'uncommitted': self.is_article_modified(article_path),
        }

    @property
    def simple_list_of_articles(self):
        """Return a simple list of articles
        """
        return sorted([
            re.sub(
                r'^/?',
                '',
                _.replace(self.articles_path, '').replace('.md', '')
            )
            for _ in
            glob('{}/**/*.md'.format(self.articles_path))
        ])

    @property
    def list_of_articles(self):
        """Return a simple list of articles with information on
        whether they've been modified
        """
        uncommitted_list = self.list_of_uncommitted_articles
        simple_list = self.simple_list_of_articles

        return [
            {
                'title': _,
                'uncommitted': True if _ in uncommitted_list else False
            }
            for _
            in simple_list
        ]

    @property
    def alphabetized_list_of_articles(self):
        """Return an alphabetized list of articles with information on
        whether they've been modified
        """
        alphabetized_list = defaultdict(list)
        uncommitted_list = self.list_of_uncommitted_articles

        for _ in self.simple_list_of_articles:
            alphabetized_list[_[:1].upper()].append(
                {
                    'title': _,
                    'uncommitted': True if _ in uncommitted_list else False
                }
            )

        return alphabetized_list

    @property
    def list_of_uncommitted_articles(self):
        """Return a list of articles that have been modified
        """
        return [
            _.a_path.replace('.md', '')
            for _
            in self.article_repo.index.diff(None)
        ]

    def escape_html(self, text):
        html_escape_table = {
            '&': '&amp;',
            '"': '&quot;',
            "'": '&apos;',
            '>': '&gt;',
            '<': '&lt;',
        }

        return ''.join(html_escape_table.get(c, c) for c in text)

    # ------------------------ Path Functions ------------------------

    """
    Paths come in as

        folder one/folder two/some article

    In `helpers`, that path is always referred to as `article_path`.
    Need functions to turn `article_path` into

        # Article namespace only
        folder one/folder two

        # Article title only
        some article

        # Article title with extension
        some article.md

        # Full path to article file
        /path/to/repo/folder one/folder two/some article.md
    """

    def article_namespace(self, article_path):
        """Return only the article namespace without trailing slashes
        """
        match = re.match(r'^(.*)/.*$', article_path)
        return match.group(1).lstrip('/') if match else ''

    def article_title(self, article_path):
        """Return just the article title without a namespace.
        """
        match = re.match(r'^(.*/)?(.*)$', article_path)

        # TODO: Improve the regex and avoid this!
        if match.group(2)[-3:].upper() == '.MD':
            title = match.group(2)[:-3]
        else:
            title = match.group(2)

        return title

    def article_path_with_extension(self, article_path):
        """Silly, really: Just add a '.md' to the article's title
        """
        return '{}.md'.format(article_path)

    def full_article_path(self, article_path):
        """Return the full path to the article on disk
        """
        return '{}/{}/{}.md'.format(
            self.articles_path,
            self.article_namespace(article_path),
            self.article_title(article_path),
        )

    def article_title_with_extension(self, article_path):
        """Return the article title from the URL path with the ".md" extension
        """
        return '{}.md'.format(self.article_title(article_path))

    # ------------------------ Repository Functions ------------------------

    def get_commits(self, article_path):
        """Returns a list of commits as `Commit` objects for a
        given article title
        """
        return [
            _
            for _
            in self.article_repo.iter_commits(
                paths=self.article_path_with_extension(article_path)
            )
        ]

    def get_commit(self, article_path, sha):
        """Fetches a single `Commit` object for a given article title
        and commit SHA
        """
        commit = [
            _
            for _
            in self.get_commits(article_path)
            if _.hexsha == sha
        ]

        return commit[0] if commit else None

    def get_blob(self, article_path, commit):
        """Get the git blob for a given commit and article title
        """
        namespaces = article_path.split('/')

        if len(namespaces) == 1:
            blob = [
                _
                for _
                in commit.tree.blobs
                if _.name == self.article_title_with_extension(article_path)
            ]

        else:
            subtree_with_blob = commit.tree[namespaces[0]]

            for namespace in namespaces[1:-1:]:
                subtree_with_blob = subtree_with_blob[namespace]

            blob = [
                _
                for _
                in subtree_with_blob.blobs
                if _.name == self.article_title_with_extension(article_path)
            ]

        return blob[0] if blob else []

    def get_revision_list(self, article_path):
        """Get a list of revision objects for a given article title
        """
        revisions = []

        for commit in self.get_commits(article_path):
            committed_date = arrow.get(commit.committed_date)

            revisions.append({
                'id': commit.hexsha,
                'message': commit.message,
                'author': commit.author.name,
                'email': commit.author.email,
                'committed': str(committed_date),
                'committed_humanized': committed_date.humanize()
            })

        return revisions

    def get_revision(self, article_path, sha):
        """Get a single revision from a blob object for a given article
        title and commit ID
        """
        commit = self.get_commit(article_path, sha)

        if not commit:
            return None

        commit_date = arrow.get(commit.committed_date)
        blob = self.get_blob(article_path, commit)
        raw_article_content = (
            blob.data_stream.read().decode('UTF-8').replace('\u00a0', '')
            if blob
            else self.raw_article(article_path)
        )

        return {
            'title': self.article_title(article_path),
            'html': self.markdown_to_html(raw_article_content),
            'raw': raw_article_content,
            'committed': str(commit_date),
            'committed_humanized': commit_date.humanize(),
        }

    def get_diff(self, article_path, a, b):
        """Return a diff string between two revisions of a given
        article title.
        """
        revision_a = self.get_revision(article_path, a)
        revision_b = self.get_revision(article_path, b)

        unified_diff = '\n'.join(
            list(
                difflib.unified_diff(
                    revision_a['raw'].splitlines(),
                    revision_b['raw'].splitlines(),
                    fromfile='{}/{}'.format('a', article_path),
                    tofile='{}/{}'.format('b', article_path),
                    lineterm='',
                )
            )
        )

        diff_template = """diff --git a/{title} b/{title}
index {sha_a}..{sha_b} {file_mode}
{diff}
"""

        unified_diff = diff_template.format(
            title=article_path,
            diff=unified_diff,
            sha_a=a[0:7],
            sha_b=b[0:7],
            file_mode=oct(
                os.stat(self.full_article_path(article_path)).st_mode
            )[2:]
        )

        # Escape HTML and "non-breaking space"
        return self.escape_html(unified_diff)

    def pull_commits(self):
        """Pull all changes to the article repository from the default remote.
        An empty list denotes a successful pull.
        """
        errors = []

        try:
            self.article_repo.remote().pull()
        except Exception as e:
            errors.append(str(e))

        return errors

    # ------------------------ Search Functions ------------------------

    def create_search_index(self):
        """Create a search index in the articles path. The folder is named
        .search_index
        """
        document_path = self.articles_path
        schema = self.__search_schema

        index_path = '{}/.search_index'.format(document_path)
        if not os.path.exists(index_path):
            os.mkdir(index_path)

        logger.info('Creating index')
        search_index = index.create_in(index_path, schema)

        return search_index

    def update_index_with(self, entity):
        """Update the search index with either a single article title
        or a list of titles
        """
        writer = self.__search_index.writer()

        if type(entity) is not list:
            entity = [entity]

        for _ in entity:
            with open(self.full_article_path(_)) as f:
                try:
                    writer.update_document(
                        title=_,
                        path=self.full_article_path(_),
                        content=f.read()
                    )
                    logger.debug('Updated {}'.format(self.article_title(_)))

                except ValueError as e:
                    logger.error('Skipping {} ({})'.format(_, str(e)))

        writer.commit()

    def delete_from_index(self, article_path):
        logger.debug('Trying {}'.format(article_path))
        writer = self.__search_index.writer()
        writer.delete_by_term('title', article_path)

        logger.debug('Removed {}'.format(article_path))

        writer.commit()

    def populate_search_index(self):
        """Wraps the `update_index_with` function for the entire
        list of articles
        """
        self.update_index_with(self.simple_list_of_articles)

    def search_articles(self, query_string):
        """Searches the index with the given query string and returns
        an object with search results and metadata
        """
        if len(query_string) < 3:
            raise ValueError('Search query must be longer than three chars')

        search_results = {
            'query': query_string,
            'count': 0,
            'results': None
        }

        query = self.__search_parser.parse(query_string)

        with self.__search_index.searcher() as searcher:
            results = searcher.search(query, terms=True, limit=None)
            results.fragmenter.maxchars = 400
            results.fragmenter.surround = 100

            search_results['count'] = len(results)
            if search_results['count'] > 0:
                search_results['results'] = []

            for hit in results:
                search_results['results'].append({
                    'title': hit['title'],
                    'content_matches': hit.highlights(
                        'content',
                        text=open(hit["path"]).read()
                    )
                })

        return search_results
예제 #40
0
class fetcher(object):
    
    def __init__(self, path):
        self.idxpath = path
        self.ix = open_dir(self.idxpath)
        self.query = MultifieldParser(['content','ctime'], schema=self.ix.schema)
        self.query.add_plugin(DateParserPlugin())
        self.sorter = MultiFacet(["ctime", ScoreFacet()])
        self.parser = ttp.Parser();
        self.dateparser = parser.parser();
        
    def fetch_thread_by_tid(self, retid):
        t1 = int(round(time.time() * 1000))
        tweets = []
        try :
            searcher = self.ix.searcher()
            results = searcher.documents(retweetid=retid)
            for r in results:
                tweet = json.loads(r['json'])
                tweet['created_at'] = self.dateparser.parse(tweet['created_at'])
                tweets.append(tweet)
        except Exception as e:
            print 'fetch_tweets error' + str(e)
        finally:
            searcher.close()
        t2 = int(round(time.time() * 1000))
        tweets = sorted(tweets, key=lambda x: x['created_at'], reverse=False)
        print '----> fetch tweets by retweet id ' + str(t2 - t1) + ' ms'
        return tweets
     
    def fetch_tweets_by_uid(self, uid):
        t1 = int(round(time.time() * 1000))
        try :
            searcher = self.ix.searcher()
            results = searcher.documents(ownerid=uid)
            tweets = []
            for r in results:
                tweet = json.loads(r['json'])
                tweet['user']['retweet_at'] = self.dateparser.parse(tweet['created_at'])
                tweet['created_at'] = self.dateparser.parse(tweet['created_at'])
                tweets.append(tweet)
        except Exception as e:
            print 'fetch_tweets error' + str(e)
        finally:
            searcher.close()
        t2 = int(round(time.time() * 1000))
        print '----> fetch tweets for the specified user costs ' + str(t2 - t1) + ' ms'
        return tweets
    
    def fetch_tweets_by_keyword(self, keyword, start, topk):
        
        print 'thread : '  + keyword
        
        t1 = int(round(time.time() * 1000))
        tweets = []
        users = []
        tweetids = {}
        qtext = unicode('ctime:[' + str(start) + ' to] AND ' + 'content:(' + keyword + ')')
        try :
            searcher = self.ix.searcher()
            q = self.query.parse(qtext)
            results = searcher.search(q)
            
            for r in results:
                t = json.loads(r['json'])
                tt = t;
                if 'retweeted_status' in t and t['retweeted_status'] is not None:
                    t = t['retweeted_status']
                tid = t['id_str']
                if tid not in tweetids:
                    
                    user = {
                        "id":tt['user']['id_str'], 
                        "retweet_time":self.dateparser.parse(tt['created_at']).strftime('%Y%m%d%H%M%S'),
                        "screen_name":tt['user']['screen_name'], 
                        "profile_image_url":tt['user']['profile_image_url'],
                        "followers_count":tt['user']['followers_count']
                    };
                    
                    users.append(user)
                    
                    tweet = {}
                    tweet['id'] = tid
                    tweet['text'] = t['text']
                    tweet['creator'] = {}
                    tweet['creator']['id'] = t['user']['id_str']
                    tweet['creator']['creator'] = t['user']['screen_name']
                    tweet['creator']['creator_img'] = t['user']['profile_image_url']
                    tweet['retweet_count'] = t['retweet_count']
                    tweet['created_at'] = self.dateparser.parse(t['created_at']).strftime('%Y%m%d%H%M%S')
                    tweet['retweet_history'] = [user]
                    tweet['rank'] = max(t['user']['followers_count'], tt['user']['followers_count']) * t['retweet_count']
                    tweetids[tid] = tweet
                    tweets.append(tweet)
                else :
                    user = {
                        "id":tt['user']['id_str'], 
                        "retweet_time":self.dateparser.parse(tt['created_at']).strftime('%Y%m%d%H%M%S'),
                        "screen_name":tt['user']['screen_name'], 
                        "profile_image_url":tt['user']['profile_image_url'],
                        "followers_count":tt['user']['followers_count']
                    };
                    users.append(user)
                    tweetids[tid]['retweet_history'].append(user)
                    tweetids[tid]['rank'] = max(tweetids[tid]['rank'], tt['user']['followers_count'] * t['retweet_count'])
                    print '--> update retweet history'
            
            tweets = sorted(tweets, key=lambda x: x['rank'], reverse=False)[:topk]
            tweets = sorted(tweets, key=lambda x: self.dateparser.parse(x['created_at']), reverse=False)
            
        except Exception as e:
            print 'error ' + str(e)
        finally:
            searcher.close()
        t2 = int(round(time.time() * 1000))
        print '----> fetch tweets for the specified user costs ' + str(t2 - t1) + ' ms'
        return (tweets, users)
    
    def fetch_retweeting_behavior(self, uid):
        
        tweets = self.fetch_tweets_by_uid(uid)
        
        print tweets
        
        glyph = {}
        glyph['threads'] = []
        glyph['users'] = []
        
        temp = []
        thread_tweets = []
        for tweet in tweets:
            tid = tweet['id']
            if 'retweeted_status' in tweet and tweet['retweeted_status'] is not None:
                tid = tweet['retweeted_status']['id']
            
            thread_retweets = self.fetch_thread_by_tid(unicode(tid))
            if(len(thread_retweets) == 0) :
                continue
            temp.append(tweet)
            thread_tweets.append(thread_retweets)
        
        tweets = temp
        
        behaviors = {}
        for i in range(len(tweets)):
            tweet = tweets[i]
            tid = tweet['id']
            if 'retweeted_status' in tweet and tweet['retweeted_status'] is not None:
                tid = tweet['retweeted_status']['id']
            
            thread = {}
            thread['content'] = []
            for tt in thread_tweets[i]:
                u = tt['user']
                if u['id'] not in behaviors:
                    behaviors[u['id']] = {};
                    behaviors[u['id']]['id'] = u['id']
                    behaviors[u['id']]['screen_name'] = u['screen_name']
                    behaviors[u['id']]['followers_count'] = u['followers_count']
                    behaviors[u['id']]['profile_image_url'] = u['profile_image_url']
                    behaviors[u['id']]['behavior'] = [0] * len(tweets);
                    behaviors[u['id']]['time'] = [''] * len(tweets);
                    behaviors[u['id']]['sentiments'] = [0] * len(tweets);
                    behaviors[u['id']]['rank'] = u['followers_count']
                    glyph['users'].append(behaviors[u['id']])
                behaviors[u['id']]['behavior'][i] = 1
                behaviors[u['id']]['time'][i] = tt['created_at'].strftime('%Y%m%d%H%M%S')
                behaviors[u['id']]['sentiments'][i] = sentiment(tweet['text'])
                thread['content'].append(behaviors[u['id']])
                        
            thread['name'] = tweet['text']
            thread['sentiment'] = sentiment(tweet['text'])
            thread['start'] = thread_tweets[i][0]['created_at'].strftime('%Y%m%d%H%M%S')
            thread['end'] = thread_tweets[i][len(thread_tweets[i]) - 1]['created_at'].strftime('%Y%m%d%H%M%S')
            glyph['threads'].append(thread)
        
        for userid in behaviors:
            behaviors[userid]['sentiment'] = 1.0 * sum(behaviors[userid]['sentiments']) / len(behaviors[userid]['sentiments'])
            del behaviors[userid]['sentiments']
        
        glyph['start'] = glyph['threads'][0]['start']
        glyph['end'] = glyph['threads'][len(glyph['threads']) - 1]['start']
        json.dump(glyph, open('./' + str(uid) + '.retweet' + '.json', 'wb'))
        
        return glyph
    
    def fetch_topic_behavior(self, uid):
        tags = {}
        tweets = self.fetch_tweets_by_uid(uid)
        for tweet in tweets:
            res = self.parser.parse(tweet['text'])
            if(len(res.tags) == 0):
                continue
            
            for tag in res.tags:
                if tag not in tags:
                    tags[tag] = tweet['created_at']
                else :
                    if tags[tag] > tweet['created_at']:
                        tags[tag] = tweet['created_at']
        
        glyph = {}
        glyph['start'] = '201210040000'
        glyph['end'] = '201210040600'
        glyph['threads'] = []
        glyph['users'] = []
        # construct thread
        behaviors = {}
        tid = 0
        for tag in tags:
            thread = {}
            thread['name'] = tag
            thread['start'] = tags[tag].strftime('%Y%m%d%H%M%S')
            (tweets, users) = self.fetch_tweets_by_keyword('#' + tag, thread['start'], 300)
            
            thread['content'] = []
            for t in tweets:
                t['time'] = [''] * len(tags.keys())
                t['time'][tid] = t['created_at']
                thread['content'].append(t)
                
            thread['end'] = thread['content'][len(thread['content']) - 1]['created_at']
            print thread['end']
            glyph['threads'].append(thread)
            
            for u in users:
                if u['id'] not in behaviors:
                    behaviors[u['id']] = {};
                    behaviors[u['id']]['id'] = u['id']
                    behaviors[u['id']]['screen_name'] = u['screen_name']
                    behaviors[u['id']]['followers_count'] = u['followers_count']
                    behaviors[u['id']]['behavior'] = [0] * len(tags.keys());
                    glyph['users'].append(behaviors[u['id']])
                    
                behaviors[u['id']]['behavior'][tid] = 1
            tid += 1
        
        glyph['start'] = glyph['threads'][0]['start']
        glyph['end'] = glyph['threads'][len(glyph['threads']) - 1]['start']
            
        json.dump(glyph, open('./' + str(uid) + '.topic' + '.json', 'wb'))
query_b = QueryParser('content', ix.schema).parse('cdk4')
with ix.searcher() as srch:
    res_b = srch.search(query_b, limit=10)
    for i in res_b:
        print(i['name'])

ix = open_dir("./indexdir1/")
query_b = QueryParser('name', ix.schema).parse('NCT01692496')
with ix.searcher() as srch:
    res_b = srch.search(query_b, limit=10)
    for i in res_b:
        print(i['name'])

mparser = MultifieldParser(["Title","content"], schema=schema)

mparser.add_plugin(wp.qparser.FuzzyTermPlugin)

ix = open_dir("./indexdir1/")
q = mparser.parse('"Colon" "cancer" "BRAF" "V600E"')
out = []
with ix.searcher() as srch:
    res_b = srch.search(q, limit=100)
    for i in res_b:
        s = str(i['name'])[-11:]
        out.append(s)
out

eg = open('./topic_19.txt','w')

pos = 1
score = 10
예제 #42
0
# coding=utf-8
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser
from whoosh.qparser import FuzzyTermPlugin

idx_dir = 'lagou_idx'
ix = open_dir(idx_dir)
searcher = ix.searcher()

parser = MultifieldParser(["name", "desc"], schema=ix.schema)
parser.add_plugin(FuzzyTermPlugin())

# Single field parser.
k = u'搜索 OR Pythn~2 city:上海'
q = parser.parse(k)

results = searcher.search_page(q, 1, pagelen=5)

print(u'{0} results found for keyword {1}, {2} returned: '.format(
    len(results), k, results.scored_length()))
for hit in results[:50]:
    print(hit['id'])
    print(hit['name'])
    # print(hit['city'])
    print(hit['com_name'])
    print('************')
예제 #43
0
class Query(object):
    def __init__(self, index_name):
        self.index_name = index_name
        self.ix = open_dir(index_name)
        self.indexer = Indexer("./template/cache/")
        self.multiParser = MultifieldParser(["anchor", "content"],
                                            schema=self.indexer.schema)
        self.singleParser = QueryParser("content", schema=self.indexer.schema)
        self.site_re = re.compile(r'site:')
        self.filetype_re = re.compile(r'filetype:')
        self.type2fn = {
            "website": self.query_website,
            "image": self.query_image,
            "file": self.query_file,
            "document": self.query_document,
            "all": self.query_all,
        }
        self.searcher = self.ix.searcher(weighting=TFIDF_PR)
        self.singleParser.add_plugin(EveryPlugin)
        self.multiParser.add_plugin(EveryPlugin)

    def query_website(self, sentence, page, filetype=None, site=None):
        query = And(
            [self.multiParser.parse(sentence),
             Term("type", "website")])
        if filetype:
            query = And([query, Term("extension", filetype)])
        if site:
            query = And([query, Prefix("url", site)])
        results = self.searcher.search_page(query, page, terms=True)
        return results

    def query_image(self, sentence, page, filetype=None, site=None):
        query = And([self.singleParser.parse(sentence), Term("type", "image")])
        if filetype:
            query = And([query, Term("extension", filetype)])
        if site:
            query = And([query, Prefix("url", site)])
        results = self.searcher.search_page(query, page, terms=True)
        return results

    def query_file(self, sentence, page, filetype=None, site=None):
        query = And([self.singleParser.parse(sentence), Term("type", "file")])
        if filetype:
            query = And([query, Term("extension", filetype)])
        if site:
            query = And([query, Prefix("url", site)])
        results = self.searcher.search_page(query, page, terms=True)
        return results

    def query_document(self, sentence, page, filetype=None, site=None):
        query = And(
            [self.singleParser.parse(sentence),
             Term("type", "document")])
        if filetype:
            query = And([query, Term("extension", filetype)])
        if site:
            query = And([query, Prefix("url", site)])
        results = self.searcher.search_page(query, page, terms=True)
        return results

    def query_all(self, sentence, page, filetype=None, site=None):
        query = self.multiParser.parse(sentence)
        if filetype:
            query = And([query, Term("extension", filetype)])
        if site:
            query = And([query, Prefix("url", site)])
        results = self.searcher.search_page(query, page, terms=True)
        return results

    def query(self, sentence, target, page):
        site = None
        filetype = None
        self.log(sentence)
        if self.site_re.match(sentence):
            sentence = sentence[5:].strip()
            site = sentence.split(' ')[0]
            sentence = sentence[len(site):].strip()
            if len(urlparse(site)[0]) == 0:
                site = "%s%s" % ("http://", site)
        elif self.filetype_re.match(sentence):
            sentence = sentence[9:].strip()
            filetype = sentence.split(' ')[0]
            sentence = sentence[len(filetype):].strip()

        return self.type2fn[target](sentence, page, filetype, site)

    def log(self, sentence):
        with open('logging.txt', 'a') as f:
            f.write(datetime.datetime.now().strftime("%y%m%d%H%M%S") + " " +
                    sentence)
예제 #44
0
    q_d = MultifieldParser(["title", "content", "extension", "url"],
                           i_d.schema,
                           group=og)
    q_e = MultifieldParser(["title", "content", "extension", "url"],
                           i_e.schema,
                           group=og)
    q_f = MultifieldParser(["title", "content", "extension", "url"],
                           i_f.schema,
                           group=og)
elif operator == 4:
    #print ("in oper 4")
    og = qparser.OrGroup.factory(0.9)
    q_a = MultifieldParser(["title", "content", "tags", "extension", "url"],
                           i_a.schema,
                           group=og)
    q_a.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))
    q_a.add_plugin(qparser.FuzzyTermPlugin())
    q_b = MultifieldParser(["title", "content", "extension", "url"],
                           i_b.schema,
                           group=og)
    q_b.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))
    q_b.add_plugin(qparser.FuzzyTermPlugin())
    q_c = MultifieldParser(["title", "content", "extension", "url"],
                           i_c.schema,
                           group=og)
    q_c.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))
    q_c.add_plugin(qparser.FuzzyTermPlugin())
    q_d = MultifieldParser(["title", "content", "extension", "url", "url"],
                           i_d.schema,
                           group=og)
    q_d.add_plugin(qparser.SequencePlugin("!(~(?P<slop>[1-9][0-9]*))?"))