示例#1
0
文件: govuk.py 项目: leifos/ifind
    def _parse_json_response(query, results):
        """
        Parses GOV.uk's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms, query)

        content = json.loads(results.text)

        # The base url - results do not provide a full link.
        base_url = "https://www.gov.uk"
        rank = 0

        for result in content[u'results']:
            text = result.get(u'description', '')
            title = result[u'title']
            url = base_url + result[u'link']
            rank = rank + 1
            # Add the result to the ifind response
            response.add_result(title=title, url=url, summary=text, rank=rank)

            if len(response) == query.top:
                break

        return response
示例#2
0
    def _parse_xml_response(query, results):
        """
        Parses Wikipedia's XML response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        xml_doc = xml.dom.minidom.parseString(results.content)
        results = xml_doc.getElementsByTagName('Item')

        for result in results:

            title = result.getElementsByTagName('Text')[0].firstChild.data
            url = result.getElementsByTagName('Url')[0].firstChild.data
            summary = result.getElementsByTagName(
                'Description')[0].firstChild.data

            response.add_result(title=title, url=url, summary=summary)

        return response
示例#3
0
    def _parse_json_response(query, results):
        """
        Parses Pipl's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms)
        content = json.loads(results.text)

        for record in content[u'records']:
            name = record[u'names'][0][u'display']
            url = record[u'source'][u'url']
            imageurl = None
            try:
                imageurl = record[u'images'][0][u'url']
            except:
                pass
            summary = Pipl._build_summary(record)

            response.add_result(title=name,
                                url=url,
                                summary=summary,
                                imageurl=imageurl)

        return response
示例#4
0
    def _parse_xml_response(query, results):
        """
        Parses Wikipedia's XML response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        xml_doc = xml.dom.minidom.parseString(results.content)
        results = xml_doc.getElementsByTagName("Item")

        for result in results:

            title = result.getElementsByTagName("Text")[0].firstChild.data
            url = result.getElementsByTagName("Url")[0].firstChild.data
            summary = result.getElementsByTagName("Description")[0].firstChild.data

            response.add_result(title=title, url=url, summary=summary)

        return response
示例#5
0
    def _parse_json_response(query, results):
        """
        Parses GOV.uk's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)

        for result in content[u"results"]:
            text = result[u"details"][u"description"]
            title = result[u"title"]
            url = result[u"web_url"]

            response.add_result(title=title, url=url, summary=text)

            if len(response) == query.top:
                break

        return response
示例#6
0
    def _parse_json_response(query, results):
        """
        Parses Facebook's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms)
        content = json.loads(results.text)

        # Check to see if the response contains any API errors.
        Facebook._check_errors(content)

        if query.result_type == 'user' or not query.result_type:
            # Sample response
            #     {
            # "data": [
            #   {
            #      "name": "John Doe",
            #      "id": "999999999999999"
            #   },
            #   {
            #      "name": "John Doe",
            #      "id": "88888888888888"
            #   }
            #   ],
            #        "paging": {
            #           "next": "long_url"
            #        }
            #     }

            # The base URL is used to create the link to the profile, it will redirect to a permanent user URL.
            base_url = "https://www.facebook.com/app_scoped_user_id/"
            for user in content[u'data']:
                name = user[u'name']
                tempid = user[u'id']
                url = base_url + tempid + '/'
                text = ''
                img = "https://graph.facebook.com/{}/picture?type=normal".format(
                    tempid)
                # Minimal information, probably need a second round of querying the API for each user to get something
                # for the snippet. Better way?
                response.add_result(title=name,
                                    url=url,
                                    summary=text,
                                    imageurl=img)

            # Implement the other search tpyes.
        return response
示例#7
0
    def _parse_json_response(self, query, results):
        """
        Parses Neutrinogeoaddress's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms, query)
        content = json.loads(results.text)

        # Results aren't paginated, no more to get.
        response.no_more_results = True

        url_base = 'https://www.google.co.uk/maps/place/'
        trans_table = maketrans(u' ', u'+')  # Switch spaces with + for the google maps url

        locations = content.get(u'locations')
        if locations:
        # There are results present, iterate over them.
            for loc in locations:
                # Kwargs below
                address = loc.get(u'address', '')
                latitude = loc.get(u'latitude', '')
                longitude = loc.get(u'longitude', '')
                country = loc.get(u'country', '')
                country_code = loc.get(u'country-code', '')
                city = loc.get(u'city', '')
                postcode = loc.get(u'postal-code', '')

                # The iframe_url must be placed in an iframe in order to render the map.
                if self.google_api_key:
                    iframe_url = self._build_iframe_url(address, trans_table)
                else:
                    iframe_url = None

                url = url_base + encode_symbols(address.encode('utf-8').translate(trans_table))
                text = Neutrinogeoaddress._build_summary(address, city, country, postcode, latitude, longitude)

                response.add_result(title=address, url=url, summary=text, imageurl=None,
                                    address=address, latitude=latitude, longitude=longitude, country=country,
                                    country_code=country_code, city=city, postcode=postcode, iframe_url=iframe_url)

        return response
示例#8
0
    def _parse_json_response(self, query, results):
        """
        Parses Googleplus's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms)
        content = json.loads(results.text)

        result_type = DEFAULT_RESULT_TYPE
        if query.result_type:
            result_type = query.result_type

        if result_type == 'people' or result_type == 'people+':
            for user in content[u'items']:
                name = user[u'displayName']
                url = user[u'url']
                imageurl = Googleplus._resize_image(user[u'image'][u'url'])

                # Check to see if the search results needs recusrively acquired person details.
                if result_type == 'people+':
                    summary = self._build_person_summary(user[u'id'])
                else:
                    summary = ''
                # Add the result to the response
                response.add_result(title=name, url=url, summary=summary, imageurl=imageurl)

        elif result_type == 'activities':
            for activity in content[u'items']:
                title = activity[u'verb'] + ' '  +  activity[u'title']
                url = activity[u'url']
                summary = Googleplus._build_activity_summary(activity)
                imageurl = ''
                try:
                    imageurl = Googleplus._resize_image(activity[u'image'][u'url'])
                except KeyError:
                    pass
                # Add the result to the response.
                response.add_result(title=title, url=url, summary=summary, imageurl=imageurl)

        return response
示例#9
0
    def _parse_json_response(query, results):
        """
        Parses Bing's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)

        rank_counter = 1

        if query.result_type == 'web' or not query.result_type:
            for result in content[u'd'][u'results'][0][u'Web']:
                response.add_result(title=result[u'Title'],
                                    url=result[u'Url'],
                                    summary=result[u'Description'],
                                    rank=rank_counter)
                #print result[u'Title']
                #print rank_counter
                #print ' '
                rank_counter += 1

        if query.result_type == 'image':
            for result in content[u'd'][u'results'][0][u'Image']:
                file_size = str(int(result[u'FileSize']) /
                                1024)  # in kilobytes
                width = result[u'Width']
                height = result[u'Height']
                media_url = result[u'MediaUrl']
                thumb_url = result[u'Thumbnail'][u'MediaUrl']
                response.add_result(file_size=file_size,
                                    width=width,
                                    height=height,
                                    media_url=media_url,
                                    thumb_url=thumb_url)

        if query.result_type == 'video':
            for result in content[u'd'][u'results'][0][u'Video']:
                run_time = Bing._get_video_length(int(result[u'RunTime']))
                title = result[u'Title']
                media_url = result[u'MediaUrl']
                thumb_url = result.get(u'Thumbnail', {}).get(u'MediaUrl', None)
                if thumb_url is None:
                    continue
                response.add_result(title=title,
                                    media_url=media_url,
                                    run_time=run_time,
                                    thumb_url=thumb_url)

        return response
示例#10
0
文件: pipl.py 项目: leifos/ifind
    def _parse_json_response(query, results):
        """
        Parses Pipl's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms, query)
        content = json.loads(results.text)

        # Pipl only returns 20 results, so there are no more.
        response.no_more_results = True

        for record in content[u'records']:
            try:
                name = record[u'names'][0][u'display']
            except KeyError:
                name = record.get('source').get('url', "_no title_")
            url = record[u'source'][u'url']
            imageurl = None
            try:
                imageurl = record[u'images'][0][u'url']
            except:
                pass
            summary = Pipl._build_summary(record)

            # Kwargs below
            # Each keyword contains a list of (potentially empty) dictionary objects.
            usernames = record.get(u'usernames')
            addresses = record.get(u'addresses')
            relationships = record.get(u'relationships')
            jobs = record.get(u'jobs')
            educations = record.get(u'educations')
            tags = record.get(u'tags')


            response.add_result(title=name, url=url, summary=summary, imageurl=imageurl,
                                usernames=usernames, addresses=addresses, relationships=relationships,
                                jobs=jobs, educations=educations, tags=tags)

        return response
示例#11
0
文件: dummy.py 项目: Loptr250/ifind
    def _create_response(self, query):
        response = Response(query.terms)


        matches = ['one','two','three','four','five','six','seven','eight','nine','ten']
        result_list = ['rand','rand','rand','rand','rand','rand','rand','rand','rand','rand']
        matched = False

        if query.terms in matches:
            matched = True

        if matched:
            result_list = matches

        for x in result_list:
            response.add_result(x, 'www.'+x+'.com', x +' '+' ' + x)


        return response
示例#12
0
文件: googlecse.py 项目: leifos/ifind
    def _parse_json_response(self, query, results):
        """
        Parses Google Custom Search's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms, query)
        content = json.loads(results.text)

        # The query object wasn't mutated earlier and the result type isn't passed to this function.
        # Check for a result_type or set it to default.
        result_type = query.result_type
        if not result_type:
            result_type = self.default_result_type

        # Check for a next page token.
        next_page_token = content.get(u'nextPageToken')
        if next_page_token:
            # A page token exists, create the URL which will fetch the next page
            response.next_page = "{}&pageToken={}".format(self._create_query_string(query), next_page_token)

        rank_counter = 1

        if result_type == 'web' or not query.result_type:
            for result in content[u'items']:
                title = result[u'title']
                url = result[u'link']
                summary = result[u'snippet']
                response.add_result(title=title, url=url, summary=summary, rank=rank_counter)
                rank_counter+=1

        return response
示例#13
0
    def _parse_json_response(query, results):
        """
        Parses Twitter's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)

        for result in content[u'statuses']:

            text = result[u'text']
            result_id = str(result[u'id'])
            user_id = result[u'user'][u'id_str']

            # TODO clean this up
            created_at = result[u'created_at'].split()
            created_at.pop(4)
            created_at = ' '.join(created_at)

            url = 'https://www.twitter.com/{0}/status/{1}'.format(user_id, result_id)

            response.add_result(title=created_at, url=url, summary=text)

            if len(response) == query.top:
                break

        return response
示例#14
0
    def _create_response(self, query):
        response = Response(query.terms)

        matches = [
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten'
        ]
        result_list = [
            'rand', 'rand', 'rand', 'rand', 'rand', 'rand', 'rand', 'rand',
            'rand', 'rand'
        ]
        matched = False

        if query.terms in matches:
            matched = True

        if matched:
            result_list = matches

        for x in result_list:
            response.add_result(x, 'www.' + x + '.com', x + ' ' + ' ' + x)

        return response
示例#15
0
    def _parse_json_response(self, query, results):
        """
        Parses Googleplus's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms)
        content = json.loads(results.text)

        result_type = DEFAULT_RESULT_TYPE
        if query.result_type:
            result_type = query.result_type

        if result_type == 'people' or result_type == 'people+':
            for user in content[u'items']:
                name = user[u'displayName']
                url = user[u'url']
                imageurl = Googleplus._resize_image(user[u'image'][u'url'])

                # Check to see if the search results needs recusrively acquired person details.
                if result_type == 'people+':
                    summary = self._build_person_summary(user[u'id'])
                else:
                    summary = ''
                # Add the result to the response
                response.add_result(title=name,
                                    url=url,
                                    summary=summary,
                                    imageurl=imageurl)

        elif result_type == 'activities':
            for activity in content[u'items']:
                title = activity[u'verb'] + ' ' + activity[u'title']
                url = activity[u'url']
                summary = Googleplus._build_activity_summary(activity)
                imageurl = ''
                try:
                    imageurl = Googleplus._resize_image(
                        activity[u'image'][u'url'])
                except KeyError:
                    pass
                # Add the result to the response.
                response.add_result(title=title,
                                    url=url,
                                    summary=summary,
                                    imageurl=imageurl)

        return response
示例#16
0
    def _parse_json_response(query, results):
        """
        Parses GOV.uk's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)

        # The base url - results do not provide a full link.
        base_url = "https://www.gov.uk"

        for result in content[u'results']:
            try:
                # Catch results with no description (they exist!)
                text = result[u'description']
            except KeyError:
                text = ''
            title = result[u'title']
            url = base_url + result[u'link']
            response.add_result(title=title, url=url, summary=text)

            if len(response) == query.top:
                break

        return response
示例#17
0
文件: bing.py 项目: Loptr250/ifind
    def _parse_json_response(query, results):
        """
        Parses Bing's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)
		
        rank_counter = 1

        if query.result_type == 'web' or not query.result_type:
            for result in content[u'd'][u'results'][0][u'Web']:
                response.add_result(title=result[u'Title'], url=result[u'Url'], summary=result[u'Description'], rank=rank_counter)
                #print result[u'Title']
                #print rank_counter
                #print ' '
                rank_counter+=1
				
        if query.result_type == 'image':
            for result in content[u'd'][u'results'][0][u'Image']:
                file_size = str(int(result[u'FileSize']) / 1024)  # in kilobytes
                width = result[u'Width']
                height = result[u'Height']
                media_url = result[u'MediaUrl']
                thumb_url = result[u'Thumbnail'][u'MediaUrl']
                response.add_result(file_size=file_size, width=width, height=height,
                                    media_url=media_url, thumb_url=thumb_url)

        if query.result_type == 'video':
            for result in content[u'd'][u'results'][0][u'Video']:
                run_time = Bing._get_video_length(int(result[u'RunTime']))
                title = result[u'Title']
                media_url = result[u'MediaUrl']
                thumb_url = result.get(u'Thumbnail', {}).get(u'MediaUrl', None)
                if thumb_url is None:
                    continue
                response.add_result(title=title, media_url=media_url, run_time=run_time, thumb_url=thumb_url)

        return response
示例#18
0
文件: twitter.py 项目: leifos/ifind
    def _parse_json_response(self, query, results):
        """
        Parses Twitter's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms, query)
        content = json.loads(results.text)

        # Check to see if there are more results.
        next_results = content[u'search_metadata'].get(u'next_results', '')
        if next_results:
            # Create a dictionary from the string found in u'next_results'
            params = next_results[1:]
            params = params.split('&')
            for index in range(len(params)):
                params[index] = params[index].split('=')
            param_dic = {}
            # At this point params looks like: [['someparam', 'somevalue'], ['someparam', 'somevalue']....]
            for lis in params:
                param_dic[lis[0]] = lis[1]

            # Set the next page URL in the response.
            response.next_page = self._create_query_string(query, search_params=param_dic)
        else:
            # No more results, set the flag in the response
            response.no_more_results = True

        for result in content[u'statuses']:

            text = result[u'text']
            result_id = str(result[u'id'])

            # User dictionary
            user = {'user_id': result[u'user'][u'id_str'],
                    'profile_image': result.get(u'user').get(u'profile_image_url'),
                    'geo_enabled': result.get(u'user').get(u'geo_enabled'),
                    'description': result.get(u'user').get(u'description'),
                    'follower_count': result.get(u'user').get(u'followers_count'),
                    'protected': result.get(u'user').get(u'protected'),
                    'location': result.get(u'user').get(u'location'),
                    'utc_offset': result.get(u'user').get(u'utc_offset'),
                    'time_zone': result.get(u'user').get(u'time_zone'),
                    'name': result.get(u'user').get(u'name'),
                    'screen_name': result.get(u'user').get(u'screen_name'),
                    'member_since': result.get(u'user').get(u'created_at')
            }

            # TODO clean this up
            stamp = result[u'created_at'].split()
            # Created at in format: '01 Jan, 2014 @ 20:23'
            created_at = "{} {}, {} @ {}".format(stamp[2], stamp[1], stamp[5], stamp[3][:-3])

            url = 'https://www.twitter.com/{0}/status/{1}'.format(user['user_id'], result_id)
            imageurl = user.get('profile_image')
            title = u"{} ({}) - {}".format(user['name'], user['screen_name'], created_at)

            # Kwargs below
            source = result.get(u'source')
            coordinates = result.get(u'coordinates')
            place = result.get(u'place')
            hashtags= result.get(u'entities').get(u'hashtags')
            user_info = user
            reply_to_screen_name = result.get(u'in_reply_to_screen_name')
            reply_to_userid = result.get(u'in_reply_to_user_id_str')
            reply_to_status = result.get(u'in_reply_to_status_id_str')
            tweet_id = result_id


            # List of links in the tweet. Each item in the list is a dictionary with keys:
            # u'url, u'indices', u'expanded_url, u'display_url'
            links = result.get(u'entities').get(u'urls')

            # List of media items in the tweet. Each item in the list is a dictionary with keys:
            # u'expanded_url', u'sizes', u'url', u'media_url_https',
            # u'id_str', u'indices', u'media_url', u'type', u'id', u'display_url'
            media = result.get(u'entities').get(u'media')

            # List of users mentioned in the tweet. Each item in the list is a dictionary with keys:
            # u'indices', 'u'screen_name', u'PSG_inside', u'id', u'name', u'id_str'
            user_mentions = result.get(u'entities').get(u'user_mentions')


            response.add_result(title=title, url=url, summary=text, imageurl=imageurl, stamp=stamp,
                                user_info=user_info, media=media, links=links, user_mentions=user_mentions,
                                source=source, coordinates=coordinates, place=place,
                                hashtags=hashtags,  reply_to_screen_name=reply_to_screen_name,
                                reply_to_status=reply_to_status, reply_to_userid=reply_to_userid, tweet_id=tweet_id)

            if len(response) == query.top:
                break

        return response
示例#19
0
    def _parse_whoosh_response(query, results):
        """
        Parses Whoosh's response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """

        response = Response(query.terms)
        # Dmax thinks this line is incorrect.
        # I've substituted it with a line just before returning the response...
        #response.result_total = results.pagecount

        r = 0
        for result in results:
            r = r + 1
            title = result["title"]
            if title:
                title = title.strip()
            else:
                title = "Untitled"

            rank = ((int(results.pagenum)-1) * results.pagelen) + r

            url = "/treconomics/" + str(result.docnum)

            summary = result.highlights("content")
            trecid = result["docid"]
            trecid = trecid.strip()

            #score = result["score"]
            source = result["source"]

            response.add_result(title=title,
                                url=url,
                                summary=summary,
                                docid=trecid,
                                source=source,
                                rank=rank,
                                whooshid=result.docnum,
                                score=result.score)

            #if len(response) == query.top:
            #    break

        # Dmax has added this line as a replacement for the one commented out above.
        response.result_total = len(results)

        # Add the total number of pages from the results object as an attribute of our response object.
        # We also add the total number of results shown on the page.
        setattr(response, 'total_pages', results.pagecount)
        setattr(response, 'results_on_page', results.pagelen)
        setattr(response, 'actual_page', results.actual_page)
        return response
示例#20
0
    def _request(self, query):
        """
        Issues a single request to Whoosh Index and returns the result as
        an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Raises:
            EngineException

        Usage:
            Private method.

        """
        #try:
        query_terms = self.parser.parse(unicode(query.terms))
        page = query.skip
        pagelen = query.top

        with self.docIndex.searcher(weighting=self.scoring_model) as searcher:
            #invalid_page_no = True

            cache_key = self.get_cache_key(page, query.terms)

            if self.use_cache and self.cache.exists(cache_key):
                return_response = self.cache.get(cache_key)
                return_response = pickle.loads(return_response)

                print "WhooshTRECNewsEngine found CACHED results"
            else:
                results = searcher.search_page(query_terms, page, pagelen=(FORWARD_LOOK_PAGES * pagelen))
                results.fragmenter = highlight.ContextFragmenter(maxchars=3000, surround=3000)
                results.formatter = highlight.HtmlFormatter()
                results.fragmenter.charlimit = 100000
                setattr(results, 'actual_page', page)

                ifind_response = self._parse_whoosh_response(query, results)
                interleaved_results = self.__interleave_results(ifind_response.results, pagelen)
                split_results = self.__split_results(query, interleaved_results)

                page_counter = page
                return_response = Response(query.terms)

                for page_list in split_results:
                    response = Response(query.terms)

                    for hit in page_list:
                        response.add_result_object(hit)

                    response.pagenum = results.pagenum
                    response.total_pages = results.pagecount
                    response.results_on_page = len(page_list)
                    response.actual_page = page_counter

                    loop_cache_key = self.get_cache_key(page_counter, query.terms)

                    if self.use_cache and not self.cache.exists(loop_cache_key):
                        response_str = pickle.dumps(response)
                        self.cache.set(loop_cache_key, response_str)

                    if page_counter == page:
                        return_response = response
                        #print "WhooshTRECNewsEngine found: " + str(len(results)) + " results for query: " + query.terms
                        #print "Page %d of %d - PageLength of %d" % (results.pagenum, results.pagecount, results.pagelen)

                    page_counter = page_counter + 1


            """
            # If the user specifies a page number that's higher than the number of pages available,
            # this loop looks until a page number is found that contains results and uses that instead.
            # Prevents a horrible AttributeError exception later on!
            while invalid_page_no:
                try:
                    results = searcher.search_page(query_terms, page, pagelen)
                    invalid_page_no = False
                    setattr(results, 'actual_page', page)
                except ValueError:
                    page -= page

            results.fragmenter = highlight.ContextFragmenter(maxchars=300, surround=300)
            results.formatter = highlight.HtmlFormatter()
            results.fragmenter.charlimit = 100000
            print "WhooshTRECNewsEngine found: " + str(len(results)) + " results for query: " + query.terms
            print "Page %d of %d - PageLength of %d" % (results.pagenum, results.pagecount, results.pagelen)
            response = self._parse_whoosh_response(query, results)
            """
        #except:
        #    print "Error in Search Service: Whoosh TREC News search failed"

        return return_response
示例#21
0
def parse_response(reader, fieldname, analyzer, fragmenter, formatter, query, results, results_are_page=False):
    """
    Returns an ifind Response, given a query and set of results from Whoosh/Redis.
    Takes an ifind Query object and a list of SORTED results for the given query.

    If the page requested (query.skip) is < 0, page 1 is returned.
    If the page requested is greater than the number of available pages, the last page is returned.
    """

    def get_term_list():
        if isinstance(query.parsed_terms, unicode):
            return [query.parsed_terms]

        return [text for term_fieldname, text in query.parsed_terms.all_terms() if fieldname == fieldname]

    response = Response(query.terms)
    response.results_total = len(results)

    if results_are_page:
        page = results[0]
        response.total_pages = results[1]
        results = results[2]
    else:
        page, response.total_pages, results = get_page(query, results)

    page_len = query.top

    i = 0

    for result in results:
        i = i + 1
        rank = (page - 1) * page_len + i
        whoosh_docnum = result[0]
        score = result[1]
        stored_data = reader.stored_fields(whoosh_docnum)

        title = stored_data["title"]

        if title:
            title = title.strip()
        else:
            title = "Untitled Document"

        url = "/treconomics/{0}/".format(whoosh_docnum)
        trecid = stored_data["docid"].strip()
        source = stored_data["source"].strip()

        summary = highlight(stored_data["content"], get_term_list(), analyzer, fragmenter, formatter)
        summary = "{0}...".format(summary)

        response.add_result(
            title=title,
            url=url,
            summary=summary,
            docid=trecid,
            source=source,
            rank=rank,
            whooshid=whoosh_docnum,
            score=score,
        )

    # The following two lines are for compatibility purposes with the existing codebase.
    # Would really like to take these out.
    setattr(response, "results_on_page", len(results))
    setattr(response, "actual_page", page)

    return response
示例#22
0
    def _parse_whoosh_response(query, search_page, field, fragmenter, snippet_size):
        """
        Parses Whoosh's response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """

        response = Response(query.terms)
        r = 0




        search_page.results.fragmenter = fragmenter


        for result in search_page:
            title = result["title"]
            if title:
                title = title.strip()
            else:
                title = "Untitled"

            if title == '':
                title = "Untitled"

            rank = result.rank + 1

            url = "/treconomics/" + str(result.docnum)

            summary = result.highlights(field,top=snippet_size)
            content = result[field]

            trecid = result["docid"]
            trecid = trecid.strip()

            source = result["source"]

            response.add_result(title=title,
                                url=url,
                                summary=summary,
                                docid=trecid,
                                source=source,
                                rank=rank,
                                whooshid=result.docnum,
                                score=result.score,
                                content=content)

        response.result_total = len(search_page)

        # Add the total number of pages from the results object as an attribute of our response object.
        # We also add the total number of results shown on the page.
        setattr(response, 'total_pages', search_page.pagecount)
        setattr(response, 'results_on_page', search_page.pagelen)
        setattr(response, 'actual_page', search_page.actual_page)
        return response
示例#23
0
def parse_response(reader,
                   fieldname,
                   analyzer,
                   fragmenter,
                   formatter,
                   query,
                   results,
                   results_are_page=False):
    """
    Returns an ifind Response, given a query and set of results from Whoosh/Redis.
    Takes an ifind Query object and a list of SORTED results for the given query.

    If the page requested (query.skip) is < 0, page 1 is returned.
    If the page requested is greater than the number of available pages, the last page is returned.
    """
    def get_term_list():
        if isinstance(query.parsed_terms, unicode):
            return [query.parsed_terms]

        return [
            text for term_fieldname, text in query.parsed_terms.all_terms()
            if fieldname == fieldname
        ]

    response = Response(query.terms)
    response.results_total = len(results)

    if results_are_page:
        page = results[0]
        response.total_pages = results[1]
        results = results[2]
    else:
        page, response.total_pages, results = get_page(query, results)

    page_len = query.top

    i = 0

    for result in results:
        i = i + 1
        rank = (page - 1) * page_len + i
        whoosh_docnum = result[0]
        score = result[1]
        stored_data = reader.stored_fields(whoosh_docnum)

        title = stored_data['title']

        if title:
            title = title.strip()
        else:
            title = "Untitled Document"

        url = "/treconomics/{0}/".format(whoosh_docnum)
        trecid = stored_data['docid'].strip()
        source = stored_data['source'].strip()

        summary = highlight(stored_data['content'], get_term_list(), analyzer,
                            fragmenter, formatter)
        summary = "{0}...".format(summary)

        response.add_result(title=title,
                            url=url,
                            summary=summary,
                            docid=trecid,
                            source=source,
                            rank=rank,
                            whooshid=whoosh_docnum,
                            score=score)

    # The following two lines are for compatibility purposes with the existing codebase.
    # Would really like to take these out.
    setattr(response, 'results_on_page', len(results))
    setattr(response, 'actual_page', page)

    return response
示例#24
0
    def _parse_json_response(self, query, results):
        """
        Parses Googleplus's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms, query)
        content = json.loads(results.text)

        # The query object wasn't mutated earlier and the result type isn't passed to this function.
        # Check for a result_type or set it to default.
        result_type = query.result_type
        if not result_type:
            result_type = self.default_result_type

        # Check for a next page token.
        next_page_token = content.get(u'nextPageToken')
        if next_page_token:
            # A page token exists, create the URL which will fetch the next page
            response.next_page = "{}&pageToken={}".format(self._create_query_string(query), next_page_token)

        if result_type == 'people':
            # Build the ifind response for a people search

            for user in content[u'items']:
                name = user[u'displayName']
                url = user[u'url']
                imageurl = Googleplus._resize_image(user[u'image'][u'url'])
                summary = ''

                # Kwargs
                id = user[u'id']

                # Add the result to the response
                response.add_result(title=name, url=url, summary=summary, imageurl=imageurl, id=id)

        elif result_type == 'activities':
            # Build the ifind response for an activity search
            for activity in content[u'items']:

                # The three dictionaries below are passed as keyword arguments to the result object
                activity_dict = {
                    'url': activity.get(u'url'),
                    'verb': activity.get(u'verb'),
                    'title': activity.get(u'title'),
                    'published': activity.get(u'published'),
                    'updated': activity.get(u'updated'),
                    'kind': activity.get(u'kind'),
                    'id': activity.get(u'id')
                }

                actor_dict = {
                    'display_name': activity.get(u'actor').get(u'displayName'),
                    'url': activity.get(u'actor').get(u'url'),
                    'image': activity.get(u'actor').get(u'image').get(u'url'),
                    'id': activity.get(u'actor').get(u'id')
                    }

                object_dict = {
                    'type': activity.get(u'object').get(u'objectType'),
                    'content': activity.get(u'object').get(u'content').encode('utf-8'),
                    'url': activity.get(u'object').get(u'url'),
                    }

                title = u"{}  ({})".format(activity_dict.get('title'),
                                          activity_dict.get('verb'))
                url = activity_dict.get('url')
                summary = Googleplus._build_activity_summary(activity)
                imageurl = Googleplus._resize_image(actor_dict.get('image'))

                # Attachments is a list of dictionaries with keys:
                # u'objectType', u'displayName', u'content', u'url' and potentially nested dictionaries,
                # such as u'embed', u'image', u'thumbnails (list of dicts).
                attachments = activity.get(u'object').get(u'attachments')

                # Add the result to the response.
                response.add_result(title=title, url=url, summary=summary, imageurl=imageurl,
                                    actor=actor_dict, object=object_dict, activity=activity_dict,
                                    attachments=attachments)

        elif result_type == 'person_lookup':
            # Build the ifind response for a person lookup. No loop as the content is for a single person.

            title = content[u'displayName']
            url = content[u'url']
            imageurl = content[u'image'][u'url']
            summary = Googleplus._build_person_summary(content)

            about_me = content.get(u'aboutMe')
            occupation = content.get(u'occupation')
            verified = content.get(u'verified')
            circled_count = content.get(u'circledByCount')
            is_plus_user = content.get(u'isPlusUser')
            birthday = content.get(u'birthday')
            bragging_rights = content.get(u'braggingRights')
            emails = content.get(u'emails')
            skills = content.get(u'skills')
            relationship_status = content.get(u'relationshipStatus')
            places_lived = content.get(u'placesLived')
            organizations = content.get(u'organizations')
            tagline = content.get(u'tagline')

            # Kwargs below
            person = {'about_me': about_me, 'occupation':occupation, 'verified': verified, 'emails': emails,
                      'circled_count': circled_count, 'is_plus_user': is_plus_user, 'birthday': birthday,
                      'bragging_rights': bragging_rights, 'skills': skills, 'relationship_status': relationship_status,
                      'places_lived': places_lived, 'organizations': organizations, 'tagline': tagline
                    }

            # Add the result to the response.
            response.add_result(title=title, url=url, summary=summary, imageurl=imageurl, person=person)

        return response
示例#25
0
    def _parse_json_response(self, query, results):
        """
        Parses Companycheck's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms, query)
        content = json.loads(results.text)

        # The base URL to add the director or company number to, which provides the complete link.
        url_base = 'http://companycheck.co.uk/'

        # Since the object isn't mutated, set the default again if there is nothing present.
        result_type = query.result_type
        if not result_type:
            result_type = self.default_result_type


        # CompanyCheck returns all the results it has, it has no further results.
        response.no_more_results = True

        if result_type == 'company' or not result_type:
            # Create the ifind response for company searches
            for company in content:
                name = company[u'name']
                url =  url_base + 'company/' + str(company[u'number'])
                imageurl = None
                summary = Companycheck._build_company_summary(company)
                # Keyword args below
                number = company[u'number']
                country = company[u'country']
                address = company[u'address']
                sic = company[u'sic']
                status = company[u'status']
                # Add result object to the response
                response.add_result(title=name, url=url, summary=summary, imageurl=imageurl,
                                    number=number, country=country, address=address, sic=sic, status=status)

        elif result_type == 'director':
            # Create the ifind response for director searches
            for director in content:
                name = director[u'name']
                url =  url_base + 'director/' + str(director[u'number'])
                imageurl = None
                sum_dic = Companycheck._build_director_summary(director)
                summary = sum_dic.get('summary')
                # Keyword args below
                postcodes = sum_dic.get('postcode_list')
                number = director[u'number']
                # Add result object to the response
                response.add_result(title=name, url=url, summary=summary, imageurl=imageurl, postcodes=postcodes,
                                    number=number)

        return response
示例#26
0
    def _parse_json_response(query, results):
        """
        Parses Twitter's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.

        """
        response = Response(query.terms)

        content = json.loads(results.text)

        for result in content[u'statuses']:

            text = result[u'text']
            result_id = str(result[u'id'])

            # User dictionary
            user = {'user_id': result[u'user'][u'id_str'],
                    'profile_image': result.get(u'user').get(u'profile_image_url'),
                    'geo_enabled': result.get(u'user').get(u'geo_enabled'),
                    'description': result.get(u'user').get(u'description'),
                    'follower_count': result.get(u'user').get(u'followers_count'),
                    'protected': result.get(u'user').get(u'protected'),
                    'location': result.get(u'user').get(u'location'),
                    'utc_offset': result.get(u'user').get(u'utc_offset'),
                    'time_zone': result.get(u'user').get(u'time_zone'),
                    'name': result.get(u'user').get(u'name'),
                    'screen_name': result.get(u'user').get(u'screen_name'),
                    'member_since': result.get(u'user').get(u'created_at')
            }

            # TODO clean this up
            stamp = result[u'created_at'].split()
            # Created at in format: '01 Jan, 2014 @ 20:23'
            created_at = "{} {}, {} @ {}".format(stamp[2], stamp[1], stamp[5], stamp[3][:-3])

            url = 'https://www.twitter.com/{0}/status/{1}'.format(user['user_id'], result_id)
            imageurl = user.get('profile_image')
            title = u"{} ({}) - {}".format(user['name'], user['screen_name'], created_at)

            # Kwargs below
            source = result.get(u'source')
            coordinates = result.get(u'coordinates')
            place = result.get(u'place')
            hashtags= result.get(u'entities').get(u'hashtags')
            user_info = user
            reply_to_screen_name = result.get(u'in_reply_to_screen_name')
            reply_to_userid = result.get(u'in_reply_to_user_id_str')
            reply_to_status = result.get(u'in_reply_to_status_id_str')


            # List of links in the tweet. Each item in the list is a dictionary with keys:
            # u'url, u'indices', u'expanded_url, u'display_url'
            links = result.get(u'entities').get(u'urls')

            # List of media items in the tweet. Each item in the list is a dictionary with keys:
            # u'expanded_url', u'sizes', u'url', u'media_url_https',
            # u'id_str', u'indices', u'media_url', u'type', u'id', u'display_url'
            media = result.get(u'entities').get(u'media')

            # List of users mentioned in the tweet. Each item in the list is a dictionary with keys:
            # u'indices', 'u'screen_name', u'PSG_inside', u'id', u'name', u'id_str'
            user_mentions = result.get(u'entities').get(u'user_mentions')


            response.add_result(title=title, url=url, summary=text, imageurl=imageurl, stamp=stamp,
                                user_info=user_info, media=media, links=links, user_mentions=user_mentions,
                                source=source, coordinates=coordinates, place=place,
                                hashtags=hashtags,  reply_to_screen_name=reply_to_screen_name,
                                reply_to_status=reply_to_status, reply_to_userid=reply_to_userid)

            if len(response) == query.top:
                break

        return response
示例#27
0
    def _parse_json_response(query, results):
        """
        Parses Companycheck's JSON response and returns as an ifind Response.

        Args:
            query (ifind Query): object encapsulating details of a search query.
            results : requests library response object containing search results.

        Returns:
            ifind Response: object encapsulating a search request's results.

        Usage:
            Private method.
        """

        response = Response(query.terms)
        content = json.loads(results.text)
        url_base = 'http://companycheck.co.uk/'

        if query.result_type:
            result_type = query.result_type
        else:
            result_type = DEFAULT_RESULT_TYPE

        if result_type == 'company' or not result_type:
            for company in content:
                name = company[u'name']
                url = url_base + 'company/' + str(company[u'number'])
                imageurl = None
                summary = Companycheck._build_company_summary(company)
                # Keyword args below
                number = company[u'number']
                country = company[u'country']
                address = company[u'address']
                sic = company[u'sic']
                status = company[u'status']
                # Add result object to the response
                response.add_result(title=name,
                                    url=url,
                                    summary=summary,
                                    imageurl=imageurl,
                                    number=number,
                                    country=country,
                                    address=address,
                                    sic=sic,
                                    status=status)

        elif result_type == 'director':
            for director in content:
                name = director[u'name']
                url = url_base + 'director/' + str(director[u'number'])
                imageurl = None
                sum_dic = Companycheck._build_director_summary(director)
                summary = sum_dic.get('summary')
                postcodes = sum_dic.get('postcode_list')
                number = director[u'number']
                response.add_result(title=name,
                                    url=url,
                                    summary=summary,
                                    imageurl=imageurl,
                                    postcodes=postcodes,
                                    number=number)

        return response