Пример #1
0
    def test_replace_white_space( self ):

        # declare variables
        start_string = ""
        test_string = ""
        expected_string = ""
        
        # initialize
        start_string = "one bird   two  shoes     and a cat"
        expected_string = "one bird two shoes and a cat"
        
        # do work
        test_string = StringHelper.replace_white_space( start_string, replace_with_IN = " " )
        
        # and the assert
        self.assertEqual( test_string, expected_string )
Пример #2
0
                # it is text - convert it to string.
                current_paragraph_text = unicode( paragraph_element )
            
            else:
            
                # not text - just grab all the text out of it.
                #current_paragraph_text = ' '.join( paragraph_element.findAll( text = True ) )
                current_paragraph_text = HTMLHelper.remove_html( str( paragraph_element ) )
                
            #-- END check to see if current element is text. --#

            # clean up - convert HTML entities
            current_paragraph_text = bs_helper.convert_html_entities( current_paragraph_text )
            
            # strip out extra white space
            current_paragraph_text = StringHelper.replace_white_space( current_paragraph_text )
            
            # got any paragraph text?
            current_paragraph_text = current_paragraph_text.strip()
            if ( ( current_paragraph_text != None ) and ( current_paragraph_text != "" ) ):
            
                # yes.  Add to paragraph text.
                paragraph_text_list.append( current_paragraph_text )
                
            #-- END check to see if any text. --#
        
        #-- END loop over paragraph elements. --#
        
        # convert paragraph list to string
        paragraph_text = ' '.join( paragraph_text_list )
        
Пример #3
0
    def get_unique_mention_string_list(self,
                                       replace_white_space_IN=False,
                                       *args,
                                       **kwargs):
        '''
        Retrieves all DataSetMention-s that relate to this DataSet, across
            all citations.  Builds and returns a set of the distinct strings
            used to refer to the dataset.
        '''

        # return reference
        mention_list_OUT = []

        # declare variables
        my_id = -1
        mention_set = set()
        data_set_citation_data_qs = None
        citation_data = None
        mention_qs = None
        mention = None
        mention_string = None

        # get citation data
        data_set_citation_data_qs = DataSetCitationData.objects.filter(
            data_set_citation__data_set=self)

        # for each citation data, get all mentions, and add the value of each
        #     to set.
        for citation_data in data_set_citation_data_qs:

            # get mentions
            mention_qs = citation_data.datasetmention_set.all()

            # for each mention, grab value and add to set if not already there.
            for mention in mention_qs:

                # get value
                mention_string = mention.value

                # is it in set?
                if (mention_string not in mention_set):

                    # are we replacing white space for javascript?
                    if (replace_white_space_IN == True):

                        # replace more than one contiguous white space character
                        #     with a space.
                        mention_string = StringHelper.replace_white_space(
                            mention_string)

                    #-- END check if we unicode_escape --#

                    # no - add it.
                    mention_set.add(mention_string)

                #-- END check to see if in set. --#

            #-- END loop over mentions. --#

        #-- END loop over citation data related to current data set --#

        # convert set to list.
        mention_list_OUT = list(mention_set)
        mention_list_OUT.sort()

        return mention_list_OUT