예제 #1
0
    def data(self, all_fields=True):
        """
            Calculate context of quotation using QuoteContext class
            Optionally return a smaller subset of fields to upload to cloud
        """

        data_dict = {
            'sha1': self.hash(),
            'citing_url': self.citing_url,
            'cited_url': self.cited_url,
        }

        # Get text version of document
        citing_doc = Document(self.citing_url)
        cited_doc = Document(self.cited_url)

        # Populate context fields with Document methods
        document_fields = ['doc_type', 'text']
        quote_context_fields = [
            'context_before', 'context_after',  # 'quote',
            'quote_length',
            'quote',
            'quote_start_position', 'quote_end_position',
            'context_start_position', 'context_end_position',
        ]

        if self.raw_output:
            data_dict['citing_raw'] = citing_doc.raw()
            data_dict['cited_raw'] = cited_doc.raw()

        # if self.text_output:
        #    quote_context_fields.append('text')

        for doc_field in document_fields:
            citing_field = ''.join(['citing_', doc_field])
            cited_field = ''.join(['cited_', doc_field])
            data_dict[citing_field] = citing_doc.data()[doc_field]
            data_dict[cited_field] = cited_doc.data()[doc_field]

        # Find context of quote from within text
        citing_context = QuoteContext(self.citing_quote, citing_doc.text())
        cited_context = QuoteContext(self.citing_quote, cited_doc.text())

        for field in quote_context_fields:
            citing_field = ''.join(['citing_', field])
            cited_field = ''.join(['cited_', field])

            data_dict[citing_field] = citing_context.data()[field]
            data_dict[cited_field] = cited_context.data()[field]

        # Stop Elapsed Timer
        elapsed_time = time.time() - self.start_time
        data_dict['create_elapsed_time'] = format(elapsed_time, '.5f')

        if not all_fields:
            excluded_fields = [
                'cited_raw', 'citing_raw',
                'citing_text', 'cited_text',
                'citing_quote_length',
                'cited_quote_start_position', 'citing_quote_start_position',
                'cited_quote_end_position', 'citing_quote_end_position',
                'cited_context_start_position',
                'citing_context_start_position',
                'cited_context_end_position', 'citing_context_end_position',
                'create_elapsed_time',
            ]  # 'cited_cache_url', 'cited_archive_url',

            for excluded_field in excluded_fields:
                data_dict.pop(excluded_field)

        return data_dict
예제 #2
0
 def raw(self):
     raw = ''
     doc = Document(self.url)
     raw = doc.raw()
     return raw