def data(self, all_fields=True): """ Calculate context of quotation using QuoteContext class Optionally return a smaller subset of fields to upload to cloud """ data_dict = { 'sha1': self.hash(), 'citing_url': self.citing_url, 'cited_url': self.cited_url, } # Get text version of document citing_doc = Document(self.citing_url) cited_doc = Document(self.cited_url) # Populate context fields with Document methods document_fields = ['doc_type', 'text'] quote_context_fields = [ 'context_before', 'context_after', # 'quote', 'quote_length', 'quote', 'quote_start_position', 'quote_end_position', 'context_start_position', 'context_end_position', ] if self.raw_output: data_dict['citing_raw'] = citing_doc.raw() data_dict['cited_raw'] = cited_doc.raw() # if self.text_output: # quote_context_fields.append('text') for doc_field in document_fields: citing_field = ''.join(['citing_', doc_field]) cited_field = ''.join(['cited_', doc_field]) data_dict[citing_field] = citing_doc.data()[doc_field] data_dict[cited_field] = cited_doc.data()[doc_field] # Find context of quote from within text citing_context = QuoteContext(self.citing_quote, citing_doc.text()) cited_context = QuoteContext(self.citing_quote, cited_doc.text()) for field in quote_context_fields: citing_field = ''.join(['citing_', field]) cited_field = ''.join(['cited_', field]) data_dict[citing_field] = citing_context.data()[field] data_dict[cited_field] = cited_context.data()[field] # Stop Elapsed Timer elapsed_time = time.time() - self.start_time data_dict['create_elapsed_time'] = format(elapsed_time, '.5f') if not all_fields: excluded_fields = [ 'cited_raw', 'citing_raw', 'citing_text', 'cited_text', 'citing_quote_length', 'cited_quote_start_position', 'citing_quote_start_position', 'cited_quote_end_position', 'citing_quote_end_position', 'cited_context_start_position', 'citing_context_start_position', 'cited_context_end_position', 'citing_context_end_position', 'create_elapsed_time', ] # 'cited_cache_url', 'cited_archive_url', for excluded_field in excluded_fields: data_dict.pop(excluded_field) return data_dict
def raw(self): raw = '' doc = Document(self.url) raw = doc.raw() return raw