Пример #1
0
def handle_work_item(processor, item):
    """ Process a work item.  The work item will be provided and its local
        temp directory will be cleaned up by the process driver
        framework.  If this method does not raise an exception the
        work item will also be removed from the work queue.

    """
    document = item['Asset-Instance'].related_document

    num_ocr_pages = document.pages.filter(
        assets__asset_class__name=models.AssetClass.PAGE_TEXT).count()

    if document.num_pages != num_ocr_pages:
        raise NotReadyException(
            "Postponing PDF generation, OCR not complete for pages")

    pdf_stream = StringIO(
        pdf.render_document(document,
                            output_buffer=StringIO(),
                            username=document.owner.username,
                            title=document.title).getvalue())

    # trial account handling -- send the PDF in attachment
    # and delete all associated assets
    if handle_trial_account(document, pdf_stream.getvalue()):
        return

    # classify the document based on the creation time of its PDF asset
    tag_document(document, datetime.timedelta(0,
                                              UPLOAD_AGGREGATE_TIME_TRESHOLD))

    pdf_assets = document.assets.filter(
        asset_class__name=models.AssetClass.DOCUMENT,
        mime_type__name=models.MimeType.PDF)

    if len(pdf_assets) != 0:
        pdf_asset = pdf_assets[0]
        pdf_asset.producer = processor
        operations.upload_asset_stream(pdf_asset, pdf_stream)
    else:
        pdf_asset = operations.create_asset_from_stream(
            data_stream=pdf_stream,
            owner=item['Owner'],
            producer=processor,
            asset_class=models.AssetClass.DOCUMENT,
            related_document=document,
            file_name=document.title,
            parent=item['Asset-Instance'],
            child_number=1,
            mime_type=models.MimeType.PDF)

    return [pdf_asset]
Пример #2
0
def handle_work_item(processor, item):

    """ Process a work item.  The work item will be provided and its local
        temp directory will be cleaned up by the process driver
        framework.  If this method does not raise an exception the
        work item will also be removed from the work queue.

    """
    document = item["Asset-Instance"].related_document

    num_ocr_pages = document.pages.filter(assets__asset_class__name=models.AssetClass.PAGE_TEXT).count()

    if document.num_pages != num_ocr_pages:
        raise NotReadyException("Postponing PDF generation, OCR not complete for pages")

    pdf_stream = StringIO(
        pdf.render_document(
            document, output_buffer=StringIO(), username=document.owner.username, title=document.title
        ).getvalue()
    )

    # trial account handling -- send the PDF in attachment
    # and delete all associated assets
    if handle_trial_account(document, pdf_stream.getvalue()):
        return

    # classify the document based on the creation time of its PDF asset
    tag_document(document, datetime.timedelta(0, UPLOAD_AGGREGATE_TIME_TRESHOLD))

    pdf_assets = document.assets.filter(
        asset_class__name=models.AssetClass.DOCUMENT, mime_type__name=models.MimeType.PDF
    )

    if len(pdf_assets) != 0:
        pdf_asset = pdf_assets[0]
        pdf_asset.producer = processor
        operations.upload_asset_stream(pdf_asset, pdf_stream)
    else:
        pdf_asset = operations.create_asset_from_stream(
            data_stream=pdf_stream,
            owner=item["Owner"],
            producer=processor,
            asset_class=models.AssetClass.DOCUMENT,
            related_document=document,
            file_name=document.title,
            parent=item["Asset-Instance"],
            child_number=1,
            mime_type=models.MimeType.PDF,
        )

    return [pdf_asset]
Пример #3
0
    def test_create_asset_from_stream(self):
        asset = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = 'test_data',
            data_stream  = StringIO(TEST_DATA),
            file_name    = 'create_asset_from_string.txt',
            child_number = 0,
            mime_type    = 'text/plain' )

        self.assert_( asset is not None )

        operations.publish_work_item(asset)

        self._validate_consumer(asset, TEST_DATA)
Пример #4
0
    def test_tag_documents_by_time(self):
        # create an unclassified document
        doc0 = operations.create_document( owner = self.user )

        asset0 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc0,
            mime_type        = models.MimeType.PDF )

        sleep(2)

        doc1 = operations.create_document( owner = self.user )

        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc1, datetime.timedelta(0, 1))

        asset1 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc1,
            mime_type        = models.MimeType.PDF )

        # do we have a new tag?
        self.assert_( doc1.tags.all().count() == 1 )

        tag1 = doc1.tags.all()[0]

        self.assert_(tag1.tag_class == models.Tag.UPLOAD_AGGREGATE)

        # sleep 3 sec
        sleep(3)

        doc2 = operations.create_document( owner = self.user )
        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc2, datetime.timedelta(0, 1))
        # is the second document tagged in the different tag?
        self.assert_( doc2.tags.all().count() == 1 )

        tag2 = doc2.tags.all()[0]

        self.assert_(tag2.label != tag1.label)

        asset2 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc2,
            mime_type        = models.MimeType.PDF )



        # sleep another 3 seconds and create the 3rd document,
        # but with a longer threshold
        sleep(3)

        doc3 = operations.create_document( owner = self.user )
        now = datetime.date.fromtimestamp(time.time())
        pdf_generator.tag_document(doc3, datetime.timedelta(0, 10))

        # did this one got tagged with a same tag?
        self.assert_( doc2.tags.all().count() == 1 )
        self.assert_( tag2 == doc3.tags.all()[0] )

        asset3 = operations.create_asset_from_stream(
            owner        = self.user,
            producer     = self.producer,
            asset_class  = models.AssetClass.DOCUMENT,
            data_stream  = StringIO('some pdf'),
            file_name    = 'create_asset_from_string.txt',
            child_number = 1,
            related_document = doc3,
            mime_type        = models.MimeType.PDF )
Пример #5
0
def handle_work_item( processor, work_item ):
    """ Pick up an uploaded email and break out each attachment we understand
        into its own upload work item.
    """
    asset_list   = []
    parent_asset = work_item['Asset-Instance']
    upload_class = manager(AssetClass).get( name = AssetClass.UPLOAD )
    message_part_class, created = manager(AssetClass).get_or_create( 
                                        name = AssetClass.MESSAGE_PART )
    counter      = 0

    conversation_rule, created = MessageRule.objects.get_or_create(
                                    owner = parent_asset.owner, 
                                    type = MessageRule.CONVERSATION)

    raw_message = email.message_from_file(file(work_item['Local-Path']))
    
    # create the record for the sender
    sender = header_to_unicode(raw_message['From'])
    subject = header_to_unicode(raw_message['Subject'])
    in_reply_to = raw_message['In-Reply-To']
    message_date = raw_message['Date']
    message_id = raw_message['Message-ID']
    references = raw_message['References']
    
    sender_name, sender_address_string = email.utils.parseaddr(sender)
    sender_name = sender_name.strip('\'')
    
    sender_address, sender_contact = get_or_create_adddress_and_contact(
                                        sender_name, 
                                        sender_address_string,
                                        parent_asset.owner)
                
    # create a conversation or get one from previously registered email
#    conversation = None
#    in_reply_to_message = None
#    try:
#        in_reply_to_message = Message.objects.get( owner = parent_asset.owner, message_id = in_reply_to )
#        conversation = in_reply_to_message.conversation
#    except Message.DoesNotExist, e:
#        conversation, created = Conversation.objects.get_or_create(
#                    owner = parent_asset.owner,
#                    subject = (subject or '-'). #FIXME regex!
#                            strip('Re: ').strip('RE: ').strip('re: '),
#                    defaults = {'key_participant': sender_address}) 

    
    # get the mailbox account address from asset filename
    # this isn't the same thing as "To:" address because the mail
    # may be sent as BCC and arrive to one of many mailboxes
    # associated to a single account
    mailbox_address = get_mailbox_address_from_asset(parent_asset)
    
    
    # Date field sometimes could be empty
    if message_date:
        message_date = datetime.datetime.fromtimestamp(time.mktime(
                                        email.utils.parsedate(
                                                message_date)))
    else:
        message_date = datetime.datetime.now()

    in_reply_to_message = None
    if in_reply_to:
        in_reply_to_message, created = Message.objects.get_or_create( 
                owner = parent_asset.owner, 
                message_id = in_reply_to,
                mailbox_address = mailbox_address,
                defaults = {'status' : Message.STATUS_REFERENCE} )
    
    # A message could have been created based on a Reference from another
    # message. In this case it will only have owner, message_id and mailbox_address
    
    # This message starts off as REFERENCE (i.e. incomplete)
    message, created = Message.objects.get_or_create(
                    owner = parent_asset.owner,
                    message_id = message_id,
                    mailbox_address = mailbox_address,
                    defaults = {'status' : Message.STATUS_REFERENCE} )
    message.subject = subject or ''
    message.date = message_date
    message.reply_to = in_reply_to_message
    message.sender_address = sender_address

    # tag unread messages
    # TODO
#    read_flag = get_message_tags_from_asset(parent_asset)
#    if read_flag:
#        seen_tag, created = Tag.objects.get_or_create(tag_class = Tag.MAIL_IMAP_FLAG_SEEN, defaults={label:'Seen'})
#        message.tags.add(seen_tag)
    
    if raw_message.get_all('to'):              
        for a in email.utils.getaddresses(raw_message.get_all('to')):
            address, contact = get_or_create_adddress_and_contact(
                                                          a[0].strip('\''), 
                                                          a[1],
                                                          parent_asset.owner)
            message.to_addresses.add(address)
    
    if raw_message.get_all('cc'):              
        for a in email.utils.getaddresses(raw_message.get_all('cc')):
            address, contact = get_or_create_adddress_and_contact(
                                                          a[0].strip('\''), 
                                                          a[1],
                                                          parent_asset.owner)
            message.to_addresses.add(address)
    
    # Now that the message is completed, set its status to READY
    message.status = Message.STATUS_READY
    message.save()
                      
#    classify_conversation(conversation)
    
    apply_rules(parent_asset.owner, message, raw_message)

    # tag all aggregates of this message
    tag = get_or_create_tag_from_asset(parent_asset)
    if tag:
        for aggregate in message.aggregates.all():
            aggregate.tags.add(tag)
            aggregate.save()


    for part in raw_message.walk():
        logging.info('part.get_content_type()=%s', part.get_content_type())
        counter += 1

        mime_type = manager(MimeType).filter(name = part.get_content_type())
        if len(mime_type) == 0:
            logging.info('%s type is unrecognized. Ignoring this message part.', part.get_content_type())
            continue
        else:
            mime_type = mime_type[0]

        payload = part.get_payload(decode=True)
        if not payload:
            logging.info('No payload of part %s. Ignoring this message part.', part.get_content_type())
            continue
        

        if not upload_class.has_consumers(part.get_content_type()):
            logging.info('%s has no consumers for %s(%s)', upload_class, mime_type.name, part.get_content_type())
            #continue

        file_name = part.get_filename() or 'part-%04d.%s' % (
            counter,
            mime_type.extension )

        # Added decode call to avoid errors like
        # UnicodeDecodeError: 'ascii' codec can't decode 
        # byte 0x91 in position 3474: ordinal not in range(128)
        #TODO: figure out why 
        payload = part.get_payload(decode=True).decode('utf8', 'ignore')

        message.summary = generate_conversation_summary(payload, mime_type)
        message.save()

        # We could already have an asset for this message. This happens when the same
        # message shows up in multiple IMAP folders eg. like Gmail does with labels
        try:
            asset = Asset.objects.get(owner = parent_asset.owner, 
                                      child_number = counter, 
                                      parent = parent_asset,
                                      asset_class = message_part_class)
        except Asset.DoesNotExist, e:
            asset_list.append(
                operations.create_asset_from_stream(
                    data_stream  = StringIO(part.get_payload(decode=True)),
                    owner        = parent_asset.owner,
                    producer     = processor,
                    asset_class  = message_part_class,
                    file_name    = file_name,
                    parent       = parent_asset,
                    child_number = counter,
                    mime_type    = mime_type,
                    related_message = message ))