Exemplo n.º 1
0
 def __init__(self, rel_id=None):
     """Initializes the object for the directed communication relationship
     rel_id = (sender address,recipient address) if it is specified.
     """
     self._db = DB()
     self._rel_id = None
     self._messages = MessageCollection.MessageCollection()
     if rel_id != None:
         self.setRelationshipID(rel_id)
 def __init__(self,rel_id=None):
     """Initializes the object for the directed communication relationship
     rel_id = (sender address,recipient address) if it is specified.
     """
     self._db = DB()
     self._rel_id = None
     self._messages = MessageCollection.MessageCollection()
     if rel_id != None:
         self.setRelationshipID(rel_id)
Exemplo n.º 3
0
 def __init__(self, rel_id=None):
     """Initializes the object for the undirected communication relationship
     rel_id = (participant address 1,paritipant address 2) if it is specified.
     """
     self._db = DB()
     self._rel_id = None
     self._directed_relationships = {}
     self._all_messages = MessageCollection.MessageCollection()
     self._thread_factory = MessageThreading.MessageThreadFactory()
     self._threads = None
     if rel_id != None:
         self.setRelationshipID(rel_id)
Exemplo n.º 4
0
 def __init__(self, ego_address):
     self._db = DB()
     self._ego_address = ego_address
     self._alters = self.getSenders()
     self._alters.extend(self.getRecipients())
     self._alters = list(set(self._alters))
Exemplo n.º 5
0
class CommEgoNetwork(object):
    def __init__(self, ego_address):
        self._db = DB()
        self._ego_address = ego_address
        self._alters = self.getSenders()
        self._alters.extend(self.getRecipients())
        self._alters = list(set(self._alters))

    def fullyObserved(self):
        """Returns True if the ego network is fully observed."""
        return self._db.fullyObserved(self._ego_address)

    def setEgoAddress(self, ego_address):
        """Sets the ego network email address to the address given."""
        self._ego_address = ego_address

    def getSenders(self):
        """Returns a list of email addresses that sent email to the ego."""
        sender_tups = self._db.getSendersForRecipient(self._ego_address)
        senders = [t[0] for t in sender_tups]
        return senders

    def getRecipients(self):
        """Returns a list of email addresses that received email from the ego."""
        recip_tups = self._db.getRecipientsForSender(self._ego_address)
        recips = [t[0] for t in recip_tups]
        return recips

    def getAlters(self):
        """Returns a list of all email addresses that exchanged email with the ego."""
        return self._alters

    def getCommRelationships(self):
        """Returns a list of all communication relationships involving the ego."""
        return [
            CommRelationship.CommRelationship((a, self._ego_address))
            for a in self._alters
        ]

    def getRelationshipEmailCounts(self):
        """Returns a dictionary containing relationship email count data. The top level 
        dictionary has two keys: 'ego' and 'data'. The value associated with 'ego' is the ego's
        email address. The value associated with 'data' is a list of dictionaries containing
        data for each communication relationship in the ego network. The relationship 
        dictionaries have two keys: 'alter' and 'counts'. The value associated with 'alter' is 
        the alter's email address. The value associated with counts is a list containing
        [# of messages sent by the ego,
         # of direct messages sent by the ego,
         # of indirect messages sent by the ego,
         # of threaded messages sent by the ego,
         # of messages sent by the alter,
         # of direct messages sent by the alter,
         # of indirect messages sent by the alter,
         # of threaded messages sent by the alter].
        """
        ego = self._ego_address
        ego_data = {'ego': ego, 'data': []}
        rels = self.getCommRelationships()
        for rel in rels:

            # Get the relationship participants
            rel_id = rel.getRelationshipID()

            # Identify the alter address
            if rel_id[0] == ego:
                alter = rel_id[1]
            else:
                alter = rel_id[0]

            # Construct the dictionary of relationship data
            rel_data = {'alter': alter}
            rel_data['counts'] = [
                rel.getNumberOfMsgsFromSender(ego),
                rel.getNumberOfDirectMsgsFromSender(ego),
                rel.getNumberOfIndirectMsgsFromSender(ego),
                rel.getNumberOfThreadedMsgsFromSender(ego),
                rel.getNumberOfMsgsFromSender(alter),
                rel.getNumberOfDirectMsgsFromSender(alter),
                rel.getNumberOfIndirectMsgsFromSender(alter),
                rel.getNumberOfThreadedMsgsFromSender(alter)
            ]
            ego_data['data'].append(rel_data)

        return ego_data

    def getNumberOfThreadedMsgs(self):
        """Returns the total number of threaded messages exchanged in the ego network."""

        # Get the relationships associated with the ego network
        rels = self.getCommRelationships()

        # Collect the message ids of the threaded messages
        mids = set([])
        for rel in rels:
            threads = rel.getConversationThreads()
            for thread in threads:
                thread_mids = thread.getMessageIDs()
                mids = mids.union(set(thread_mids))

        return len(mids)
Exemplo n.º 6
0
class DirectedCommRelationship(object):
    def __init__(self, rel_id=None):
        """Initializes the object for the directed communication relationship
        rel_id = (sender address,recipient address) if it is specified.
        """
        self._db = DB()
        self._rel_id = None
        self._messages = MessageCollection.MessageCollection()
        if rel_id != None:
            self.setRelationshipID(rel_id)

    def __str__(self):
        """Creates a display string for the relationship id."""
        return str(self._rel_id)

    def setRelationshipID(self, rel_id):
        """Sets the message collection to those messages associated with the given 
        directed relationship.
        """

        # Set the relationship id
        self._rel_id = rel_id

        # Get the list of (message id, epoch secs) tuples for the directed relationship
        tlist = self._db.getDirectedCommRelationship(rel_id)

        # Check to ensure we're dealing with a valid relationship
        if tlist != None:

            # Extract the list of message ids
            mids = [tup[0] for tup in tlist]

            # Set the list of message ids in the MessageCollection object
            self._messages.setMessageIDs(mids)

    def getRelationshipID(self):
        """Returns the relationship ID for the directed relationship."""
        return self._rel_id

    def getAllMessages(self):
        """Returns the message collection associated with the directed relationship."""
        return self._messages

    def getSenderTokens(self, time_interval=None):
        """Returns a list of all sender tokens from the set of messages in the directed
        relationship. If a time interval is specified through a tuple of datetimes
        (interval_begin, interval_end), the sender tokens returned correspond to the 
        messages that were sent within the time interval.
        """

        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        # Extract the tokens
        tokens = []
        for msg in self._messages:
            if time_interval == None or within_time_interval(
                    msg.Datetime, time_interval):
                tokens.extend(msg.getSenderTokens())
        return tokens

    def getNumberOfMsgs(self, time_interval=None):
        """Returns the number of messages associated with the directed relationship.
        If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of messages that 
        were sent within the time interval.
        """
        return self._messages.getNumberOfMsgs(time_interval)

    def getNumberOfDirectMsgs(self, time_interval=None):
        """Returns the number of messages where the recipient is listed in the TO field.
        If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of relevant messages 
        that were sent within the time interval.
        """

        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        # Count the relevant messages
        count = 0
        recip = self._rel_id[1]
        for msg in self._messages:

            # If a time interval was specified and the message is not in the time interval,
            # continue to the next message
            if time_interval != None and not within_time_interval(
                    msg.Datetime, time_interval):
                continue

            if recip in msg.TO:
                count += 1

        return count

    def getNumberOfIndirectMsgs(self, time_interval=None):
        """Returns the number of messages where the recipient is listed in either the CC
        or BCC field. If the recipient is listed in the TO field as well, the TO field takes
        precedent. If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of relevant messages 
        that were sent within the time interval.
        """

        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        count = 0
        recip = self._rel_id[1]
        for msg in self._messages:

            # If a time interval was specified and the message is not in the time interval,
            # continue to the next message
            if time_interval != None and not within_time_interval(
                    msg.Datetime, time_interval):
                continue

            if (not recip in msg.TO) and (recip in msg.CC or recip in msg.BCC):
                count += 1

        return count
Exemplo n.º 7
0
 def __init__(self,ego_address):
     self._db = DB()
     self._ego_address = ego_address
     self._alters = self.getSenders()
     self._alters.extend(self.getRecipients())
     self._alters = list(set(self._alters))    
Exemplo n.º 8
0
class CommEgoNetwork(object):

    def __init__(self,ego_address):
        self._db = DB()
        self._ego_address = ego_address
        self._alters = self.getSenders()
        self._alters.extend(self.getRecipients())
        self._alters = list(set(self._alters))    
        
    def fullyObserved(self):
        """Returns True if the ego network is fully observed."""
        return self._db.fullyObserved(self._ego_address)
    
    def setEgoAddress(self,ego_address):
        """Sets the ego network email address to the address given."""
        self._ego_address = ego_address
        
    def getSenders(self):
        """Returns a list of email addresses that sent email to the ego."""
        sender_tups = self._db.getSendersForRecipient(self._ego_address)
        senders = [t[0] for t in sender_tups]
        return senders
        
    def getRecipients(self):
        """Returns a list of email addresses that received email from the ego."""
        recip_tups = self._db.getRecipientsForSender(self._ego_address)
        recips = [t[0] for t in recip_tups]
        return recips
        
    def getAlters(self):
        """Returns a list of all email addresses that exchanged email with the ego."""
        return self._alters
        
    def getCommRelationships(self):
        """Returns a list of all communication relationships involving the ego."""
        return [CommRelationship.CommRelationship((a,self._ego_address)) for a in self._alters]
        
    def getRelationshipEmailCounts(self):
        """Returns a dictionary containing relationship email count data. The top level 
        dictionary has two keys: 'ego' and 'data'. The value associated with 'ego' is the ego's
        email address. The value associated with 'data' is a list of dictionaries containing
        data for each communication relationship in the ego network. The relationship 
        dictionaries have two keys: 'alter' and 'counts'. The value associated with 'alter' is 
        the alter's email address. The value associated with counts is a list containing
        [# of messages sent by the ego,
         # of direct messages sent by the ego,
         # of indirect messages sent by the ego,
         # of threaded messages sent by the ego,
         # of messages sent by the alter,
         # of direct messages sent by the alter,
         # of indirect messages sent by the alter,
         # of threaded messages sent by the alter].
        """
        ego = self._ego_address
        ego_data = {'ego' : ego, 'data' : []}
        rels = self.getCommRelationships()
        for rel in rels:
            
            # Get the relationship participants
            rel_id = rel.getRelationshipID()
            
            # Identify the alter address
            if rel_id[0] == ego:
                alter = rel_id[1]
            else:
                alter = rel_id[0]
                
            # Construct the dictionary of relationship data
            rel_data = {'alter' : alter}
            rel_data['counts'] = [rel.getNumberOfMsgsFromSender(ego),
                                  rel.getNumberOfDirectMsgsFromSender(ego),
                                  rel.getNumberOfIndirectMsgsFromSender(ego),            
                                  rel.getNumberOfThreadedMsgsFromSender(ego),
                                  rel.getNumberOfMsgsFromSender(alter),
                                  rel.getNumberOfDirectMsgsFromSender(alter),
                                  rel.getNumberOfIndirectMsgsFromSender(alter),
                                  rel.getNumberOfThreadedMsgsFromSender(alter)]
            ego_data['data'].append(rel_data)
                   
        return ego_data        
        
    def getNumberOfThreadedMsgs(self):
        """Returns the total number of threaded messages exchanged in the ego network."""
        
        # Get the relationships associated with the ego network
        rels = self.getCommRelationships()
        
        # Collect the message ids of the threaded messages
        mids = set([])
        for rel in rels:        
            threads = rel.getConversationThreads()
            for thread in threads:
                thread_mids = thread.getMessageIDs()
                mids = mids.union(set(thread_mids))
                
        return len(mids)
class DirectedCommRelationship(object):

    def __init__(self,rel_id=None):
        """Initializes the object for the directed communication relationship
        rel_id = (sender address,recipient address) if it is specified.
        """
        self._db = DB()
        self._rel_id = None
        self._messages = MessageCollection.MessageCollection()
        if rel_id != None:
            self.setRelationshipID(rel_id)
            
    def __str__(self):
        """Creates a display string for the relationship id."""
        return str(self._rel_id)
        
    def setRelationshipID(self,rel_id):
        """Sets the message collection to those messages associated with the given 
        directed relationship.
        """
        
        # Set the relationship id
        self._rel_id = rel_id
        
        # Get the list of (message id, epoch secs) tuples for the directed relationship
        tlist = self._db.getDirectedCommRelationship(rel_id)
        
        # Check to ensure we're dealing with a valid relationship
        if tlist != None:

            # Extract the list of message ids
            mids = [tup[0] for tup in tlist]
        
            # Set the list of message ids in the MessageCollection object
            self._messages.setMessageIDs(mids)
            
    def getRelationshipID(self):
        """Returns the relationship ID for the directed relationship."""
        return self._rel_id
            
    def getAllMessages(self):
        """Returns the message collection associated with the directed relationship."""
        return self._messages
        
    def getSenderTokens(self,time_interval=None):
        """Returns a list of all sender tokens from the set of messages in the directed
        relationship. If a time interval is specified through a tuple of datetimes
        (interval_begin, interval_end), the sender tokens returned correspond to the 
        messages that were sent within the time interval.
        """
        
        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        # Extract the tokens
        tokens = []
        for msg in self._messages:
            if time_interval == None or within_time_interval(msg.Datetime,time_interval):
                tokens.extend(msg.getSenderTokens())
        return tokens
        
    def getNumberOfMsgs(self,time_interval=None):
        """Returns the number of messages associated with the directed relationship.
        If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of messages that 
        were sent within the time interval.
        """
        return self._messages.getNumberOfMsgs(time_interval)
        
    def getNumberOfDirectMsgs(self,time_interval=None):
        """Returns the number of messages where the recipient is listed in the TO field.
        If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of relevant messages 
        that were sent within the time interval.
        """

        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        # Count the relevant messages
        count = 0
        recip = self._rel_id[1]
        for msg in self._messages:
            
            # If a time interval was specified and the message is not in the time interval,
            # continue to the next message
            if time_interval != None and not within_time_interval(msg.Datetime,time_interval):
                continue

            if recip in msg.TO:
                count += 1
                
        return count
        
    def getNumberOfIndirectMsgs(self,time_interval=None):
        """Returns the number of messages where the recipient is listed in either the CC
        or BCC field. If the recipient is listed in the TO field as well, the TO field takes
        precedent. If a time interval is specified through a tuple of datetimes (interval_begin, 
        interval_end), the count returned corresponds to the number of relevant messages 
        that were sent within the time interval.
        """

        # If a time interval is specified, check to see if it is valid
        if time_interval != None and not valid_time_interval(time_interval):
            raise Exception("The specificed time interval is not valid!")

        count = 0
        recip = self._rel_id[1]
        for msg in self._messages:
        
            # If a time interval was specified and the message is not in the time interval,
            # continue to the next message
            if time_interval != None and not within_time_interval(msg.Datetime,time_interval):
                continue        
        
            if (not recip in msg.TO) and (recip in msg.CC or recip in msg.BCC):
                count += 1
        
        return count
Exemplo n.º 10
0
 def __init__(self, message_id=None):
     self._db = DB()
     if message_id != None:
         self.setMessageID(message_id)
Exemplo n.º 11
0
class Message(object):

    def __init__(self, message_id=None):
        self._db = DB()
        if message_id != None:
            self.setMessageID(message_id)
        
    def __str__(self):
        """Creates a display string for printing the specified message attributes."""
    
        # Define the property order for display
        disp_props = ['MessageID', 'Datetime', 'EpochSecs', 'Sender', 'TO', 'CC', 
                      'BCC', 'Subject', 'Body']
    
        # Assemble the display string
        disp_string = ''
        for prop in disp_props:
        
            # Get the attribute if it exists
            try:
                attr = getattr(self,prop)
            except AttributeError:
                attr = 'not defined'
            
            # If the attribute is a list, build the string representation
            if type(attr) == list:
                astr = ''
                for item in attr:
                    astr += str(item) + ', '
                attr = astr[:-2]
            
            # Append to the display string
            disp_string += prop + ' : ' + str(attr) + '\n'
   
        return disp_string
        
    def setMessageID(self,message_id):
        """Sets the message object attributes to those returned by db.getMessage()."""
        
        # Get the message properties
        msg = self._db.getMessage(message_id)
        
        # Set the object properties
        for k,v in msg.items():
            setattr(self, k, v)
            
    def _trim_at_first_substring(self,sub,s):
        """Finds the first occurrence of sub in s. If sub is present, s is trimmed at the 
        starting location of sub and returned."""
        idx = s.find(sub)
        if idx > -1:
            s = s[:idx]
        return s
    
    def getSenderText(self):
        """Returns the filtered message body with text from previous messages removed.""" 
        
        # Get the message body
        body = self.Body

        # The following are heuristics for identifying sender text in the Enron email corpus
        
        # Remove the original message text if present
        body = self._trim_at_first_substring('-----Original Message-----',body)
        
        # Remove forwarded message text if present
        body = self._trim_at_first_substring('---------------------- Forwarded by',body)
        body = self._trim_at_first_substring('From:',body)
        body = self._trim_at_first_substring('To:',body)
        
        # Remove meeting text
        body = self._trim_at_first_substring('-----Original Appointment-----',body)
        
        # Remove the BlackBerry signature if present
        body = self._trim_at_first_substring('--------------------------\nSent from my BlackBerry Wireless Handheld',body)
        
        # remove random =20 entries in the message body
        body = re.sub(r'=20','',body)
    
        # remove random = that appear in the middle, at the beginning and at
        # the end of words
        body = re.sub(r'\b=\b','',body)
        body = re.sub(r'=\b','',body)
        body = re.sub(r'\b=','',body)        
        
        return body
        
    def getSenderTokens(self,lower=True):
        """Returns a list of tokens derived from the sender's text in the message body.
        If lower = True, the tokens will be returned in all lowercase."""
        
        # The regular expression defining the tokenizer.
        # Extracts sequences with <one or more letters>'<one or more letters> OR
        # <one or more letters>
        regexp = r"([a-zA-Z]+'[a-zA-Z]+)|([a-zA-Z]+)"  
        
        # Extract the tokens
        tokens = nltk.regexp_tokenize(self.getSenderText(),regexp)        
        
        # Lowercase the tokens if necessary
        if lower:
            tokens = map(lambda s : s.lower(),tokens)
        
        return tokens