示例#1
0
def test_apply_features():
    s = """This is John Doe

Tuesday @3pm suits. I'll chat to you then.

VP Research and Development, Xxxx Xxxx Xxxxx

555-226-2345

[email protected]"""
    sender = "John <*****@*****.**>"
    features = fs.features(sender)
    result = fs.apply_features(s, features)
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(
        result,
        [
            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
            [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        ],
    )

    with patch.object(fs, "SIGNATURE_MAX_LINES", 5):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
        eq_(result, new_result)
示例#2
0
def test_apply_features():
    s = '''This is John Doe

Tuesday @3pm suits. I'll chat to you then.

VP Research and Development, Xxxx Xxxx Xxxxx

555-226-2345

[email protected]'''
    sender = 'John <*****@*****.**>'
    features = fs.features(sender)
    result = fs.apply_features(s, features)
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
        eq_(result, new_result)
示例#3
0
def build_extraction_dataset(folder, dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                continue
            lines = msg.splitlines()
            for i in range(1, min(SIGNATURE_MAX_LINES,
                                   len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]

                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
示例#4
0
def build_extraction_dataset(folder, dataset_filename, sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                continue
            lines = msg.splitlines()
            for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]

                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
示例#5
0
def test_build_pattern():
    s = '''John Doe

VP Research and Development, Xxxx Xxxx Xxxxx

555-226-2345

[email protected]'''
    sender = 'John <*****@*****.**>'
    features = fs.features(sender)
    result = fs.build_pattern(s, features)
    eq_(result, [2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1])
示例#6
0
def test_build_extraction_dataset():
    if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')):
        os.remove(os.path.join(TMP_DIR, 'extraction.data'))
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR, 'extraction.data'), 1)
    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
                              labelsColumn=-1)
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
    eq_(test_data.size(), 32)
    eq_(len(features('')), test_data.numFeatures)
示例#7
0
def test_build_extraction_dataset():
    if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')):
        os.remove(os.path.join(TMP_DIR, 'extraction.data'))
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)
    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
                              labelsColumn=-1)
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
    eq_(test_data.size(), 32)
    eq_(len(features('')), test_data.numFeatures)
示例#8
0
def test_apply_features():
    s = '''John Doe

VP Research and Development, Xxxx Xxxx Xxxxx

555-226-2345

[email protected]'''
    sender = 'John <*****@*****.**>'
    features = fs.features(sender)
    result = fs.apply_features(s, features)
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
        eq_(result, new_result)
示例#9
0
def test_build_extraction_dataset():
    if os.path.exists(os.path.join(TMP_DIR, "extraction.data")):
        os.remove(os.path.join(TMP_DIR, "extraction.data"))
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, "P"),
                               os.path.join(TMP_DIR, "extraction.data"), 1)

    filename = os.path.join(TMP_DIR, "extraction.data")
    file_data = genfromtxt(filename, delimiter=",")
    test_data = file_data[:, :-1]

    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
    eq_(test_data.shape[0], 32)
    eq_(len(features("")), test_data.shape[1])
示例#10
0
def test_build_extraction_dataset():
    if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')):
        os.remove(os.path.join(TMP_DIR, 'extraction.data'))
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)

    filename = os.path.join(TMP_DIR, 'extraction.data')
    file_data = genfromtxt(filename, delimiter=",")
    test_data = file_data[:, :-1]

    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
    eq_(test_data.shape[0], 32)
    eq_(len(features('')), test_data.shape[1])
示例#11
0
def build_extraction_dataset(repetition,
                             source_folder,
                             emails,
                             dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`
    .
    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    global EXTRACTOR_DATA
    dataset_filename = dataset_filename + repetition
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for email in emails:
            filename = source_folder + email
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue

            ### Use 2 lines below to save the marked signature part into '*_result' file.
            ##
            result_filename = build_result_filename(filename)
            if os.path.exists(result_filename):
                os.remove(result_filename)
            with open(result_filename, 'a') as result:
                ## indent below after comment is taken off
                lines = msg.splitlines()
                for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                    line = lines[-i]
                    label = -1
                    if line[:len(SIGNATURE_ANNOTATION)] == \
                            SIGNATURE_ANNOTATION:
                        label = 1
                        line = line[len(SIGNATURE_ANNOTATION):]
                        # ##
                        # result.write(line + '\n')
                        # ##
                    elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                        line = line[len(REPLY_ANNOTATION):]
                    X = build_pattern(line, features(sender))
                    X.append(label)
                    labeled_pattern = ','.join([str(e) for e in X])
                    dataset.write(labeled_pattern + '\n')
    return dataset_filename
示例#12
0
def build_extraction_dataset(folder, dataset_filename,
                             sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            #print filename
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue
            ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction.
            # msg = process(msg,filename,sender)
            # continue

            # ### Use 2 lines below to save the marked signature part into '*_result' file.
            # ##
            # result_filename = build_result_filename(filename)
            # if os.path.exists(result_filename):
            #     os.remove(result_filename)
            # with open(result_filename, 'a') as result:
            # ## indent below after comment is taken off
            lines = msg.splitlines()
            for i in xrange(1, min(SIGNATURE_MAX_LINES,
                                   len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                    # ##
                    # result.write(line + '\n')
                    # ##
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]
                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
示例#13
0
def build_extraction_dataset(folder, dataset_filename, sender_known=True):
    """Builds signature extraction dataset using emails in the `folder`.

    The emails in the `folder` should be annotated i.e. signature lines
    should be marked with `#sig#`.
    """
    if os.path.exists(dataset_filename):
        os.remove(dataset_filename)
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            #print filename
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if not sender or not msg:
                #print 'Empty: ' + filename
                continue
            ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction.
            # msg = process(msg,filename,sender)
            # continue

            # ### Use 2 lines below to save the marked signature part into '*_result' file.
            # ##
            # result_filename = build_result_filename(filename)
            # if os.path.exists(result_filename):
            #     os.remove(result_filename)
            # with open(result_filename, 'a') as result:
            # ## indent below after comment is taken off
            lines = msg.splitlines()
            for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
                        SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                    # ##
                    # result.write(line + '\n')
                    # ##
                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
                    line = line[len(REPLY_ANNOTATION):]
                X = build_pattern(line, features(sender))
                X.append(label)
                labeled_pattern = ','.join([str(e) for e in X])
                dataset.write(labeled_pattern + '\n')
示例#14
0
def extract_training_vectors(emails, csv_file, output_extraction_file):
    with open(output_extraction_file, 'w') as dataset:
        with open(csv_file, 'r') as csvinput:
            reader = csv.DictReader(csvinput)
            for row in reader:
                if row['filename'] not in emails:
                    continue
                else:
                    sender = row['sender']
                    lines = set(row['origin'].splitlines())
                    sigs = set(row['sig'].splitlines())
                    n_sigs = lines - sigs
                    for line in lines:
                        label = 0
                        if line in sigs:
                            label = 1
                        X = build_pattern(line, features(sender))
                        X.append(label)
                        labeled_pattern = ','.join([str(e) for e in X])
                        dataset.write(labeled_pattern + '\n')
示例#15
0
def build_detection_class(folder, dataset_filename, label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n')
示例#16
0
def build_detection_class(folder, dataset_filename,
                          label, sender_known=True):
    """Builds signature detection class.

    Signature detection dataset includes patterns for two classes:
    * class for positive patterns (goes with label 1)
    * class for negative patterns (goes with label -1)

    The patterns are build of emails from `folder` and appended to
    dataset file.

    >>> build_signature_detection_class('emails/P', 'train.data', 1)
    """
    with open(dataset_filename, 'a') as dataset:
        for filename in os.listdir(folder):
            filename = os.path.join(folder, filename)
            sender, msg = parse_msg_sender(filename, sender_known)
            if sender is None or msg is None:
                continue
            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
            X = build_pattern(msg, features(sender))
            X.append(label)
            labeled_pattern = ','.join([str(e) for e in X])
            dataset.write(labeled_pattern + '\n')
示例#17
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = SparseDataSet([build_pattern(line, features(sender))])
    return classifier.decisionFunc(data, 0) > 0
示例#18
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = numpy.array(build_pattern(line, features(sender)))
    return classifier.predict(data) > 0
示例#19
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = SparseDataSet([build_pattern(line, features(sender))])
    return classifier.decisionFunc(data, 0) > 0
示例#20
0
def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
    data = numpy.array(build_pattern(line, features(sender)))
    return classifier.predict(data) > 0