def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: continue lines = msg.splitlines() for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: continue lines = msg.splitlines() for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def test_build_pattern(): s = '''John Doe VP Research and Development, Xxxx Xxxx Xxxxx 555-226-2345 [email protected]''' sender = 'John <*****@*****.**>' features = fs.features(sender) result = fs.build_pattern(s, features) eq_(result, [2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1])
def build_extraction_dataset(repetition, source_folder, emails, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder` . The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ global EXTRACTOR_DATA dataset_filename = dataset_filename + repetition if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for email in emails: filename = source_folder + email sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: #print 'Empty: ' + filename continue ### Use 2 lines below to save the marked signature part into '*_result' file. ## result_filename = build_result_filename(filename) if os.path.exists(result_filename): os.remove(result_filename) with open(result_filename, 'a') as result: ## indent below after comment is taken off lines = msg.splitlines() for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] # ## # result.write(line + '\n') # ## elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n') return dataset_filename
def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): #print filename filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: #print 'Empty: ' + filename continue ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction. # msg = process(msg,filename,sender) # continue # ### Use 2 lines below to save the marked signature part into '*_result' file. # ## # result_filename = build_result_filename(filename) # if os.path.exists(result_filename): # os.remove(result_filename) # with open(result_filename, 'a') as result: # ## indent below after comment is taken off lines = msg.splitlines() for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] # ## # result.write(line + '\n') # ## elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def build_extraction_dataset(folder, dataset_filename, sender_known=True): """Builds signature extraction dataset using emails in the `folder`. The emails in the `folder` should be annotated i.e. signature lines should be marked with `#sig#`. """ if os.path.exists(dataset_filename): os.remove(dataset_filename) with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): #print filename filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if not sender or not msg: #print 'Empty: ' + filename continue ## use 2 lines below to pre-process emails to get the body and sender file for later Email Extraction. # msg = process(msg,filename,sender) # continue # ### Use 2 lines below to save the marked signature part into '*_result' file. # ## # result_filename = build_result_filename(filename) # if os.path.exists(result_filename): # os.remove(result_filename) # with open(result_filename, 'a') as result: # ## indent below after comment is taken off lines = msg.splitlines() for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] # ## # result.write(line + '\n') # ## elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: line = line[len(REPLY_ANNOTATION):] X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def extract_training_vectors(emails, csv_file, output_extraction_file): with open(output_extraction_file, 'w') as dataset: with open(csv_file, 'r') as csvinput: reader = csv.DictReader(csvinput) for row in reader: if row['filename'] not in emails: continue else: sender = row['sender'] lines = set(row['origin'].splitlines()) sigs = set(row['sig'].splitlines()) n_sigs = lines - sigs for line in lines: label = 0 if line in sigs: label = 1 X = build_pattern(line, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def build_detection_class(folder, dataset_filename, label, sender_known=True): """Builds signature detection class. Signature detection dataset includes patterns for two classes: * class for positive patterns (goes with label 1) * class for negative patterns (goes with label -1) The patterns are build of emails from `folder` and appended to dataset file. >>> build_signature_detection_class('emails/P', 'train.data', 1) """ with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if sender is None or msg is None: continue msg = re.sub('|'.join(ANNOTATIONS), '', msg) X = build_pattern(msg, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def build_detection_class(folder, dataset_filename, label, sender_known=True): """Builds signature detection class. Signature detection dataset includes patterns for two classes: * class for positive patterns (goes with label 1) * class for negative patterns (goes with label -1) The patterns are build of emails from `folder` and appended to dataset file. >>> build_signature_detection_class('emails/P', 'train.data', 1) """ with open(dataset_filename, 'a') as dataset: for filename in os.listdir(folder): filename = os.path.join(folder, filename) sender, msg = parse_msg_sender(filename, sender_known) if sender is None or msg is None: continue msg = re.sub('|'.join(ANNOTATIONS), '', msg) X = build_pattern(msg, features(sender)) X.append(label) labeled_pattern = ','.join([str(e) for e in X]) dataset.write(labeled_pattern + '\n')
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = SparseDataSet([build_pattern(line, features(sender))]) return classifier.decisionFunc(data, 0) > 0
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = numpy.array(build_pattern(line, features(sender))) return classifier.predict(data) > 0
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = numpy.array(build_pattern(line, features(sender))) return classifier.predict(data) > 0
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = SparseDataSet([build_pattern(line, features(sender))]) return classifier.decisionFunc(data, 0) > 0