Exemplo n.º 1
0
def train(vine_data, data):
    shuffle(data)
    print(len(vine_data))
    print(len(data))
    rate = 0.7
    boundary = int(rate * (len(vine_data) + len(data)))
    with open('../data/accurate.txt', 'a') as f:
        f.write('\ntotal size: ' + str(boundary) + '\n')
        cl2 = DecisionTreeClassifier(vine_data +
                                     data[:boundary - len(vine_data)])
        accurate2 = cl2.accuracy(data[boundary - len(vine_data):])
        print(accurate2)
    return cl2
Exemplo n.º 2
0
def get_classifier():
    print str('getting classifier')
    if os.path.isfile('output/classifier-random-tree.pickle'):
        cl = load_classifier()
    else:
        print str('creating classifier')
        with open('output/train-even.csv', 'r') as trainingFile:
            cl = DecisionTreeClassifier(trainingFile, format="csv")
            save_classifier(cl)
    return cl
Exemplo n.º 3
0
def classifier(agent_name):
    data_intent = list(db.intents.find({'agent_name': agent_name}))
    list_intent = []
    for i in data_intent:
        dict_intent = {}
        dict_intent[i['intent_name']] = i['user_expressions']
        list_intent.append(dict_intent)
    intent_list = dict([(key, d[key]) for d in list_intent for key in d])
    d = {i: k for k, v in intent_list.items() for i in v}
    train = list(d.items())
    clf = DecisionTreeClassifier(train)
    joblib.dump(clf, '/home/dev/Botzup/Botzup/' + agent_name + '.pkl')
class TestTreeTextClasiffier(luigi.Task):
    """docstring for TestNaiveBayesTextClasiffier"""
    lang = luigi.Parameter()

    def output(self):
        conf = Conf()
        path = conf.getAbsPath()
        return luigi.LocalTarget('%s/Data/tree_%s.clasi' % (path, self.lang))

    def requires(self):
        return [GenerateTextByLang(self.lang)]

    def run(self):
        d2v = None
        modelLoc = ""
        ficheroTweets = None
        for input in self.input():
            if "check" in input.path:
                d2v = Doc2Vec()
                modelLoc = input.path.replace("check", "model")
            else:
                ficheroTweets = input.path

        lab = LabeledLineSentence(ficheroTweets, ides="String")
        all_train = []
        for tweet in lab:
            tag = tweet.tags
            if "POS" in tag[0] or "NEG" in tag[0]:

                phrase = ' '.join(str(x) for x in tweet.words)
                #print phrase
                try:
                    all_train.append(
                        (phrase.encode('ascii',
                                       'ignore'), tag[0].split("_")[0]))
                except Exception, e:
                    pass

        leng = 2000
        train = int(leng * 0.80)
        shuffle(all_train)

        #print all_train[:train]
        cl = DecisionTreeClassifier(all_train[:train])
        #print all_train[train:leng]
        print cl.accuracy(all_train[train:leng])
Exemplo n.º 5
0
class TestDecisionTreeClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(
            repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(
                len(train_set)))
Exemplo n.º 6
0
class TestDecisionTreeClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = DecisionTreeClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_accuracy(self):
        acc = self.classifier.accuracy(test_set)
        assert_true(isinstance(acc, float))

    def test_update(self):
        original_length = len(self.classifier.train_set)
        self.classifier.update([("lorem ipsum", "positive")])
        new_length = len(self.classifier.train_set)
        assert_equal(original_length + 1, new_length)

    def test_custom_feature_extractor(self):
        cl = DecisionTreeClassifier(train_set, custom_extractor)
        cl.classify("Yay! I'm so happy it works.")
        assert_equal(cl.train_features[0][1], 'positive')

    def test_pseudocode(self):
        code = self.classifier.pseudocode()
        assert_true("if" in code)

    def test_pretty_format(self):
        pp = self.classifier.pprint(width=60)
        pf = self.classifier.pretty_format(width=60)
        assert_true(isinstance(pp, unicode))
        assert_equal(pp, pf)

    def test_repr(self):
        assert_equal(repr(self.classifier),
            "<DecisionTreeClassifier trained on {0} instances>".format(len(train_set)))
Exemplo n.º 7
0
    test = list(zip(test_data, test_target))

    start_time = time.time()
    cl = NaiveBayesClassifier(train)
    # Compute accuracy
    print("NaiveBayes Accuracy: {0}".format(cl.accuracy(test)))

    # Show 10 most informative features
    cl.show_informative_features(10)
    print(cl.informative_features(10))
    elapsed_time = time.time() - start_time
    print(elapsed_time)

    if (not ignoreDT):
        start_time = time.time()
        cl = DecisionTreeClassifier(train)
        print("DecisionTree Accuracy: {0}".format(cl.accuracy(test)))
        print(cl.pseudocode())
        elapsed_time = time.time() - start_time
        print(elapsed_time)

    start_time = time.time()
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.pipeline import Pipeline

    class StemmedCountVectorizer(CountVectorizer):
        def build_analyzer(self):
            analyzer = super(StemmedCountVectorizer, self).build_analyzer()
            return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
Exemplo n.º 8
0
        tags.append(a[1])

print(len(words), len(tags))

for i in range(1000):
    if (i < 800):
        temp = (words[i], tags[i])
        train.append(temp)
    else:
        temp = (words[i], tags[i])
        test.append(temp)
print(train)
print(test)

naive = NaiveBayesClassifier(train)
dtc = DecisionTreeClassifier(train)
mec = MaxEntClassifier(train)

print("NaiveBayesClassifier Accuracy: {0}".format(naive.accuracy(test)))
print("DecisionTreeClassifier Accuracy: {0}".format(dtc.accuracy(test)))
print("MaxEntClassifier Accuracy: {0}".format(mec.accuracy(test)))

cl = NaiveBayesClassifier(train)
print("NaiveBayesClassifier Accuracy: {0}".format(cl.accuracy(test)))
for i in range(0, len(test)):
    tag = cl.classify(test[i])
    pred_tags.append(tag)
    if (tag == test_tags[i]):
        count += 1
print(len(pred_tags), len(test_tags))
print(count)
Exemplo n.º 9
0
 def test_custom_feature_extractor(self):
     cl = DecisionTreeClassifier(train_set, custom_extractor)
     cl.classify("Yay! I'm so happy it works.")
     assert_equal(cl.train_features[0][1], 'positive')
Exemplo n.º 10
0
def Train():
    train_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\taining set\\anger-ratings-0to1.txt",'r',encoding="utf8",);
    train = [] 
    value_set = 20
    i = 0
    for line in train_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
           # train.append((element,'anger'))
           if i==value_set:
               break
           a = element.split('anger')
           train.append((a[0].lstrip(),'anger'))
           i = i+1
    i=0
    train_file_fear =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\fear-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('fear')
           train.append((a[0].lstrip(),'fear'))
           i = i+1
           
    i=0
    train_file_joy =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\joy-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('joy')
           train.append((a[0].lstrip(),'joy'))
           i = i+1
    i=0
    train_file_sadness =  open("H:\\EmotionDetection\\Saif mohammad\\taining set\\sadness-ratings-0to1.txt",'r',encoding="utf8",);
    for line in train_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
           if i==value_set:
               break
           a = element.split('sadness')
           train.append((a[0].lstrip(),'sadness'))
           i = i+1
    value_set = 20            
    test = []
    i=0 
    test_file_anger = open("H:\\EmotionDetection\\Saif mohammad\\test set\\with intensity labels\\anger.txt",'r',encoding="utf8",);
    for line in test_file_anger.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break
            #test.append((element,'anger'))
            b= element.split('anger')
            test.append((b[0].lstrip(),'anger'))
            i= i+1
    i=0
    test_file_fear = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\fear.txt",'r',encoding="utf8",);
    for line in test_file_fear.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('fear')
            test.append((b[0].lstrip(),'fear'))
            i=i+1
    i=0        
    test_file_joy = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\joy.txt",'r',encoding="utf8",);
    for line in test_file_joy.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('joy')
            test.append((b[0].lstrip(),'joy'))
            i= i+1
    i=0        
    test_file_sadness = open("H:\\EmotionDetection\\Saif mohammad\\test set\\without intensity labels\\sadness.txt",'r',encoding="utf8",);
    for line in test_file_sadness.readlines():      
        for element in line[5:-1].split('\n'):
            if i ==value_set:
                break 
            b= element.split('sadness')
            test.append((b[0].lstrip(),'sadness'))
            i= i+1


    model =DecisionTreeClassifier(train)
    print("accuracy label of Naive Bayes Classifier:{:.4f}".format(model.accuracy(test)))
    print("Training completed....")
    #Dumping model NaiveBayes
    fp=open("H:\\Project_CDAC\\Models\\model_NB.pkl","wb")
    pickle.dump(model,fp)
    fp.close()
    print("Serialization of model completed")
Exemplo n.º 11
0
# trains = []
# for i in range(int(size)):
#     trains.append(train[i])
# for i in range(250, int(size)+250):
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
    cl4 = NLTKClassifier(trains)
    print("Classifier: NLTK -- Accuracy: ", cl4.accuracy(test), "\n")

else:
    print("Bad input!")
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', row[0])
    key = key.replace('"', '')
    blob = TextBlob(unicode(key, 'utf-8', 'ignore'))
    key = ' '.join(blob.noun_phrases)
    value = row[1].upper()
    if count < 120:
        training_data.append((key, value))
    else:
        test_data.append((key, value))
    count += 1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(
    classifier.classify(
        u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities."
        .encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
Exemplo n.º 13
0
training_array_stemmed_without_sw=data.get_training_array_stemmed_without_sw(data)
# print('training_array_stemmed_without_sw')
# print(training_array_stemmed_without_sw)
test_array = data.get_test_array(data)
print('test_array')
print(test_array)
test_array_without_sw = data.get_test_array_without_sw(data)
print('test_array_without_sw')
print(test_array_without_sw)
test_array_stemmed_without_sw =  data.get_test_array_stemmed_without_sw(data)
print('test_array_stemmed_without_sw')
print(test_array_stemmed_without_sw)

print('\n************ DecisionTreeClassifier ********************\n')
print('Before pre-processing \n')
cl = DecisionTreeClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = DecisionTreeClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
cl = DecisionTreeClassifier(training_array_stemmed_without_sw)
classify_review(cl)
print('\n ************ NaiveBayesClassifier ********************\n')
print('Before pre-processing\n')
cl = NaiveBayesClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = NaiveBayesClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
Exemplo n.º 14
0
           ('Fantastic Mr Fox is an awesome film!', 'neg'),
           ('Dragonball Evolution is simply terrible!!', 'pos')]
"""
Textblob provides in-build classifiers module to create a custom classifier. 
So, let’s quickly import it and create a basic classifier.
"""

from textblob.classifiers import NaiveBayesClassifier
classifier = NaiveBayesClassifier(training)
print(classifier.accuracy(testing))
"""
classifier.show_informative_features()
classifier.show_informative_features(3)

As, we can see that if the text contains “is”, 
then there is a high probability that the statement will be negative.
"""

classifier.classify("the weather is terrible!")
classifier.classify("I am very happy today")
classifier.classify("This book was so helpful")
classifier.classify("I'm excited to try my new classifier.")
classifier.classify("This is an awesome library!")

## decision tree classifier
from textblob.classifiers import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(training)
print(dt_classifier.accuracy(testing))

###############################################################################
    if category == expected:
        correct += 1
stop_nbc = time.time()
elapsed = stop_nbc - start_nbc
print("Number of tests: " + str(len(sentences)))
print("Correct tests: " + str(correct))
accuracy = correct / len(sentences)
print("Naive Bayes Classifier accuracy: " + str(accuracy))
print("Testing time (in seconds): " + str(elapsed))

print()
# Decision Tree Classifier
print("Training Decision Tree Classifier...")
start_dtc = time.time()
with open('training.json', 'r') as training:
    dtc = DecisionTreeClassifier(training, format="json")
stop_dtc = time.time()
print("Training Decision Tree Classifier completed...")
elapsed = stop_dtc - start_dtc
print("Training time (in seconds): " + str(elapsed))
print("Testing Decision Tree Classifier...")
correct = 0
start_dtc = time.time()
for i in range(0, len(sentences)):
    category = str(dtc.classify(sentences[i])).lower()
    expected = str(categories[i]).lower()
    if category == expected:
        correct += 1
stop_dtc = time.time()
elapsed = stop_dtc - start_dtc
print("Number of tests: " + str(len(sentences)))
Exemplo n.º 16
0

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


var= open("train.csv","w")
train_csv=pd.DataFrame.to_csv(train, index= False)         
var.write(train_csv)
var.close()

train = pd.concat([X_train['text'],y_train], axis=1)
with open('train.csv', 'r') as fp2:
    cl2 = DecisionTreeClassifier(fp2, format="csv")        


pred_train=[]
feature_train=[]
true_train= train.label
for instance in train.text:
    feature_train.append(feats(instance))
    blob = TextBlob(instance, classifier=cl2)
    pred_train.append(int(float(blob.classify())))
    

count=0
for i in range(len(pred_train)):
    if pred_train[i] == y_train[i]:
        count= count+1
Exemplo n.º 17
0
# print training_data_raw
training_data_set = {}
for key, row in training_data_raw.iterrows():
    training_data_set[row[0]] = row[1]

training_data = []

for key, value in training_data_set.iteritems():
    key = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', key)
    value = value.upper()
    training_data.append((unicode(key, 'utf-8', 'ignore'), value))

classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data = []
test_set = []
print "Step in 1"
for key, row in test_data_raw.iterrows():
    item = row[0]
    item = unicode(item, 'utf-8', 'ignore')
    item = re.sub(
        r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''',
        '', item)
Exemplo n.º 18
0
    print(sentence.classify())

# Compute accuracy
print("Accuracy: {0}".format(cl.accuracy(test)))

# Show 5 most informative features
cl.show_informative_features()

#Classifying txt file with DecisionTree

train = [('Buy cheap drugs', 'spam'), ('Cheap viagra', 'spam'),
         ('Win 1000 dollar', 'spam'), ('Greatings, how are you?', 'ham'),
         ('What an awesome picture', 'ham'), ('Send me your adress', 'ham'),
         ('viagra', 'spam')]

tr = DecisionTreeClassifier(train)

file = open("test.txt")
t = file.read()
print(type(t))
blob = TextBlob(t, classifier=tr)
blob.tags
print(blob)

for sentence in blob.sentences:
    print(sentence)
    print(sentence.classify())

print(blob.classify())
print(tr.pseudocode())
print(tr.pretty_format())
Exemplo n.º 19
0
random.seed(1)

train = [('I love this sandwich.', 'pos'),
         ('This is an amazing place!', 'pos'),
         ('I feel very good about these beers.', 'pos'),
         ('This is my best work.', 'pos'), ("What an awesome view", 'pos'),
         ('I do not like this restaurant', 'neg'),
         ('I am tired of this stuff.', 'neg'),
         ("I can't deal with this", 'neg'), ('He is my sworn enemy!', 'neg'),
         ('My boss is horrible.', 'neg')]
test = [('The beer was good.', 'pos'), ('I do not enjoy my job', 'neg'),
        ("I ain't feeling dandy today.", 'neg'), ("I feel amazing!", 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')]

cl = DecisionTreeClassifier(train)

# Grab some movie review data
reviews = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]
random.shuffle(reviews)
new_train, new_test = reviews[0:100], reviews[100:150]

# Update the classifier with the new training data
cl.update(new_train)

# Compute accuracy
accuracy = cl.accuracy(test + new_test)
print("Accuracy: {0}".format(accuracy))
Exemplo n.º 20
0
 def setUp(self):
     self.classifier = DecisionTreeClassifier(train_set)
Exemplo n.º 21
0
                train.append( (ngrams, str(doc['class'])) )

            
        ## TEST SET -----------------
        test = []
        results = dbtest_set.find({'class':{'$ne':1}})  # {'class':{'$eq':1}}
        for doc in results:
            if(param[1] not in ['bigrams','trigrams']):
                test.append( (doc[param[0]][param[1]], str(doc['class'])) )
            else:
                # join the ngrams together so we can use them
                ngrams = join_ngrams(doc[param[0]][param[1]])
                test.append( (ngrams, str(doc['class'])) )


        cl = DecisionTreeClassifier(train)
        type = 'DecisionTree'
        # cl = NaiveBayesClassifier(train)
        # type = 'NaiveBayes'

        # wraps NLTK simply: return nltk.classify.accuracy(self.classifier, 
        # test_features) 
        acc = cl.accuracy(test) * 100
        print('Classifier Type      | ', type, ' with ', '.'.join(param))
        print('Accuracy, train/test | ', '=',  str(acc), '% ,', len(train), \
            '/', len(test))
        #cl.show_informative_features(30)
        print ('\n')
        print ('\n')

for key, row in training_data_raw.iterrows():
   key = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', row[0])
   key = key.replace('"','')
   blob = TextBlob(unicode(key, 'utf-8', 'ignore'))
   key = ' '.join(blob.noun_phrases)
   value = row[1].upper()
   if count < 120:
      training_data.append((key, value))
   else:
      test_data.append((key, value))
   count+=1

print "Training Set is Processed -> ", count

print "Learning in progress ..."
classifier = DecisionTreeClassifier(training_data)
print "Classifier Ready"

print classifier.accuracy(test_data)
classifier.update(test_data)
print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
"""
print "Writing Results to csv"
flag = True
with open('Human_resultsOnly - Copy.csv') as inp:
    with open('Taxonomy output.csv', 'wb') as oup:
        reader = csv.reader(inp)
        writer = csv.writer(oup)
        for row in reader:
            if flag:
                row.append("Taxonomy")
Exemplo n.º 23
-2
training_data_set = {}
for key,  row in training_data_raw.iterrows():
   training_data_set[row[0]] = row[1]



training_data = []

for key, value in training_data_set.iteritems():
   key = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', key)
   value = value.upper()
   training_data.append((unicode(key, 'utf-8', 'ignore'), value))



classifier = DecisionTreeClassifier(training_data)

# print(classifier.classify(u"Prosthetic and Orthotic Care (POC), an independent prosthetics and orthotics company serving disabled individuals in Southern Illinois and Eastern Missouri, has discovered that an unauthorized individual has stolen the protected health information of 23,015 patients.The cyberattack occurred in June 2016, although POC only became aware of the hacking incident on July 10. The hacker gained access to patient data by exploiting security flaw in a third party software system that had been purchased by POC. The attack was conducted by a hacker operating under the name – TheDarkOverlord – who was also responsible for the cyberattacks on Athens Orthopedic Clinic and Midwest Orthopedics Group, in addition to a hack of as of yet unnamed health insurer. In total, the records of over 9.5 million patients are understood to have been obtained by the hacker.According to a breach notice issued by POC, the stolen data include names, addresses and other contact information, internal ID numbers, billing amounts, appointment dates, and diagnostic codes. Some patients also had their Social Security number, date of birth, procedure photographs, health insurer’s names, and other identification information stolen. The breach total number was included in the posting of the third party software vendor who was hacked and affected many medical clinics, practices and facilities.".encode('ascii', errors='ignore')))
# print(classifier.classify(u"Providence Health & Services in Oregon is notifying about 5,400 current and former patients that a former employee may have improperly accessed their patient records.Providence said in a statement Friday that it learned of the breach in May during an internal audit and had since fired the Portland-based employee.The audit found the worker had accessed health records between July 2012 and April 2016. It says the worker viewed demographic and medical treatment information, and may also have seen insurance information and Social Security numbers.".encode(encoding='ascii', errors='ignore')))

test_data_raw = pd.read_csv('Test_set.csv', header=0)

test_data=[]
test_set=[]
print "Step in 1"
for key, row in test_data_raw.iterrows():
   item = row[0]
   item = unicode(item, 'utf-8', 'ignore')
   item = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', item)
   classification = classifier.classify(item)
   #test_data.append({'Incident': item,  'Classification' : classification})