def classifyNonClusteredJira(self):
        columnName = 'C'
        for index, row in self.df.iterrows():
            clusterName = row['Labels']
            keyWords = row['KeyWords']

            if (clusterName in constantsObj.INITIAL_CLUSTERS):
                self.issueSet.append(({
                    "class": row['Labels'],
                    "sentence": keyWords
                }))

        for issue in self.issueSet:
            self.jiraTrainer.train(issue['sentence'], issue['class'])

        jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer)

        for index, row in self.df.iterrows():
            clusterName = row['Labels']
            keyWords = row['KeyWords']
            if (clusterName not in constantsObj.INITIAL_CLUSTERS):
                identifiedCluster = jiraClassifier.classify(
                    row['KeyWords']).__getitem__(0)
                identifiedCluster = identifiedCluster.__getitem__(0)
                self.issueSet.append(({
                    "class": identifiedCluster,
                    "sentence": keyWords
                }))
                self.nonClusteredJirasAfterClusteringFile.write(
                    "%s --- %s\n" % (keyWords, identifiedCluster))
                '''writeIndex = columnName + str(index-2)
                self.activeWorkSheet[writeIndex] = identifiedCluster'''

        self.nonClusteredJirasAfterClusteringFile.close()
        return self.issueSet
Exemplo n.º 2
0
    def __call__(self, text):
        context = self.context
        request = self.request
        response = request.response
        catalog = context.portal_catalog

        bayesFilter = api.portal.get_registry_record(
            'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter')

        trainingSet = []
        for line in bayesFilter.split('\n'):
            trainingSet.append({
                'category': 'hasKey',
                'text': safe_unicode(line)
            })

        trainer = Trainer(tokenizer)
        for record in trainingSet:
            trainer.train(record['text'], record['category'])
        classifier = Classifier(trainer.data, tokenizer)

        result = classifier.classify(safe_unicode(text))

        import pdb
        pdb.set_trace()
Exemplo n.º 3
0
  def train_spam_texts():
    # Reading dataset file
    dataset_lang = "ru"
    dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig")
    dataset_data = json.load(dataset_file)

    # Preparing adverts spam dataset
    prepared_dataset = []
    for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]):
      prepared_dataset.append({
        "text": item["text"],
        "category": "adverts"
      })
    
    # Training
    # (Will be replaced by another library soon)
    advertsTrainer = Trainer(tokenizer)
    for one_dataset_item in prepared_dataset:
      advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"])
    adverts_classifier = Classifier(advertsTrainer.data, tokenizer)

    # Usage
    # classification = adverts_classifier.classify("рассылка")
    # category_chance = classification[0][1]
    # print(category_chance)
    def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira):
        for item in inputTrainingData:
            self.jiraTrainer.train(item['sentence'], item['class'])
        jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer)
        clusterForInputJira = jiraClassifier.classify(inputJira)

        return clusterForInputJira
Exemplo n.º 5
0
    def getKeywords(self, html):

        text = self.getHtml2Text(html)
#        print text
        text = self.zhsJieba(text)

        #取得registry
        reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict')
        trainSet = []
        for item in reg:
            key = item.split('|||')[0]
            for line in reg[item].split('\n'):
                zhsString = self.zhsJieba(line)
                trainSet.append({'category': key, 'text': zhsString})

        #用簡單貝氏分類文章
        newsTrainer = Trainer(tokenizer)
        for news in trainSet:
            newsTrainer.train(news['text'].encode('utf-8'), news['category'])
        newsClassifier = Classifier(newsTrainer.data, tokenizer)
        classification = newsClassifier.classify(text)
        print classification
#        import pdb; pdb.set_trace()
        if classification[0][1] == 0.0:
            classification.insert(0, (u'n99', 0.0))
        result = []
        for item in classification:
            result.append(item[0])
        return result
def classificationNB(index):
    '''
    Train the Naive Bayes classifier and classify data
    naiveBayesClassifier is used.
    https://github.com/muatik/naive-bayes-classifier
    '''
    # Initial training set from file
    trainset = []
    f = open('E:\\databases\\trainset.txt', 'r')
    for line in f:
        if len(line.strip()) == 0:
            continue
        line = line.strip().split()
        assert len(line) == 22
        trainset.append({
            'text': '%08d' % int(line[(index + 1) * 2]),
            'category': line[(index + 1) * 2 + 1]
        })
    pass  # for line in f
    f.close()

    # Train the classifier
    trainer = Trainer(tokenizer)
    for case in trainset:
        trainer.train(case['text'], case['category'])
    classifier = Classifier(trainer.data, tokenizer)

    # Classification for each of the rest sets
    for i in range(10):
        if index == i:
            continue
        print '%-2d ~ %-2d' % (index, i)
        # Read cases from the file and classify each case
        f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'r')
        results = []
        count = 0
        for line in f:
            count += 1
            line = line.strip()
            if len(line) == 0:
                continue
            if count == 1:  # the first line -- title
                header = 'CAT%02d' % (index + 1)
                assert header not in line
                results.append('%s\t%s' % (line, header))
                continue
            pass  # if count == 1
            case = line.split()
            assert len(case) >= 4
            clf = classifier.classify(case[2])
            results.append('%s\t%s' % (line, clf))
        pass  # for line in f
        f.close()

        # Save the results back to the file
        f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'w')
        for re in results:
            f.write('%s\n' % re)
        f.close()
    pass  # for i in range(10)
Exemplo n.º 7
0
    def train(self):
        """Train on base and FB data"""

        with open('res/data/base_data.csv', 'r') as csv_file:

            reader = csv.reader(csv_file)
            i = 0
            for line in reader:

                i += 1

                line_split = line
                read_dict = {}

                if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0:
                    continue

                read_dict['class'] = line_split[2].strip()
                # Accounting for our inconsistency in Spreadsheet
                if read_dict["class"] == "Real":
                    read_dict['text'] = line_split[6].strip()
                else:
                    read_dict['text'] = line_split[5].strip()

                print(read_dict)

                self.training_data.append(read_dict)

        print('---->>>>>><<<<<<<-------')

        with open('res/data/fb_data.csv', 'r') as csv_file:

            reader = csv.reader(csv_file)
            i = 0
            for line in reader:

                i += 1

                line_split = line
                read_dict = {}

                if i == 1 or len(line_split) <= 2:
                    continue

                read_dict['class'] = line_split[2].strip()
                read_dict['text'] = line_split[5].strip()

                print(read_dict)

                self.training_data.append(read_dict)

        #print training_data
        for data in self.training_data:
            self.newsTrainer.train(data['text'], data['class'])

        self.newsClassifier = Classifier(
            self.newsTrainer.data,
            tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
def classify(filename, size):

    trainingSet, testingSet = make_chronological_sets.create_sets(
        filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    prop_caught = float(mal_mal) / float(mal_mal + clean_mal)
    prop_missed = float(clean_mal) / float(mal_mal + clean_mal)

    ## Stuff to get proportions:

    # size = float(size)

    # mal_mal = float(mal_mal)/size
    # mal_clean = float(mal_clean)/size
    # clean_mal = float(clean_mal)/size
    # clean_clean = float(clean_clean)/size

    ## Confusion matrix stuff:

    # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]]

    # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean'])

    print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean)
    print "Malware: " + str(mal_mal + clean_mal)
    print "Clean: " + str(mal_clean + clean_clean)
    print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")"
    print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format(
        prop_missed) + ")"
Exemplo n.º 9
0
def train_classifier(newsData_train):

    data_process = Data_process()

    for data in newsData_train:
        data_process.final_process(data['text'], data['category'])

    newsClassifier = Classifier(data_process, data_process.tokenizer)

    return newsClassifier
Exemplo n.º 10
0
def get_classer():
    newsTrainer = Trainer(tokenizer)

    for news in newsSet:
        newsTrainer.train(news['text'], news['category'])

    # When you have sufficient trained data, you are almost done and can start to use
    # a classifier.
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    return newsClassifier
Exemplo n.º 11
0
    def init(cls, lang='tr', namesCollection=NamesCollection, classifier=None):

        cls.lang = lang
        cls.namesCollection = namesCollection

        if classifier:
            cls.classifier = classifier
        else:
            cls.classifier = Classifier(CachedModel.get(lang), tokenizer)

        cls.initialized = True
Exemplo n.º 12
0
    def train(self):
        """Train on base and FB data"""

        # Run through each training example in data interface and
        # feed them into model
        for data_point in self.data_interface.arr:
            data_class = data_point[2].strip()  # Class is "Credibility"
            data_text = data_point[4].strip()  # Text is "Content URL"
            self.newsTrainer.train(data_text, data_class)

        self.newsClassifier = Classifier(self.newsTrainer.data, \
            tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
Exemplo n.º 13
0
def classify(filename, size, url, result):

    trainingSet = make_training_set.create_set(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    print "Expected: " + result
    print classifier.classify(url)
Exemplo n.º 14
0
def post_logfile():
    if request.method == 'GET':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification

        return json.dumps(dict(classification))
Exemplo n.º 15
0
def create_naive_bayes_classifier(training_examples, training_annotations):
    print("creating naive bayes classifier")
    annotations = [categories[x] for x in training_annotations]

    news_trainer = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(training_examples, annotations):
        news_trainer.train(example, annotation)
    classifier = Classifier(
        news_trainer.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    print("\t->done")
    return classifier
    def train(self):
        with open('src/URL.csv', 'r') as csv_file:
                reader = csv_file.readlines()
                for line in reader:
                        read_dict = {}
                        line_split = line.split(',')
                        if len(line_split) < 2 or len(line_split[0]) == 0:
                                continue
                        read_dict['text'] = line_split[0].strip()
                        read_dict['class'] = line_split[1].strip()
                        
                        self.training_data.append(read_dict)

        #print training_data
        for data in self.training_data:
            self.newsTrainer.train(data['text'], data['class'])

        self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
Exemplo n.º 17
0
def post_logfile():
    if request.method == 'POST':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification
        result = []
        for item in classification:
            obj = CustomType(item[0], item[1])
            result.append(json.loads(obj.toJSON()))
        # return json.dumps(OrderedDict(classification))
        return json.dumps(result, indent=4)
Exemplo n.º 18
0
def classify(filename, size):

    trainingSet, testingSet = make_balanced_sets.create_sets(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    size = float(size)

    mal_mal = float(mal_mal) / size
    mal_clean = float(mal_clean) / size
    clean_mal = float(clean_mal) / size
    clean_clean = float(clean_clean) / size

    confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]]

    pprint(confusionMatrix)
    print "Accuracy: " + str(mal_mal + clean_clean)
    print "False positives (predicted clean when malicious): " + str(clean_mal)
    print "False negatives (predicted malicious when clean): " + str(mal_clean)
Exemplo n.º 19
0
def create_nbc_nb_classifier(training_dataset):
    training_examples, training_annotations = training_dataset
    # training_annotations = [int(not bool(annotation)) for annotation in training_annotations]
    parsed_training_examples = [
        set(tokenize(example)) for example in training_examples
    ]

    tr = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(parsed_training_examples,
                                   training_annotations):
        tr.train(example, annotation)

    print("number of tokens seen: %s" % len(tr.data.frequencies.keys()))
    return tr, Classifier(
        tr.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
Exemplo n.º 20
0
 def neyronka(self, _str):
     newsTrainer = Trainer(tokenizer)
     with open('o', 'rt', encoding='utf8') as csvfile:
         res = '['
         for i in csvfile.readlines():
             if i == '\n':
                 continue
             else:
                 theme, text = i.split('***')
                 res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str(
                     theme) + '\'},\n'
         res += ']'
         newsSet = eval(res)
         for news in newsSet:
             newsTrainer.train(news['text'], news['category'])
         newsClassifier = Classifier(newsTrainer.data, tokenizer)
         unknownInstance = _str
         classification = newsClassifier.classify(unknownInstance)
         return (sorted(classification, key=(lambda x: -x[1])))
def determine(sentence):
    newsTrainer = Trainer(tokenizer)
    newsSet = []

    with open('data.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            newsSet.append({'fact': row['Fact'], 'decision': row['Decision']})

    for news in newsSet:
        newsTrainer.train(news['fact'], news['decision'])

    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    classification = newsClassifier.classify(sentence)
    # False
    false = classification[0][1]
    false = str(false).split('.')[0]
    # True
    true = classification[1][1]
    true = str(true).split('.')[0]
    data = [true, false]
    return data
Exemplo n.º 22
0
import json, os, sys, re
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
##IMPORTS
'''
Usage:

    python GuessDisease.py "symptomA symptomB symptomC"
Example INPUT:
    python GuessDisease.py "agitation exhaustion vomit"
Example OUTPUT:

    {
    "disease": "influenza"
    }


'''

##SETTING UP
diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
with open("Dataset.csv", "r") as file:  #OPENS DATASET
    for i in file:  #FOR EACH LINE
        lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
        diseaseclassifier.train(lines[1], lines[0])  #TRAINING
diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
classification = diseaseclassifier.classify(sys.argv[1])  #CLASIFY INPUT
print classification[0]  #PRINT CLASIFICATION
documentTrainer = Trainer(tokenizer)

documentSet = []


def getTextBasedOnDocumentID(documentID):
    ID = int(documentID.split('_')[1])
    line = linecache.getline('../2.document_set/document_set.csv', ID + 2)
    text = line.split(',"')[1]
    return text


for i in range(0, len(traincsv)):
    documentSet.append({
        'text': getTextBasedOnDocumentID(traincsv[i][0]),
        'category': traincsv[i][1]
    })

for documents in documentSet:
    documentTrainer.train(documents['text'], documents['category'])

newsClassifier = Classifier(documentTrainer.data, tokenizer)

for i in range(0, len(testcsv)):
    data = getTextBasedOnDocumentID(testcsv[i][0])
    classification = newsClassifier.classify(data)
    testcsv[i][1] = int(classification[0][0])
df = pd.DataFrame(testcsv)
df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False)
#np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")
Exemplo n.º 24
0
def classify(input):
    twitter = Twitter()

    f = open("data.txt", "r")

    data = json.loads(f.read())

    gradeTrainer = Trainer(tokenizer)
    loadTrainer = Trainer(tokenizer)
    lectureTrainer = Trainer(tokenizer)

    print("Training grade ...")
    for subject in data:
        if subject["grade"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    gradeTrainer.train(li, subject["grade"])

    print("Training load ...")
    for subject in data:
        if subject["load"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    loadTrainer.train(li, subject["load"])

    print("Training lecture ...")
    for subject in data:
        if subject["lecture"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    lectureTrainer.train(li, subject["lecture"])

    gradeClassifier = Classifier(gradeTrainer.data, tokenizer)
    loadClassifier = Classifier(loadTrainer.data, tokenizer)
    lectureClassifier = Classifier(lectureTrainer.data, tokenizer)

    input = u"" + input
    classify_input = []

    for element in twitter.pos(input):
        if element[1] == 'Noun':
            classify_input.append(element[0])
        elif element[1] == 'Verb':
            classify_input.append(element[0])
        elif element[1] == 'Adjective':
            classify_input.append(element[0])
        elif element[1] == 'Adverb':
            classify_input.append(element[0])
        elif element[1] == 'Exclamation':
            classify_input.append(element[0])
        elif element[1] == 'Alpha':
            classify_input.append(element[0])
        elif element[1] == 'KoreanParticle':
            classify_input.append(element[0])

    text = " ".join(classify_input)

    print(text)

    gradeClassification = gradeClassifier.classify(text)
    loadClassification = loadClassifier.classify(text)
    lectureClassification = lectureClassifier.classify(text)

    print(
        "\n________________________________________GRADE________________________________________\n"
    )
    print(gradeClassification)
    print(
        "\n________________________________________LOAD_________________________________________\n"
    )
    print(loadClassification)
    print(
        "\n________________________________________LECTURE______________________________________\n"
    )
    print(lectureClassification)

    return gradeClassification, loadClassification, lectureClassification
Exemplo n.º 25
0
    def get(self):
        try:
            print "  "
            print "TestClassifier start"
            print "  "
            # pasar  los stop words a lista desde el file
            with open("stop_words.txt", "r") as ins:
                array = []
                for line in ins:
                    array.append((line.rstrip('\n')).decode('unicode-escape'))
            #print array
            newsTrainer = Trainer(
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&_"]))

            hoy = date.today()

            query = News3.query(News3.date == hoy,
                                News3.news_from.IN([
                                    "uy_press",
                                ]), News3.category == "Política")

            # You need to train the system passing each text one by one to the trainer module.
            #newsSet =[
            #    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
            #    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
            #    {'text': 'do not neglect exercise', 'category': 'health'},
            #    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
            #    {'text': 'eat to lose weight', 'category': 'health'},
            #    {'text': 'you should not eat much', 'category': 'health'}
            #]

            query2 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "deportes")

            query4 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "salud")

            #for news in newsSet:
            #    newsTrainer.train(news['text'], news['category'])
            c = 0
            #print query
            for i in query:
                print "  "
                print i.category
                newsTrainer.train(i.html, 'politica')
                #if c == 10: break
                c += 1

            #for i in query2:
            #	newsTrainer.train(i.html, 'deportes')
            #raise Exception('I know Python!')

            #for i in query4:
            #	newsTrainer.train(i.html, 'salud')

            # When you have sufficient trained data, you are almost done and can start to use
            # a classifier.

            # Now you have a classifier which can give a try to classifiy text of news whose
            # category is unknown, yet.
            query3 = News3.query(
                News3.date == hoy,
                News3.news_from.IN([
                    "el_pais",
                ]),
                News3.id.IN([0]),
            )

            ###
            newsClassifier = Classifier(
                newsTrainer.data,
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&"]))
            #print unknownInstance
            classification = newsClassifier.classify(
                "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo"
            )

            # the classification variable holds the detected categories sorted
            print " classification "
            print(classification)
        except:
            print traceback.format_exc()
Exemplo n.º 26
0
def article_keywords(article):
    keys = Keywords.objects.get(article=article)
    print keys
    l = [k.keyword for k in keys.keywords.all()]
    print " ".join(l)
    keyset = {'keyword': " ".join(l)}
    return keyset


if __name__ == '__main__':
    print "Starting testing of Bayes Classifer"
    labeled_articles = [
        (a, a.relevant)
        for a in Article.objects.all()[:(len(Article.objects.all()))]
    ]
    print labeled_articles
    featuresets = []
    for (article, relevant) in labeled_articles:
        r = article_keywords(article)
        featuresets.append((r, relevant))
    print featuresets
    train_set, test_set = featuresets[:(len(featuresets))], featuresets[(
        len(featuresets) - 2):]
    print train_set
    newsTrainer = Trainer(tokenizer)
    for f in train_set:
        newsTrainer.train(f[0]['keyword'], f[1])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    url = raw_input("Enter the url: ")
    testurl(url, newsClassifier)
Exemplo n.º 27
0
    if data[no][1] == 'negative':
        cls = data[no][3]
    else:
        cls = data[no][1]

    twi_cont = str_pre_process(data[no][10])

    struct = {'text': twi_cont, 'category': cls}
    #print twi_cont, cls
    train_twi.append(struct)

for twi in train_twi:
    trainer.train(twi['text'], twi['category'])

model = Classifier(trainer.data, tokenizer)

print "Testing..."

for no in range(12000, num_twi):
    twi_cont = str_pre_process(data[no][10])
    classification = model.classify(twi_cont)
    #print classification,
    test_twi.append(classification)

    if data[no][1] == 'negative':
        cls = data[no][3]
    else:
        cls = data[no][1]

    true_cls.append(cls)
Exemplo n.º 28
0
from naiveBayesClassifier.classifier import Classifier

sentimentTrainer = Trainer(tokenizer)

# Get the training dataset.
with open('training.csv', 'r') as f:
    data = f.read()
trainset = data.splitlines()

for line in trainset:
    pos1 = line.find(',"')
    pos2 = line.find('",', pos1)
    if pos1 == -1:
        pos1 = line.find(',')
        pos2 = line.find(',', pos1 + 1)
        comment = line[pos1 + 1:pos2]
        sentiment = line[pos2 + 1:]
    else:
        comment = line[pos1 + 2:pos2 - 2]
        sentiment = line[pos2 + 2:]
    sentimentTrainer.train(comment, sentiment)

# Use the classifier.
sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer)

# Classify an unknown review.
unknownInstance = "I don't like the app. It crashes everytime."
classification = sentimentClassifier.classify(unknownInstance)

print classification
Exemplo n.º 29
0
    {'symptoms': 'unresponsiveness', 'disease': 'dementia'},
    {'symptoms': 'lethargy', 'disease': 'dementia'},
    {'symptoms': 'agitation', 'disease': 'dementia'},
    {'symptoms': 'ecchymosis', 'disease': 'dementia'},
    {'symptoms': 'syncope', 'disease': 'dementia'},
    {'symptoms': 'rale', 'disease': 'dementia'},
    {'symptoms': 'unconscious state', 'disease': 'dementia'},
    {'symptoms': 'cough', 'disease': 'dementia'},
    {'symptoms': 'bedridden', 'disease': 'dementia'},
    {'symptoms': 'unsteady gait', 'disease': 'dementia'},
    

    ]



for news in newsSet:
    newsTrainer.train(news['symptoms'], news['disease'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstance = "pain fever coughing"
classification = newsClassifier.classify(unknownInstance)

# the classification variable holds the possible categories sorted by 
# their probablity value
print(classification)
Exemplo n.º 30
0
def get_model(trump, cruz, kasich, clinton, sanders):
    trainer = Trainer(tokenizer)

    twiSet = []
    for address in trump:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/trump/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'trump'}
        twiSet.append(struct)
        text_file.close()

    for address in cruz:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/cruz/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'cruz'}
        twiSet.append(struct)
        text_file.close()

    for address in kasich:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/kasich/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'kasich'}
        twiSet.append(struct)
        text_file.close()

    for address in clinton:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/clinton/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'clinton'}
        twiSet.append(struct)
        text_file.close()

    for address in sanders:
        with open(
                "/Users/Helicopter/Desktop/Logan_s/Courses/Stat_AI_ML/election/election_twitters/sanders/"
                + address, "r") as text_file:
            content = ""
            for line in text_file:
                for word in line.split():
                    content = content + word + " "
            struct = {'text': content, 'category': 'sanders'}
        twiSet.append(struct)
        text_file.close()

    for twi in twiSet:
        trainer.train(twi['text'], twi['category'])

    newclassifier = Classifier(trainer.data, tokenizer)

    return newclassifier