示例#1
0
def train_SaveClassifierRandom(posURLs,negURLs,classifierFileName):
        
    posDocs = getWebpageText(posURLs)
    posDocs = [d['title'] + " " + d['text'] for d in posDocs if d]
    
    negDocs = getWebpageText(negURLs)
    negDocs = [d['title'] + " " + d['text'] for d in negDocs if d]
    
    posLen = len(posDocs)
    print posLen
    negLen = len(negDocs)
    print negLen
    posLabels = [1]* posLen
    negLabels = [0]*negLen 
    
    
    
    dataSetDocs = posDocs + negDocs
    dataSetLabels = posLabels + negLabels
    
    dataDocLabels = zip(dataSetDocs,dataSetLabels)
    random.shuffle(dataDocLabels)
    
    sep = int(0.7*len(dataDocLabels))
    trainingDocLabels = dataDocLabels[:sep]
    testDocLabels = dataDocLabels[sep:]
    
    trainingLabels = [v for _,v in trainingDocLabels]
    trainingDocs = [k for k,_ in trainingDocLabels]
    
    testDocs = [d for d,_ in testDocLabels]
    test_labels=[l for _,l in testDocLabels]
    
    classifier = NaiveBayesClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)
    
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
示例#2
0
def train_SaveClassifierFolder(posURLs,negURLs,classifierFileName):
        
    posDocs = getWebpageText(posURLs)
    posDocs = [d['title'] + " " + d['text'] for d in posDocs if d]
    
    negDocsList = []
    for n in negURLs:
        negDocsList.append(getWebpageText(n))
    
    negTraining = []
    negTesting =[]
    for nu in negDocsList:
        ns = int(len(nu)*0.7)
        negTraining.extend(nu[:ns])
        negTesting.extend(nu[ns:])
    
    negTraining = [d['title'] + " " + d['text'] for d in negTraining if d]
    negTesting = [d['title'] + " " + d['text'] for d in negTesting if d]
    
    
    posLen = len(posDocs)
    posSep = int(0.7*posLen)
    posTraining = posDocs[:posSep]
    posTest = posDocs[posSep:]
    
    trainingDocs = posTraining + negTraining
    trainingLabels = [1]* len(posTraining) + [0]*len(negTraining)
    
    testingDocs = posTest + negTesting
    testingLabels = [1]*len(posTest) + [0]*len(negTesting)
        
    classifier = NaiveBayesClassifier()
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(testingLabels)
    print classifier.score(testingDocs, test_labelsArr)
    
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
示例#3
0
def train_SaveClassifier(posURLs,negURLs,classifierFileName):
        
    #posDocs = getWebpageText(posURLs)
    posDocs = getWebpageText_NoURLs(posURLs)
    posDocs = [d['text'] for d in posDocs if d]
    
    #negDocs = getWebpageText(negURLs)
    negDocs = getWebpageText_NoURLs(negURLs)
    negDocs = [d['text'] for d in negDocs if d]
    
    #negTraining = [d['title'] + " " + d['text'] for d in negTraining if d]
    #negTesting = [d['title'] + " " + d['text'] for d in negTesting if d]
    
    posLen = len(posDocs)
    posSep = int(0.7*posLen)
    posTraining = posDocs[:posSep]
    posTest = posDocs[posSep:]
    
    negLen = len(negDocs)
    negSep = int(0.7*negLen)
    negTraining = negDocs[:negSep]
    negTest = negDocs[negSep:]
    
    trainingDocs = posTraining + negTraining
    trainingLabels = [1]* len(posTraining) + [0]*len(negTraining)
    
    testingDocs = posTest + negTest
    testingLabels = [1]*len(posTest) + [0]*len(negTest)
        
    classifier = NaiveBayesClassifier()
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    print classifier.score(trainingDocs, trainingLabelsArr)
    print metrics.classification_report(trainingLabelsArr, classifier.predicted)
       
    test_labelsArr = np.array(testingLabels)
    print classifier.score(testingDocs, test_labelsArr)
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    
    #print classifier.classifier.feature_log_prob_
    #print classifier.classifier.coef_
    
    classifierFile = open(classifierFileName,"wb")
    pickle.dump(classifier,classifierFile)
    classifierFile.close()
    return classifier
示例#4
0
def main():

    #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"]
    #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537",
    #           "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok",
    #            "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132",
    #            "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215"
    #            ]
    seedUrls = [
        "http://www.ndtv.com/topic/sikkim-earthquake",
        "http://zeenews.india.com/tags/Sikkim_earthquake.html",
        "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/",
        "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok"
    ]
    '''
    seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/",
                "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0",
                "http://www.guardian.co.uk/world/series/egypt-protests",
                "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html",
                "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html",
                "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/"
                ]
    '''
    #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military']
    ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt")
    urls_tokens = []
    title_tokens = []
    docs = getrawDocs("html_files2-balanced.txt", urls_tokens, title_tokens)
    #writeToFile(docs,"rawData.txt")
    print("raw docs extracted")
    docs_len = len(docs)
    #docs_tokens = getTokenizedDocs(docs)
    #print(" docs tokens extracted")
    #labels = getLabels(docs_tokens, topicKeywords)
    #writeToFile(labels,"labels.txt")
    labels = getLabelsFromFile("labels2-balanced.txt")
    print sum(labels)

    ##print("docs labels calcualted")

    sep = int(docs_len * 0.9)

    trainingDocs = docs[:sep]

    trainingLabels = labels[:sep]

    testDocs = docs[sep:]
    test_labels = labels[sep:]

    classifier = NaiveBayesClassifier()

    #classifier = SVMClassifier()

    trainingLabelsArr = np.array(labels)
    classifier.trainClassifier(docs, trainingLabelsArr)

    #print classifier.classifier.coef_
    #print classifier.ch2.get_support()

    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs, trainingLabelsArr)

    #print len(trainingDocs)
    #print len (trainingLabelsArr)
    #classifier.trainClassifier(trainingDocs,trainingLabels)

    #print("classifer trained")
    #print (classifier.classifier)
    #print sum(test_labels)

    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)

    #print sum(classifier.predicted)
    #print classifier.score(testDocs, test_labels)

    print metrics.classification_report(test_labelsArr, classifier.predicted)
    '''
def main():
    
    #seedUrls = ["http://www.huffingtonpost.com/news/arab-spring/","http://www.opendemocracy.net/david-hayes/arab-spring-protest-power-prospect","http://www.washingtonpost.com/wp-srv/special/world/middle-east-protests/"]
    #seedUrls = ["http://www.ndtv.com/article/india/big-earthquake-in-sikkim-tremors-across-india-54-dead-over-100-injured-134537",
    #           "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok",
    #            "http://www.ndtv.com/article/india/quake-aftermath-many-villages-in-sikkim-still-cut-off-thousands-waiting-for-help-135132",
    #            "http://www.ndtv.com/article/india/12-dead-40-missing-at-sikkim-plant-hit-by-quake-135215"
    #            ]
    seedUrls = ["http://www.ndtv.com/topic/sikkim-earthquake",
                "http://zeenews.india.com/tags/Sikkim_earthquake.html",
                "http://earthquake-report.com/2011/09/18/very-strong-earthquake-in-sikkim-india/",
                "http://articles.timesofindia.indiatimes.com/2011-09-21/india/30184028_1_construction-site-teesta-urja-gangtok"
                ]
    '''
    seedUrls = ["http://www.aljazeera.com/indepth/spotlight/anger-in-egypt/",
                "http://live.reuters.com/Event/Unrest_in_Egypt?Page=0",
                "http://www.guardian.co.uk/world/series/egypt-protests",
                "http://www.huffingtonpost.com/2012/06/24/egypt-uprising-election-timeline_n_1622773.html",
                "http://www.washingtonpost.com/wp-srv/world/special/egypt-transition-timeline/index.html",
                "http://botw.org/top/Regional/Africa/Egypt/Society_and_Culture/Politics/Protests_2011/"
                ]
    '''
    #topicKeywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military']
    ##topicKeywords = getTopicKeywords("manual-sikkim-earthquake-wikipedia.txt")
    urls_tokens = []
    title_tokens = []
    docs = getrawDocs("html_files2-balanced.txt",urls_tokens, title_tokens)
    #writeToFile(docs,"rawData.txt")
    print("raw docs extracted")
    docs_len = len(docs)
    #docs_tokens = getTokenizedDocs(docs)
    #print(" docs tokens extracted")
    #labels = getLabels(docs_tokens, topicKeywords)
    #writeToFile(labels,"labels.txt")
    labels = getLabelsFromFile("labels2-balanced.txt")
    print sum(labels)
    
    ##print("docs labels calcualted")
    
    sep = int(docs_len*0.9)
    
    trainingDocs = docs[:sep]
    
    trainingLabels = labels[:sep]
    
    testDocs = docs[sep:]
    test_labels=labels[sep:]
    
    classifier = NaiveBayesClassifier()
    
    #classifier = SVMClassifier()
    
    trainingLabelsArr = np.array(labels)
    classifier.trainClassifier(docs,trainingLabelsArr)
    
    #print classifier.classifier.coef_
    #print classifier.ch2.get_support()
    
    trainingLabelsArr = np.array(trainingLabels)
    classifier.trainClassifier(trainingDocs,trainingLabelsArr)
    
    #print len(trainingDocs) 
    #print len (trainingLabelsArr)
    #classifier.trainClassifier(trainingDocs,trainingLabels)
    
    #print("classifer trained")
    #print (classifier.classifier)
    #print sum(test_labels)
    
    test_labelsArr = np.array(test_labels)
    print classifier.score(testDocs, test_labelsArr)
    
    #print sum(classifier.predicted)
    #print classifier.score(testDocs, test_labels)
    
    print metrics.classification_report(test_labelsArr, classifier.predicted)
    
    '''