示例#1
0
class AINewsCorpus:
    """
    A corpus is a set of news articles (each with a title, content,
    and categories) that are used for training and comparison
    purposes. For training, the corpus provides the training
    examples. For comparison, the corpus provides the data for various
    measures like word frequency. This is important in the prediction
    process: we only want to predict a new article's categories based
    on word frequencies, and other measures, from the corpus; we don't
    want articles that have not been "vetted" (articles not part of
    the corpus) to contribute to these measures.

    A corpus can be "loaded" via C{load_corpus()} or "restored" via
    C{restore_corpus()}. The difference is the following: when loading a
    corpus, word frequencies are measured and stored in the database
    table C{wordlist_eval}; when restoring a corpus, word frequencies
    are simply retrieved from the database table C{wordlist}. In other
    words, we load a corpus when we are training or evaluating our
    training procedures, and we restore a corpus when we are
    predicting.
    """
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.retained_db_docs = None

        self.restore_corpus()

    def compare_articles(self, article1, article2):
        dupcount1 = len(article1['duplicates'])
        dupcount2 = len(article2['duplicates'])
        relevance1 = article1['source_relevance']
        relevance2 = article2['source_relevance']
        cat_count1 = len(article1['categories'])
        cat_count2 = len(article2['categories'])
        if cmp(dupcount1, dupcount2) == 0:
            if cmp(relevance1, relevance2) == 0:
                return cmp(cat_count1, cat_count2)
            else:
                return cmp(relevance1, relevance2)
        else:
            return cmp(dupcount1, dupcount2)

    def get_tfidf(self, urlid, wordfreq):
        """
        Helper function to retrieve the tfidf of each word based on the urlid.
        @param  urlid: target news story's urlid.
        @type  urlid: C{int}
        """
        if urlid in self.cache_urls:
            return self.cache_urls[urlid]
        wordid_freq_pairs = {}
        for word in wordfreq:
            if word in self.dftext:
                wordid_freq_pairs[self.idwords[word]] = (wordfreq[word],
                                                         self.dftext[word])

        data = {}
        distsq = 0.0
        for wordid in wordid_freq_pairs:
            tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
                    (math.log(self.corpus_count + 1, 2) - \
                    math.log(wordid_freq_pairs[wordid][1] + 1, 2))
            data[wordid] = tfidf
            distsq += tfidf * tfidf
        dist = math.sqrt(distsq)
        if dist > 1.0e-9:
            for key in data:
                data[key] /= dist
        self.cache_urls[urlid] = data
        return data

    def cos_sim(self, tfidf1, tfidf2):
        """
        A helper function to compute the cos simliarity between
        news story and centroid.
        @param  tfidf1: target news story tfidf vector.
        @type  tfidf1: C{dict}
        @param tfidf2: centroid tfidf vector.
        @type  tfidf2: C{dict}
        """
        sim = 0.0
        for key in tfidf1:
            if key in tfidf2:
                word = self.wordids[key]
                a = tfidf1[key]
                b = tfidf2[key]
                sim += a * b
        return sim

    def get_article(self, urlid, corpus=False):
        row = None
        if corpus:
            table = 'cat_corpus'
            cat_table = 'cat_corpus_cats'
            row = self.db.selectone("""select u.url, u.title, u.content
                from %s as u where u.urlid = %s""" % (table, urlid))

        else:
            table = 'urllist'
            cat_table = 'categories'
            row = self.db.selectone("""select u.url, u.title, u.content, u.summary, 
                u.pubdate, u.crawldate, u.processed, u.published, u.source,
                u.source_relevance, u.source_id, u.tfpn, u.image_url
                from %s as u where u.urlid = %s""" % \
                                        (table, urlid))
        if row != None and row[2] is not None:
            content = row[2]
            wordfreq = self.txtpro.simpletextprocess(urlid, content)
            summary = ""
            if not corpus: summary = row[3]
            processed = False
            if not corpus and row[6] == 1: processed = True
            published = False
            if not corpus and row[7] == 1: published = True
            pubdate = ""
            if not corpus: pubdate = row[4]
            crawldate = ""
            if not corpus: crawldate = row[5]
            source = ""
            if not corpus: source = row[8]
            tfpn = "xx"
            if not corpus: tfpn = row[11]
            source_relevance = 0
            if row[9]: source_relevance = int(row[9])
            categories = []
            cat_rows = self.db.selectall("""select category from %s
                where urlid = %s""" % (cat_table, urlid))
            for cat_row in cat_rows:
                categories.append(cat_row[0])
            return {
                'urlid': urlid,
                'url': row[0],
                'title': row[1],
                'content': content,
                'summary': summary,
                'pubdate': pubdate,
                'crawldate': crawldate,
                'processed': processed,
                'published': published,
                'source': source,
                'source_relevance': source_relevance,
                'source_id': row[10],
                'categories': categories,
                'duplicates': [],
                'tfpn': tfpn,
                'wordfreq': wordfreq,
                'image_url': row[12],
                'tfidf': self.get_tfidf(urlid, wordfreq)
            }
        else:
            return None

    def get_articles_daterange(self, date_start, date_end):
        articles = {}
        rows = self.db.selectall(
            """select urlid from urllist
            where pubdate >= %s and pubdate <= %s""", (date_start, date_end))
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_articles_idrange(self, urlid_start, urlid_end, corpus=False):
        articles = {}
        rows = self.db.selectall(
            """select urlid from urllist
            where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end))
        for row in rows:
            art = self.get_article(row[0], corpus)
            if art is not None:
                articles[row[0]] = art
        return articles

    def get_unprocessed(self):
        articles = {}
        rows = self.db.selectall(
            "select urlid from urllist where processed = 0")
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_publishable(self):
        articles = []
        rows = self.db.selectall(
            "select urlid from urllist where "
            "publishable = 1 and published = 0 and pubdate != '0000-00-00'")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def get_published(self):
        articles = []
        rows = self.db.selectall(
            "select urlid from urllist where published = 1")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def mark_processed(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set processed = 1 where urlid = %s",
                article['urlid'])

    def mark_publishable(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set publishable = 1 where urlid = %s",
                article['urlid'])

    def mark_published(self, articles):
        for article in articles:
            self.db.execute(
                "update urllist set published = 1 where urlid = %s",
                article['urlid'])

    def restore_corpus(self):
        self.wordids = {}
        self.dftext = {}
        rows = self.db.selectall("select rowid, word, dftext from wordlist")
        for row in rows:
            self.wordids[row[0]] = row[1]
            self.idwords[row[1]] = row[0]
            self.dftext[row[1]] = row[2]
        self.corpus_count = self.db.selectone(
            "select count(*) from cat_corpus")[0]

    def add_freq_index(self, urlid, wordfreq, categories=[]):
        for word in wordfreq:
            self.wordcounts.setdefault(word, 0)
            self.wordcounts[word] += 1

    def commit_freq_index(self, table):
        self.dftext = {}
        self.wordids = {}
        for word in self.wordcounts:
            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
                "values(%s, %s)", (word, self.wordcounts[word]))
            self.wordids[rowid] = word
            self.idwords[word] = rowid
            self.dftext[word] = self.wordcounts[word]
        self.wordcounts = {}

    def load_corpus(self, ident, pct, debug=False, retain=False):
        if debug:
            print "Loading corpus..."
        source = ident.split(':')[0]
        name = ident.split(':')[1:]
        if source == "file":
            docs = self.load_file_corpus(name, debug)
        elif source == "db":
            docs = self.load_db_corpus(name, debug, retain)
        if debug: print

        random.shuffle(docs)
        offset = int(len(docs) * pct)
        if debug:
            print "Selecting random %d%% of corpus (%d docs)." % \
                    (pct * 100, offset)

        # sort train_corpus by urlid
        train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0))
        self.corpus_count = len(train_corpus)

        # sort predict_corpus by urlid
        predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \
                key=operator.itemgetter(0))

        self.db.execute("delete from wordlist_eval")
        self.db.execute("alter table wordlist_eval auto_increment = 0")
        self.wordids = {}
        self.wordcounts = {}
        self.cache_urls = {}
        for c in train_corpus:
            self.add_freq_index(c[0], c[1], c[2].split())
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        self.commit_freq_index('wordlist_eval')

        return (train_corpus, predict_corpus)

    def load_file_corpus(self, name, debug=False):
        wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel"
        f = open(wordsfile, 'r')
        self.wordids = {}
        wordid = 1
        for line in f:
            self.wordids[int(wordid)] = line.strip()
            wordid += 1

        catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel"
        f = open(catsfile, 'r')
        cats = {}
        uniqcats = set()
        docid = 0
        for line in f:
            cats[docid] = line.strip()
            uniqcats.add(line.strip())
            docid += 1
        self.categories = list(uniqcats)

        matfile = paths['corpus.corpus_other'] + name[0] + ".mat"
        f = open(matfile, 'r')
        f.readline()  # ignore first line
        docs = []
        docid = 0
        for line in f:
            wordfreq = {}
            for (wordid, freq) in izip(*[iter(line.split())] * 2):
                wordfreq[self.wordids[int(wordid)]] = int(float(freq))
            docs.append((docid, wordfreq, cats[docid]))
            docid += 1
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        return docs

    def load_db_corpus(self, name, debug=False, retain=False):
        rows = self.db.selectall("""select c.urlid, c.content,
            group_concat(cc.category separator ' ')
            from %s as c, %s as cc
            where c.urlid = cc.urlid
            group by c.urlid order by c.urlid desc""" % (name[0], name[1]))
        if debug: print "Processing %d articles..." % len(rows)
        if retain and self.retained_db_docs != None:
            return self.retained_db_docs
        docs = []
        for row in rows:
            wordfreq = self.txtpro.simpletextprocess(row[0], row[1])
            if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '):
                docs.append((row[0], wordfreq, row[2]))
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        if retain:
            self.retained_db_docs = docs
        return docs
示例#2
0
class AINewsWekaClassifier:
    def __init__(self):
        self.txtpro = AINewsTextProcessor()

    def __save_bag_of_words(self, tid, fieldidx):
        # find all unique words in the arff 'title' field, remove stop
        # words, perform stemming, collect their frequencies
        phrases = []
        f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            phrases.append(record[fieldidx])
        bag = self.txtpro.simpletextprocess(0, ' '.join(phrases))
        smallerbag = FreqDist()
        i = 0
        for word in bag:
            if i == 1000:
                break
            smallerbag[word] = bag[word]
            i += 1
        p = open("%sbag_of_words-%d.pickle" % (paths['weka.bag_of_words_dir'], fieldidx), 'w')
        pickle.dump(smallerbag, p)
        p.close()

    def __prepare_arff(self, tid):
        p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r')
        bag_title = pickle.load(p)
        p.close()
        p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r')
        bag_body = pickle.load(p)
        p.close()

        data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid}
        for word in bag_title:
            data['attributes'].append(("title-%s" % word, 'NUMERIC'))
        for word in bag_body:
            data['attributes'].append(("body-%s" % word, 'NUMERIC'))
        data['attributes'].append(('class', ['yes', 'no']))

        f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            record_bag_title = self.txtpro.simpletextprocess(0, record[0])
            record_bag_body = self.txtpro.simpletextprocess(0, record[1])
            record_data = []
            # iterate through original bag, figure out freq in this record's bag
            for word in bag_title:
                if word in record_bag_title:
                    record_data.append(record_bag_title[word])
                else:
                    record_data.append(0)
            for word in bag_body:
                if word in record_bag_body:
                    record_data.append(record_bag_body[word])
                else:
                    record_data.append(0)
            record_data.append(record[2])
            data['data'].append(record_data)

        fnew = open("%s%d-wordvec-nonsparse.arff" % \
                        (paths['weka.training_arff_dir'], tid), 'w')
        arff.dump(fnew, data)
        fnew.close()

        # convert to sparse format
        Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
               "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
                  (paths['weka.weka_jar'],
                   paths['weka.training_arff_dir'], tid,
                   paths['weka.training_arff_dir'], tid),
              shell = True).communicate()

        remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
        
    # 1. load unprocessed arff files, from just one tid, from family_resemblance export
    # 2. gather all titles, parse into a bag of words
    # 3. save bag of words (list? need to keep the order) in a pickle file
    # 4. write new sparse arff files for each tid using this sorted bag of words

    def __get_tids(self):
        tids = []
        files = listdir(paths['weka.training_arff_dir'])
        for f in files:
            m = re.match(r'^(\d+).arff$', f)
            if m:
                if m.group(1) == '0': continue
                tids.append(int(m.group(1)))
        return tids

    def train(self):
        tids = self.__get_tids()
        
        # all tid arffs have same entries, so use the first to grab the bag of words
        print "Saving bag of words..."
        self.__save_bag_of_words(tids[0], 0)
        self.__save_bag_of_words(tids[0], 1)

        for tid in sorted(tids):
            print "Preparing tid %d" % tid
            self.__prepare_arff(tid)

        for tid in sorted(tids):
            print "Spread subsampling for tid %d" % tid
            Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
                   "-M 1.0 -X 0.0 -S 1 -c last " +
                   "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
                      (paths['weka.weka_jar'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True).communicate()

            print "Training random forests for tid %d" % tid
            Popen(("java -cp %s %s %s -v " +
                   "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
                      (paths['weka.weka_jar'],
                       config['weka.classifier'],
                       config['weka.classifier_params'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True, stdout = PIPE).communicate()
            print out

    def train_experiment(self):
        model_scores = {}
        models = {'random-forest': ('weka.classifiers.trees.RandomForest', '-I 20 -K 0'),
                  'naive-bayes': ('weka.classifiers.bayes.NaiveBayes', ''),
                  'bayesnet': ('weka.classifiers.bayes.BayesNet', ''),
                  'j48': ('weka.classifiers.trees.J48', ''),
                  'knn': ('weka.classifiers.lazy.IBk', '-K 3')}

        tids = self.__get_tids()
        
        # all tid arffs have same entries, so use the first to grab the bag of words
        print "Saving bag of words..."
        self.__save_bag_of_words(tids[0], 0)
        self.__save_bag_of_words(tids[0], 1)

        for tid in sorted(tids):
            print "Preparing tid %d" % tid
            self.__prepare_arff(tid)

        for tid in sorted(tids):
            print "Spread subsampling for tid %d" % tid
            Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
                   "-M 1.0 -X 0.0 -S 1 -c last " +
                   "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
                      (paths['weka.weka_jar'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True).communicate()

        for tid in sorted(tids):
            model_scores[tid] = {}
            for model in models.keys():
                print "Training %s for tid %d" % (models[model][0], tid)
                (out, _) = Popen(("java -cp %s %s %s -v " +
                                  "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
                                     (paths['weka.weka_jar'],
                                      models[model][0], models[model][1],
                                      paths['weka.training_arff_dir'], tid,
                                      paths['weka.training_arff_dir'], tid),
                                 shell = True, stdout = PIPE).communicate()
                
                correct = 0.0
                for line in out.splitlines():
                    m = re.search(r'Correctly Classified Instances\s+\d+\s+(.*) %', line)
                    if m:
                        correct = float(m.group(1))
                        break
                model_scores[tid][model] = correct

        with open('training_experiment.csv', 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['model', 'tid', 'correct'])
            for tid in model_scores.keys():
                for model in model_scores[tid].keys():
                    writer.writerow([model, tid, model_scores[tid][model]])

    def __predict_arff(self):
        tids = self.__get_tids()

        # the testing file should always be 0.arff
        self.__prepare_arff(0)

        predictions = {}
        for tid in sorted(tids):
            predictions[tid] = []

            print "Predicting tid %d" % tid
            (out, err) = Popen(("java -cp %s %s " +
                                "-T %s0-wordvec.arff -l %s%d.model -p last") % \
                                   (paths['weka.weka_jar'],
                                    config['weka.classifier'],
                                    paths['weka.training_arff_dir'],
                                    paths['weka.training_arff_dir'], tid),
                               shell = True, stdout = PIPE).communicate()
            for line in out.splitlines():
                m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.?\d*)', line)
                if m:
                    answer = False
                    if m.group(1) == 'yes':
                        answer = True
                    conf = float(m.group(2))
                    if conf < 0.75:
                        answer = False
                    predictions[tid].append((answer, conf))
        return predictions

    def predict(self, articles):
        # modifies the provided articles dict

        data = {'attributes': [('title', 'STRING'),
                               ('body', 'STRING'),
                               ('class', ['yes', 'no'])],
                'data': [], 'description': u'', 'relation': '0'}

        for urlid in sorted(articles.keys()):
            title = re.sub(r'\W', ' ', articles[urlid]['title'])
            body = re.sub(r'\W', ' ', articles[urlid]['summary'])
            data['data'].append([title, body, 'no'])

        # make the testing file 0.arff
        fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
        arff.dump(fnew, data)
        fnew.close()

        predictions = self.__predict_arff()

        for urlid in sorted(articles.keys()):
            articles[urlid]['categories'] = []

        tids = self.__get_tids()
        for tid in sorted(tids):
            for (i, urlid) in enumerate(sorted(articles.keys())):
                if predictions[tid][i][0]:
                     articles[urlid]['categories'].append(str(tid))
示例#3
0
class AINewsCorpus:
    """
    A corpus is a set of news articles (each with a title, content,
    and categories) that are used for training and comparison
    purposes. For training, the corpus provides the training
    examples. For comparison, the corpus provides the data for various
    measures like word frequency. This is important in the prediction
    process: we only want to predict a new article's categories based
    on word frequencies, and other measures, from the corpus; we don't
    want articles that have not been "vetted" (articles not part of
    the corpus) to contribute to these measures.

    A corpus can be "loaded" via C{load_corpus()} or "restored" via
    C{restore_corpus()}. The difference is the following: when loading a
    corpus, word frequencies are measured and stored in the database
    table C{wordlist_eval}; when restoring a corpus, word frequencies
    are simply retrieved from the database table C{wordlist}. In other
    words, we load a corpus when we are training or evaluating our
    training procedures, and we restore a corpus when we are
    predicting.
    """
    def __init__(self):
        self.txtpro = AINewsTextProcessor()
        self.cache_urls = {}

        #: A dictionary of word=>word freq in corpus
        self.dftext = {}

        #: A dictionary of word=>wordid
        self.idwords = {}

        #: A dictionary of wordid=>word
        self.wordids = {}

        self.db = AINewsDB()

        self.categories = ["AIOverview","Agents", "Applications", \
                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
                 "Speech", "Systems", "Vision"]

        self.sources = {}
        rows = self.db.selectall("select parser, relevance from sources")
        for row in rows:
            self.sources[row[0].split('::')[0]] = int(row[1])

        self.retained_db_docs = None
        
        self.restore_corpus()

    def get_relevance(self, publisher):
        if re.search(r'via Google News', publisher):
            publisher = 'GoogleNews'
        return self.sources[publisher]

    def compare_articles(self, article1, article2):
        dupcount1 = len(article1['duplicates'])
        dupcount2 = len(article2['duplicates'])
        if article1['publisher'].find('User submitted') != -1:
            relevance1 = 200
        else:
            relevance1 = self.get_relevance(article1['publisher'])
        if article2['publisher'].find('User submitted') != -1:
            relevance2 = 200
        else:
            relevance2 = self.get_relevance(article2['publisher'])
        cat_count1 = len(article1['categories'])
        cat_count2 = len(article2['categories'])
        if cmp(dupcount1, dupcount2) == 0:
            if cmp(relevance1, relevance2) == 0:
                return cmp(cat_count1, cat_count2)
            else:
                return cmp(relevance1, relevance2)
        else:
            return cmp(dupcount1, dupcount2)

    def get_tfidf(self, urlid, wordfreq):
        """
        Helper function to retrieve the tfidf of each word based on the urlid.
        @param  urlid: target news story's urlid.
        @type  urlid: C{int}
        """
        if urlid in self.cache_urls:
            return self.cache_urls[urlid]
        wordid_freq_pairs = {}
        for word in wordfreq:
            if word in self.dftext:
                wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word])

        data = {}
        distsq = 0.0
        for wordid in wordid_freq_pairs:
            tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
                    (math.log(self.corpus_count + 1, 2) - \
                    math.log(wordid_freq_pairs[wordid][1] + 1, 2))
            data[wordid] = tfidf
            distsq += tfidf * tfidf
        dist = math.sqrt(distsq)
        if dist > 1.0e-9:
            for key in data:
                data[key] /= dist
        self.cache_urls[urlid] = data
        return data

    def cos_sim(self, tfidf1, tfidf2):
        """
        A helper function to compute the cos simliarity between
        news story and centroid.
        @param  tfidf1: target news story tfidf vector.
        @type  tfidf1: C{dict}
        @param tfidf2: centroid tfidf vector.
        @type  tfidf2: C{dict}
        """
        sim = 0.0
        for key in tfidf1:
            if key in tfidf2:
                word = self.wordids[key]
                a = tfidf1[key]
                b = tfidf2[key]
                sim += a*b
        return sim

    def get_article(self, urlid, corpus = False):
        row = None
        if corpus:
            table = 'cat_corpus'
            cat_table = 'cat_corpus_cats'
            row = self.db.selectone("""select u.url, u.title, u.content
                from %s as u where u.urlid = %s""" % (table, urlid))

        else:
            table = 'urllist'
            cat_table = 'categories'
            row = self.db.selectone("""select u.url, u.title, u.content, u.summary, 
                u.pubdate, u.crawldate, u.processed, u.published, u.publisher
                from %s as u where u.urlid = %s""" % \
                                        (table, urlid))
        if row != None and row[2] is not None:
            wordfreq = self.txtpro.simpletextprocess(urlid, row[2])
            summary = ""
            if not corpus: summary = row[3]
            processed = False
            if not corpus and row[6] == 1: processed = True
            published = False
            if not corpus and row[7] == 1: published = True
            pubdate = ""
            if not corpus: pubdate = row[4]
            crawldate = ""
            if not corpus: crawldate = row[5]
            publisher = ""
            if not corpus: publisher = row[8]
            categories = []
            cat_rows = self.db.selectall("""select category from %s
                where urlid = %s""" % (cat_table, urlid))
            for cat_row in cat_rows:
                categories.append(cat_row[0])
            return {'urlid': urlid, 'url': row[0], 'title': row[1],
                    'content': trunc(row[2], max_pos=3000),
                    'content_all': row[2],
                    'summary': summary,
                    'pubdate': pubdate, 'crawldate': crawldate,
                    'processed': processed, 'published': published,
                    'publisher': publisher,
                    'categories': categories, 'duplicates': [],
                    'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)}
        else:
            return None

    def get_articles_daterange(self, date_start, date_end):
        articles = {}
        rows = self.db.selectall("""select urlid from urllist
            where pubdate >= %s and pubdate <= %s""", (date_start, date_end))
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_articles_idrange(self, urlid_start, urlid_end, corpus = False):
        articles = {}
        rows = self.db.selectall("""select urlid from urllist
            where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end))
        for row in rows:
            art = self.get_article(row[0], corpus)
            if art is not None:
                articles[row[0]] = art
        return articles

    def get_unprocessed(self):
        articles = {}
        rows = self.db.selectall("select urlid from urllist where processed = 0")
        for row in rows:
            articles[row[0]] = self.get_article(row[0])
        return articles

    def get_publishable(self):
        articles = []
        rows = self.db.selectall("select urlid from urllist where "
                        "publishable = 1 and published = 0 and pubdate != '0000-00-00'")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def get_published(self):
        articles = []
        rows = self.db.selectall("select urlid from urllist where published = 1")
        for row in rows:
            articles.append(self.get_article(row[0]))
        return articles

    def mark_processed(self, articles):
        for article in articles:
            self.db.execute("update urllist set processed = 1 where urlid = %s",
                    article['urlid'])

    def mark_publishable(self, articles):
        for article in articles:
            self.db.execute("update urllist set publishable = 1 where urlid = %s",
                            article['urlid'])

    def mark_published(self, articles):
        for article in articles:
            self.db.execute("update urllist set published = 1 where urlid = %s",
                            article['urlid'])

    def restore_corpus(self):
        self.wordids = {}
        self.dftext = {}
        rows = self.db.selectall("select rowid, word, dftext from wordlist")
        for row in rows:
            self.wordids[row[0]] = row[1]
            self.idwords[row[1]] = row[0]
            self.dftext[row[1]] = row[2]
        self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0]

    def add_freq_index(self, urlid, wordfreq, categories = []):
        for word in wordfreq:
            self.wordcounts.setdefault(word, 0)
            self.wordcounts[word] += 1

    def commit_freq_index(self, table):
        self.dftext = {}
        self.wordids = {}
        for word in self.wordcounts:
            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
                "values(%s, %s)", (word, self.wordcounts[word]))
            self.wordids[rowid] = word
            self.idwords[word] = rowid
            self.dftext[word] = self.wordcounts[word]
        self.wordcounts = {}

    def load_corpus(self, ident, pct, debug = False, retain = False):
        if debug:
            print "Loading corpus..."
        source = ident.split(':')[0]
        name = ident.split(':')[1:]
        if source == "file":
            docs = self.load_file_corpus(name, debug)
        elif source == "db":
            docs = self.load_db_corpus(name, debug, retain)
        if debug: print

        random.shuffle(docs)
        offset = int(len(docs)*pct)
        if debug:
            print "Selecting random %d%% of corpus (%d docs)." % \
                    (pct * 100, offset)

        # sort train_corpus by urlid
        train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0))
        self.corpus_count = len(train_corpus)

        # sort predict_corpus by urlid
        predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \
                key=operator.itemgetter(0))

        self.db.execute("delete from wordlist_eval")
        self.db.execute("alter table wordlist_eval auto_increment = 0")
        self.wordids = {}
        self.wordcounts = {}
        self.cache_urls = {}
        for c in train_corpus:
            self.add_freq_index(c[0], c[1], c[2].split())
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        self.commit_freq_index('wordlist_eval')

        return (train_corpus, predict_corpus)

    def load_file_corpus(self, name, debug = False):
        wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel"
        f = open(wordsfile, 'r')
        self.wordids = {}
        wordid = 1
        for line in f:
            self.wordids[int(wordid)] = line.strip()
            wordid += 1

        catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel"
        f = open(catsfile, 'r')
        cats = {}
        uniqcats = set()
        docid = 0
        for line in f:
            cats[docid] = line.strip()
            uniqcats.add(line.strip())
            docid += 1
        self.categories = list(uniqcats)

        matfile = paths['corpus.corpus_other'] + name[0] + ".mat"
        f = open(matfile, 'r')
        f.readline() # ignore first line
        docs = []
        docid = 0
        for line in f:
            wordfreq = {}
            for (wordid, freq) in izip(*[iter(line.split())]*2):
                wordfreq[self.wordids[int(wordid)]] = int(float(freq))
            docs.append((docid, wordfreq, cats[docid]))
            docid += 1
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        return docs

    def load_db_corpus(self, name, debug = False, retain = False):
        rows = self.db.selectall("""select c.urlid, c.content,
            group_concat(cc.category separator ' ')
            from %s as c, %s as cc
            where c.urlid = cc.urlid
            group by c.urlid order by c.urlid desc""" % (name[0], name[1]))
        if debug: print "Processing %d articles..." % len(rows)
        if retain and self.retained_db_docs != None:
            return self.retained_db_docs
        docs = []
        for row in rows:
            wordfreq = self.txtpro.simpletextprocess(row[0], row[1])
            if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '):
                docs.append((row[0], wordfreq, row[2]))
            if debug:
                sys.stdout.write('.')
                sys.stdout.flush()
        if retain:
            self.retained_db_docs = docs
        return docs
示例#4
0
class AINewsWekaClassifier:
    def __init__(self):
        self.txtpro = AINewsTextProcessor()

    def __save_bag_of_words(self, tid, fieldidx):
        # find all unique words in the arff 'title' field, remove stop
        # words, perform stemming, collect their frequencies
        phrases = []
        f = arff.load(
            open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            phrases.append(record[fieldidx])
        bag = self.txtpro.simpletextprocess(0, ' '.join(phrases))
        smallerbag = FreqDist()
        i = 0
        for word in bag:
            if i == 1000:
                break
            smallerbag[word] = bag[word]
            i += 1
        p = open(
            "%sbag_of_words-%d.pickle" %
            (paths['weka.bag_of_words_dir'], fieldidx), 'w')
        pickle.dump(smallerbag, p)
        p.close()

    def __prepare_arff(self, tid):
        p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'],
                 'r')
        bag_title = pickle.load(p)
        p.close()
        p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'],
                 'r')
        bag_body = pickle.load(p)
        p.close()

        data = {
            'attributes': [],
            'data': [],
            'description': u'',
            'relation': tid
        }
        for word in bag_title:
            data['attributes'].append(("title-%s" % word, 'NUMERIC'))
        for word in bag_body:
            data['attributes'].append(("body-%s" % word, 'NUMERIC'))
        data['attributes'].append(('class', ['yes', 'no']))

        f = arff.load(
            open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
        for record in f['data']:
            record_bag_title = self.txtpro.simpletextprocess(0, record[0])
            record_bag_body = self.txtpro.simpletextprocess(0, record[1])
            record_data = []
            # iterate through original bag, figure out freq in this record's bag
            for word in bag_title:
                if word in record_bag_title:
                    record_data.append(record_bag_title[word])
                else:
                    record_data.append(0)
            for word in bag_body:
                if word in record_bag_body:
                    record_data.append(record_bag_body[word])
                else:
                    record_data.append(0)
            record_data.append(record[2])
            data['data'].append(record_data)

        fnew = open("%s%d-wordvec-nonsparse.arff" % \
                        (paths['weka.training_arff_dir'], tid), 'w')
        arff.dump(fnew, data)
        fnew.close()

        # convert to sparse format
        Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
               "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
                  (paths['weka.weka_jar'],
                   paths['weka.training_arff_dir'], tid,
                   paths['weka.training_arff_dir'], tid),
              shell = True).communicate()

        remove("%s%d-wordvec-nonsparse.arff" %
               (paths['weka.training_arff_dir'], tid))

    # 1. load unprocessed arff files, from just one tid, from family_resemblance export
    # 2. gather all titles, parse into a bag of words
    # 3. save bag of words (list? need to keep the order) in a pickle file
    # 4. write new sparse arff files for each tid using this sorted bag of words

    def __get_tids(self):
        tids = []
        files = listdir(paths['weka.training_arff_dir'])
        for f in files:
            m = re.match(r'^(\d+).arff$', f)
            if m:
                if m.group(1) == '0': continue
                tids.append(int(m.group(1)))
        return tids

    def train(self):
        tids = self.__get_tids()

        # all tid arffs have same entries, so use the first to grab the bag of words
        print "Saving bag of words..."
        self.__save_bag_of_words(tids[0], 0)
        self.__save_bag_of_words(tids[0], 1)

        for tid in sorted(tids):
            print "Preparing tid %d" % tid
            self.__prepare_arff(tid)

        for tid in sorted(tids):
            print "Spread subsampling for tid %d" % tid
            Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
                   "-M 1.0 -X 0.0 -S 1 -c last " +
                   "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
                      (paths['weka.weka_jar'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True).communicate()

            print "Training random forests for tid %d" % tid
            Popen(("java -cp %s %s %s -v " +
                   "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
                      (paths['weka.weka_jar'],
                       config['weka.classifier'],
                       config['weka.classifier_params'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True, stdout = PIPE).communicate()
            print out

    def train_experiment(self):
        model_scores = {}
        models = {
            'random-forest':
            ('weka.classifiers.trees.RandomForest', '-I 20 -K 0'),
            'naive-bayes': ('weka.classifiers.bayes.NaiveBayes', ''),
            'bayesnet': ('weka.classifiers.bayes.BayesNet', ''),
            'j48': ('weka.classifiers.trees.J48', ''),
            'knn': ('weka.classifiers.lazy.IBk', '-K 3')
        }

        tids = self.__get_tids()

        # all tid arffs have same entries, so use the first to grab the bag of words
        print "Saving bag of words..."
        self.__save_bag_of_words(tids[0], 0)
        self.__save_bag_of_words(tids[0], 1)

        for tid in sorted(tids):
            print "Preparing tid %d" % tid
            self.__prepare_arff(tid)

        for tid in sorted(tids):
            print "Spread subsampling for tid %d" % tid
            Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
                   "-M 1.0 -X 0.0 -S 1 -c last " +
                   "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
                      (paths['weka.weka_jar'],
                       paths['weka.training_arff_dir'], tid,
                       paths['weka.training_arff_dir'], tid),
                  shell = True).communicate()

        for tid in sorted(tids):
            model_scores[tid] = {}
            for model in models.keys():
                print "Training %s for tid %d" % (models[model][0], tid)
                (out, _) = Popen(("java -cp %s %s %s -v " +
                                  "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
                                     (paths['weka.weka_jar'],
                                      models[model][0], models[model][1],
                                      paths['weka.training_arff_dir'], tid,
                                      paths['weka.training_arff_dir'], tid),
                                 shell = True, stdout = PIPE).communicate()

                correct = 0.0
                for line in out.splitlines():
                    m = re.search(
                        r'Correctly Classified Instances\s+\d+\s+(.*) %', line)
                    if m:
                        correct = float(m.group(1))
                        break
                model_scores[tid][model] = correct

        with open('training_experiment.csv', 'w') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['model', 'tid', 'correct'])
            for tid in model_scores.keys():
                for model in model_scores[tid].keys():
                    writer.writerow([model, tid, model_scores[tid][model]])

    def __predict_arff(self):
        tids = self.__get_tids()

        # the testing file should always be 0.arff
        self.__prepare_arff(0)

        predictions = {}
        for tid in sorted(tids):
            predictions[tid] = []

            print "Predicting tid %d" % tid
            (out, err) = Popen(("java -cp %s %s " +
                                "-T %s0-wordvec.arff -l %s%d.model -p last") % \
                                   (paths['weka.weka_jar'],
                                    config['weka.classifier'],
                                    paths['weka.training_arff_dir'],
                                    paths['weka.training_arff_dir'], tid),
                               shell = True, stdout = PIPE).communicate()
            for line in out.splitlines():
                m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.?\d*)',
                              line)
                if m:
                    answer = False
                    if m.group(1) == 'yes':
                        answer = True
                    conf = float(m.group(2))
                    if conf < 0.75:
                        answer = False
                    predictions[tid].append((answer, conf))
        return predictions

    def predict(self, articles):
        # modifies the provided articles dict

        data = {
            'attributes': [('title', 'STRING'), ('body', 'STRING'),
                           ('class', ['yes', 'no'])],
            'data': [],
            'description':
            u'',
            'relation':
            '0'
        }

        for urlid in sorted(articles.keys()):
            title = re.sub(r'\W', ' ', articles[urlid]['title'])
            body = re.sub(r'\W', ' ', articles[urlid]['summary'])
            data['data'].append([title, body, 'no'])

        # make the testing file 0.arff
        fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
        arff.dump(fnew, data)
        fnew.close()

        predictions = self.__predict_arff()

        for urlid in sorted(articles.keys()):
            articles[urlid]['categories'] = []

        tids = self.__get_tids()
        for tid in sorted(tids):
            for (i, urlid) in enumerate(sorted(articles.keys())):
                if predictions[tid][i][0]:
                    articles[urlid]['categories'].append(str(tid))