def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta( days=int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = ""
def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} #: A dictionary of word=>word freq in corpus self.dftext = {} #: A dictionary of word=>wordid self.idwords = {} #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() self.categories = ["AIOverview","Agents", "Applications", \ "CognitiveScience", "Education", "Ethics", "Games", "History", \ "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \ "Reasoning", "Representation", "Robots", "ScienceFiction", \ "Speech", "Systems", "Vision"] self.sources = {} rows = self.db.selectall("select parser, relevance from sources") for row in rows: self.sources[row[0].split('::')[0]] = int(row[1]) self.retained_db_docs = None self.restore_corpus()
def __init__(self): self.debug = config["ainews.debug"] self.today = date.today() self.earliest_date = self.today - timedelta(days=int(config["ainews.period"])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = ""
def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} #: A dictionary of word=>word freq in corpus self.dftext = {} #: A dictionary of word=>wordid self.idwords = {} #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() self.categories = ["AIOverview","Agents", "Applications", \ "CognitiveScience", "Education", "Ethics", "Games", "History", \ "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \ "Reasoning", "Representation", "Robots", "ScienceFiction", \ "Speech", "Systems", "Vision"] self.retained_db_docs = None self.restore_corpus()
def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta(days = int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.svm_classifier = AINewsSVMClassifier() self.txtpro = AINewsTextProcessor() self.summarizer = AINewsSummarizer() self.articles = {} self.publishable_articles = [] self.semiauto_email_output = "" self.topicids = {"AIOverview":0, "Agents":1, "Applications":2, "CognitiveScience":3, "Education":4,"Ethics":5, "Games":6, "History":7, "Interfaces":8, "MachineLearning":9, "NaturalLanguage":10, "Philosophy":11, "Reasoning":12, "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16, "Systems":17, "Vision":18}
def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} #: A dictionary of word=>word freq in corpus self.dftext = {} #: A dictionary of word=>wordid self.idwords = {} #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() self.categories = [ "AIOverview", "Agents", "Applications", "CognitiveScience", "Education", "Ethics", "Games", "History", "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", "Reasoning", "Representation", "Robots", "ScienceFiction", "Speech", "Systems", "Vision", ] self.retained_db_docs = None self.restore_corpus()
class AINewsCorpus: """ A corpus is a set of news articles (each with a title, content, and categories) that are used for training and comparison purposes. For training, the corpus provides the training examples. For comparison, the corpus provides the data for various measures like word frequency. This is important in the prediction process: we only want to predict a new article's categories based on word frequencies, and other measures, from the corpus; we don't want articles that have not been "vetted" (articles not part of the corpus) to contribute to these measures. A corpus can be "loaded" via C{load_corpus()} or "restored" via C{restore_corpus()}. The difference is the following: when loading a corpus, word frequencies are measured and stored in the database table C{wordlist_eval}; when restoring a corpus, word frequencies are simply retrieved from the database table C{wordlist}. In other words, we load a corpus when we are training or evaluating our training procedures, and we restore a corpus when we are predicting. """ def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} #: A dictionary of word=>word freq in corpus self.dftext = {} #: A dictionary of word=>wordid self.idwords = {} #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() self.categories = ["AIOverview","Agents", "Applications", \ "CognitiveScience", "Education", "Ethics", "Games", "History", \ "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \ "Reasoning", "Representation", "Robots", "ScienceFiction", \ "Speech", "Systems", "Vision"] self.retained_db_docs = None self.restore_corpus() def compare_articles(self, article1, article2): dupcount1 = len(article1['duplicates']) dupcount2 = len(article2['duplicates']) relevance1 = article1['source_relevance'] relevance2 = article2['source_relevance'] cat_count1 = len(article1['categories']) cat_count2 = len(article2['categories']) if cmp(dupcount1, dupcount2) == 0: if cmp(relevance1, relevance2) == 0: return cmp(cat_count1, cat_count2) else: return cmp(relevance1, relevance2) else: return cmp(dupcount1, dupcount2) def get_tfidf(self, urlid, wordfreq): """ Helper function to retrieve the tfidf of each word based on the urlid. @param urlid: target news story's urlid. @type urlid: C{int} """ if urlid in self.cache_urls: return self.cache_urls[urlid] wordid_freq_pairs = {} for word in wordfreq: if word in self.dftext: wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word]) data = {} distsq = 0.0 for wordid in wordid_freq_pairs: tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \ (math.log(self.corpus_count + 1, 2) - \ math.log(wordid_freq_pairs[wordid][1] + 1, 2)) data[wordid] = tfidf distsq += tfidf * tfidf dist = math.sqrt(distsq) if dist > 1.0e-9: for key in data: data[key] /= dist self.cache_urls[urlid] = data return data def cos_sim(self, tfidf1, tfidf2): """ A helper function to compute the cos simliarity between news story and centroid. @param tfidf1: target news story tfidf vector. @type tfidf1: C{dict} @param tfidf2: centroid tfidf vector. @type tfidf2: C{dict} """ sim = 0.0 for key in tfidf1: if key in tfidf2: word = self.wordids[key] a = tfidf1[key] b = tfidf2[key] sim += a * b return sim def get_article(self, urlid, corpus=False): row = None if corpus: table = 'cat_corpus' cat_table = 'cat_corpus_cats' row = self.db.selectone("""select u.url, u.title, u.content from %s as u where u.urlid = %s""" % (table, urlid)) else: table = 'urllist' cat_table = 'categories' row = self.db.selectone("""select u.url, u.title, u.content, u.summary, u.pubdate, u.crawldate, u.processed, u.published, u.source, u.source_relevance, u.source_id, u.tfpn, u.image_url from %s as u where u.urlid = %s""" % \ (table, urlid)) if row != None and row[2] is not None: content = row[2] wordfreq = self.txtpro.simpletextprocess(urlid, content) summary = "" if not corpus: summary = row[3] processed = False if not corpus and row[6] == 1: processed = True published = False if not corpus and row[7] == 1: published = True pubdate = "" if not corpus: pubdate = row[4] crawldate = "" if not corpus: crawldate = row[5] source = "" if not corpus: source = row[8] tfpn = "xx" if not corpus: tfpn = row[11] source_relevance = 0 if row[9]: source_relevance = int(row[9]) categories = [] cat_rows = self.db.selectall("""select category from %s where urlid = %s""" % (cat_table, urlid)) for cat_row in cat_rows: categories.append(cat_row[0]) return { 'urlid': urlid, 'url': row[0], 'title': row[1], 'content': content, 'summary': summary, 'pubdate': pubdate, 'crawldate': crawldate, 'processed': processed, 'published': published, 'source': source, 'source_relevance': source_relevance, 'source_id': row[10], 'categories': categories, 'duplicates': [], 'tfpn': tfpn, 'wordfreq': wordfreq, 'image_url': row[12], 'tfidf': self.get_tfidf(urlid, wordfreq) } else: return None def get_articles_daterange(self, date_start, date_end): articles = {} rows = self.db.selectall( """select urlid from urllist where pubdate >= %s and pubdate <= %s""", (date_start, date_end)) for row in rows: articles[row[0]] = self.get_article(row[0]) return articles def get_articles_idrange(self, urlid_start, urlid_end, corpus=False): articles = {} rows = self.db.selectall( """select urlid from urllist where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end)) for row in rows: art = self.get_article(row[0], corpus) if art is not None: articles[row[0]] = art return articles def get_unprocessed(self): articles = {} rows = self.db.selectall( "select urlid from urllist where processed = 0") for row in rows: articles[row[0]] = self.get_article(row[0]) return articles def get_publishable(self): articles = [] rows = self.db.selectall( "select urlid from urllist where " "publishable = 1 and published = 0 and pubdate != '0000-00-00'") for row in rows: articles.append(self.get_article(row[0])) return articles def get_published(self): articles = [] rows = self.db.selectall( "select urlid from urllist where published = 1") for row in rows: articles.append(self.get_article(row[0])) return articles def mark_processed(self, articles): for article in articles: self.db.execute( "update urllist set processed = 1 where urlid = %s", article['urlid']) def mark_publishable(self, articles): for article in articles: self.db.execute( "update urllist set publishable = 1 where urlid = %s", article['urlid']) def mark_published(self, articles): for article in articles: self.db.execute( "update urllist set published = 1 where urlid = %s", article['urlid']) def restore_corpus(self): self.wordids = {} self.dftext = {} rows = self.db.selectall("select rowid, word, dftext from wordlist") for row in rows: self.wordids[row[0]] = row[1] self.idwords[row[1]] = row[0] self.dftext[row[1]] = row[2] self.corpus_count = self.db.selectone( "select count(*) from cat_corpus")[0] def add_freq_index(self, urlid, wordfreq, categories=[]): for word in wordfreq: self.wordcounts.setdefault(word, 0) self.wordcounts[word] += 1 def commit_freq_index(self, table): self.dftext = {} self.wordids = {} for word in self.wordcounts: rowid = self.db.execute("insert into "+table+" (word, dftext) " + \ "values(%s, %s)", (word, self.wordcounts[word])) self.wordids[rowid] = word self.idwords[word] = rowid self.dftext[word] = self.wordcounts[word] self.wordcounts = {} def load_corpus(self, ident, pct, debug=False, retain=False): if debug: print "Loading corpus..." source = ident.split(':')[0] name = ident.split(':')[1:] if source == "file": docs = self.load_file_corpus(name, debug) elif source == "db": docs = self.load_db_corpus(name, debug, retain) if debug: print random.shuffle(docs) offset = int(len(docs) * pct) if debug: print "Selecting random %d%% of corpus (%d docs)." % \ (pct * 100, offset) # sort train_corpus by urlid train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0)) self.corpus_count = len(train_corpus) # sort predict_corpus by urlid predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \ key=operator.itemgetter(0)) self.db.execute("delete from wordlist_eval") self.db.execute("alter table wordlist_eval auto_increment = 0") self.wordids = {} self.wordcounts = {} self.cache_urls = {} for c in train_corpus: self.add_freq_index(c[0], c[1], c[2].split()) if debug: sys.stdout.write('.') sys.stdout.flush() self.commit_freq_index('wordlist_eval') return (train_corpus, predict_corpus) def load_file_corpus(self, name, debug=False): wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel" f = open(wordsfile, 'r') self.wordids = {} wordid = 1 for line in f: self.wordids[int(wordid)] = line.strip() wordid += 1 catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel" f = open(catsfile, 'r') cats = {} uniqcats = set() docid = 0 for line in f: cats[docid] = line.strip() uniqcats.add(line.strip()) docid += 1 self.categories = list(uniqcats) matfile = paths['corpus.corpus_other'] + name[0] + ".mat" f = open(matfile, 'r') f.readline() # ignore first line docs = [] docid = 0 for line in f: wordfreq = {} for (wordid, freq) in izip(*[iter(line.split())] * 2): wordfreq[self.wordids[int(wordid)]] = int(float(freq)) docs.append((docid, wordfreq, cats[docid])) docid += 1 if debug: sys.stdout.write('.') sys.stdout.flush() return docs def load_db_corpus(self, name, debug=False, retain=False): rows = self.db.selectall("""select c.urlid, c.content, group_concat(cc.category separator ' ') from %s as c, %s as cc where c.urlid = cc.urlid group by c.urlid order by c.urlid desc""" % (name[0], name[1])) if debug: print "Processing %d articles..." % len(rows) if retain and self.retained_db_docs != None: return self.retained_db_docs docs = [] for row in rows: wordfreq = self.txtpro.simpletextprocess(row[0], row[1]) if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '): docs.append((row[0], wordfreq, row[2])) if debug: sys.stdout.write('.') sys.stdout.flush() if retain: self.retained_db_docs = docs return docs
class AINewsCorpus: """ A corpus is a set of news articles (each with a title, content, and categories) that are used for training and comparison purposes. For training, the corpus provides the training examples. For comparison, the corpus provides the data for various measures like word frequency. This is important in the prediction process: we only want to predict a new article's categories based on word frequencies, and other measures, from the corpus; we don't want articles that have not been "vetted" (articles not part of the corpus) to contribute to these measures. A corpus can be "loaded" via C{load_corpus()} or "restored" via C{restore_corpus()}. The difference is the following: when loading a corpus, word frequencies are measured and stored in the database table C{wordlist_eval}; when restoring a corpus, word frequencies are simply retrieved from the database table C{wordlist}. In other words, we load a corpus when we are training or evaluating our training procedures, and we restore a corpus when we are predicting. """ def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} #: A dictionary of word=>word freq in corpus self.dftext = {} #: A dictionary of word=>wordid self.idwords = {} #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() self.categories = ["AIOverview","Agents", "Applications", \ "CognitiveScience", "Education", "Ethics", "Games", "History", \ "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \ "Reasoning", "Representation", "Robots", "ScienceFiction", \ "Speech", "Systems", "Vision"] self.sources = {} rows = self.db.selectall("select parser, relevance from sources") for row in rows: self.sources[row[0].split('::')[0]] = int(row[1]) self.retained_db_docs = None self.restore_corpus() def get_relevance(self, publisher): if re.search(r'via Google News', publisher): publisher = 'GoogleNews' return self.sources[publisher] def compare_articles(self, article1, article2): dupcount1 = len(article1['duplicates']) dupcount2 = len(article2['duplicates']) if article1['publisher'].find('User submitted') != -1: relevance1 = 200 else: relevance1 = self.get_relevance(article1['publisher']) if article2['publisher'].find('User submitted') != -1: relevance2 = 200 else: relevance2 = self.get_relevance(article2['publisher']) cat_count1 = len(article1['categories']) cat_count2 = len(article2['categories']) if cmp(dupcount1, dupcount2) == 0: if cmp(relevance1, relevance2) == 0: return cmp(cat_count1, cat_count2) else: return cmp(relevance1, relevance2) else: return cmp(dupcount1, dupcount2) def get_tfidf(self, urlid, wordfreq): """ Helper function to retrieve the tfidf of each word based on the urlid. @param urlid: target news story's urlid. @type urlid: C{int} """ if urlid in self.cache_urls: return self.cache_urls[urlid] wordid_freq_pairs = {} for word in wordfreq: if word in self.dftext: wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word]) data = {} distsq = 0.0 for wordid in wordid_freq_pairs: tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \ (math.log(self.corpus_count + 1, 2) - \ math.log(wordid_freq_pairs[wordid][1] + 1, 2)) data[wordid] = tfidf distsq += tfidf * tfidf dist = math.sqrt(distsq) if dist > 1.0e-9: for key in data: data[key] /= dist self.cache_urls[urlid] = data return data def cos_sim(self, tfidf1, tfidf2): """ A helper function to compute the cos simliarity between news story and centroid. @param tfidf1: target news story tfidf vector. @type tfidf1: C{dict} @param tfidf2: centroid tfidf vector. @type tfidf2: C{dict} """ sim = 0.0 for key in tfidf1: if key in tfidf2: word = self.wordids[key] a = tfidf1[key] b = tfidf2[key] sim += a*b return sim def get_article(self, urlid, corpus = False): row = None if corpus: table = 'cat_corpus' cat_table = 'cat_corpus_cats' row = self.db.selectone("""select u.url, u.title, u.content from %s as u where u.urlid = %s""" % (table, urlid)) else: table = 'urllist' cat_table = 'categories' row = self.db.selectone("""select u.url, u.title, u.content, u.summary, u.pubdate, u.crawldate, u.processed, u.published, u.publisher from %s as u where u.urlid = %s""" % \ (table, urlid)) if row != None and row[2] is not None: wordfreq = self.txtpro.simpletextprocess(urlid, row[2]) summary = "" if not corpus: summary = row[3] processed = False if not corpus and row[6] == 1: processed = True published = False if not corpus and row[7] == 1: published = True pubdate = "" if not corpus: pubdate = row[4] crawldate = "" if not corpus: crawldate = row[5] publisher = "" if not corpus: publisher = row[8] categories = [] cat_rows = self.db.selectall("""select category from %s where urlid = %s""" % (cat_table, urlid)) for cat_row in cat_rows: categories.append(cat_row[0]) return {'urlid': urlid, 'url': row[0], 'title': row[1], 'content': trunc(row[2], max_pos=3000), 'content_all': row[2], 'summary': summary, 'pubdate': pubdate, 'crawldate': crawldate, 'processed': processed, 'published': published, 'publisher': publisher, 'categories': categories, 'duplicates': [], 'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)} else: return None def get_articles_daterange(self, date_start, date_end): articles = {} rows = self.db.selectall("""select urlid from urllist where pubdate >= %s and pubdate <= %s""", (date_start, date_end)) for row in rows: articles[row[0]] = self.get_article(row[0]) return articles def get_articles_idrange(self, urlid_start, urlid_end, corpus = False): articles = {} rows = self.db.selectall("""select urlid from urllist where urlid >= %s and urlid <= %s""", (urlid_start, urlid_end)) for row in rows: art = self.get_article(row[0], corpus) if art is not None: articles[row[0]] = art return articles def get_unprocessed(self): articles = {} rows = self.db.selectall("select urlid from urllist where processed = 0") for row in rows: articles[row[0]] = self.get_article(row[0]) return articles def get_publishable(self): articles = [] rows = self.db.selectall("select urlid from urllist where " "publishable = 1 and published = 0 and pubdate != '0000-00-00'") for row in rows: articles.append(self.get_article(row[0])) return articles def get_published(self): articles = [] rows = self.db.selectall("select urlid from urllist where published = 1") for row in rows: articles.append(self.get_article(row[0])) return articles def mark_processed(self, articles): for article in articles: self.db.execute("update urllist set processed = 1 where urlid = %s", article['urlid']) def mark_publishable(self, articles): for article in articles: self.db.execute("update urllist set publishable = 1 where urlid = %s", article['urlid']) def mark_published(self, articles): for article in articles: self.db.execute("update urllist set published = 1 where urlid = %s", article['urlid']) def restore_corpus(self): self.wordids = {} self.dftext = {} rows = self.db.selectall("select rowid, word, dftext from wordlist") for row in rows: self.wordids[row[0]] = row[1] self.idwords[row[1]] = row[0] self.dftext[row[1]] = row[2] self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0] def add_freq_index(self, urlid, wordfreq, categories = []): for word in wordfreq: self.wordcounts.setdefault(word, 0) self.wordcounts[word] += 1 def commit_freq_index(self, table): self.dftext = {} self.wordids = {} for word in self.wordcounts: rowid = self.db.execute("insert into "+table+" (word, dftext) " + \ "values(%s, %s)", (word, self.wordcounts[word])) self.wordids[rowid] = word self.idwords[word] = rowid self.dftext[word] = self.wordcounts[word] self.wordcounts = {} def load_corpus(self, ident, pct, debug = False, retain = False): if debug: print "Loading corpus..." source = ident.split(':')[0] name = ident.split(':')[1:] if source == "file": docs = self.load_file_corpus(name, debug) elif source == "db": docs = self.load_db_corpus(name, debug, retain) if debug: print random.shuffle(docs) offset = int(len(docs)*pct) if debug: print "Selecting random %d%% of corpus (%d docs)." % \ (pct * 100, offset) # sort train_corpus by urlid train_corpus = sorted(docs[0:offset], key=operator.itemgetter(0)) self.corpus_count = len(train_corpus) # sort predict_corpus by urlid predict_corpus = sorted(docs[offset:offset+int(len(docs)*0.1)], \ key=operator.itemgetter(0)) self.db.execute("delete from wordlist_eval") self.db.execute("alter table wordlist_eval auto_increment = 0") self.wordids = {} self.wordcounts = {} self.cache_urls = {} for c in train_corpus: self.add_freq_index(c[0], c[1], c[2].split()) if debug: sys.stdout.write('.') sys.stdout.flush() self.commit_freq_index('wordlist_eval') return (train_corpus, predict_corpus) def load_file_corpus(self, name, debug = False): wordsfile = paths['corpus.corpus_other'] + name[0] + ".mat.clabel" f = open(wordsfile, 'r') self.wordids = {} wordid = 1 for line in f: self.wordids[int(wordid)] = line.strip() wordid += 1 catsfile = paths['corpus.corpus_other'] + name[0] + ".mat.rlabel" f = open(catsfile, 'r') cats = {} uniqcats = set() docid = 0 for line in f: cats[docid] = line.strip() uniqcats.add(line.strip()) docid += 1 self.categories = list(uniqcats) matfile = paths['corpus.corpus_other'] + name[0] + ".mat" f = open(matfile, 'r') f.readline() # ignore first line docs = [] docid = 0 for line in f: wordfreq = {} for (wordid, freq) in izip(*[iter(line.split())]*2): wordfreq[self.wordids[int(wordid)]] = int(float(freq)) docs.append((docid, wordfreq, cats[docid])) docid += 1 if debug: sys.stdout.write('.') sys.stdout.flush() return docs def load_db_corpus(self, name, debug = False, retain = False): rows = self.db.selectall("""select c.urlid, c.content, group_concat(cc.category separator ' ') from %s as c, %s as cc where c.urlid = cc.urlid group by c.urlid order by c.urlid desc""" % (name[0], name[1])) if debug: print "Processing %d articles..." % len(rows) if retain and self.retained_db_docs != None: return self.retained_db_docs docs = [] for row in rows: wordfreq = self.txtpro.simpletextprocess(row[0], row[1]) if wordfreq.N() > 0 and 'NotRelated' not in row[2].split(' '): docs.append((row[0], wordfreq, row[2])) if debug: sys.stdout.write('.') sys.stdout.flush() if retain: self.retained_db_docs = docs return docs
def __init__(self): self.txtpro = AINewsTextProcessor()
class AINewsWekaClassifier: def __init__(self): self.txtpro = AINewsTextProcessor() def __save_bag_of_words(self, tid, fieldidx): # find all unique words in the arff 'title' field, remove stop # words, perform stemming, collect their frequencies phrases = [] f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: phrases.append(record[fieldidx]) bag = self.txtpro.simpletextprocess(0, ' '.join(phrases)) smallerbag = FreqDist() i = 0 for word in bag: if i == 1000: break smallerbag[word] = bag[word] i += 1 p = open("%sbag_of_words-%d.pickle" % (paths['weka.bag_of_words_dir'], fieldidx), 'w') pickle.dump(smallerbag, p) p.close() def __prepare_arff(self, tid): p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_title = pickle.load(p) p.close() p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_body = pickle.load(p) p.close() data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid} for word in bag_title: data['attributes'].append(("title-%s" % word, 'NUMERIC')) for word in bag_body: data['attributes'].append(("body-%s" % word, 'NUMERIC')) data['attributes'].append(('class', ['yes', 'no'])) f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: record_bag_title = self.txtpro.simpletextprocess(0, record[0]) record_bag_body = self.txtpro.simpletextprocess(0, record[1]) record_data = [] # iterate through original bag, figure out freq in this record's bag for word in bag_title: if word in record_bag_title: record_data.append(record_bag_title[word]) else: record_data.append(0) for word in bag_body: if word in record_bag_body: record_data.append(record_bag_body[word]) else: record_data.append(0) record_data.append(record[2]) data['data'].append(record_data) fnew = open("%s%d-wordvec-nonsparse.arff" % \ (paths['weka.training_arff_dir'], tid), 'w') arff.dump(fnew, data) fnew.close() # convert to sparse format Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " + "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid)) # 1. load unprocessed arff files, from just one tid, from family_resemblance export # 2. gather all titles, parse into a bag of words # 3. save bag of words (list? need to keep the order) in a pickle file # 4. write new sparse arff files for each tid using this sorted bag of words def __get_tids(self): tids = [] files = listdir(paths['weka.training_arff_dir']) for f in files: m = re.match(r'^(\d+).arff$', f) if m: if m.group(1) == '0': continue tids.append(int(m.group(1))) return tids def train(self): tids = self.__get_tids() # all tid arffs have same entries, so use the first to grab the bag of words print "Saving bag of words..." self.__save_bag_of_words(tids[0], 0) self.__save_bag_of_words(tids[0], 1) for tid in sorted(tids): print "Preparing tid %d" % tid self.__prepare_arff(tid) for tid in sorted(tids): print "Spread subsampling for tid %d" % tid Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " + "-M 1.0 -X 0.0 -S 1 -c last " + "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() print "Training random forests for tid %d" % tid Popen(("java -cp %s %s %s -v " + "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \ (paths['weka.weka_jar'], config['weka.classifier'], config['weka.classifier_params'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() print out def train_experiment(self): model_scores = {} models = {'random-forest': ('weka.classifiers.trees.RandomForest', '-I 20 -K 0'), 'naive-bayes': ('weka.classifiers.bayes.NaiveBayes', ''), 'bayesnet': ('weka.classifiers.bayes.BayesNet', ''), 'j48': ('weka.classifiers.trees.J48', ''), 'knn': ('weka.classifiers.lazy.IBk', '-K 3')} tids = self.__get_tids() # all tid arffs have same entries, so use the first to grab the bag of words print "Saving bag of words..." self.__save_bag_of_words(tids[0], 0) self.__save_bag_of_words(tids[0], 1) for tid in sorted(tids): print "Preparing tid %d" % tid self.__prepare_arff(tid) for tid in sorted(tids): print "Spread subsampling for tid %d" % tid Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " + "-M 1.0 -X 0.0 -S 1 -c last " + "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() for tid in sorted(tids): model_scores[tid] = {} for model in models.keys(): print "Training %s for tid %d" % (models[model][0], tid) (out, _) = Popen(("java -cp %s %s %s -v " + "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \ (paths['weka.weka_jar'], models[model][0], models[model][1], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() correct = 0.0 for line in out.splitlines(): m = re.search(r'Correctly Classified Instances\s+\d+\s+(.*) %', line) if m: correct = float(m.group(1)) break model_scores[tid][model] = correct with open('training_experiment.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['model', 'tid', 'correct']) for tid in model_scores.keys(): for model in model_scores[tid].keys(): writer.writerow([model, tid, model_scores[tid][model]]) def __predict_arff(self): tids = self.__get_tids() # the testing file should always be 0.arff self.__prepare_arff(0) predictions = {} for tid in sorted(tids): predictions[tid] = [] print "Predicting tid %d" % tid (out, err) = Popen(("java -cp %s %s " + "-T %s0-wordvec.arff -l %s%d.model -p last") % \ (paths['weka.weka_jar'], config['weka.classifier'], paths['weka.training_arff_dir'], paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() for line in out.splitlines(): m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.?\d*)', line) if m: answer = False if m.group(1) == 'yes': answer = True conf = float(m.group(2)) if conf < 0.75: answer = False predictions[tid].append((answer, conf)) return predictions def predict(self, articles): # modifies the provided articles dict data = {'attributes': [('title', 'STRING'), ('body', 'STRING'), ('class', ['yes', 'no'])], 'data': [], 'description': u'', 'relation': '0'} for urlid in sorted(articles.keys()): title = re.sub(r'\W', ' ', articles[urlid]['title']) body = re.sub(r'\W', ' ', articles[urlid]['summary']) data['data'].append([title, body, 'no']) # make the testing file 0.arff fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w') arff.dump(fnew, data) fnew.close() predictions = self.__predict_arff() for urlid in sorted(articles.keys()): articles[urlid]['categories'] = [] tids = self.__get_tids() for tid in sorted(tids): for (i, urlid) in enumerate(sorted(articles.keys())): if predictions[tid][i][0]: articles[urlid]['categories'].append(str(tid))
class AINewsPublisher(): def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta(days = int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.svm_classifier = AINewsSVMClassifier() self.txtpro = AINewsTextProcessor() self.summarizer = AINewsSummarizer() self.articles = {} self.publishable_articles = [] self.semiauto_email_output = "" self.topicids = {"AIOverview":0, "Agents":1, "Applications":2, "CognitiveScience":3, "Education":4,"Ethics":5, "Games":6, "History":7, "Interfaces":8, "MachineLearning":9, "NaturalLanguage":10, "Philosophy":11, "Reasoning":12, "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16, "Systems":17, "Vision":18} def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]['publish'] = True self.articles[urlid]['transcript'] = [] # filter by date for urlid in self.articles: if self.articles[urlid]['pubdate'] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]['pubdate'] = self.today self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append("Rejected due to bogus publication date.") elif self.articles[urlid]['pubdate'] < self.earliest_date: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s") % (self.earliest_date.strftime('%F'), self.articles[urlid]['pubdate'].strftime('%F'))) # filter by blacklist (for urls) for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]['url']): self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because url matched blacklisted url %s" % black)) break # filter by whitelist for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]['content']) self.articles[urlid]['white_wordfreq'] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 \ and self.articles[urlid]['publisher'] != 'UserSubmitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to only one or no whitelisted terms') # update categories based on SVM classifier predictions self.svm_classifier.predict(self.articles) # drop articles classified as 'NotRelated' unless the article # is user-submitted for urlid in self.articles: if 'NotRelated' in self.articles[urlid]['categories'] \ and self.articles[urlid]['publisher'] != 'UserSubmitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to NotRelated classification') # drop articles with no categories (even if user-submitted) for urlid in self.articles: if len(self.articles[urlid]['categories']) == 0: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to no selected categories') # filter out duplicates; some articles may have 'publish' set to False # by this function self.duplicates.filter_duplicates(self.articles) # add article summaries self.summarizer.summarize(self.corpus, self.articles) for urlid in self.articles: try: print urlid, self.articles[urlid]['publish'], \ self.articles[urlid]['title'], \ self.articles[urlid]['categories'], \ self.articles[urlid]['summary'] print except: pass for urlid in self.articles: # update article in database self.update_db(self.articles[urlid]) # mark each as processed self.corpus.mark_processed(self.articles.itervalues()) # save sorted list of articles to be read by AINewsPublisher; sort by # duplicate count (more = better), then relevance of source, # then by number of categories (more = better) unpublished_articles = sorted( filter(lambda x: x['publish'], self.articles.values()), cmp=lambda x,y: self.corpus.compare_articles(x, y), reverse = True) max_cat_count = int(config['publisher.max_cat_count']) max_count = int(config['publisher.max_count']) cat_counts = {} for cat in self.corpus.categories: cat_counts[cat] = 0 # choose stories such that no category has more than max_cat_count # members and no more than max_count stories have been selected # (independent of category); only one of the article's categories needs # to have "free space" self.publishable_articles = [] for article in unpublished_articles: if len(self.publishable_articles) == max_count: break free_cat = False for cat in article['categories']: if cat_counts[cat] < max_cat_count: free_cat = True break # if there is a free category or this article has only the # Applications category, then it can be published if free_cat or (article['categories'] == ['Applications']): self.publishable_articles.append(article) self.articles[article['urlid']]['transcript'].append('Published') self.articles[article['urlid']]['published'] = True for cat in article['categories']: cat_counts[cat] += 1 # record that these articles are publishable self.corpus.mark_publishable(self.publishable_articles) def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article['urlid']) for cat in article['categories']: self.db.execute("insert into categories values (%s,%s)", (article['urlid'], cat)) self.db.execute("update urllist set summary = %s where urlid = %s", (article['summary'], article['urlid'])) def get_publishable_articles(self): publishable = self.corpus.get_publishable() self.publishable_articles = [] # drop "Applications" category if article has more categories for article in publishable: if len(article['categories']) > 1: article['categories'] = filter(lambda c: c != "Applications", article['categories']) self.publishable_articles.append(article) def mark_published(self): self.corpus.mark_published(self.publishable_articles) def generate_standard_output(self): """ Generate the stanard output for debuging on screen. """ txt = LatestNewsTxt() txt.news = self.publishable_articles savefile(paths['ainews.output'] + "std_output.txt", str(txt)) def generate_email_output(self): """ Generate the output for email format. """ email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.news = self.publishable_articles email.aitopic_urls = aitopic_urls email.topicids = self.topicids email_output = str(email) savefile(paths['ainews.output'] + "email_output.txt", email_output) self.semiauto_email_output = email_output def generate_pmwiki_all_output(self): pmwiki_all = AllNewsPmWiki() pmwiki_all.date = self.today.strftime("%B %d, %Y") pmwiki_all.year = self.today.strftime("%Y") pmwiki_all.news = self.articles.values() savefile(paths['ainews.output'] + "pmwiki_all.txt", str(pmwiki_all)) # Generate wiki metadata page for each article urlids_output = "" for urlid in self.articles: urlids_output += str(urlid) + '\n' article_wiki = ArticlePmWiki() article_wiki.year = self.today.strftime("%Y") article_wiki.dupthreshold = float(config['duplicates.threshold']) article_wiki.n = self.articles[urlid] savefile(paths['ainews.output'] + "aiarticles/%d" % urlid, str(article_wiki)) savefile(paths['ainews.output'] + "urlids_output.txt", urlids_output) def generate_pmwiki_published_output(self): """ Genereate the output with PmWiki page format. It needs to be further processed by AINewsPmwiki.php. """ pmwiki = LatestNewsPmWiki() pmwiki.date = self.today.strftime("%B %d, %Y") pmwiki.year = self.today.strftime("%Y") pmwiki.news = self.publishable_articles pmwiki.rater = True savefile(paths['ainews.output'] + "pmwiki_output.txt", str(pmwiki)) pmwiki.rater = False savefile(paths['ainews.output'] + "pmwiki_output_norater.txt", str(pmwiki)) def publish_email(self): """ Call AINewsEmail.php to send email through PHP Mail Server """ #cmd = 'php AINewsEmail.php' #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate() self.publish_email_semiauto() def publish_email_semiauto(self): """ Create an AINewsSemiAutoEmail.html file for admin to click and semi-auto send it to the subscriber list. """ semiauto = """ <html> <body> <h1>AI Alert - SemiAuto Sender</h1> <form action="http://aaai.org/cgi-dada/mail.cgi?flavor=send_email" method='post'> <!-- <form action="welcome.php" method="post"> --> <input type='hidden' name='f' value='send_email' /> <input type='hidden' name='process' value='true' /> <input type='hidden' name='admin_list' value='alert' /> <input type='hidden' name='message_subject' value="%s" /> <input type='hidden' name='email_format' value='HTML' /> <textarea type='hidden' name="text_message_body">%s</textarea> <input type='submit' value='Submit Mailing List Message' /> </form> <h2>Please review the email below. If there are concerns, contact Bruce or Reid:</h2> <p> %s </p> </body> </html> """ % ("AI Alert - "+str(self.today.strftime("%B %d, %Y")), self.semiauto_email_output, self.semiauto_email_output) savefile(paths['ainews.html'] + "semiauto_email.html", semiauto) def publish_pmwiki(self): """ Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website. """ cmd = 'php AINewsPmwiki.php' Popen(cmd, shell = True).wait() def update_rss(self): rssitems = [] # insert latest news into rssitems for article in self.publishable_articles: rssitems.append(PyRSS2Gen.RSSItem( title = article['title'], link = article['url'], description = article['summary'], guid = PyRSS2Gen.Guid(article['url']), pubDate = datetime(article['pubdate'].year, \ article['pubdate'].month, article['pubdate'].day))) rssfile = paths['ainews.rss'] + "news.xml" publish_rss(rssfile, rssitems) topicrsses = ['overview', 'agent', 'apps', 'cogsci', 'edu', 'ethsoc', 'game', 'hist', 'interf', 'ml', 'nlp', 'phil', 'reason', 'rep', 'robot', 'scifi', 'speech', 'systems', 'vision'] topicitems = [] for i in range(len(topicrsses)): topicitems.append([]) urlset = set() for article in self.publishable_articles: if article['url'] in urlset: continue urlset.add(article['url']) for cat in article['categories']: topicid = self.topicids[cat] topicitems[topicid].append(PyRSS2Gen.RSSItem( title = article['title'], link = article['url'], description = article['summary'], guid = PyRSS2Gen.Guid(article['url']), pubDate = datetime(article['pubdate'].year, \ article['pubdate'].month, article['pubdate'].day))) for i in range(len(topicrsses)): rssfile = paths['ainews.rss'] + topicrsses[i]+'.xml' if len(topicitems[i]) != 0: publish_rss(rssfile, topicitems[i])
class AINewsPublisher(): def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta( days=int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = "" def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]['publish'] = True self.articles[urlid]['transcript'] = [] # filter by date print "Filtering by date..." for urlid in self.articles: if self.articles[urlid]['pubdate'] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]['pubdate'] = self.today self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( "Rejected due to bogus publication date.") elif self.articles[urlid]['pubdate'] < self.earliest_date: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s") % (self.earliest_date.strftime('%F'), self.articles[urlid]['pubdate'].strftime('%F'))) # filter by blacklist (for urls) print "Filtering by blacklist..." for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]['url']): self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because url matched blacklisted url %s" % black)) break # filter by whitelist print "Filtering by whitelist..." for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess( urlid, self.articles[urlid]['content']) self.articles[urlid]['white_wordfreq'] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 \ and self.articles[urlid]['source'] != 'User Submitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to only one or no whitelisted terms') # update categories based on classifier predictions print "Classifying..." self.weka.predict(self.articles) # drop articles with no categories print "Dropping articles with no categories..." for urlid in self.articles: if len(self.articles[urlid]['categories']) == 0: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to no selected categories') # filter out duplicates; some articles may have 'publish' set to False # by this function print "Filtering duplicates..." self.duplicates.filter_duplicates(self.articles) for urlid in self.articles: print urlid, self.articles[urlid]['publish'], \ self.articles[urlid]['title'], \ self.articles[urlid]['categories'], \ self.articles[urlid]['summary'] print print "Grabbing images..." for urlid in self.articles: # grab and convert article image (if it exists) self.grab_convert_image(self.articles[urlid]) # update article in database self.update_db(self.articles[urlid]) # mark each as processed print "Marking as processed." self.corpus.mark_processed(self.articles.itervalues()) def grab_convert_image(self, article): if len(article['image_url']) == 0: article['image_path'] = '' return try: f = urllib2.urlopen(article['image_url']) img = open( "%s%s" % (paths['ainews.image_dir'], str(article['urlid'])), 'w') img.write(f.read()) img.close() # produces [urlid].jpg Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \ (paths['imagemagick.mogrify'], paths['ainews.image_dir'], str(article['urlid'])), shell = True).communicate() # remove [urlid] file (with no extension) remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid']))) article[ 'image_path'] = "public://newsfinder_images/%s.jpg" % article[ 'urlid'] except Exception as e: print "Failed converting image for %d: %s" % (article['urlid'], e) article['image_path'] = '' def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article['urlid']) for cat in article['categories']: self.db.execute("insert into categories values (%s,%s)", (article['urlid'], cat)) def generate_feed_import(self): """ Generate XML file for feed import on the Drupal site. """ xml = FeedImport() for article in self.articles.values(): article['source'] = re.sub(r'&', '&', article['source']) xml.news = self.articles.values() savefile(paths['ainews.output_xml'] + "news.xml", str(xml)) def generate_email_output(self): articles = [] try: f = urllib2.urlopen(paths['ainews.top_weekly_news_xml']) xml = etree.parse(f) for node in xml.iter("node"): print "Found", node.findtext("Title") published = node.findtext("Publication_date") articles.append({ 'title': node.findtext("Title"), 'source': node.findtext("Source"), 'topics': re.sub(r'/topic/', 'http://aitopics.org/topic/', node.findtext("Topics")), 'pubdate': date(int(published[0:4]), int(published[5:7]), int(published[8:10])), 'summary': re.sub( r'</p>(</blockquote>)?$', '', re.sub(r'^(<blockquote>)?<p>', '', node.findtext("Body"))), 'url': node.findtext("Original_link"), 'link': re.sub(r'/news/', 'http://aitopics.org/news/', node.findtext("Link")), 'image': re.sub( r'<img', '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ', node.findtext("Representative_image")) }) except Exception, e: print e email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.articles = articles email_output = str(email) return email_output
def __init__(self): self.txtpro = AINewsTextProcessor()
class AINewsWekaClassifier: def __init__(self): self.txtpro = AINewsTextProcessor() def __save_bag_of_words(self, tid, fieldidx): # find all unique words in the arff 'title' field, remove stop # words, perform stemming, collect their frequencies phrases = [] f = arff.load( open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: phrases.append(record[fieldidx]) bag = self.txtpro.simpletextprocess(0, ' '.join(phrases)) smallerbag = FreqDist() i = 0 for word in bag: if i == 1000: break smallerbag[word] = bag[word] i += 1 p = open( "%sbag_of_words-%d.pickle" % (paths['weka.bag_of_words_dir'], fieldidx), 'w') pickle.dump(smallerbag, p) p.close() def __prepare_arff(self, tid): p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_title = pickle.load(p) p.close() p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r') bag_body = pickle.load(p) p.close() data = { 'attributes': [], 'data': [], 'description': u'', 'relation': tid } for word in bag_title: data['attributes'].append(("title-%s" % word, 'NUMERIC')) for word in bag_body: data['attributes'].append(("body-%s" % word, 'NUMERIC')) data['attributes'].append(('class', ['yes', 'no'])) f = arff.load( open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r')) for record in f['data']: record_bag_title = self.txtpro.simpletextprocess(0, record[0]) record_bag_body = self.txtpro.simpletextprocess(0, record[1]) record_data = [] # iterate through original bag, figure out freq in this record's bag for word in bag_title: if word in record_bag_title: record_data.append(record_bag_title[word]) else: record_data.append(0) for word in bag_body: if word in record_bag_body: record_data.append(record_bag_body[word]) else: record_data.append(0) record_data.append(record[2]) data['data'].append(record_data) fnew = open("%s%d-wordvec-nonsparse.arff" % \ (paths['weka.training_arff_dir'], tid), 'w') arff.dump(fnew, data) fnew.close() # convert to sparse format Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " + "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid)) # 1. load unprocessed arff files, from just one tid, from family_resemblance export # 2. gather all titles, parse into a bag of words # 3. save bag of words (list? need to keep the order) in a pickle file # 4. write new sparse arff files for each tid using this sorted bag of words def __get_tids(self): tids = [] files = listdir(paths['weka.training_arff_dir']) for f in files: m = re.match(r'^(\d+).arff$', f) if m: if m.group(1) == '0': continue tids.append(int(m.group(1))) return tids def train(self): tids = self.__get_tids() # all tid arffs have same entries, so use the first to grab the bag of words print "Saving bag of words..." self.__save_bag_of_words(tids[0], 0) self.__save_bag_of_words(tids[0], 1) for tid in sorted(tids): print "Preparing tid %d" % tid self.__prepare_arff(tid) for tid in sorted(tids): print "Spread subsampling for tid %d" % tid Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " + "-M 1.0 -X 0.0 -S 1 -c last " + "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() print "Training random forests for tid %d" % tid Popen(("java -cp %s %s %s -v " + "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \ (paths['weka.weka_jar'], config['weka.classifier'], config['weka.classifier_params'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() print out def train_experiment(self): model_scores = {} models = { 'random-forest': ('weka.classifiers.trees.RandomForest', '-I 20 -K 0'), 'naive-bayes': ('weka.classifiers.bayes.NaiveBayes', ''), 'bayesnet': ('weka.classifiers.bayes.BayesNet', ''), 'j48': ('weka.classifiers.trees.J48', ''), 'knn': ('weka.classifiers.lazy.IBk', '-K 3') } tids = self.__get_tids() # all tid arffs have same entries, so use the first to grab the bag of words print "Saving bag of words..." self.__save_bag_of_words(tids[0], 0) self.__save_bag_of_words(tids[0], 1) for tid in sorted(tids): print "Preparing tid %d" % tid self.__prepare_arff(tid) for tid in sorted(tids): print "Spread subsampling for tid %d" % tid Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " + "-M 1.0 -X 0.0 -S 1 -c last " + "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \ (paths['weka.weka_jar'], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True).communicate() for tid in sorted(tids): model_scores[tid] = {} for model in models.keys(): print "Training %s for tid %d" % (models[model][0], tid) (out, _) = Popen(("java -cp %s %s %s -v " + "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \ (paths['weka.weka_jar'], models[model][0], models[model][1], paths['weka.training_arff_dir'], tid, paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() correct = 0.0 for line in out.splitlines(): m = re.search( r'Correctly Classified Instances\s+\d+\s+(.*) %', line) if m: correct = float(m.group(1)) break model_scores[tid][model] = correct with open('training_experiment.csv', 'w') as csvfile: writer = csv.writer(csvfile) writer.writerow(['model', 'tid', 'correct']) for tid in model_scores.keys(): for model in model_scores[tid].keys(): writer.writerow([model, tid, model_scores[tid][model]]) def __predict_arff(self): tids = self.__get_tids() # the testing file should always be 0.arff self.__prepare_arff(0) predictions = {} for tid in sorted(tids): predictions[tid] = [] print "Predicting tid %d" % tid (out, err) = Popen(("java -cp %s %s " + "-T %s0-wordvec.arff -l %s%d.model -p last") % \ (paths['weka.weka_jar'], config['weka.classifier'], paths['weka.training_arff_dir'], paths['weka.training_arff_dir'], tid), shell = True, stdout = PIPE).communicate() for line in out.splitlines(): m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.?\d*)', line) if m: answer = False if m.group(1) == 'yes': answer = True conf = float(m.group(2)) if conf < 0.75: answer = False predictions[tid].append((answer, conf)) return predictions def predict(self, articles): # modifies the provided articles dict data = { 'attributes': [('title', 'STRING'), ('body', 'STRING'), ('class', ['yes', 'no'])], 'data': [], 'description': u'', 'relation': '0' } for urlid in sorted(articles.keys()): title = re.sub(r'\W', ' ', articles[urlid]['title']) body = re.sub(r'\W', ' ', articles[urlid]['summary']) data['data'].append([title, body, 'no']) # make the testing file 0.arff fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w') arff.dump(fnew, data) fnew.close() predictions = self.__predict_arff() for urlid in sorted(articles.keys()): articles[urlid]['categories'] = [] tids = self.__get_tids() for tid in sorted(tids): for (i, urlid) in enumerate(sorted(articles.keys())): if predictions[tid][i][0]: articles[urlid]['categories'].append(str(tid))
class AINewsPublisher: def __init__(self): self.debug = config["ainews.debug"] self.today = date.today() self.earliest_date = self.today - timedelta(days=int(config["ainews.period"])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = "" def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]["publish"] = True self.articles[urlid]["transcript"] = [] # filter by date print "Filtering by date..." for urlid in self.articles: if self.articles[urlid]["pubdate"] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]["pubdate"] = self.today self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to bogus publication date.") elif self.articles[urlid]["pubdate"] < self.earliest_date: self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append( ( "Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s" ) % (self.earliest_date.strftime("%F"), self.articles[urlid]["pubdate"].strftime("%F")) ) # filter by blacklist (for urls) print "Filtering by blacklist..." for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]["url"]): self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append( ("Rejected because url matched blacklisted url %s" % black) ) break # filter by whitelist print "Filtering by whitelist..." for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]["content"]) self.articles[urlid]["white_wordfreq"] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 and self.articles[urlid]["source"] != "User Submitted": self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to only one or no whitelisted terms") # update categories based on classifier predictions print "Classifying..." self.weka.predict(self.articles) # drop articles with no categories print "Dropping articles with no categories..." for urlid in self.articles: if len(self.articles[urlid]["categories"]) == 0: self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to no selected categories") # filter out duplicates; some articles may have 'publish' set to False # by this function print "Filtering duplicates..." self.duplicates.filter_duplicates(self.articles) for urlid in self.articles: print urlid, self.articles[urlid]["publish"], self.articles[urlid]["title"], self.articles[urlid][ "categories" ], self.articles[urlid]["summary"] print print "Grabbing images..." for urlid in self.articles: # grab and convert article image (if it exists) self.grab_convert_image(self.articles[urlid]) # update article in database self.update_db(self.articles[urlid]) # mark each as processed print "Marking as processed." self.corpus.mark_processed(self.articles.itervalues()) def grab_convert_image(self, article): if len(article["image_url"]) == 0: article["image_path"] = "" return try: f = urllib2.urlopen(article["image_url"]) img = open("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])), "w") img.write(f.read()) img.close() # produces [urlid].jpg Popen( "%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % (paths["imagemagick.mogrify"], paths["ainews.image_dir"], str(article["urlid"])), shell=True, ).communicate() # remove [urlid] file (with no extension) remove("%s%s" % (paths["ainews.image_dir"], str(article["urlid"]))) article["image_path"] = "public://newsfinder_images/%s.jpg" % article["urlid"] except Exception as e: print "Failed converting image for %d: %s" % (article["urlid"], e) article["image_path"] = "" def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article["urlid"]) for cat in article["categories"]: self.db.execute("insert into categories values (%s,%s)", (article["urlid"], cat)) def generate_feed_import(self): """ Generate XML file for feed import on the Drupal site. """ xml = FeedImport() for article in self.articles.values(): article["source"] = re.sub(r"&", "&", article["source"]) xml.news = self.articles.values() savefile(paths["ainews.output_xml"] + "news.xml", str(xml)) def generate_email_output(self): articles = [] try: f = urllib2.urlopen(paths["ainews.top_weekly_news_xml"]) xml = etree.parse(f) for node in xml.iter("node"): print "Found", node.findtext("Title") published = node.findtext("Publication_date") articles.append( { "title": node.findtext("Title"), "source": node.findtext("Source"), "topics": re.sub(r"/topic/", "http://aitopics.org/topic/", node.findtext("Topics")), "pubdate": date(int(published[0:4]), int(published[5:7]), int(published[8:10])), "summary": re.sub( r"</p>(</blockquote>)?$", "", re.sub(r"^(<blockquote>)?<p>", "", node.findtext("Body")) ), "url": node.findtext("Original_link"), "link": re.sub(r"/news/", "http://aitopics.org/news/", node.findtext("Link")), "image": re.sub( r"<img", '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ', node.findtext("Representative_image"), ), } ) except Exception, e: print e email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.articles = articles email_output = str(email) return email_output