def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta( days=int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = ""
def evaluate(): corpus = AINewsCorpus() print "urlid,length truewords,length justext,length goose,ld justtext,ld goose" for filename in sorted(glob.glob("../../experiments/justext/*.true")): truetext = ents.convert(file(filename).read()) truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False)) truewords = re.split(r'\s+', truetext) urlid = filename[26:30] article = corpus.get_article(urlid) if article == None: continue articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False)) articlewords = re.split(r'\s+', articletext) goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url'] (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate() goosetext = ents.convert(stdout.encode('ascii')) goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False)) goosewords = re.split(r'\s+', goosetext) ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords)) ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords)) print "%s,%d,%d,%d,%.4f,%.4f" % \ (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
def __init__(self): self.debug = config["ainews.debug"] self.today = date.today() self.earliest_date = self.today - timedelta(days=int(config["ainews.period"])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = ""
def evaluate(self, ident, inputdir): corpus = AINewsCorpus() (articles, _) = corpus.load_corpus(ident, 1.0, True) for (urlid,_,_) in articles: article = corpus.get_article(urlid, True) try: os.mkdir("%s/gold/%s" % (inputdir, urlid)) except: pass f = open("%s/gold/%s/%s.fulltext" % (inputdir, urlid, urlid), 'w') f.write(article['content']) f.write("\n") f.close() f = open("%s/system/ots/%s.ots.system" % (inputdir, urlid), 'w') f.write("\n".join(self.summarize_single_ots(article))) f.write("\n") f.close() f = open("%s/system/tfidf/%s.tfidf.system" % (inputdir, urlid), 'w') f.write("\n".join(self.summarize_article(corpus, article, 4, False))) f.write("\n") f.close() print "Saved %s." % urlid
def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta(days = int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.svm_classifier = AINewsSVMClassifier() self.txtpro = AINewsTextProcessor() self.summarizer = AINewsSummarizer() self.articles = {} self.publishable_articles = [] self.semiauto_email_output = "" self.topicids = {"AIOverview":0, "Agents":1, "Applications":2, "CognitiveScience":3, "Education":4,"Ethics":5, "Games":6, "History":7, "Interfaces":8, "MachineLearning":9, "NaturalLanguage":10, "Philosophy":11, "Reasoning":12, "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16, "Systems":17, "Vision":18}
def __init__(self): self.corpus = AINewsCorpus()
class AINewsSVMClassifier: def __init__(self): self.corpus = AINewsCorpus() def predict(self, articles): urlids = sorted(articles.keys()) for urlid in articles: articles[urlid]['categories'] = [] # produce the test input file f = open(paths['svm.svm_data']+'predict', 'w') for urlid in urlids: for cat in self.corpus.categories: articles[urlid]['cat_probs'] = {} tfidf = self.corpus.get_tfidf(urlid, articles[urlid]['wordfreq']) f.write("+1 ") for wordid in sorted(tfidf.keys()): f.write("%s:%f " % (wordid, tfidf[wordid])) f.write("\n") f.close() # predict each category plus NotRelated for cat in self.corpus.categories: cmd = 'svm-scale -r "%s" "%s" > "%s"' % \ (paths['svm.svm_data']+cat+'.range', \ paths['svm.svm_data']+'predict', \ paths['svm.svm_data']+'predict-'+cat+'.scaled') Popen(cmd, shell = True).wait() cmd = 'svm-predict -b 1 "%s" "%s" "%s" > /dev/null' % \ (paths['svm.svm_data']+'predict-'+cat+'.scaled', \ paths['svm.svm_data']+cat+'.model', paths['svm.svm_data']+'predict-'+cat+'.output') Popen(cmd, shell = True).wait() f = open(paths['svm.svm_data']+'predict-'+cat+'.output', 'r') lines = f.readlines() f.close() # first line of output file says "labels -1 1" or whatever; # the order could be different, so we have to check labels = re.match('labels (-?1) (-?1)', lines[0]).group(1,2) if labels[0] == '1': pos_label = 0 else: pos_label = 1 for i in range(1, len(lines)): (prediction, prob1, prob2) = \ re.match('(-?1) (\d\.?\d*e?-?\d*) (\d\.?\d*e?-?\d*)', lines[i]).group(1,2,3) if pos_label == 0: prob_yes = prob1 else: prob_yes = prob2 articles[urlids[i-1]]['cat_probs'][cat] = prob_yes if prediction == '1': articles[urlids[i-1]]['categories'].append(cat) for urlid in urlids: articles[urlid]['categories'] = sorted(articles[urlid]['categories']) def train(self, ident): (train_corpus, _) = self.corpus.load_corpus(ident, 1.0, True) self.generate_libsvm_input(train_corpus, 'train') print "Done generating SVM input." self.libsvm_train(False) def evaluate(self, ident, pct): for i in range(1): results = {} (train_corpus, predict_corpus) = self.corpus.load_corpus(ident, float(pct), True, True) savepickle(paths['svm.svm_data_tmp']+'wordids.pkl', self.corpus.wordids) self.generate_libsvm_input(train_corpus, 'train') self.generate_libsvm_input(predict_corpus, 'predict') print "Done generating SVM input." results = self.libsvm_train(True) print "Iteration", i, ", pct", pct print results def generate_libsvm_input(self, corpus, suffix): train_labels = {} train_samples = {} for cat in self.corpus.categories: train_labels[cat] = [] train_samples[cat] = [] for c in corpus: cats = c[2].split(' ') for cat in self.corpus.categories: train_samples[cat].append(self.corpus.get_tfidf(c[0], c[1])) if cat in cats: train_labels[cat].append("+1") else: train_labels[cat].append("-1") for cat in self.corpus.categories: # do feature selection whole_fsc_dict,whole_imp_v = cal_feat_imp(train_labels[cat], train_samples[cat]) # choose top 9000 features fv = whole_imp_v[:9000] tr_sel_samp = select(train_samples[cat], fv) model = open(paths['svm.svm_data_tmp']+cat+'-'+suffix, 'w') for i in range(len(train_samples[cat])): model.write("%s " % train_labels[cat][i]) for wordid in sorted(tr_sel_samp[i].iterkeys()): model.write("%s:%f " % (wordid, tr_sel_samp[i][wordid])) model.write("\n") model.close() def libsvm_train(self, alsotest): results = {} # train each category plus NotRelated for cat in self.corpus.categories: if alsotest: sys.stdout.write("Training and testing " + cat + "... ") else: sys.stdout.write("Training " + cat + "... ") sys.stdout.flush() if alsotest: cmd = 'python svm-easy.py "%s" "%s"' % \ (paths['svm.svm_data_tmp']+cat+'-train', paths['svm.svm_data_tmp']+cat+'-predict') else: cmd = 'python svm-easy.py "%s"' % (paths['svm.svm_data_tmp']+cat+'-train') (stdout, _) = Popen(cmd, shell = True, stdout=PIPE).communicate() if alsotest: m = re.match('.*Accuracy = (\d+).*', re.sub('\n', '', stdout)) results[cat] = float(m.group(1)) sys.stdout.write(str(results[cat]) + "\n") sys.stdout.flush() return results
class AINewsDuplicates: def __init__(self): self.corpus = AINewsCorpus() def filter_duplicates(self, articles): date_start = date.today() - timedelta(days = int(config['duplicates.days_back'])) date_end = date.today() cutoff = float(config['duplicates.threshold']) all_articles = self.corpus.get_articles_daterange(date_start, date_end) duplicates = [] similarities = {} urlids = sorted(all_articles.keys()) for i in range(0, len(urlids) - 1): for j in range(i+1, len(urlids)): # only compare to articles that might be published this week if urlids[j] not in articles: continue tfidf1 = all_articles[urlids[i]]['tfidf'] tfidf2 = all_articles[urlids[j]]['tfidf'] similarity = self.corpus.cos_sim(tfidf1, tfidf2) if similarity >= cutoff: # if article i has not been published if not all_articles[urlids[i]]['published']: add_to_duplicates(duplicates, urlids[i], urlids[j]) similarities[(urlids[i], urlids[j])] = similarity similarities[(urlids[j], urlids[i])] = similarity # if article i has already been published, # then just don't publish article j else: articles[urlids[j]]['duplicates'] = \ [(urlids[i], all_articles[urlids[i]]['title'], similarity)] if articles[urlids[j]]['publish']: articles[urlids[j]]['publish'] = False articles[urlids[j]]['transcript'].append( ("Rejected because duplicate (sim=%.3f, " + "cutoff=%.3f) of already published article %s") % \ (similarity, cutoff, str(urlids[i]))) for dupset in duplicates: for urlid in dupset: if urlid in articles: dupset2 = dupset.copy() dupset2.remove(urlid) articles[urlid]['duplicates'] = \ map(lambda u: (u, articles[u]['title'], similarities[(u,urlid)]), filter(lambda u: u in articles and (u,urlid) in similarities, dupset2)) sorted_dups = sorted(filter(lambda u: u in articles and articles[u]['publish'], dupset), cmp=lambda x,y: self.corpus.compare_articles(articles[x], articles[y]), reverse = True) if(len(sorted_dups) > 1): # first in sorted set is chosen; rest are dumped articles[sorted_dups[0]]['transcript'].append("Preferred over duplicates") for urlid in sorted_dups[1:]: if articles[urlid]['publish']: articles[urlid]['publish'] = False articles[urlid]['transcript'].append(("Rejected because duplicate " + "%s was chosen instead") % sorted_dups[0])
([323, 504], "Teleconference robot")] duplist_stored = [] try: duplist_stored = loadpickle(paths['corpus.duplist']) except: pass notduplist_stored = set() try: notduplist_stored = loadpickle(paths['corpus.notduplist']) except: pass duplists += duplist_stored corpus = AINewsCorpus() summarizer = AINewsSummarizer() id_begin = 315 id_end = 1500 #################################### # idset records all the news id #################################### idset = set() # idset records all human selected news id checklist = set() # checklist records all human selected dup pairs for dupset in duplists: for id in dupset[0]: idset.add(id) n = len(dupset[0]) sortedlist = sorted(dupset[0]) for i in range(n-1):
class AINewsPublisher(): def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta(days = int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.svm_classifier = AINewsSVMClassifier() self.txtpro = AINewsTextProcessor() self.summarizer = AINewsSummarizer() self.articles = {} self.publishable_articles = [] self.semiauto_email_output = "" self.topicids = {"AIOverview":0, "Agents":1, "Applications":2, "CognitiveScience":3, "Education":4,"Ethics":5, "Games":6, "History":7, "Interfaces":8, "MachineLearning":9, "NaturalLanguage":10, "Philosophy":11, "Reasoning":12, "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16, "Systems":17, "Vision":18} def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]['publish'] = True self.articles[urlid]['transcript'] = [] # filter by date for urlid in self.articles: if self.articles[urlid]['pubdate'] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]['pubdate'] = self.today self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append("Rejected due to bogus publication date.") elif self.articles[urlid]['pubdate'] < self.earliest_date: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s") % (self.earliest_date.strftime('%F'), self.articles[urlid]['pubdate'].strftime('%F'))) # filter by blacklist (for urls) for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]['url']): self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because url matched blacklisted url %s" % black)) break # filter by whitelist for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]['content']) self.articles[urlid]['white_wordfreq'] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 \ and self.articles[urlid]['publisher'] != 'UserSubmitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to only one or no whitelisted terms') # update categories based on SVM classifier predictions self.svm_classifier.predict(self.articles) # drop articles classified as 'NotRelated' unless the article # is user-submitted for urlid in self.articles: if 'NotRelated' in self.articles[urlid]['categories'] \ and self.articles[urlid]['publisher'] != 'UserSubmitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to NotRelated classification') # drop articles with no categories (even if user-submitted) for urlid in self.articles: if len(self.articles[urlid]['categories']) == 0: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to no selected categories') # filter out duplicates; some articles may have 'publish' set to False # by this function self.duplicates.filter_duplicates(self.articles) # add article summaries self.summarizer.summarize(self.corpus, self.articles) for urlid in self.articles: try: print urlid, self.articles[urlid]['publish'], \ self.articles[urlid]['title'], \ self.articles[urlid]['categories'], \ self.articles[urlid]['summary'] print except: pass for urlid in self.articles: # update article in database self.update_db(self.articles[urlid]) # mark each as processed self.corpus.mark_processed(self.articles.itervalues()) # save sorted list of articles to be read by AINewsPublisher; sort by # duplicate count (more = better), then relevance of source, # then by number of categories (more = better) unpublished_articles = sorted( filter(lambda x: x['publish'], self.articles.values()), cmp=lambda x,y: self.corpus.compare_articles(x, y), reverse = True) max_cat_count = int(config['publisher.max_cat_count']) max_count = int(config['publisher.max_count']) cat_counts = {} for cat in self.corpus.categories: cat_counts[cat] = 0 # choose stories such that no category has more than max_cat_count # members and no more than max_count stories have been selected # (independent of category); only one of the article's categories needs # to have "free space" self.publishable_articles = [] for article in unpublished_articles: if len(self.publishable_articles) == max_count: break free_cat = False for cat in article['categories']: if cat_counts[cat] < max_cat_count: free_cat = True break # if there is a free category or this article has only the # Applications category, then it can be published if free_cat or (article['categories'] == ['Applications']): self.publishable_articles.append(article) self.articles[article['urlid']]['transcript'].append('Published') self.articles[article['urlid']]['published'] = True for cat in article['categories']: cat_counts[cat] += 1 # record that these articles are publishable self.corpus.mark_publishable(self.publishable_articles) def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article['urlid']) for cat in article['categories']: self.db.execute("insert into categories values (%s,%s)", (article['urlid'], cat)) self.db.execute("update urllist set summary = %s where urlid = %s", (article['summary'], article['urlid'])) def get_publishable_articles(self): publishable = self.corpus.get_publishable() self.publishable_articles = [] # drop "Applications" category if article has more categories for article in publishable: if len(article['categories']) > 1: article['categories'] = filter(lambda c: c != "Applications", article['categories']) self.publishable_articles.append(article) def mark_published(self): self.corpus.mark_published(self.publishable_articles) def generate_standard_output(self): """ Generate the stanard output for debuging on screen. """ txt = LatestNewsTxt() txt.news = self.publishable_articles savefile(paths['ainews.output'] + "std_output.txt", str(txt)) def generate_email_output(self): """ Generate the output for email format. """ email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.news = self.publishable_articles email.aitopic_urls = aitopic_urls email.topicids = self.topicids email_output = str(email) savefile(paths['ainews.output'] + "email_output.txt", email_output) self.semiauto_email_output = email_output def generate_pmwiki_all_output(self): pmwiki_all = AllNewsPmWiki() pmwiki_all.date = self.today.strftime("%B %d, %Y") pmwiki_all.year = self.today.strftime("%Y") pmwiki_all.news = self.articles.values() savefile(paths['ainews.output'] + "pmwiki_all.txt", str(pmwiki_all)) # Generate wiki metadata page for each article urlids_output = "" for urlid in self.articles: urlids_output += str(urlid) + '\n' article_wiki = ArticlePmWiki() article_wiki.year = self.today.strftime("%Y") article_wiki.dupthreshold = float(config['duplicates.threshold']) article_wiki.n = self.articles[urlid] savefile(paths['ainews.output'] + "aiarticles/%d" % urlid, str(article_wiki)) savefile(paths['ainews.output'] + "urlids_output.txt", urlids_output) def generate_pmwiki_published_output(self): """ Genereate the output with PmWiki page format. It needs to be further processed by AINewsPmwiki.php. """ pmwiki = LatestNewsPmWiki() pmwiki.date = self.today.strftime("%B %d, %Y") pmwiki.year = self.today.strftime("%Y") pmwiki.news = self.publishable_articles pmwiki.rater = True savefile(paths['ainews.output'] + "pmwiki_output.txt", str(pmwiki)) pmwiki.rater = False savefile(paths['ainews.output'] + "pmwiki_output_norater.txt", str(pmwiki)) def publish_email(self): """ Call AINewsEmail.php to send email through PHP Mail Server """ #cmd = 'php AINewsEmail.php' #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate() self.publish_email_semiauto() def publish_email_semiauto(self): """ Create an AINewsSemiAutoEmail.html file for admin to click and semi-auto send it to the subscriber list. """ semiauto = """ <html> <body> <h1>AI Alert - SemiAuto Sender</h1> <form action="http://aaai.org/cgi-dada/mail.cgi?flavor=send_email" method='post'> <!-- <form action="welcome.php" method="post"> --> <input type='hidden' name='f' value='send_email' /> <input type='hidden' name='process' value='true' /> <input type='hidden' name='admin_list' value='alert' /> <input type='hidden' name='message_subject' value="%s" /> <input type='hidden' name='email_format' value='HTML' /> <textarea type='hidden' name="text_message_body">%s</textarea> <input type='submit' value='Submit Mailing List Message' /> </form> <h2>Please review the email below. If there are concerns, contact Bruce or Reid:</h2> <p> %s </p> </body> </html> """ % ("AI Alert - "+str(self.today.strftime("%B %d, %Y")), self.semiauto_email_output, self.semiauto_email_output) savefile(paths['ainews.html'] + "semiauto_email.html", semiauto) def publish_pmwiki(self): """ Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website. """ cmd = 'php AINewsPmwiki.php' Popen(cmd, shell = True).wait() def update_rss(self): rssitems = [] # insert latest news into rssitems for article in self.publishable_articles: rssitems.append(PyRSS2Gen.RSSItem( title = article['title'], link = article['url'], description = article['summary'], guid = PyRSS2Gen.Guid(article['url']), pubDate = datetime(article['pubdate'].year, \ article['pubdate'].month, article['pubdate'].day))) rssfile = paths['ainews.rss'] + "news.xml" publish_rss(rssfile, rssitems) topicrsses = ['overview', 'agent', 'apps', 'cogsci', 'edu', 'ethsoc', 'game', 'hist', 'interf', 'ml', 'nlp', 'phil', 'reason', 'rep', 'robot', 'scifi', 'speech', 'systems', 'vision'] topicitems = [] for i in range(len(topicrsses)): topicitems.append([]) urlset = set() for article in self.publishable_articles: if article['url'] in urlset: continue urlset.add(article['url']) for cat in article['categories']: topicid = self.topicids[cat] topicitems[topicid].append(PyRSS2Gen.RSSItem( title = article['title'], link = article['url'], description = article['summary'], guid = PyRSS2Gen.Guid(article['url']), pubDate = datetime(article['pubdate'].year, \ article['pubdate'].month, article['pubdate'].day))) for i in range(len(topicrsses)): rssfile = paths['ainews.rss'] + topicrsses[i]+'.xml' if len(topicitems[i]) != 0: publish_rss(rssfile, topicitems[i])
# This file is part of NewsFinder. # https://github.com/joshuaeckroth/AINews # # Copyright (c) 2011 by the Association for the Advancement of # Artificial Intelligence. This program and parts of it may be used and # distributed without charge for non-commercial purposes as long as this # notice is included. import sys import re from AINewsConfig import paths from AINewsCorpus import AINewsCorpus from AINewsCentroidClassifier import AINewsCentroidClassifier aicorpus = AINewsCorpus() def dissim(tfidf1, tfidf2, category = None): d = 1.0 - aicorpus.cos_sim(tfidf1, tfidf2, category) if d < 0.1E-10: d = 0.0 return d if __name__ == "__main__": directory = sys.argv[1] ident = sys.argv[2] corpus = aicorpus.load_corpus(ident, 1.0)[0] centroid = AINewsCentroidClassifier(aicorpus) for category in aicorpus.categories: centroid.train_centroid(category, corpus, 'centroid_eval', True)
class AINewsPublisher(): def __init__(self): self.debug = config['ainews.debug'] self.today = date.today() self.earliest_date = self.today - timedelta( days=int(config['ainews.period'])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = "" def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]['publish'] = True self.articles[urlid]['transcript'] = [] # filter by date print "Filtering by date..." for urlid in self.articles: if self.articles[urlid]['pubdate'] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]['pubdate'] = self.today self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( "Rejected due to bogus publication date.") elif self.articles[urlid]['pubdate'] < self.earliest_date: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s") % (self.earliest_date.strftime('%F'), self.articles[urlid]['pubdate'].strftime('%F'))) # filter by blacklist (for urls) print "Filtering by blacklist..." for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]['url']): self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( ("Rejected because url matched blacklisted url %s" % black)) break # filter by whitelist print "Filtering by whitelist..." for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess( urlid, self.articles[urlid]['content']) self.articles[urlid]['white_wordfreq'] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 \ and self.articles[urlid]['source'] != 'User Submitted': self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to only one or no whitelisted terms') # update categories based on classifier predictions print "Classifying..." self.weka.predict(self.articles) # drop articles with no categories print "Dropping articles with no categories..." for urlid in self.articles: if len(self.articles[urlid]['categories']) == 0: self.articles[urlid]['publish'] = False self.articles[urlid]['transcript'].append( 'Rejected due to no selected categories') # filter out duplicates; some articles may have 'publish' set to False # by this function print "Filtering duplicates..." self.duplicates.filter_duplicates(self.articles) for urlid in self.articles: print urlid, self.articles[urlid]['publish'], \ self.articles[urlid]['title'], \ self.articles[urlid]['categories'], \ self.articles[urlid]['summary'] print print "Grabbing images..." for urlid in self.articles: # grab and convert article image (if it exists) self.grab_convert_image(self.articles[urlid]) # update article in database self.update_db(self.articles[urlid]) # mark each as processed print "Marking as processed." self.corpus.mark_processed(self.articles.itervalues()) def grab_convert_image(self, article): if len(article['image_url']) == 0: article['image_path'] = '' return try: f = urllib2.urlopen(article['image_url']) img = open( "%s%s" % (paths['ainews.image_dir'], str(article['urlid'])), 'w') img.write(f.read()) img.close() # produces [urlid].jpg Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \ (paths['imagemagick.mogrify'], paths['ainews.image_dir'], str(article['urlid'])), shell = True).communicate() # remove [urlid] file (with no extension) remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid']))) article[ 'image_path'] = "public://newsfinder_images/%s.jpg" % article[ 'urlid'] except Exception as e: print "Failed converting image for %d: %s" % (article['urlid'], e) article['image_path'] = '' def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article['urlid']) for cat in article['categories']: self.db.execute("insert into categories values (%s,%s)", (article['urlid'], cat)) def generate_feed_import(self): """ Generate XML file for feed import on the Drupal site. """ xml = FeedImport() for article in self.articles.values(): article['source'] = re.sub(r'&', '&', article['source']) xml.news = self.articles.values() savefile(paths['ainews.output_xml'] + "news.xml", str(xml)) def generate_email_output(self): articles = [] try: f = urllib2.urlopen(paths['ainews.top_weekly_news_xml']) xml = etree.parse(f) for node in xml.iter("node"): print "Found", node.findtext("Title") published = node.findtext("Publication_date") articles.append({ 'title': node.findtext("Title"), 'source': node.findtext("Source"), 'topics': re.sub(r'/topic/', 'http://aitopics.org/topic/', node.findtext("Topics")), 'pubdate': date(int(published[0:4]), int(published[5:7]), int(published[8:10])), 'summary': re.sub( r'</p>(</blockquote>)?$', '', re.sub(r'^(<blockquote>)?<p>', '', node.findtext("Body"))), 'url': node.findtext("Original_link"), 'link': re.sub(r'/news/', 'http://aitopics.org/news/', node.findtext("Link")), 'image': re.sub( r'<img', '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ', node.findtext("Representative_image")) }) except Exception, e: print e email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.articles = articles email_output = str(email) return email_output
class AINewsPublisher: def __init__(self): self.debug = config["ainews.debug"] self.today = date.today() self.earliest_date = self.today - timedelta(days=int(config["ainews.period"])) self.db = AINewsDB() self.corpus = AINewsCorpus() self.duplicates = AINewsDuplicates() self.txtpro = AINewsTextProcessor() self.weka = AINewsWekaClassifier() self.articles = {} self.semiauto_email_output = "" def filter_and_process(self): self.articles = self.corpus.get_unprocessed() if len(self.articles) == 0: return # assume every article will be published; may be set to False from one # of the filtering processes below for urlid in self.articles: self.articles[urlid]["publish"] = True self.articles[urlid]["transcript"] = [] # filter by date print "Filtering by date..." for urlid in self.articles: if self.articles[urlid]["pubdate"] == None: # give a meaningful pubdate so that other code doesn't crash self.articles[urlid]["pubdate"] = self.today self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to bogus publication date.") elif self.articles[urlid]["pubdate"] < self.earliest_date: self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append( ( "Rejected because article is too old " + "(earliest valid date is %s while article was " + "published on %s" ) % (self.earliest_date.strftime("%F"), self.articles[urlid]["pubdate"].strftime("%F")) ) # filter by blacklist (for urls) print "Filtering by blacklist..." for urlid in self.articles: for black in blacklist_urls: if re.search(black, self.articles[urlid]["url"]): self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append( ("Rejected because url matched blacklisted url %s" % black) ) break # filter by whitelist print "Filtering by whitelist..." for urlid in self.articles: white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]["content"]) self.articles[urlid]["white_wordfreq"] = white_wordfreq # require at least two different whitelisted terms # unless the article is user-submitted if len(white_wordfreq) < 2 and self.articles[urlid]["source"] != "User Submitted": self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to only one or no whitelisted terms") # update categories based on classifier predictions print "Classifying..." self.weka.predict(self.articles) # drop articles with no categories print "Dropping articles with no categories..." for urlid in self.articles: if len(self.articles[urlid]["categories"]) == 0: self.articles[urlid]["publish"] = False self.articles[urlid]["transcript"].append("Rejected due to no selected categories") # filter out duplicates; some articles may have 'publish' set to False # by this function print "Filtering duplicates..." self.duplicates.filter_duplicates(self.articles) for urlid in self.articles: print urlid, self.articles[urlid]["publish"], self.articles[urlid]["title"], self.articles[urlid][ "categories" ], self.articles[urlid]["summary"] print print "Grabbing images..." for urlid in self.articles: # grab and convert article image (if it exists) self.grab_convert_image(self.articles[urlid]) # update article in database self.update_db(self.articles[urlid]) # mark each as processed print "Marking as processed." self.corpus.mark_processed(self.articles.itervalues()) def grab_convert_image(self, article): if len(article["image_url"]) == 0: article["image_path"] = "" return try: f = urllib2.urlopen(article["image_url"]) img = open("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])), "w") img.write(f.read()) img.close() # produces [urlid].jpg Popen( "%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % (paths["imagemagick.mogrify"], paths["ainews.image_dir"], str(article["urlid"])), shell=True, ).communicate() # remove [urlid] file (with no extension) remove("%s%s" % (paths["ainews.image_dir"], str(article["urlid"]))) article["image_path"] = "public://newsfinder_images/%s.jpg" % article["urlid"] except Exception as e: print "Failed converting image for %d: %s" % (article["urlid"], e) article["image_path"] = "" def update_db(self, article): self.db.execute("delete from categories where urlid = %s", article["urlid"]) for cat in article["categories"]: self.db.execute("insert into categories values (%s,%s)", (article["urlid"], cat)) def generate_feed_import(self): """ Generate XML file for feed import on the Drupal site. """ xml = FeedImport() for article in self.articles.values(): article["source"] = re.sub(r"&", "&", article["source"]) xml.news = self.articles.values() savefile(paths["ainews.output_xml"] + "news.xml", str(xml)) def generate_email_output(self): articles = [] try: f = urllib2.urlopen(paths["ainews.top_weekly_news_xml"]) xml = etree.parse(f) for node in xml.iter("node"): print "Found", node.findtext("Title") published = node.findtext("Publication_date") articles.append( { "title": node.findtext("Title"), "source": node.findtext("Source"), "topics": re.sub(r"/topic/", "http://aitopics.org/topic/", node.findtext("Topics")), "pubdate": date(int(published[0:4]), int(published[5:7]), int(published[8:10])), "summary": re.sub( r"</p>(</blockquote>)?$", "", re.sub(r"^(<blockquote>)?<p>", "", node.findtext("Body")) ), "url": node.findtext("Original_link"), "link": re.sub(r"/news/", "http://aitopics.org/news/", node.findtext("Link")), "image": re.sub( r"<img", '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ', node.findtext("Representative_image"), ), } ) except Exception, e: print e email = LatestNewsEmail() email.date = self.today.strftime("%B %d, %Y") email.year = self.today.strftime("%Y") email.articles = articles email_output = str(email) return email_output