Exemplo n.º 1
0
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(
            days=int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""
Exemplo n.º 2
0
def evaluate():
    corpus = AINewsCorpus()
    print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
    for filename in sorted(glob.glob("../../experiments/justext/*.true")):
        truetext = ents.convert(file(filename).read())
        truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
        truewords = re.split(r'\s+', truetext)
        urlid = filename[26:30]
        article = corpus.get_article(urlid)
        if article == None: continue
        articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
        articlewords = re.split(r'\s+', articletext)
        goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
        (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
        goosetext = ents.convert(stdout.encode('ascii'))
        goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
        goosewords = re.split(r'\s+', goosetext)
        ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
        ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
        print "%s,%d,%d,%d,%.4f,%.4f" % \
            (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
Exemplo n.º 3
0
    def __init__(self):
        self.debug = config["ainews.debug"]
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days=int(config["ainews.period"]))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""
Exemplo n.º 4
0
 def evaluate(self, ident, inputdir):
     corpus = AINewsCorpus()
     (articles, _) = corpus.load_corpus(ident, 1.0, True)
     for (urlid,_,_) in articles:
         article = corpus.get_article(urlid, True)
         try:
             os.mkdir("%s/gold/%s" % (inputdir, urlid))
         except:
             pass
         f = open("%s/gold/%s/%s.fulltext" % (inputdir, urlid, urlid), 'w')
         f.write(article['content'])
         f.write("\n")
         f.close()
         f = open("%s/system/ots/%s.ots.system" % (inputdir, urlid), 'w')
         f.write("\n".join(self.summarize_single_ots(article)))
         f.write("\n")
         f.close()
         f = open("%s/system/tfidf/%s.tfidf.system" % (inputdir, urlid), 'w')
         f.write("\n".join(self.summarize_article(corpus, article, 4, False)))
         f.write("\n")
         f.close()
         print "Saved %s." % urlid
Exemplo n.º 5
0
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.svm_classifier = AINewsSVMClassifier()
        self.txtpro = AINewsTextProcessor()
        self.summarizer = AINewsSummarizer()

        self.articles = {}
        self.publishable_articles = []
        self.semiauto_email_output = ""

        self.topicids = {"AIOverview":0, "Agents":1, "Applications":2,
           "CognitiveScience":3, "Education":4,"Ethics":5, 
           "Games":6, "History":7, "Interfaces":8, "MachineLearning":9,
           "NaturalLanguage":10, "Philosophy":11, "Reasoning":12,
           "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16,
           "Systems":17,  "Vision":18}
Exemplo n.º 6
0
 def __init__(self):
     self.corpus = AINewsCorpus()
Exemplo n.º 7
0
class AINewsSVMClassifier:
    def __init__(self):
        self.corpus = AINewsCorpus()

    def predict(self, articles):
        urlids = sorted(articles.keys())
        for urlid in articles:
            articles[urlid]['categories'] = []

        # produce the test input file
        f = open(paths['svm.svm_data']+'predict', 'w')
        for urlid in urlids:
            for cat in self.corpus.categories:
                articles[urlid]['cat_probs'] = {}
            tfidf = self.corpus.get_tfidf(urlid, articles[urlid]['wordfreq'])
            f.write("+1 ")
            for wordid in sorted(tfidf.keys()):
                f.write("%s:%f " % (wordid, tfidf[wordid]))
            f.write("\n")
        f.close()

        # predict each category plus NotRelated
        for cat in self.corpus.categories:
            cmd = 'svm-scale -r "%s" "%s" > "%s"' % \
                (paths['svm.svm_data']+cat+'.range', \
                paths['svm.svm_data']+'predict', \
                paths['svm.svm_data']+'predict-'+cat+'.scaled')
            Popen(cmd, shell = True).wait()
            cmd = 'svm-predict -b 1 "%s" "%s" "%s" > /dev/null' % \
                (paths['svm.svm_data']+'predict-'+cat+'.scaled', \
                paths['svm.svm_data']+cat+'.model',
                paths['svm.svm_data']+'predict-'+cat+'.output')
            Popen(cmd, shell = True).wait()
            f = open(paths['svm.svm_data']+'predict-'+cat+'.output', 'r')
            lines = f.readlines()
            f.close()
            # first line of output file says "labels -1 1" or whatever;
            # the order could be different, so we have to check
            labels = re.match('labels (-?1) (-?1)', lines[0]).group(1,2)
            if labels[0] == '1': pos_label = 0
            else: pos_label = 1
            for i in range(1, len(lines)):
                (prediction, prob1, prob2) = \
                        re.match('(-?1) (\d\.?\d*e?-?\d*) (\d\.?\d*e?-?\d*)', lines[i]).group(1,2,3)
                if pos_label == 0: prob_yes = prob1
                else: prob_yes = prob2
                articles[urlids[i-1]]['cat_probs'][cat] = prob_yes
                if prediction == '1':
                    articles[urlids[i-1]]['categories'].append(cat)

            for urlid in urlids:
                articles[urlid]['categories'] = sorted(articles[urlid]['categories'])

    def train(self, ident):
        (train_corpus, _) = self.corpus.load_corpus(ident, 1.0, True)
        self.generate_libsvm_input(train_corpus, 'train')
        print "Done generating SVM input."
        self.libsvm_train(False)

    def evaluate(self, ident, pct):
        for i in range(1):
            results = {}
            (train_corpus, predict_corpus) = self.corpus.load_corpus(ident, float(pct), True, True)
            savepickle(paths['svm.svm_data_tmp']+'wordids.pkl', self.corpus.wordids)
            self.generate_libsvm_input(train_corpus, 'train')
            self.generate_libsvm_input(predict_corpus, 'predict')
            print "Done generating SVM input."
            results = self.libsvm_train(True)
            print "Iteration", i, ", pct", pct
            print results
                                                                                                  
    def generate_libsvm_input(self, corpus, suffix):
        train_labels = {}
        train_samples = {}
        for cat in self.corpus.categories:
            train_labels[cat] = []
            train_samples[cat] = []
        for c in corpus:
            cats = c[2].split(' ')
            for cat in self.corpus.categories:
                train_samples[cat].append(self.corpus.get_tfidf(c[0], c[1]))
                if cat in cats:
                    train_labels[cat].append("+1")
                else:
                    train_labels[cat].append("-1")

        for cat in self.corpus.categories:
            # do feature selection
            whole_fsc_dict,whole_imp_v = cal_feat_imp(train_labels[cat], train_samples[cat])
            # choose top 9000 features
            fv = whole_imp_v[:9000]
            tr_sel_samp = select(train_samples[cat], fv)

            model = open(paths['svm.svm_data_tmp']+cat+'-'+suffix, 'w')
            for i in range(len(train_samples[cat])):
                model.write("%s " % train_labels[cat][i])
                for wordid in sorted(tr_sel_samp[i].iterkeys()):
                    model.write("%s:%f " % (wordid, tr_sel_samp[i][wordid]))
                model.write("\n")
            model.close()

    def libsvm_train(self, alsotest):
        results = {}
        # train each category plus NotRelated
        for cat in self.corpus.categories:
            if alsotest:
                sys.stdout.write("Training and testing " + cat + "... ")
            else:
                sys.stdout.write("Training " + cat + "... ")
            sys.stdout.flush()
            if alsotest:
                cmd = 'python svm-easy.py "%s" "%s"' % \
                    (paths['svm.svm_data_tmp']+cat+'-train',
                     paths['svm.svm_data_tmp']+cat+'-predict')
            else:
                cmd = 'python svm-easy.py "%s"' % (paths['svm.svm_data_tmp']+cat+'-train')
            (stdout, _) = Popen(cmd, shell = True, stdout=PIPE).communicate()
            if alsotest:
                m = re.match('.*Accuracy = (\d+).*', re.sub('\n', '', stdout))
                results[cat] = float(m.group(1))
                sys.stdout.write(str(results[cat]) + "\n")
                sys.stdout.flush()
        return results
Exemplo n.º 8
0
 def __init__(self):
     self.corpus = AINewsCorpus()
Exemplo n.º 9
0
class AINewsDuplicates:
    def __init__(self):
        self.corpus = AINewsCorpus()

    def filter_duplicates(self, articles):
        date_start = date.today() - timedelta(days = int(config['duplicates.days_back']))
        date_end = date.today()
        cutoff = float(config['duplicates.threshold'])
        all_articles = self.corpus.get_articles_daterange(date_start, date_end)
        duplicates = []
        similarities = {}

        urlids = sorted(all_articles.keys())
        for i in range(0, len(urlids) - 1):
            for j in range(i+1, len(urlids)):
                # only compare to articles that might be published this week
                if urlids[j] not in articles: continue

                tfidf1 = all_articles[urlids[i]]['tfidf']
                tfidf2 = all_articles[urlids[j]]['tfidf']
                similarity = self.corpus.cos_sim(tfidf1, tfidf2)

                if similarity >= cutoff:
                    # if article i has not been published
                    if not all_articles[urlids[i]]['published']:
                        add_to_duplicates(duplicates, urlids[i], urlids[j])
                        similarities[(urlids[i], urlids[j])] = similarity
                        similarities[(urlids[j], urlids[i])] = similarity
                    # if article i has already been published,
                    # then just don't publish article j
                    else:
                        articles[urlids[j]]['duplicates'] = \
                                [(urlids[i], all_articles[urlids[i]]['title'], similarity)]
                        if articles[urlids[j]]['publish']:
                            articles[urlids[j]]['publish'] = False
                            articles[urlids[j]]['transcript'].append(
                                    ("Rejected because duplicate (sim=%.3f, " +
                                    "cutoff=%.3f) of already published article %s") % \
                                            (similarity, cutoff, str(urlids[i])))

        for dupset in duplicates:
            for urlid in dupset:
                if urlid in articles:
                    dupset2 = dupset.copy()
                    dupset2.remove(urlid)
                    articles[urlid]['duplicates'] = \
                            map(lambda u: (u, articles[u]['title'], similarities[(u,urlid)]),
                                filter(lambda u: u in articles and (u,urlid) in similarities, dupset2))

            sorted_dups = sorted(filter(lambda u: u in articles and articles[u]['publish'], dupset),
                    cmp=lambda x,y: self.corpus.compare_articles(articles[x], articles[y]),
                    reverse = True)
            if(len(sorted_dups) > 1):
                # first in sorted set is chosen; rest are dumped
                articles[sorted_dups[0]]['transcript'].append("Preferred over duplicates")

                for urlid in sorted_dups[1:]:
                    if articles[urlid]['publish']:
                        articles[urlid]['publish'] = False
                        articles[urlid]['transcript'].append(("Rejected because duplicate " +
                                "%s was chosen instead") % sorted_dups[0])
Exemplo n.º 10
0
    ([323, 504], "Teleconference robot")]

duplist_stored = []
try:
    duplist_stored = loadpickle(paths['corpus.duplist'])
except:
    pass

notduplist_stored = set()
try:
    notduplist_stored = loadpickle(paths['corpus.notduplist'])
except:
    pass
duplists += duplist_stored

corpus = AINewsCorpus()
summarizer = AINewsSummarizer()

id_begin = 315
id_end = 1500
####################################
# idset records all the news id
####################################
idset = set()     # idset records all human selected news id
checklist = set() # checklist records all human selected dup pairs
for dupset in duplists:
    for id in dupset[0]:
        idset.add(id)
    n = len(dupset[0])
    sortedlist = sorted(dupset[0])
    for i in range(n-1):
Exemplo n.º 11
0
class AINewsPublisher():
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.svm_classifier = AINewsSVMClassifier()
        self.txtpro = AINewsTextProcessor()
        self.summarizer = AINewsSummarizer()

        self.articles = {}
        self.publishable_articles = []
        self.semiauto_email_output = ""

        self.topicids = {"AIOverview":0, "Agents":1, "Applications":2,
           "CognitiveScience":3, "Education":4,"Ethics":5, 
           "Games":6, "History":7, "Interfaces":8, "MachineLearning":9,
           "NaturalLanguage":10, "Philosophy":11, "Reasoning":12,
           "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16,
           "Systems":17,  "Vision":18}

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0: return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]['publish'] = True
            self.articles[urlid]['transcript'] = []

        # filter by date
        for urlid in self.articles:
            if self.articles[urlid]['pubdate'] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]['pubdate'] = self.today
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append("Rejected due to bogus publication date.")
            elif self.articles[urlid]['pubdate'] < self.earliest_date:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        ("Rejected because article is too old " +
                        "(earliest valid date is %s while article was " +
                        "published on %s") % (self.earliest_date.strftime('%F'),
                            self.articles[urlid]['pubdate'].strftime('%F')))

        # filter by blacklist (for urls)
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]['url']):
                    self.articles[urlid]['publish'] = False
                    self.articles[urlid]['transcript'].append(
                        ("Rejected because url matched blacklisted url %s" % black))
                    break

        # filter by whitelist
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(urlid,
                    self.articles[urlid]['content'])
            self.articles[urlid]['white_wordfreq'] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 \
                    and self.articles[urlid]['publisher'] != 'UserSubmitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to only one or no whitelisted terms')

        # update categories based on SVM classifier predictions
        self.svm_classifier.predict(self.articles)

        # drop articles classified as 'NotRelated' unless the article
        # is user-submitted
        for urlid in self.articles:
            if 'NotRelated' in self.articles[urlid]['categories'] \
                    and self.articles[urlid]['publisher'] != 'UserSubmitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to NotRelated classification')

        # drop articles with no categories (even if user-submitted)
        for urlid in self.articles:
            if len(self.articles[urlid]['categories']) == 0:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                        'Rejected due to no selected categories')

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        self.duplicates.filter_duplicates(self.articles)

        # add article summaries
        self.summarizer.summarize(self.corpus, self.articles)

        for urlid in self.articles:
            try:
                print urlid, self.articles[urlid]['publish'], \
                    self.articles[urlid]['title'], \
                    self.articles[urlid]['categories'], \
                    self.articles[urlid]['summary']
                print
            except:
                pass

        for urlid in self.articles:
            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        self.corpus.mark_processed(self.articles.itervalues())

        # save sorted list of articles to be read by AINewsPublisher; sort by
        # duplicate count (more = better), then relevance of source,
        # then by number of categories (more = better)
        unpublished_articles = sorted(
                filter(lambda x: x['publish'], self.articles.values()),
                cmp=lambda x,y: self.corpus.compare_articles(x, y),
                reverse = True)

        max_cat_count = int(config['publisher.max_cat_count'])
        max_count = int(config['publisher.max_count'])
        cat_counts = {}
        for cat in self.corpus.categories:
            cat_counts[cat] = 0
        # choose stories such that no category has more than max_cat_count
        # members and no more than max_count stories have been selected
        # (independent of category); only one of the article's categories needs
        # to have "free space"
        self.publishable_articles = []
        for article in unpublished_articles:
            if len(self.publishable_articles) == max_count:
                break
            free_cat = False
            for cat in article['categories']:
                if cat_counts[cat] < max_cat_count:
                    free_cat = True
                    break
            # if there is a free category or this article has only the
            # Applications category, then it can be published
            if free_cat or (article['categories'] == ['Applications']):
                self.publishable_articles.append(article)
                self.articles[article['urlid']]['transcript'].append('Published')
                self.articles[article['urlid']]['published'] = True
                for cat in article['categories']:
                    cat_counts[cat] += 1

        # record that these articles are publishable
        self.corpus.mark_publishable(self.publishable_articles)

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s", article['urlid'])
        for cat in article['categories']:
            self.db.execute("insert into categories values (%s,%s)",
                (article['urlid'], cat))
        self.db.execute("update urllist set summary = %s where urlid = %s",
                        (article['summary'], article['urlid']))

    def get_publishable_articles(self):
        publishable = self.corpus.get_publishable()

        self.publishable_articles = []

        # drop "Applications" category if article has more categories
        for article in publishable:
            if len(article['categories']) > 1:
                article['categories'] = filter(lambda c: c != "Applications",
                                               article['categories'])
            self.publishable_articles.append(article)


    def mark_published(self):
        self.corpus.mark_published(self.publishable_articles)

    def generate_standard_output(self): 
        """
        Generate the stanard output for debuging on screen.
        """
        txt = LatestNewsTxt()
        txt.news = self.publishable_articles
        savefile(paths['ainews.output'] + "std_output.txt", str(txt))

    def generate_email_output(self):
        """
        Generate the output for email format.
        """
        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.news = self.publishable_articles
        email.aitopic_urls = aitopic_urls
        email.topicids = self.topicids
        email_output = str(email)

        savefile(paths['ainews.output'] + "email_output.txt", email_output)
        self.semiauto_email_output = email_output

    def generate_pmwiki_all_output(self):
        pmwiki_all = AllNewsPmWiki()
        pmwiki_all.date = self.today.strftime("%B %d, %Y")
        pmwiki_all.year = self.today.strftime("%Y")
        pmwiki_all.news = self.articles.values()
        savefile(paths['ainews.output'] + "pmwiki_all.txt", str(pmwiki_all))

        # Generate wiki metadata page for each article
        urlids_output = ""
        for urlid in self.articles:
            urlids_output += str(urlid) + '\n'
            article_wiki = ArticlePmWiki()
            article_wiki.year = self.today.strftime("%Y")
            article_wiki.dupthreshold = float(config['duplicates.threshold'])
            article_wiki.n = self.articles[urlid]
            savefile(paths['ainews.output'] + "aiarticles/%d" % urlid,
                    str(article_wiki))
        savefile(paths['ainews.output'] + "urlids_output.txt", urlids_output)
        
    def generate_pmwiki_published_output(self):
        """
        Genereate the output with PmWiki page format. It needs to be further
        processed by AINewsPmwiki.php.
        """
        pmwiki = LatestNewsPmWiki()
        pmwiki.date = self.today.strftime("%B %d, %Y")
        pmwiki.year = self.today.strftime("%Y")
        pmwiki.news = self.publishable_articles
        pmwiki.rater = True
        savefile(paths['ainews.output'] + "pmwiki_output.txt", str(pmwiki))
        pmwiki.rater = False
        savefile(paths['ainews.output'] + "pmwiki_output_norater.txt", str(pmwiki))

    def publish_email(self):
        """
        Call AINewsEmail.php to send email through PHP Mail Server
        """
        #cmd = 'php AINewsEmail.php'
        #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
        self.publish_email_semiauto()
        
    def publish_email_semiauto(self):
        """
        Create an AINewsSemiAutoEmail.html file for admin to click and semi-auto
        send it to the subscriber list.
        """
        semiauto = """
        <html>
        <body>
        <h1>AI Alert - SemiAuto Sender</h1>
        <form action="http://aaai.org/cgi-dada/mail.cgi?flavor=send_email" method='post'>
        <!-- <form action="welcome.php" method="post"> -->
        <input type='hidden' name='f' value='send_email' />
        <input type='hidden' name='process' value='true' />
        <input type='hidden' name='admin_list' value='alert' />
        <input type='hidden' name='message_subject' value="%s" />
        <input type='hidden' name='email_format' value='HTML' />
        <textarea type='hidden' name="text_message_body">%s</textarea>
        <input type='submit' value='Submit Mailing List Message' />
        </form>
        <h2>Please review the email below. If there are concerns, contact Bruce or Reid:</h2>
        <p>
        %s
        </p>
        </body>
        </html>
        """ % ("AI Alert - "+str(self.today.strftime("%B %d, %Y")),
               self.semiauto_email_output, self.semiauto_email_output)
        savefile(paths['ainews.html'] + "semiauto_email.html", semiauto)

    def publish_pmwiki(self):
        """
        Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website.
        """
        cmd = 'php AINewsPmwiki.php'
        Popen(cmd, shell = True).wait()
        
    def update_rss(self):
        rssitems = []
        # insert latest news into rssitems
        for article in self.publishable_articles:
            rssitems.append(PyRSS2Gen.RSSItem(
                title = article['title'],
                link = article['url'],
                description = article['summary'],
                guid = PyRSS2Gen.Guid(article['url']),
                pubDate = datetime(article['pubdate'].year, \
                    article['pubdate'].month, article['pubdate'].day)))
            
        rssfile = paths['ainews.rss'] + "news.xml"
        publish_rss(rssfile, rssitems)
        
        
        topicrsses = ['overview', 'agent', 'apps', 'cogsci', 'edu', 'ethsoc', 
            'game', 'hist', 'interf', 'ml', 'nlp', 'phil', 'reason',
             'rep', 'robot', 'scifi', 'speech', 'systems',  'vision']
        topicitems = []
        for i in range(len(topicrsses)):
            topicitems.append([])
        urlset = set()
        for article in self.publishable_articles:
            if article['url'] in urlset: continue
            urlset.add(article['url'])
            for cat in article['categories']:
                topicid = self.topicids[cat]
                topicitems[topicid].append(PyRSS2Gen.RSSItem(
                        title = article['title'],
                        link = article['url'],
                        description = article['summary'],
                        guid = PyRSS2Gen.Guid(article['url']),
                        pubDate = datetime(article['pubdate'].year, \
                            article['pubdate'].month, article['pubdate'].day)))
            
        for i in range(len(topicrsses)):
            rssfile = paths['ainews.rss'] + topicrsses[i]+'.xml'
            if len(topicitems[i]) != 0:
                publish_rss(rssfile, topicitems[i])
Exemplo n.º 12
0
# This file is part of NewsFinder.
# https://github.com/joshuaeckroth/AINews
#
# Copyright (c) 2011 by the Association for the Advancement of
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.

import sys
import re
from AINewsConfig import paths
from AINewsCorpus import AINewsCorpus
from AINewsCentroidClassifier import AINewsCentroidClassifier

aicorpus = AINewsCorpus()

def dissim(tfidf1, tfidf2, category = None):
    d = 1.0 - aicorpus.cos_sim(tfidf1, tfidf2, category)
    if d < 0.1E-10: d = 0.0
    return d

if __name__ == "__main__":

    directory = sys.argv[1]
    ident = sys.argv[2]

    corpus = aicorpus.load_corpus(ident, 1.0)[0]

    centroid = AINewsCentroidClassifier(aicorpus)
    for category in aicorpus.categories:
        centroid.train_centroid(category, corpus, 'centroid_eval', True)
Exemplo n.º 13
0
class AINewsPublisher():
    def __init__(self):
        self.debug = config['ainews.debug']
        self.today = date.today()
        self.earliest_date = self.today - timedelta(
            days=int(config['ainews.period']))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0: return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]['publish'] = True
            self.articles[urlid]['transcript'] = []

        # filter by date
        print "Filtering by date..."
        for urlid in self.articles:
            if self.articles[urlid]['pubdate'] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]['pubdate'] = self.today
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    "Rejected due to bogus publication date.")
            elif self.articles[urlid]['pubdate'] < self.earliest_date:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    ("Rejected because article is too old " +
                     "(earliest valid date is %s while article was " +
                     "published on %s") %
                    (self.earliest_date.strftime('%F'),
                     self.articles[urlid]['pubdate'].strftime('%F')))

        # filter by blacklist (for urls)
        print "Filtering by blacklist..."
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]['url']):
                    self.articles[urlid]['publish'] = False
                    self.articles[urlid]['transcript'].append(
                        ("Rejected because url matched blacklisted url %s" %
                         black))
                    break

        # filter by whitelist
        print "Filtering by whitelist..."
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(
                urlid, self.articles[urlid]['content'])
            self.articles[urlid]['white_wordfreq'] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 \
                    and self.articles[urlid]['source'] != 'User Submitted':
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    'Rejected due to only one or no whitelisted terms')

        # update categories based on classifier predictions
        print "Classifying..."
        self.weka.predict(self.articles)

        # drop articles with no categories
        print "Dropping articles with no categories..."
        for urlid in self.articles:
            if len(self.articles[urlid]['categories']) == 0:
                self.articles[urlid]['publish'] = False
                self.articles[urlid]['transcript'].append(
                    'Rejected due to no selected categories')

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        print "Filtering duplicates..."
        self.duplicates.filter_duplicates(self.articles)

        for urlid in self.articles:
            print urlid, self.articles[urlid]['publish'], \
                self.articles[urlid]['title'], \
                self.articles[urlid]['categories'], \
                self.articles[urlid]['summary']
            print

        print "Grabbing images..."
        for urlid in self.articles:
            # grab and convert article image (if it exists)
            self.grab_convert_image(self.articles[urlid])

            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        print "Marking as processed."
        self.corpus.mark_processed(self.articles.itervalues())

    def grab_convert_image(self, article):
        if len(article['image_url']) == 0:
            article['image_path'] = ''
            return
        try:
            f = urllib2.urlopen(article['image_url'])
            img = open(
                "%s%s" % (paths['ainews.image_dir'], str(article['urlid'])),
                'w')
            img.write(f.read())
            img.close()
            # produces [urlid].jpg
            Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \
                      (paths['imagemagick.mogrify'], paths['ainews.image_dir'],
                       str(article['urlid'])),
                  shell = True).communicate()
            # remove [urlid] file (with no extension)
            remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid'])))
            article[
                'image_path'] = "public://newsfinder_images/%s.jpg" % article[
                    'urlid']
        except Exception as e:
            print "Failed converting image for %d: %s" % (article['urlid'], e)
            article['image_path'] = ''

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s",
                        article['urlid'])
        for cat in article['categories']:
            self.db.execute("insert into categories values (%s,%s)",
                            (article['urlid'], cat))

    def generate_feed_import(self):
        """
        Generate XML file for feed import on the Drupal site.
        """
        xml = FeedImport()
        for article in self.articles.values():
            article['source'] = re.sub(r'&', '&amp;', article['source'])
        xml.news = self.articles.values()
        savefile(paths['ainews.output_xml'] + "news.xml", str(xml))

    def generate_email_output(self):
        articles = []
        try:
            f = urllib2.urlopen(paths['ainews.top_weekly_news_xml'])
            xml = etree.parse(f)
            for node in xml.iter("node"):
                print "Found", node.findtext("Title")
                published = node.findtext("Publication_date")
                articles.append({
                    'title':
                    node.findtext("Title"),
                    'source':
                    node.findtext("Source"),
                    'topics':
                    re.sub(r'/topic/', 'http://aitopics.org/topic/',
                           node.findtext("Topics")),
                    'pubdate':
                    date(int(published[0:4]), int(published[5:7]),
                         int(published[8:10])),
                    'summary':
                    re.sub(
                        r'</p>(</blockquote>)?$', '',
                        re.sub(r'^(<blockquote>)?<p>', '',
                               node.findtext("Body"))),
                    'url':
                    node.findtext("Original_link"),
                    'link':
                    re.sub(r'/news/', 'http://aitopics.org/news/',
                           node.findtext("Link")),
                    'image':
                    re.sub(
                        r'<img',
                        '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
                        node.findtext("Representative_image"))
                })
        except Exception, e:
            print e

        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.articles = articles
        email_output = str(email)

        return email_output
Exemplo n.º 14
0
class AINewsPublisher:
    def __init__(self):
        self.debug = config["ainews.debug"]
        self.today = date.today()
        self.earliest_date = self.today - timedelta(days=int(config["ainews.period"]))
        self.db = AINewsDB()
        self.corpus = AINewsCorpus()
        self.duplicates = AINewsDuplicates()
        self.txtpro = AINewsTextProcessor()
        self.weka = AINewsWekaClassifier()

        self.articles = {}
        self.semiauto_email_output = ""

    def filter_and_process(self):
        self.articles = self.corpus.get_unprocessed()

        if len(self.articles) == 0:
            return

        # assume every article will be published; may be set to False from one
        # of the filtering processes below
        for urlid in self.articles:
            self.articles[urlid]["publish"] = True
            self.articles[urlid]["transcript"] = []

        # filter by date
        print "Filtering by date..."
        for urlid in self.articles:
            if self.articles[urlid]["pubdate"] == None:
                # give a meaningful pubdate so that other code doesn't crash
                self.articles[urlid]["pubdate"] = self.today
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to bogus publication date.")
            elif self.articles[urlid]["pubdate"] < self.earliest_date:
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append(
                    (
                        "Rejected because article is too old "
                        + "(earliest valid date is %s while article was "
                        + "published on %s"
                    )
                    % (self.earliest_date.strftime("%F"), self.articles[urlid]["pubdate"].strftime("%F"))
                )

        # filter by blacklist (for urls)
        print "Filtering by blacklist..."
        for urlid in self.articles:
            for black in blacklist_urls:
                if re.search(black, self.articles[urlid]["url"]):
                    self.articles[urlid]["publish"] = False
                    self.articles[urlid]["transcript"].append(
                        ("Rejected because url matched blacklisted url %s" % black)
                    )
                    break

        # filter by whitelist
        print "Filtering by whitelist..."
        for urlid in self.articles:
            white_wordfreq = self.txtpro.whiteprocess(urlid, self.articles[urlid]["content"])
            self.articles[urlid]["white_wordfreq"] = white_wordfreq

            # require at least two different whitelisted terms
            # unless the article is user-submitted
            if len(white_wordfreq) < 2 and self.articles[urlid]["source"] != "User Submitted":
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to only one or no whitelisted terms")

        # update categories based on classifier predictions
        print "Classifying..."
        self.weka.predict(self.articles)

        # drop articles with no categories
        print "Dropping articles with no categories..."
        for urlid in self.articles:
            if len(self.articles[urlid]["categories"]) == 0:
                self.articles[urlid]["publish"] = False
                self.articles[urlid]["transcript"].append("Rejected due to no selected categories")

        # filter out duplicates; some articles may have 'publish' set to False
        # by this function
        print "Filtering duplicates..."
        self.duplicates.filter_duplicates(self.articles)

        for urlid in self.articles:
            print urlid, self.articles[urlid]["publish"], self.articles[urlid]["title"], self.articles[urlid][
                "categories"
            ], self.articles[urlid]["summary"]
            print

        print "Grabbing images..."
        for urlid in self.articles:
            # grab and convert article image (if it exists)
            self.grab_convert_image(self.articles[urlid])

            # update article in database
            self.update_db(self.articles[urlid])

        # mark each as processed
        print "Marking as processed."
        self.corpus.mark_processed(self.articles.itervalues())

    def grab_convert_image(self, article):
        if len(article["image_url"]) == 0:
            article["image_path"] = ""
            return
        try:
            f = urllib2.urlopen(article["image_url"])
            img = open("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])), "w")
            img.write(f.read())
            img.close()
            # produces [urlid].jpg
            Popen(
                "%s -format jpg -gravity Center -thumbnail 200x200 %s%s"
                % (paths["imagemagick.mogrify"], paths["ainews.image_dir"], str(article["urlid"])),
                shell=True,
            ).communicate()
            # remove [urlid] file (with no extension)
            remove("%s%s" % (paths["ainews.image_dir"], str(article["urlid"])))
            article["image_path"] = "public://newsfinder_images/%s.jpg" % article["urlid"]
        except Exception as e:
            print "Failed converting image for %d: %s" % (article["urlid"], e)
            article["image_path"] = ""

    def update_db(self, article):
        self.db.execute("delete from categories where urlid = %s", article["urlid"])
        for cat in article["categories"]:
            self.db.execute("insert into categories values (%s,%s)", (article["urlid"], cat))

    def generate_feed_import(self):
        """
        Generate XML file for feed import on the Drupal site.
        """
        xml = FeedImport()
        for article in self.articles.values():
            article["source"] = re.sub(r"&", "&amp;", article["source"])
        xml.news = self.articles.values()
        savefile(paths["ainews.output_xml"] + "news.xml", str(xml))

    def generate_email_output(self):
        articles = []
        try:
            f = urllib2.urlopen(paths["ainews.top_weekly_news_xml"])
            xml = etree.parse(f)
            for node in xml.iter("node"):
                print "Found", node.findtext("Title")
                published = node.findtext("Publication_date")
                articles.append(
                    {
                        "title": node.findtext("Title"),
                        "source": node.findtext("Source"),
                        "topics": re.sub(r"/topic/", "http://aitopics.org/topic/", node.findtext("Topics")),
                        "pubdate": date(int(published[0:4]), int(published[5:7]), int(published[8:10])),
                        "summary": re.sub(
                            r"</p>(</blockquote>)?$", "", re.sub(r"^(<blockquote>)?<p>", "", node.findtext("Body"))
                        ),
                        "url": node.findtext("Original_link"),
                        "link": re.sub(r"/news/", "http://aitopics.org/news/", node.findtext("Link")),
                        "image": re.sub(
                            r"<img",
                            '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
                            node.findtext("Representative_image"),
                        ),
                    }
                )
        except Exception, e:
            print e

        email = LatestNewsEmail()
        email.date = self.today.strftime("%B %d, %Y")
        email.year = self.today.strftime("%Y")
        email.articles = articles
        email_output = str(email)

        return email_output