def trainModelFromTopics(s): if s.verbose: s.logger.info("trainModelFromTopics : Topics :" + str(s.topics)) ncg = NewsCorpusGenerator(s.corpus_dir, 'mongo', mongo_db_name='DomainModelCorpora', domain=s.domain) article_links = [] for t in s.topics: # Extract Content & Create Corpus article_links.extend(s.crawl_links(t, s.domain)) # 1. crawl the topics if s.verbose: s.logger.info(("Total %d links to extract" % len(article_links)) + " links ==>" + str(article_links)) # 2. store results in mongoDB ncg.generate_corpus(article_links) if s.verbose: s.logger.info("trainModelFromTopics : Stats:" + str(ncg.get_stats()))
# Commodities commodities_terms = ['silver','gold','commodities'] commo = get_links(commodities_terms,'Commodities') print len(commo) article_links.extend(commo) # Fraud & Insider Trading fraud_terms = ['insider trading','Ponzi Scheme','finance fraud'] fraud = get_links(fraud_terms,'Fraud') print len(fraud) article_links.extend(fraud) # Litigation lit_terms = ['company settlement','company litigation','company lawsuit'] lit = get_links(lit_terms,'Litigation') print len(lit) article_links.extend(lit) # Earning Reports er_terms = ['earning reports','quarterly results','financial statement'] er = get_links(er_terms,'Earning_Reports') print len(er) article_links.extend(er) # Extract Content & Create Corpus print "Total %d links to extract" % len(article_links) ex.generate_corpus(article_links) print ex.get_stats()