Exemplo n.º 1
0
    def trainModelFromTopics(s):
        if s.verbose:
            s.logger.info("trainModelFromTopics : Topics :" + str(s.topics))
        ncg = NewsCorpusGenerator(s.corpus_dir,
                                  'mongo',
                                  mongo_db_name='DomainModelCorpora',
                                  domain=s.domain)

        article_links = []
        for t in s.topics:
            # Extract Content & Create Corpus
            article_links.extend(s.crawl_links(t, s.domain))
        # 1. crawl the topics
        if s.verbose:
            s.logger.info(("Total %d links to extract" % len(article_links)) +
                          " links ==>" + str(article_links))
        # 2. store results in mongoDB
        ncg.generate_corpus(article_links)
        if s.verbose:
            s.logger.info("trainModelFromTopics : Stats:" +
                          str(ncg.get_stats()))
Exemplo n.º 2
0
# Commodities
commodities_terms = ['silver','gold','commodities']
commo = get_links(commodities_terms,'Commodities')
print len(commo)
article_links.extend(commo)


# Fraud & Insider Trading
fraud_terms = ['insider trading','Ponzi Scheme','finance fraud']
fraud = get_links(fraud_terms,'Fraud')
print len(fraud)
article_links.extend(fraud)

# Litigation 
lit_terms = ['company settlement','company litigation','company lawsuit']
lit = get_links(lit_terms,'Litigation')
print len(lit)
article_links.extend(lit)

# Earning Reports
er_terms = ['earning reports','quarterly results','financial statement']
er = get_links(er_terms,'Earning_Reports')
print len(er)
article_links.extend(er)

# Extract Content & Create Corpus
print "Total %d links to extract" % len(article_links)
ex.generate_corpus(article_links)
print ex.get_stats()