Exemplo n.º 1
0
 def get_news_sentence(self, answer):
     #Create a database of news articles about the subject of te question
     cg = NewsCorpusGenerator('temp_news_corpus', 'sqlite')
     links = cg.google_news_search(answer, 'Standard', 5)
     cg.generate_corpus(links)
     conn = sqlite3.connect('temp_news_corpus/corpus.db')
     news_strings = []
     for row in conn.execute('SELECT body FROM articles'):
         news_strings.append(
             str(row).decode('unicode_escape').encode('ascii', 'ignore'))
     os.remove('temp_news_corpus/corpus.db')  # Remove the database
     for n in news_strings[1:]:
         summary = summarize(n)
         if (summary != u"" and summary != []):
             if (summary[0:3] == '(u"'):
                 return summary[3:]
             else:
                 return summary
     return ''
Exemplo n.º 2
0
    def trainModelFromTopics(s):
        if s.verbose:
            s.logger.info("trainModelFromTopics : Topics :" + str(s.topics))
        ncg = NewsCorpusGenerator(s.corpus_dir,
                                  'mongo',
                                  mongo_db_name='DomainModelCorpora',
                                  domain=s.domain)

        article_links = []
        for t in s.topics:
            # Extract Content & Create Corpus
            article_links.extend(s.crawl_links(t, s.domain))
        # 1. crawl the topics
        if s.verbose:
            s.logger.info(("Total %d links to extract" % len(article_links)) +
                          " links ==>" + str(article_links))
        # 2. store results in mongoDB
        ncg.generate_corpus(article_links)
        if s.verbose:
            s.logger.info("trainModelFromTopics : Stats:" +
                          str(ncg.get_stats()))
Exemplo n.º 3
0
# Commodities
commodities_terms = ['silver','gold','commodities']
commo = get_links(commodities_terms,'Commodities')
print len(commo)
article_links.extend(commo)


# Fraud & Insider Trading
fraud_terms = ['insider trading','Ponzi Scheme','finance fraud']
fraud = get_links(fraud_terms,'Fraud')
print len(fraud)
article_links.extend(fraud)

# Litigation 
lit_terms = ['company settlement','company litigation','company lawsuit']
lit = get_links(lit_terms,'Litigation')
print len(lit)
article_links.extend(lit)

# Earning Reports
er_terms = ['earning reports','quarterly results','financial statement']
er = get_links(er_terms,'Earning_Reports')
print len(er)
article_links.extend(er)

# Extract Content & Create Corpus
print "Total %d links to extract" % len(article_links)
ex.generate_corpus(article_links)
print ex.get_stats()
import os
from news_corpus_builder import NewsCorpusGenerator
from iab_cat_load import iab_tier2


# Location to save generated corpus
news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'news_data')

# Save results to sqlite or  files per article
ex = NewsCorpusGenerator(news_corpus_dir)


for subcategory, category in iab_tier2.iteritems():
    print 'Getting search result for [' + subcategory + '] in [' + category + ']'
    # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links
    links = ex.google_news_search(subcategory, category, 100)
    print 'saving...'
    # Generate and save corpus
    try:
        ex.generate_corpus(links)
    except:
        pass
Exemplo n.º 5
0
import os
from news_corpus_builder import NewsCorpusGenerator
from iab_cat_load import iab_tier2

# Location to save generated corpus
news_corpus_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'news_data')

# Save results to sqlite or  files per article
ex = NewsCorpusGenerator(news_corpus_dir)

for subcategory, category in iab_tier2.iteritems():
    print 'Getting search result for [' + subcategory + '] in [' + category + ']'
    # Retrieve 50 links related to the search term dogs and assign a category of Pet to the retrieved links
    links = ex.google_news_search(subcategory, category, 100)
    print 'saving...'
    # Generate and save corpus
    try:
        ex.generate_corpus(links)
    except:
        pass