示例#1
0
文件: generate.py 项目: keho98/argos
def generate(num=200):
    this_dir = path.dirname(__file__)
    articles = load_articles()

    # Load seed sources.
    srcs = json.load(open(path.join(this_dir, 'seed_sources.json'), 'r'))

    # Filter down to articles from the specified sources.
    filtered = [a for a in articles if a['source'] in srcs]

    # Grab the maximum articles from the top.
    a = filtered[:num]

    # Dump the results to another json.
    new_dump = open(path.join(this_dir, 'seed.json'), 'w')
    json.dump(a, new_dump)
示例#2
0
def generate(keywords, num=5000):
    this_dir = path.dirname(__file__)
    articles = load_articles()

    # Filter down to articles from the specified sources.
    results = []
    articles = articles[:num]
    for idx, article in enumerate(articles):
        article_words = tokenize(article['text'])
        if set(article_words).issuperset(set(keywords)):
            results.append(article)
        progress_bar(idx / len(articles) * 100)

    # Store articles into separate text files.
    for article in results:
        #print(json.dumps(article, sort_keys=True, indent=4))
        article_path = 'unorganized_articles/{0}.txt'.format(article['title'])
        f = open(path.join(this_dir, article_path), 'wb')
        f.write(article['text'].encode('utf-8'))
示例#3
0
文件: generate.py 项目: keho98/argos
def generate(keywords, num=5000):
    this_dir = path.dirname(__file__)
    articles = load_articles()

    # Filter down to articles from the specified sources.
    results = []
    articles = articles[:num]
    for idx, article in enumerate(articles):
        article_words = tokenize(article['text'])
        if set(article_words).issuperset(set(keywords)):
            results.append(article)
        progress_bar(idx/len(articles) * 100)

    # Store articles into separate text files.
    for article in results:
        #print(json.dumps(article, sort_keys=True, indent=4))
        article_path = 'unorganized_articles/{0}.txt'.format(article['title'])
        f = open(path.join(this_dir, article_path), 'wb')
        f.write(article['text'].encode('utf-8'))