def generate_idf_model(): tokens_global = defaultdict(int) bigrams_global = defaultdict(int) print "Extracting..." articles = 0 for i, article in enumerate(load_articles(50000)): for token in set(article['tokens']): tokens_global[token] += 1 for bigram in set(article['bigrams']): bigrams_global[bigram] += 1 if i % 100 == 0: print "Done: %s" % i articles += 1 print "Calculating IDF..." for file_name, terms in (('tokens', tokens_global), ('bigrams', bigrams_global)): data = {'articles': articles} data['terms'] = dict() for term, count in terms.items(): idf = math.log((articles / (1 + count))) data['terms'][term] = idf with open(data_path('idf_%s.json' % file_name), 'wb') as fh: json.dump(data, fh)
def topic_graph(): print "Loading IDF model..." model = load_idf_model('tokens') print "Making a graph..." #articles = 0 edges = defaultdict(int) for article in articles.find(_limit=10000): if 'spiegel.de/international' in article['article_url']: continue terms = article_terms(model, article)[:15] for (term1, score1), (term2, score2) in combinations(terms, 2): key = max(term1, term2), min(term1, term2) edges[key] += score1 * score2 G = nx.Graph() #for (s, d), w in edges.items(): for (s, d), w in sorted(edges.items(), key=lambda (a, b): b, reverse=True)[:20000]: G.add_edge(s, d, weight=w) nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))
def generate_idf_model(): tokens_global = defaultdict(int) bigrams_global = defaultdict(int) print "Extracting..." articles = 0 for i, article in enumerate(load_articles(50000)): for token in set(article['tokens']): tokens_global[token] += 1 for bigram in set(article['bigrams']): bigrams_global[bigram] += 1 if i % 100 == 0: print "Done: %s" % i articles += 1 print "Calculating IDF..." for file_name, terms in (('tokens', tokens_global), ('bigrams', bigrams_global)): data = {'articles': articles} data['terms'] = dict() for term, count in terms.items(): idf = math.log((articles/(1+count))) data['terms'][term] = idf with open(data_path('idf_%s.json' % file_name), 'wb') as fh: json.dump(data, fh)
def load_idf_model(type_): with open(data_path('idf_%s.json' % type_), 'rb') as fh: return json.load(fh)