def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def generate_topicmaps(src, handle_media=False): def tee(files, filename): ctm = openfile(filename + '.ctm') xtm = openfile(filename + '.xtm') files.append(ctm) files.append(xtm) return TeeCableHandler(create_ctm_handler(ctm), create_xtm_handler(xtm)) files = [] handlers = [] european_handler = CableIdFilter(tee(files, 'european-cables'), pred.origin_filter(pred.origin_europe)) all_cables_handler = tee(files, 'cables') handlers.append( DefaultMetadataOnlyFilter( DebitlyFilter(TeeCableHandler(european_handler, all_cables_handler)))) handlers.append(slo_handler(files)) handlers.append(ContentCableHandler(tee(files, 'cable-content'))) if handle_media: ctm, xtm = openfile('media-iris.ctm'), openfile('media-iris.xtm') files.append(ctm) files.append(xtm) h = DebitlyFilter( MediaTitleResolver( handler.TeeMapHandler(create_ctm_miohandler(ctm), create_xtm_miohandler(xtm)))) handlers.append(h) handle_source(src, MultipleCableHandler(handlers)) for f in files: f.close()
def generate_corpus(src): handler = TeeCableHandler(DefaultMetadataOnlyFilter(CorpusHandler('./', prefix='german_cables_metadata_')), CorpusHandler('./', prefix='german_cables_')) handle_source(src, handler, pred.origin_filter(pred.origin_germany))