示例#1
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None  # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
示例#2
0
def generate_topicmaps(src, handle_media=False):
    def tee(files, filename):
        ctm = openfile(filename + '.ctm')
        xtm = openfile(filename + '.xtm')
        files.append(ctm)
        files.append(xtm)
        return TeeCableHandler(create_ctm_handler(ctm),
                               create_xtm_handler(xtm))

    files = []
    handlers = []
    european_handler = CableIdFilter(tee(files, 'european-cables'),
                                     pred.origin_filter(pred.origin_europe))
    all_cables_handler = tee(files, 'cables')
    handlers.append(
        DefaultMetadataOnlyFilter(
            DebitlyFilter(TeeCableHandler(european_handler,
                                          all_cables_handler))))
    handlers.append(slo_handler(files))
    handlers.append(ContentCableHandler(tee(files, 'cable-content')))
    if handle_media:
        ctm, xtm = openfile('media-iris.ctm'), openfile('media-iris.xtm')
        files.append(ctm)
        files.append(xtm)
        h = DebitlyFilter(
            MediaTitleResolver(
                handler.TeeMapHandler(create_ctm_miohandler(ctm),
                                      create_xtm_miohandler(xtm))))
        handlers.append(h)
    handle_source(src, MultipleCableHandler(handlers))
    for f in files:
        f.close()
示例#3
0
def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS):
    """\

    """
    wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle')
    bow_filename = os.path.join(out_dir, 'cables_bow.mm')
    tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm')
    predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany)
    # 1. Create word dict
    dct = Dictionary()
    dct_handler = DictionaryHandler(dct)
    handler = create_filter(dct_handler)
    handle_source(src, handler, predicate)
    dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words)
    dct.save(wordid_filename)
    # 2. Reiterate through the cables and create the vector space
    corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False)
    handler = create_filter(corpus_handler)
    handle_source(src, handler, predicate)
    # 3. Load corpus
    mm = MmCorpus(bow_filename)
    # 4. Create TF-IDF model
    tfidf = TfidfModel(mm, id2word=dct, normalize=True)
    # 5. Save the TF-IDF model
    MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
示例#4
0
def generate_corpus(src):
    handler = TeeCableHandler(DefaultMetadataOnlyFilter(CorpusHandler('./', prefix='german_cables_metadata_')),
                              CorpusHandler('./', prefix='german_cables_'))
    handle_source(src, handler, pred.origin_filter(pred.origin_germany))