def main():
    parser = argparse.ArgumentParser(
        description='accumulates values of same (document,author)-Pairs')
    parser.add_argument(
        '--raw-contribs',
        type=argparse.FileType('r'),
        help='path to input MatrixMarket raw contributions file (.mm/.mm.bz2)',
        required=True)
    parser.add_argument(
        '--acc-contribs',
        type=argparse.FileType('w'),
        help='path to output MatrixMarket accumulated contributions .mm file',
        required=True)

    args = parser.parse_args()
    input_raw_contribs_dump_path = args.raw_contribs.name
    output_acc_contribs_dump_path = args.acc_contribs.name

    logger.info('running with:\n{}'.format(
        pformat({
            'input_raw_contribs_dump_path': input_raw_contribs_dump_path,
            'output_acc_contribs_dump_path': output_acc_contribs_dump_path
        })))

    # lade, akkumulierte & speichere Beiträge
    raw_contribs = MmCorpus(input_raw_contribs_dump_path)
    acc_contribs = (accumulate(raw_doc_contribs)
                    for raw_doc_contribs in raw_contribs)
    #MmCorpus.serialize(output_acc_contribs_dump_path, corpus=acc_contribs, progress_cnt=10000)
    MmWriter.write_corpus(output_acc_contribs_dump_path,
                          corpus=acc_contribs,
                          index=False,
                          progress_cnt=10000,
                          metadata=False)
Пример #2
0
    def __init__(self, filename):
        """\

        `filename`
            The name of the target file.
        """
        mmw = MmWriter(filename)
        # write empty headers to the file (with enough space to be overwritten later)
        mmw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line
        self._mmw = mmw
        self._num_docs = -1
        self._num_terms = 0
        self._num_nnz = 0 # number of non-zeroes in the sparse corpus matrix
def main(args):

    cp = SafeConfigParser()
    cp.read(args.config)
    base_dir = os.path.join(cp.get('general', 'local_data_dir'), args.dir)
    hadoop_base_dir = os.path.join(cp.get('general', 'hadoop_data_dir'), args.dir)


    word2index_file = os.path.join(base_dir, cp.get('LDA', 'word2index'))
    blei_corpus_file = os.path.join(base_dir, cp.get('LDA', 'blei_corpus'))
    doc2topic_file = os.path.join(base_dir, cp.get('LDA', 'doc2topic'))


    dictionary = gensim.corpora.dictionary.Dictionary() 
    id2Token = dict(enumerate(l[:-1] for l in open(word2index_file)))
    dictionary.token2id  = {v: k for k, v in id2Token.items()}
    corpus = gensim.corpora.bleicorpus.BleiCorpus(blei_corpus_file, fname_vocab=word2index_file)


    time1 = time.time()
    model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,\
                                num_topics=args.dim,\
                                id2word=dictionary,\
                                workers=8,\
                                chunksize=10000,\
                                passes=1,\
                                batch=False,\
                                alpha='symmetric',\
                                eta=None,\
                                decay=0.5,\
                                offset=1.0,\
                                eval_every=10,\
                                iterations=50,\
                                gamma_threshold=0.001)
    time2 = time.time()
    print 'training lda model took %0.3f minutes' % ((time2-time1) / 60.0)
    model.save(os.path.join(base_dir, 'lda_model'))

    time1 = time.time()
    matrix = model[corpus]
    MmWriter.write_corpus(doc2topic_file, matrix)
    time2 = time.time()
    print 'creating lda vectors took %0.3f minutes' % ((time2-time1) / 60.0)
Пример #4
0
def main(args):

    cp = SafeConfigParser()
    cp.read(args.config)
    base_dir = os.path.join(cp.get('DEFAULT', 'data_path'), args.lang)
    hadoop_base_dir = os.path.join(cp.get('DEFAULT', 'hadoop_data_path'), args.lang)


    word2index_file = os.path.join(base_dir, cp.get('recommendation', 'word2index'))
    blei_corpus_file = os.path.join(base_dir, cp.get('recommendation', 'blei_corpus'))
    doc2topic_file = os.path.join(base_dir, cp.get('recommendation', 'doc2topic'))


    dictionary = gensim.corpora.dictionary.Dictionary() 
    id2Token = dict(enumerate(l[:-1] for l in open(word2index_file)))
    dictionary.token2id  = {v: k for k, v in id2Token.items()}
    corpus = gensim.corpora.bleicorpus.BleiCorpus(blei_corpus_file, fname_vocab=word2index_file)


    time1 = time.time()
    model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,\
                                num_topics=args.dim,\
                                id2word=dictionary,\
                                workers=8,\
                                chunksize=10000,\
                                passes=1,\
                                batch=False,\
                                alpha='symmetric',\
                                eta=None,\
                                decay=0.5,\
                                offset=1.0,\
                                eval_every=10,\
                                iterations=50,\
                                gamma_threshold=0.001)
    time2 = time.time()
    logger.info ('training lda model took %0.3f minutes' % ((time2-time1) / 60.0))
    model.save(os.path.join(base_dir, 'lda_model'))

    time1 = time.time()
    matrix = model[corpus]
    MmWriter.write_corpus(doc2topic_file, matrix)
    time2 = time.time()
    logger.info ('creating lda vectors took %0.3f minutes' % ((time2-time1) / 60.0))
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description='prunes contribs of a given author-document-contribs file, storing only top N max. contributions per authot')
    parser.add_argument('--author-doc-contribs', type=argparse.FileType('r'), help='path to input contribution MatrixMarket file (.mm/.mm.bz2)', required=True)
    parser.add_argument('--pruned-contribs', type=argparse.FileType('w'), help='path to output MatrixMarket .mm file', required=True)
    parser.add_argument('--top-n-contribs', type=int, help='keep only N contribs with highes values per author', required=True)
    
    args = parser.parse_args()
    input_author_doc_contribs_path = args.author_doc_contribs.name
    output_pruned_contribs_path = args.pruned_contribs.name
    top_n_contribs = args.top_n_contribs
    
    logger.info('running with:\n{}'.format(pformat({'input_author_doc_contribs_path':input_author_doc_contribs_path, 'output_pruned_contribs_path':output_pruned_contribs_path, 'top_n_contribs':top_n_contribs})))
        
    contribs = MmCorpus(input_author_doc_contribs_path)
    num_authors = contribs.num_docs
    num_docs = contribs.num_terms
    logger.info('processing contributions of {} authors, {} docs'.format(num_authors, num_docs))
    pruned_contribs = prune_contribs_of_authors(contribs, top_n_contribs)
    logger.info('writing pruned corpus')
    MmWriter.write_corpus(output_pruned_contribs_path, pruned_contribs, num_terms=num_docs, index=False, progress_cnt=10000, metadata=False)
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description='creates an id2author mapping gensim dictionary a document->authorid contributions MatrixMarket file and a binary article title file from a given WikiMedia *-pages-meta-history dump (considering only articles in mainspace!)')
    parser.add_argument('--history-dump', type=argparse.FileType('r'), help='path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)', required=True)
    parser.add_argument('--id2author', type=argparse.FileType('w'), help='path to output text id2author dictionary (.txt/.txt.bz2)', required=True)
    parser.add_argument('--contribs', type=argparse.FileType('w'), help='path to output MatrixMarket contributions .mm file; also creates a binary article title file CONTRIBS.metadata.cpickle', required=True)
    parser.add_argument('--contribution-value', choices=CONTRIBUTION_VALUE_FUNCTIONS, help='calculated per-contribution value; choices: {}'.format(CONTRIBUTION_VALUE_FUNCTIONS.keys()), required=True)
    parser.add_argument("--namespace-prefixes", type=argparse.FileType('r'), help='file of namespace prefixes to ignore')    
        
    args = parser.parse_args()
    args = parser.parse_args()
    input_history_dump_path = args.history_dump.name
    output_id2author_path = args.id2author.name
    output_contribs_path = args.contribs.name
    contribution_value = args.contribution_value
    namespace_prefixes = read_lines(args.namespace_prefixes.name) if args.namespace_prefixes else ()
        
    logger.info('running with:\n{}'.format(pformat({'input_history_dump_path':input_history_dump_path, 'output_id2author_path':output_id2author_path, 'output_contribs_path':output_contribs_path, 'contribution_value':contribution_value, 'namespace_prefixes':namespace_prefixes})))        
            
    # konstruiere id2author-Dictionary: mappt Autornamen von registrierten, Nicht-Bot-Autoren auf IDs und umgekehrt
    with smart_open(input_history_dump_path) as history_dump_file:    
        logger.info('generating author->id mappings')
        history_dump = xml_dump.Iterator.from_file(history_dump_file)
        # benutze id2word-Dictionary von gensim als id2author-Dictionary: Autoren entsprechen Termen
        id2author = Dictionary(get_revision_authors_of_pages(history_dump, namespace_prefixes))
        logger.info('found {} different authors'.format(len(id2author)))
        logger.info('removing non-registered authors')
        remove_from_dictionary(id2author, is_registered_user)
        logger.info('reduced to {} registered authors'.format(len(id2author)))
        logger.info('removing bots')
        remove_from_dictionary(id2author, is_not_bot_user)
        logger.info('reduced to {} registered non-bot authors'.format(len(id2author)))
        id2author.compactify()
        id2author.save_as_text(output_id2author_path)
        
    # berechne & speichere Einträge (Autor-ID, Versionswert) Versionen gültiger Autoren für alle Artikel 
    with smart_open(input_history_dump_path) as history_dump_file: 
        logger.info('generating MatrixMarket representation per revision: (docid, authorid, value of revision)')
        history_dump = xml_dump.Iterator.from_file(history_dump_file)
        revision_value_fun = CONTRIBUTION_VALUE_FUNCTIONS[contribution_value]
        doc_auth_contribs = MetadataCorpus(get_revision_values(get_revisions_of_pages(history_dump, namespace_prefixes), id2author, revision_value_fun))
        MmWriter.write_corpus(output_contribs_path, corpus=doc_auth_contribs, num_terms=len(id2author), index=False, progress_cnt=10000, metadata=True)   
Пример #7
0
def main():
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("--articles-dump",
                        type=argparse.FileType('r'),
                        help='path to input .xml.bz2 articles dump',
                        required=True)
    parser.add_argument("--out-prefix",
                        help='prefix of the generated output files',
                        required=True)
    parser.add_argument(
        "--keep-words",
        type=int,
        help='number of most frequent word types to keep (default {})',
        required=True)
    parser.add_argument(
        "--no-below",
        type=int,
        help=
        'Keep only tokes which appear in at least NO_BELOW documents (default {})',
        required=True)
    parser.add_argument(
        "--no-above",
        type=float,
        help=
        'Keep only tokes which appear in at most NO_ABOVE*CORPUSSIZE documents (default {})',
        required=True)
    parser.add_argument(
        "--article-min-tokens",
        type=int,
        help=
        'Analyze only articles of >= ARTICLE_MIN_TOKENS tokens default {}). Should be >=1',
        required=True)
    parser.add_argument("--token-min-len",
                        type=int,
                        help='Consider only tokens of at least MIN chars',
                        required=True)
    parser.add_argument('--remove-stopwords',
                        action='store_true',
                        help='remove english stopwords with gensims stoplist')
    parser.add_argument("--namespace-prefixes",
                        type=argparse.FileType('r'),
                        help='file of namespace prefixes to ignore')

    args = parser.parse_args()
    input_articles_path = args.articles_dump.name
    output_prefix = args.out_prefix
    keep_words = args.keep_words
    no_below, no_above = args.no_below, args.no_above
    article_min_tokens = args.article_min_tokens
    token_min_len = args.token_min_len
    remove_stopwords = args.remove_stopwords
    namespace_prefixes = read_lines(
        args.namespace_prefixes.name) if args.namespace_prefixes else ()

    logger.info('running with:\n{}'.format(
        pformat({
            'input_articles_path': input_articles_path,
            'output_prefix': output_prefix,
            'keep_words': keep_words,
            'no_below': no_below,
            'no_above': no_above,
            'article_min_tokens': article_min_tokens,
            'token_min_len': token_min_len,
            'remove_stopwords': remove_stopwords,
            'namespace_prefixes': namespace_prefixes
        })))

    # erzeuge & und speichere Vokabular
    logger.info('generating vocabulary')
    stopwords = STOPWORDS if remove_stopwords else ()
    corpus = MediaWikiCorpus(input_articles_path, article_min_tokens,
                             token_min_len, stopwords, namespace_prefixes)
    corpus.dictionary = Dictionary(corpus.get_texts())
    logger.info(
        'filtering dictionary: removing terms in less than {} docs'.format(
            no_below))
    corpus.dictionary.filter_extremes(no_below=no_below,
                                      no_above=1,
                                      keep_n=keep_words)
    logger.info(
        'filtering dictionary: removing terms in more than {} of all docs'.
        format(no_above))
    corpus.dictionary.filter_extremes(no_below=0,
                                      no_above=no_above,
                                      keep_n=keep_words)
    corpus.dictionary.compactify()
    output_id2word_path = output_prefix + '-id2word.txt'
    corpus.dictionary.save_as_text(output_id2word_path)

    # erzeuge & speichere BOW-Modell aus Vokabular
    logger.info('generating bag of words corpus')
    corpus.metadata = True
    output_corpus_path = output_prefix + '.mm'
    #MmCorpus.serialize(output_corpus_path, corpus, progress_cnt=10000, metadata=True)
    MmWriter.write_corpus(output_corpus_path,
                          corpus=corpus,
                          index=False,
                          progress_cnt=10000,
                          metadata=True)