def wiki_dump_from_gs():
    logger.info('Starting, process will connect with ElasticSearch and online wikipedia site...')
    mentions_files = [args.mentions]
    dump_file = args.output
    vocab = load_mentions_vocab_from_files(mentions_files)

    if args.host and args.port and args.index:
        wiki_elastic = WikipediaRelationExtraction(WikipediaSearchMethod.ELASTIC,
                                                   host=args.host,
                                                   port=args.port,
                                                   index=args.index)
    else:
        logger.info(
            'Running without Wikipedia elastic search, Note that this will '
            'take much longer to process only using online service')
        wiki_elastic = None

    wiki_online = WikipediaRelationExtraction(WikipediaSearchMethod.ONLINE)

    for phrase in vocab:
        phrase = phrase.replace("'", "").replace('"', "").replace('\\', "").strip()
        logger.info('Try to retrieve \'%s\' from elastic search', phrase)
        pages = None
        if wiki_elastic:
            pages = wiki_elastic.get_phrase_related_pages(phrase)
        if not pages or not pages.get_pages() or len(pages.get_pages()) == 0:
            logger.info('Not on elastic, retrieve \'%s\' from wiki online site', phrase)
            pages = wiki_online.get_phrase_related_pages(phrase)
        for search_page in pages.get_pages():
            add_page(search_page, phrase)

    with open(dump_file, 'w') as myfile:
        json.dump(result_dump, myfile, default=json_dumper)

    logger.info('Saving dump to file-%s', dump_file)
def glove_dump():
    filter_stop_words = False
    glove_file = args.glove
    out_file = args.output
    mention_files = [args.mentions]
    vocab = load_mentions_vocab_from_files(mention_files, filter_stop_words)
    word_to_ix, embeddings = load_glove_for_vocab(glove_file, vocab)

    logger.info('Words in vocabulary %d', len(vocab))
    logger.info('Found %d words from vocabulary', len(word_to_ix.keys()))
    with open(out_file, 'wb') as f:
        pickle.dump([word_to_ix, embeddings], f)
    logger.info('Saving dump to file-%s', out_file)
def vo_dump():
    vo_file = args.vo
    out_file = args.output
    mentions_event_gold_file = [args.mentions]
    vocab = load_mentions_vocab_from_files(mentions_event_gold_file, True)
    vo = VerboceanRelationExtraction.load_verbocean_file(vo_file)
    vo_for_vocab = {}
    for word in vocab:
        if word in vo:
            vo_for_vocab[word] = vo[word]

    logger.info('Found %d words from vocabulary', len(vo_for_vocab.keys()))
    logger.info('Preparing to save refDict output file')

    with open(out_file, 'w') as f:
        json.dump(vo_for_vocab, f)
    logger.info('Done saved to-%s', out_file)
def ref_dict_dump():
    logger.info('Extracting referent dict dump, this may take a while...')
    ref_dict_file = args.ref_dict
    out_file = args.output
    mentions_entity_gold_file = [args.mentions]
    vocab = load_mentions_vocab_from_files(mentions_entity_gold_file, True)

    ref_dict = ReferentDictRelationExtraction.load_reference_dict(ref_dict_file)

    ref_dict_for_vocab = {}
    for word in vocab:
        if word in ref_dict:
            ref_dict_for_vocab[word] = ref_dict[word]

    logger.info('Found %d words from vocabulary', len(ref_dict_for_vocab.keys()))
    logger.info('Preparing to save refDict output file')
    with open(out_file, 'w') as f:
        json.dump(ref_dict_for_vocab, f)
    logger.info('Done saved to-%s', out_file)