Пример #1
0
def load(tweets):
    es = Elasticsearch(host=config.es_host, port=config.es_port)

    if es.indices.exists(idx_name):
        print('index {} already exists'.format(idx_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, idx_name)
        except ElasticsearchException as e:
            print('error adding mapping:\n' + str(e))
            es.indices.delete(idx_name)
            create_index(es, idx_name, mapping)
    else:
        print('index {} does not exist'.format(idx_name))
        create_index(es, idx_name, mapping)

    k = 0
    data = []
    tweets_len = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": idx_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
        }
        data.append(bulk_doc)
        k += 1

        if k % bulk_chunk_size == 0 or k == tweets_len:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(
                INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, data)
            print 'ElasticSearch indexed %d documents' % success
            data = []
Пример #2
0
def load(tweets):    
    es = Elasticsearch(host = config.es_host, port = config.es_port)

    if es.indices.exists(index_name):
        print ('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n'+str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)
    
    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": index_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
            }
        bulk_data.append(bulk_doc)
        counter+=1
        
        if counter % bulk_chunk_size == 0 or counter == list_size:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, bulk_data)
            print 'ElasticSearch indexed %d documents' % success
            bulk_data = []
Пример #3
0
def load(doc):
    es = Elasticsearch(host=config.es_host, port=config.es_port)
    tweet = get_tweet(doc)
    result = es.index(index=index_name,
                      doc_type=doc_type,
                      id=tweet[id_field],
                      body=tweet)
    return result
Пример #4
0
def load(tweets):       
    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_data.append(tweet)
        counter+=1
    return bulk_data
def load(tweets):
    # es = Elasticsearch(host = config.es_host, port = config.es_port)

    es = Elasticsearch(hosts=[{
        'host': config.es_host,
        'port': config.es_port
    }],
                       http_auth=awsauth,
                       use_ssl=True,
                       verify_certs=True,
                       connection_class=RequestsHttpConnection)
    es_version_number = es.info()['version']['number']
    mapping_to_put = get_tweet_mapping(es_version_number)

    print(mapping_to_put)
    # mapping = {doc_type: tweet_mapping
    #            }
    mapping = {'mappings': mapping_to_put}

    if es.indices.exists(index_name):
        print('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(body=mapping_to_put, index=index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n' + str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)

    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": index_name,
            # "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
        }
        bulk_data.append(bulk_doc)
        counter += 1

        if counter % bulk_chunk_size == 0 or counter == list_size:
            print("ElasticSearch bulk index (index: {INDEX})...".format(
                INDEX=index_name))
            success, _ = bulk(es, bulk_data)

            print('ElasticSearch indexed %d documents' % success)
            bulk_data = []
Пример #6
0
def load(tweets):
    # es = Elasticsearch(host = config.es_host, port = config.es_port)
    awsauth = create_awsauth()
    es = Elasticsearch(hosts=[{
        'host': config.es_host,
        'port': config.es_port
    }],
                       http_auth=awsauth,
                       use_ssl=True,
                       verify_certs=True,
                       connection_class=RequestsHttpConnection)

    if es.indices.exists(index_name):
        print('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n' + str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)

    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        try:
            tweet = get_tweet(doc)
            bulk_doc = {
                "_index": index_name,
                "_type": doc_type,
                "_id": tweet[id_field],
                "_source": tweet
            }
            bulk_data.append(bulk_doc)
        except Exception as e:
            print(
                "A single Tweet Doc failed to be loaded to Elasticsearch, tweet id is: "
                + doc['id_str'] + " Exception is: " + str(e))

        counter += 1

        if counter % bulk_chunk_size == 0 or counter == list_size:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(
                INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, bulk_data)
            print 'ElasticSearch indexed %d documents' % success
            bulk_data = []
def load(tweets):
    # es = Elasticsearch(host='9d230a39473f4abe9f0db0dc15d81c86.us-west1.gcp.cloud.es.io', port=9243, verify_certs=False,
    #               scheme="https", http_auth=('superturbo', 'M1nuteMa1d'))
    # es = Elasticsearch(hosts = config.es_host)
    # es = Elasticsearch(host = config.es_host, port = config.es_port, http_auth=('superturbo', 'M1nuteMa1d'))
    # es_version_number = es.info()['version']['number']
    # tweet_mapping = get_tweet_mapping(es_version_number)

    # mapping = {doc_type: tweet_mapping
    #            }
    #
    # if es.indices.exists(index_name):
    #     print ('index {} already exists'.format(index_name))
    #     try:
    #         es.indices.put_mapping(doc_type, tweet_mapping, index_name)
    #     except ElasticsearchException as e:
    #         print('error putting mapping:\n'+str(e))
    #         print('deleting index {}...'.format(index_name))
    #         es.indices.delete(index_name)
    #         create_index(es, index_name, mapping)
    # else:
    #     print('index {} does not exist'.format(index_name))
    #     create_index(es, index_name, mapping)

    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        id = tweet[id_field]
        # print(tweet)
        bulk_doc = {
            "_index": index_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
        }
        bulk_data.append(bulk_doc)
        counter += 1

        requests.put(url + id,
                     json=tweet,
                     headers=headers,
                     auth=HTTPBasicAuth('superturbo', 'M1nuteMa1d'))
Пример #8
0
def load(tweets):    
    es = Elasticsearch(host = config.es_host, port = config.es_port)
    es_version_number = es.info()['version']['number']
    tweet_mapping = get_tweet_mapping(es_version_number)
    mapping = {doc_type: tweet_mapping
               }


    if es.indices.exists(index_name):
        print ('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n'+str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)
    
    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": index_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
            }
        bulk_data.append(bulk_doc)
        counter+=1
        
        if counter % bulk_chunk_size == 0 or counter == list_size:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, bulk_data)
            print 'ElasticSearch indexed %d documents' % success
            bulk_data = []
Пример #9
0
def load(tweets):
    """Create an elasticserach connection using the environment variables 'ES_USER' and 'ES_PASS'"""
    es = Elasticsearch(connection_class=Urllib3HttpConnection, host=config.es_host, port=config.es_port, http_auth=(os.getenv('ES_USER', 'user'), os.getenv('ES_PASS')), use_ssl=True)
    if es.indices.exists(index_name):
        print ('index {} already exists'.format(index_name))
        try:
            es.indices.put_mapping(doc_type, tweet_mapping, index_name)
        except ElasticsearchException as e:
            print('error putting mapping:\n'+str(e))
            print('deleting index {}...'.format(index_name))
            es.indices.delete(index_name)
            create_index(es, index_name, mapping)
    else:
        print('index {} does not exist'.format(index_name))
        create_index(es, index_name, mapping)

    counter = 0
    bulk_data = []
    list_size = len(tweets)
    for doc in tweets:
        print(doc)
        tweet = get_tweet(doc)
        bulk_doc = {
            "_index": index_name,
            "_type": doc_type,
            "_id": tweet[id_field],
            "_source": tweet
            }
        bulk_data.append(bulk_doc)
        counter+=1

        if counter % bulk_chunk_size == 0 or counter == list_size:
            print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(INDEX=index_name, TYPE=doc_type)
            success, _ = bulk(es, bulk_data)
            print 'ElasticSearch indexed %d documents' % success
            bulk_data = []