def set_in_index(self, documentList): """ Store the list of documents in the Elasticsearch index via HTTP APIs @type documentList: List @param documentList: List of image layer JSON documents """ #Get the Elasticsearch address from the config file cfg = config.load() #Store the document list in Elasticsearch es = ElasticSearch(cfg.search_options.get("address")) try: es.bulk_index(cfg.search_options.get("index"), cfg.search_options.get("type"), documentList, id_field='id') except InvalidJsonResponseError: logger.debug("InvalidJsonResponseError!") except Timeout: logger.debug("Timeout!") except ConnectionError: logger.debug("ConnectionError!") except ElasticHttpError: logger.debug("ElasticHttpError!") except InvalidJsonResponseError: logger.debug("InvalidJsonResponseError!") except ElasticHttpNotFoundError: logger.debug("ElasticHttpNotFoundError!")
class ESLayers(object): """Implementation of Elastic Search as layers backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def _transform(self, layer, version, layer_name): """Add some meta data fields which are ES specific""" layer = dict(layer) # copy label = layer['label'] del layer['label'] return { 'id': '%s/%s/%s' % (version, layer_name, label), 'version': version, 'name': layer_name, 'label': label, 'layer': layer } def bulk_put(self, layers, version, layer_name, root_label): """Store all layer objects""" self.es.bulk_index( settings.ELASTIC_SEARCH_INDEX, 'layer', map(lambda l: self._transform(l, version, layer_name), layers)) def get(self, name, label, version): """Find the layer that matches these parameters""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer', version + '/' + name + '/' + label) return result['_source']['layer'] except ElasticHttpNotFoundError: return None
def import_json_into_es(types, inputfolder, logger): """ imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'} :param inputfolder: :param logger: :return: """ es = ElasticSearch(config.ELASTICSEARCH_URL) try: es.delete_index('wikidata') es.create_index('wikidata') logger.info('rebuild index [wikidata]') except: logger.warning('cant delete wikidata index') # convert type dictionary wd_types = dict() for key in types.keys(): value = int(types[key].split('/')[-1][1:]) wd_types[value] = {'type': key, 'filename': path.join(inputfolder, '{}.json.bz2'.format(key))} # import each given type for key in wd_types: logger.info(wd_types[key]) done = 0 items = [] for line in BZ2File(wd_types[key]['filename'],'rb'): line = line.strip() item = loads(line) item['uri'] = 'http://wikidata.org/wiki/' + item['id'] items.append(item) done += 1 if ( done % 5000 == 0 ): es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') items = [] # if done % len(wd_types) / 10 == 0: # log 10% steps # logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done )) if done % 10000 == 0: logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d'))) if len(items) > 0: es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') logger.info('imported {}: {}'.format(wd_types[key]['type'],format(done, ',d')))
def feed(index='monolith', type='downloads', es_port=9200): client = ElasticSearch('http://0.0.0.0:%d/' % es_port) platforms = ['Mac OS X', 'Windows 8', 'Ubuntu'] # indexing a year of data (2012) first_day = datetime.datetime(2012, 1, 1) last_day = datetime.datetime(2012, 12, 31) day_range = last_day - first_day for month in range(1, 13): name = 'time_2012-%.2d' % month try: client.delete_index(name) except Exception: pass client.create_index(name, settings={ 'number_of_shards': 1, 'number_of_replicas': 0, 'analysis': { 'analyzer': { 'default': { 'type': 'custom', 'tokenizer': 'keyword' } } }, 'store': { 'compress': { 'stored': 'true' } }, }) # indexing 100 apps for add_on in range(100): docs = defaultdict(list) for delta in range(day_range.days): date = first_day + datetime.timedelta(days=delta) data = { 'date': date, 'os': random.choice(platforms), 'downloads_count': random.randint(1000, 1500), 'users_count': random.randint(10000, 15000), 'add_on': add_on + 1 } docs[date.month].append(data) for month, values in docs.items(): client.bulk_index('time_2012-%.2d' % month, type, values) sys.stdout.write('.') sys.stdout.flush() client.optimize('time_*', max_num_segments=1, wait_for_merge=True) client.flush() sys.stdout.write('\nDone!\n')
def index_data(data_source, index_name, doc_type): es = ElasticSearch(urls='http://localhost', port=9200) try: es.delete_index(index_name) except: pass es.create_index(index_name) try: es.bulk_index(index_name, doc_type, data_source) except: print("Error! Skipping Document...!") pass
class ESRegulations(object): """Implementation of Elastic Search as regulations backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def get(self, label, version): """Find the regulation label + version""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', version + '/' + label) reg_node = result['_source'] del reg_node['regulation'] del reg_node['version'] del reg_node['label_string'] del reg_node['id'] return reg_node except ElasticHttpNotFoundError: return None def _transform(self, reg, version): """Add some meta data fields which are ES specific""" node = dict(reg) # copy node['version'] = version node['label_string'] = '-'.join(node['label']) node['regulation'] = node['label'][0] node['id'] = version + '/' + node['label_string'] node['root'] = len(node['label']) == 1 return node def bulk_put(self, regs, version, root_label): """Store all reg objects""" self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', map(lambda r: self._transform(r, version), regs)) def listing(self, label=None): """List regulation version-label pairs that match this label (or are root, if label is None)""" if label is None: query = {'match': {'root': True}} else: query = {'match': {'label_string': label}} query = {'fields': ['label_string', 'version'], 'query': query} result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX, doc_type='reg_tree', size=100) return sorted((res['fields']['version'], res['fields']['label_string']) for res in result['hits']['hits'])
def index_data(data_path, chunksize, index_name, doc_type): f = open(data_path) csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize) es = ElasticSearch(urls='http://localhost', port=9200) try: es.delete_index(index_name) except: pass es.create_index(index_name) for i, df in enumerate(csvfile): records = df.where(pd.notnull(df), None).T.to_dict() records_list = [records[i] for i in records] try: es.bulk_index(index_name, doc_type, records_list) except: print("Error! Skipping chunk...!") pass
def feed(index='monolith', type='downloads', es_port=9200): client = ElasticSearch('http://0.0.0.0:%d/' % es_port) platforms = ['Mac OS X', 'Windows 8', 'Ubuntu'] # indexing a year of data (2012) first_day = datetime.datetime(2012, 1, 1) last_day = datetime.datetime(2012, 12, 31) day_range = last_day - first_day for month in range(1, 13): name = 'time_2012-%.2d' % month try: client.delete_index(name) except Exception: pass client.create_index(name, settings={ 'number_of_shards': 1, 'number_of_replicas': 0, 'analysis': {'analyzer': {'default': { 'type': 'custom', 'tokenizer': 'keyword' }}}, 'store': {'compress': {'stored': 'true'}}, }) # indexing 100 apps for add_on in range(100): docs = defaultdict(list) for delta in range(day_range.days): date = first_day + datetime.timedelta(days=delta) data = {'date': date, 'os': random.choice(platforms), 'downloads_count': random.randint(1000, 1500), 'users_count': random.randint(10000, 15000), 'add_on': add_on + 1} docs[date.month].append(data) for month, values in docs.items(): client.bulk_index('time_2012-%.2d' % month, type, values) sys.stdout.write('.') sys.stdout.flush() client.optimize('time_*', max_num_segments=1, wait_for_merge=True) client.flush() sys.stdout.write('\nDone!\n')
def indexNodes(self): es = ElasticSearch('http://0.0.0.0:9200') i = 0 for file in os.listdir(self.dataIndexNodes): if file.endswith('.json'): with open(self.dataIndexNodes + file, "r") as f: nodes = json.loads(f.read()) print ("Indexing Node data", self.dataIndexNodes + file, len(nodes)) bulkCount = 0 bulkAry = [] for node in nodes: i += 1 if (i < 170000): continue bulkCount = bulkCount + 1 bulkAry.append(node); if bulkCount == 1000: es.bulk_index('nodes','node',bulkAry, id_field='id') bulkCount = 0 bulkAry = [] print i if len(bulkAry) != 0: es.bulk_index('nodes','node',bulkAry, id_field='id')
def send(self, messages): if self.type == '@type': self.type = messages[0].get('@type') logger.debug('Type is \'@type\' - setting it to %r', self.type) es = ElasticSearch('http://%s:%s' % (self.host, self.port)) now = datetime.utcnow() index = now.strftime('logstash-%Y.%m.%d') result = es.bulk_index(index=index, doc_type=self.type, docs=messages) logger.debug('Elasticsearch bulk_index run returned with:\n\n%s\n', pformat(result)) return True
def load(args): """ Load jobs from external data sources. """ es = ElasticSearch(args.elastic_search_url) default_providers = { 'github': providers.Github(), 'indeed': providers.Indeed(args.indeed_api_key), 'craigslist': providers.Craigslist() } chosen_providers = get_providers( args.providers, default_providers) or set(default_providers.values()) excluded_providers = get_providers( args.exclude_providers, default_providers) or set() for provider in chosen_providers - excluded_providers: name = provider.name params = { 'location': args.location, 'query': args.query } data = provider.get(**params) tagline = '{name} data for location {location} and ' \ 'query {query}'.format(name=name, **params) try: result = es.bulk_index(provider.name.lower(), 'job', data) except ValueError: print('Skipping {tagline}. 0 items found.'.format(tagline=tagline)) continue num_items = len(result['items']) print('Loaded {tagline}. Result: {num_items} jobs in {time} ' 'seconds'.format(tagline=tagline, num_items=num_items, time=result['took']))
def import_json_into_es(types, inputfolder, logger): """ imports entitied from the *name.json.bz2* files (one entity per line) into local elasticsearch :param types: json string like {'person': 'http://www.wikidata.org/entity/Q5', 'name': 'Wikidata-URI'} :param inputfolder: :param logger: :return: """ es = ElasticSearch(config.ELASTICSEARCH_URL) try: es.delete_index('wikidata') es.create_index('wikidata') logger.info('rebuild index [wikidata]') except: logger.warning('cant delete wikidata index') # convert type dictionary wd_types = dict() for key in types.keys(): value = int(types[key].split('/')[-1][1:]) wd_types[value] = { 'type': key, 'filename': path.join(inputfolder, '{}.json.bz2'.format(key)) } # import each given type for key in wd_types: logger.info(wd_types[key]) done = 0 items = [] for line in BZ2File(wd_types[key]['filename'], 'rb'): line = line.strip() item = loads(line) item['uri'] = 'http://wikidata.org/wiki/' + item['id'] items.append(item) done += 1 if (done % 5000 == 0): es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') items = [] # if done % len(wd_types) / 10 == 0: # log 10% steps # logger.info('imported {}: {:,d} ({:,d})'.format(wd_types[key]['type'],done, 100*len(wd_types)/done )) if done % 10000 == 0: logger.info('imported {}: {}'.format(wd_types[key]['type'], format(done, ',d'))) if len(items) > 0: es.bulk_index('wikidata', wd_types[key]['type'], items, id_field='id') logger.info('imported {}: {}'.format(wd_types[key]['type'], format(done, ',d')))
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list(client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
client = app.test_client() ctx = app.test_request_context() ctx.push() es = ElasticSearch('http://localhost:9200/') try: es.delete_index('cdpp') except ElasticHttpNotFoundError: # we can safely ignore this, because it might be an initial run pass res = db.session.query(Sign).all() for r in res: d = r.__dict__ d.pop('_sa_instance_state', None) # bulk-index the cleaned signs es.bulk_index('cdpp', 'sign', [r.__dict__ for r in res], id_field='id') tablets = db.session.query(Tablet).all() repr = [] for result in tablets: d = result.__dict__ keys = ['medium', 'city', 'locality', 'period', 'sub_period', 'text_vehicle', 'method', 'genre', 'museum_number'] as_dict = {} for key in keys: value = getattr(result, key) if value: as_dict[key] = unicode(value) if result.rulers: as_dict['ruler'] = result.rulers[0].name
"type": "string", "index": "analyzed" }, "untouched": { "type": "string", "index": "not_analyzed" } } }, "topics": { "type": "multi_field", "fields": { "topics": { "type": "string", "index": "analyzed" }, "untouched": { "type": "string", "index": "not_analyzed" } } } } } } es.create_index(index, {"mappings": mapping}) # es.put_mapping(index, doc_type, mapping) es.bulk_index(index, doc_type, get_docs(fname), 'persistent_id')
def ES_bulk_insert(file_name): index_name = "geodata" doc_type = "data" # ElasticSearch URL ElasticSearch_URL = "http://localhost:9200/" file_path_name = raw_data_path + "/" + file_name t0 = time() #Bulk size to be import the records in elasticsearch chunk_size = 5000 txt_file = pd.read_csv( file_path_name, sep="\t", iterator=True, chunksize=chunk_size, header=None, names=[ 'geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude', 'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone', 'modification_date' ], dtype={ "geonameid": int64, "name": object, "asciiname": object, "alternatenames": object, "latitude": float64, "longitude": float64, "feature_class": object, "feature_code": object, "country_code": object, "cc2": object, "admin1_code": object, "admin2_code": object, "admin3_code": object, "admin4_code": object, "population": int64, "elevation": object, "dem": int64, "timezone": object, "modification_date": object }) # Connecting to ElasticSearch es = ElasticSearch(ElasticSearch_URL) print("Data Import started for file ", file_name) #Insert the Dataframe to elasticsearch using bulk for i, df in enumerate(txt_file): print(i) records = df.where(pd.notnull(df), None).T.to_dict() list_records = [records[it] for it in records] try: es.bulk_index(index_name, doc_type, list_records) except: print("Error.. Skipping some records") pass print("File ", file_name, "imported in %.3fs" % (time() - t0))
except Exception, e: print e else : print "Created flights" s.put_mapping("flights","flight",simplejson.loads('{"flight":{"properties":{"datum":{"type":"string","index":"not_analyzed","omit_norms":true,"index_options":"docs"},"type": { "type": "string", "index" : "not_analyzed" }, "duration":{"type":"double"},"end":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}},"flight":{"type":"string","store":true,"analyzer":"keyword"},"hex":{"type":"string","store":true,"analyzer":"keyword"},"id":{"type":"string","store":true},"radar":{"type":"string","store":true,"analyzer":"keyword"},"reg":{"type":"string","store":true,"analyzer":"keyword"},"route":{"properties":{"coordinates":{"type":"double"},"type":{"type":"string"}}},"start":{"properties":{"alt":{"type":"integer"},"dist":{"type":"float"},"speed":{"type":"integer"},"time":{"type":"date","format":"dateOptionalTime"},"town":{"type":"string","analyzer":"keyword"},"country":{"type":"string","analyzer":"keyword"}}}}}}')) def md(a) : a["datum"]=a["starttime"][:10] return a def makets(a) : for f in ("starttime","endtime") : a[f]=maket(a[f]) return a d=simplejson.load(sys.stdin) chunksize=50 print "%s documents" % (len(d),) for i in xrange(0,len(d),chunksize) : s.bulk_index("flights","flight",d[i:i+chunksize]) print "inserted %s starting from %s" % (chunksize,i)
def build_es_index(raw_data_path): if "TOPOGRAM_TMP_PATH" in os.environ: tmp_path=os.environ.get('TOPOGRAM_TMP_PATH') else: tmp_path='/tmp' # raw_data_path=os.path.join(raw_path,"data/datazip/selected/") pid_file=os.path.join(tmp_path,"csv_chunk") # config elasticsearch if "TOPOGRAM_ES_HOST" in os.environ: es_host=os.environ.get('TOPOGRAM_ES_HOST') else: es_host='http://localhost:9200/' # init ElasticSearch es = ElasticSearch(es_host) # size of CSV chunk to process chunksize=1000 # parse index name : 2 weeks per index to fasten search weeks={} for r in xrange(1,52,2): weeks[r]=weeks[r+1]="weiboscope_"+str(r)+"_"+str(r+1) # for w in weeks: print w,weeks[w] # init previous_chunk=0 t0=time() for path, subdirs, files in os.walk(raw_data_path): # loop through each files i_file=0 for filename in files: # if i==1 : break file_is_ok=False # check if there is an ongoing task if filename[-10:] == "processing": file_is_ok=True # get previous with open(pid_file, "r") as pid: previous_chunk=int(pid.read()) # get previous file_to_process_name=filename elif filename[-3:] == "zip" and filename[:4] == "week": # get only zip files file_is_ok=True if file_is_ok==True : t1=time() i_file+=1 # flag the file zip_path=os.path.join(path,filename) # print zip_path if filename[-10:] != "processing": os.rename(zip_path, zip_path+".processing") zip_path=os.path.join(path,filename+".processing") raw_csvname=filename.split(".")[0]+".csv" # read zipped csv files with zipfile.ZipFile(zip_path) as z: # open zip f = z.open(raw_csvname) # read csv csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) week_number=filename.split(".")[0][4:] index_name=weeks[int(week_number)] # print index_name for i,df in enumerate(csvfile): if i <= previous_chunk: print i, "%d files, already indexed %s"%(i_file,raw_csvname) else: print i, "%d files, now indexing %s"%(i_file,raw_csvname) # fix the date formatting df["created_at"]=df["created_at"].str.replace(" ", "T") try : # fix encoding df["text"]=df["text"].str.decode("utf-8") # convert dataframe to json object records=df.where(pd.notnull(df), None).T.to_dict() # convert json object to a list of json objects list_records=[records[it] for it in records] # insert into elasticsearch try : es.bulk_index(index_name,"tweet",list_records) except : print "error with elasticsearch" pass except : print "encoding problem..." pass with open(pid_file, "w") as pid: pid.write(str(i)) print "%s processed in %.3fs"%(raw_csvname,time()-t1) # flag the file : done # os.rename(zip_path, zip_path+".done") os.remove(zip_path) # reset counters previous_chunk=0 with open(pid_file, "w") as pid: pid.write(str(0)) print "Everything done in %.3fs"%(time()-t0)
DELIMITER = str(input5) ERRORFILEOUTPUT = open(input6, "w") print "Running..." for line in FILENAME: fields = line.split(DELIMITER) data.append({ "Filename": fields[0].strip(), "File Type": fields[1].strip(), "Language1": fields[2].strip(), "Language2": fields[3].strip(), "Language3": fields[4].strip() }) try: conn.bulk_index(PROJECTNAME, INDEXNAME, data) correct_counter += 1 except Exception as e: for i in data: failurelist.append(data) failed_count += 1 data = [] if correct_counter != 0: print "\n", correct_counter, " rows were successfully loaded into ES \n" elif correct_counter == 0: print "\n", correct_counter, " rows were loaded into elasticsearch.\n" if (failed_count != 0) & (correct_counter != 0): print failed_count, "rows failed to load - check error output file to see the specific data" elif (failed_count == 0) & (correct_counter != 0):
class TestClient(unittest.TestCase): def setUp(self): super(TestClient, self).setUp() docs = [] self.es_host = os.environ.get('ES_HOST', 'http://*****:*****@mock.patch('monolith.client.util.iterweeks') def test_datetime_ranges(self, _mock): "Test datetime ranges get converted to dates." client = self._make_one() start = datetime.datetime(2012, 1, 1, 12, 34, 56) end = datetime.datetime(2012, 1, 31, 12, 34, 56) list(client('downloads_count', start, end, interval='week')) self.assertEqual(_mock.call_args[0][0], datetime.date(2012, 1, 1)) assert not isinstance(_mock.call_args[0][0], datetime.datetime) self.assertEqual(_mock.call_args[0][1], datetime.date(2012, 1, 31)) assert not isinstance(_mock.call_args[0][1], datetime.datetime) def test_date_order(self): # Ensure fill doesn't change date ordering. client = self._make_one() prev_date = datetime.date(2000, 1, 1) # Addon 1 doesn't have downloads for every month and the client will # fill zeroes for the missing dates. hits = list( client('downloads_count', START, '2012-05-01', interval='month', add_on='1')) for hit in hits: d = hit['date'] assert prev_date < d prev_date = d
def build_es_index(raw_data_path): if "TOPOGRAM_TMP_PATH" in os.environ: tmp_path = os.environ.get('TOPOGRAM_TMP_PATH') else: tmp_path = '/tmp' # raw_data_path=os.path.join(raw_path,"data/datazip/selected/") pid_file = os.path.join(tmp_path, "csv_chunk") # config elasticsearch if "TOPOGRAM_ES_HOST" in os.environ: es_host = os.environ.get('TOPOGRAM_ES_HOST') else: es_host = 'http://localhost:9200/' # init ElasticSearch es = ElasticSearch(es_host) # size of CSV chunk to process chunksize = 1000 # parse index name : 2 weeks per index to fasten search weeks = {} for r in xrange(1, 52, 2): weeks[r] = weeks[r + 1] = "weiboscope_" + str(r) + "_" + str(r + 1) # for w in weeks: print w,weeks[w] # init previous_chunk = 0 t0 = time() for path, subdirs, files in os.walk(raw_data_path): # loop through each files i_file = 0 for filename in files: # if i==1 : break file_is_ok = False # check if there is an ongoing task if filename[-10:] == "processing": file_is_ok = True # get previous with open(pid_file, "r") as pid: previous_chunk = int(pid.read()) # get previous file_to_process_name = filename elif filename[ -3:] == "zip" and filename[:4] == "week": # get only zip files file_is_ok = True if file_is_ok == True: t1 = time() i_file += 1 # flag the file zip_path = os.path.join(path, filename) # print zip_path if filename[-10:] != "processing": os.rename(zip_path, zip_path + ".processing") zip_path = os.path.join(path, filename + ".processing") raw_csvname = filename.split(".")[0] + ".csv" # read zipped csv files with zipfile.ZipFile(zip_path) as z: # open zip f = z.open(raw_csvname) # read csv csvfile = pd.read_csv(f, iterator=True, chunksize=chunksize) week_number = filename.split(".")[0][4:] index_name = weeks[int(week_number)] # print index_name for i, df in enumerate(csvfile): if i <= previous_chunk: print i, "%d files, already indexed %s" % ( i_file, raw_csvname) else: print i, "%d files, now indexing %s" % ( i_file, raw_csvname) # fix the date formatting df["created_at"] = df["created_at"].str.replace( " ", "T") try: # fix encoding df["text"] = df["text"].str.decode("utf-8") # convert dataframe to json object records = df.where(pd.notnull(df), None).T.to_dict() # convert json object to a list of json objects list_records = [records[it] for it in records] # insert into elasticsearch try: es.bulk_index(index_name, "tweet", list_records) except: print "error with elasticsearch" pass except: print "encoding problem..." pass with open(pid_file, "w") as pid: pid.write(str(i)) print "%s processed in %.3fs" % (raw_csvname, time() - t1) # flag the file : done # os.rename(zip_path, zip_path+".done") os.remove(zip_path) # reset counters previous_chunk = 0 with open(pid_file, "w") as pid: pid.write(str(0)) print "Everything done in %.3fs" % (time() - t0)
df["created_at"]=df["created_at"].str.replace(" ", "T") try : # fix encoding df["text"]=df["text"].str.decode("utf-8") # convert dataframe to json object records=df.where(pd.notnull(df), None).T.to_dict() # convert json object to a list of json objects list_records=[records[it] for it in records] # insert into elasticsearch try : es.bulk_index(index_name,"tweet",list_records) except : print "error with elasticsearch" pass except : print "encoding problem..." pass with open(pid_file, "w") as pid: pid.write(str(i)) print "%s processed in %.3fs"%(raw_csvname,time()-t1) # flag the file : done # os.rename(zip_path, zip_path+".done")
csv_filename='robinhood-daily-rets.csv' # size of the bulk chunksize=5000 # parse csv with pandas csvfile=pd.read_csv(csv_filename) # init ElasticSearch es = ElasticSearch('http://104.236.201.91:9200/') # init index try : es.delete_index("robinhood") except : pass es.create_index("robinhood") # start bulk indexing print("now indexing %s..."%(csv_filename)) records=csvfile.where(pd.notnull(csvfile), None).T.to_dict() list_records=[records[it] for it in records] try : es.bulk_index("robinhood","myPortfolio",list_records) except : print("error!, skipping a date") pass print("done in %.3fs"%(time()-t0))
ERRORFILEOUTPUT = open(input6, "w") print "Running..." for line in FILENAME: fields = line.split(DELIMITER) if len(fields) == NUM_OF_FIELDS: data.append({ "Filename" : fields[0].strip(), "File Type" : fields[1].strip(), "Language1" : fields[2].strip(), "Language2" : fields[3].strip(), "Language3" : fields[4].strip() }) try: conn.bulk_index(PROJECTNAME,INDEXNAME,data) correct_counter += 1 except Exception as e: for i in data: failurelist.append(data) failed_count += 1 data = [] if correct_counter!=0: print "\n",correct_counter," rows were successfully loaded into ES \n" elif correct_counter==0: print "\n",correct_counter," rows were loaded into elasticsearch.\n" if (failed_count != 0) & (correct_counter != 0): print failed_count,"rows failed to load - check error output file to see the specific data" elif (failed_count == 0) & (correct_counter != 0):
# open csv file f = open(raw_data_path+csv_filename) # read csv # parse csv with pandas csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize) # init ElasticSearch es = ElasticSearch('http://localhost:9200/') # init index try : es.delete_index("weiboscope") except : pass es.create_index("weiboscope") # start bulk indexing print ("now indexing %s..."%(csv_filename)) for i,df in enumerate(csvfile): print (i) records=df.where(pd.notnull(df), None).T.to_dict() list_records=[records[it] for it in records] try : es.bulk_index("weiboscope","tweet",list_records) except : print ("error!, skiping some tweets sorry") pass print( "done in %.3fs"%(time()-t0))
def documents_from_mails(mails): """Build document from mail""" for mail in mails: if 'Date' in mail.headers: # Some mails seem broken. yield { '@source': 'stuff://', '@type': 'mailadmin', '@tags': [mail.headers['From']], '@fields': mail.headers, '@timestamp': parse_date(mail.headers['Date']), '@source_host': 'localhost', '@source_path': 'mail/admin ', '@message': mail.body, 'id': mail.headers['Message-Id'] } if __name__ == '__main__': # Instantiate it with an url es = ElasticSearch(sys.argv[1]) # Kibana need this kind of name NAME = 'logstash-2013.06.13' try: es.delete_index(NAME) except ElasticHttpNotFoundError: pass # Nobody cares emails = mbox(sys.argv[2]) for n, docs in enumerate(bulk_iterate(documents_from_mails(emails), 100)): es.bulk_index(NAME, 'mailadmin', docs) print(n) print es.refresh(NAME)