/
csv2es.py
109 lines (91 loc) · 4.05 KB
/
csv2es.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import csv
import json
import click
from elasticsearch import Elasticsearch, helpers
from utils import echo, isperiod, t2i, get_fieldnames, time_interval, index_op
def docs_from_file(filename, idx_name, doc_type, id_field_idx,
quiet):
"""
Return a generator for pulling rows from a given delimited file.
:param filename: the name of the file to read from
:param idx_name: index name
:param doc_type: document type
:param doc_id: default to 'None', that means the id field will be
generated automatically by ES. If assigned here, it's a digital,
stands for positional index of fields list.
:param quiet: don't output anything to the console when this is True
"""
def all_docs():
with open(filename, newline='') as doc_file:
fields = get_fieldnames(doc_file)
dict_reader = csv.DictReader(doc_file, fieldnames=fields)
if 'ticket' in doc_type:
fields.append("ticket_time")
echo('Using the following ' + str(len(fields)) + ' fields:',
quiet)
for field in fields:
echo(field, quiet)
i = 0
for row in dict_reader:
# Prepare meta info for each indexed document.
meta = {
'index': idx_name,
'type': doc_type,
}
if id_field_idx is not None:
meta['id'] = row[fields[int(id_field_idx)]]
# Convert tim inteval to an integer in minutes.
for k, v in row.items():
if isinstance(v, str) and isperiod(v):
row[k] = t2i(v)
if 'ticket' in doc_type:
row['ticket_time'] = time_interval(row['create_time'],
row['close_time'],
'%m/%d/%Y %I:%M:%S %p')
i += 1
echo('Sending item %s to ES ...' % i, quiet)
yield index_op(row, meta)
return all_docs
@click.command()
@click.option('--host', default='http://127.0.0.1:9200/', required=False,
help='The Elasticsearch host (http://elasticsearch-host:port/)')
@click.option('--index-name', required=True,
help='Index name to load data into')
@click.option('--csvfile', required=True,
help='File to index from (or \'-\' for stdin)')
@click.option('--id-field-idx', required=False,
help='Which field to be document\'s ID field')
@click.option('--delete-index', is_flag=True, required=False,
help='Delete existing index if it exists')
@click.option('--quiet', is_flag=True, required=False,
help='Minimize console output')
def cli(host, index_name, csvfile, id_field_idx, delete_index, quiet):
"""
Bulk import a delimited file into a target Elasticsearch instance.
Common delimited files include things like CSV.
Load a CSV file:
data2es --index-name myindex --doc-type mydoc --import-file test.csv
"""
echo('Using host: %s' % host, quiet)
es = Elasticsearch(hosts=[host])
if es.indices.exists(index_name):
echo('Index %s already exist' % index_name, False)
if delete_index:
es.indices.delete(index=index_name)
echo('Deleted: %s' % index_name, quiet)
es.indices.create(index=index_name)
echo('Created new index: %s' % index_name, quiet)
else:
es.indices.create(index=index_name)
echo('Created new index: %s' % index_name, quiet)
echo('Using document type: %s' % doc_type, quiet)
if mapping_file:
echo('Applying mapping from: %s' % mapping_file, quiet)
with open(mapping_file) as f:
mapping = json.loads(f.read())
es.indices.put_mapping(doc_type, mapping, [index_name,])
action_g = docs_from_file(import_file, index_name, doc_type,
id_field_idx, quiet)
helpers.bulk(es, action_g())
if __name__ == "__main__":
cli()