/
aggregator_functions.py
97 lines (82 loc) · 2.91 KB
/
aggregator_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import re
import pymongo
import pickle
from nltk.corpus import stopwords
from bson.son import SON
from pandas import DataFrame
def area(dimlist):
if len(dimlist) == 4:
return float(dimlist[1].replace(',',''))*float(dimlist[3].replace(',',''))
else:
return None
def tag_to_tags(string, omitted_words):
'''
omitted_words should be a set, not list
'''
return list( set(string.lower().split()) - omitted_words )
def add_tags(json_entry, omitted_words):
try:
json_entry['note_tags'] = tag_to_tags(json_entry['lotNote'], omitted_words)
except KeyError:
json_entry['note_tags'] = []
try:
json_entry['material_tags'] = tag_to_tags(json_entry['materials'], omitted_words)
except KeyError:
json_entry['material_tags'] = []
try:
json_entry['area'] = area(re.findall('\\d+.\\d+', json_entry['measurements']))
except ValueError:
json_entry['area'] = None
return json_entry
def iterate_over_db(db_name='asi_database', coll_name='asi_collection'):
client = pymongo.MongoClient()
db = client[db_name]
c = db[coll_name]
curs = c.find({})
omitted_words = set(stopwords.words('english'))
while curs.alive:
c.save(add_tags(curs.next(), omitted_words))
def aggregate_tags(db_name='asi_database', coll_name='asi_collection', tag='material_tags'):
client = pymongo.MongoClient()
db = client[db_name]
c = db[coll_name]
aggregation = c.aggregate([
{"$unwind": ''.join(['$', tag])},
{"$group": {'_id': ''.join(['$', tag]), "count":{"$sum": 1}}},
{"$sort": SON([("count", -1),('_id', -1)])}
])
with open(tag + '.pickle', 'wb') as out:
pickle.dump(aggregation, out)
return aggregation
def load_filter_tags(exclude_tags='results/exclude.csv'):
exclude_words=[]
duplicate_sets=[]
exclude_set = True
with open(exclude_tags, 'r') as f:
for line in f.readlines():
if exclude_set:
if line.strip() == '':
exclude_set = False
else:
exclude_words.append(line.strip())
else:
duplicate_sets.append(map(lambda x: x.strip(), line.split()))
return exclude_words, duplicate_sets
def filter_tags(tag_pickle='results/material_tags.pickle', exclude_tags='results/exclude.csv', n=50):
exclude_words, duplicate_sets = load_filter_tags(exclude_tags)
with open(tag_pickle, 'r') as f:
t = DataFrame(pickle.load(f)['result']).set_index('_id')
for setn in duplicate_sets:
t.ix[setn[0]] += sum(map(lambda x: t.ix[x] , setn[1:]))
for tag in setn[1:]:
t.drop(tag, inplace=True)
for tag in exclude_words:
t.drop(tag, inplace=True)
t.sort(ascending=False)
return t[:n].index
def main():
#aggregate_tags(db_name='asi-database')
pass
if __name__ == "__main__":
main()