def main(options): # get availalbe defined fiels data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema')) fields = data['schema']['fields'].keys() if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'): # for each retrive the high/med/low freq terms with csv_writer('term-freqs.txt', ['field', 'type', 'token', 'freq']) as writer: for f in fields: try: print 'Getting freqs for: %s' % f rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS}) high_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'high', high_freq) if len(high_freq.values()) == 0: continue max_count = int(max(0.1, min(high_freq.values())) / 2) - 1 if max_count < 1: continue rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count}) med_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'med', med_freq) if len(med_freq.values()) == 0: continue max_count = max(int(max(0.1, min(med_freq.values())) / 2) - 1, 1) if max_count < 1: continue rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count}) low_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'low', low_freq) except Exception, e: print 'Error getting terms for: %s' % f traceback.print_exc()
def main(options): # get availalbe defined fiels data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema')) fields = data['schema']['fields'].keys() if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'): retrieve_term_freqs(options, fields) if os.path.exists('phrase-freqs.txt.2') and EXISTING_TERMS_REPLACE \ or not os.path.exists('phrase-freqs.txt.2'): retrieve_pseudo_collocations(options, maxlen=[2, 5], stop_after_reaching=10000, output_name='phrase-freqs.txt') generate_field_queries(options) generate_wild_queries(options) generate_phrase_queries(options, length=2, input='phrase-freqs.txt.2') generate_phrase_queries(options, length=5, input='phrase-freqs.txt.5') generate_fuzzy_queries(options, length=1, input='phrase-freqs.txt.2') generate_fuzzy_queries(options, length=2, input='phrase-freqs.txt.2') generate_near_queries(options, length=2, input='phrase-freqs.txt.5') generate_near_queries(options, length=4, input='phrase-freqs.txt.5') generate_boolean_queries(options, 'AND', length=5, input='phrase-freqs.txt.2') generate_boolean_queries(options, 'AND', length=2, input='phrase-freqs.txt.2') generate_boolean_queries(options, 'OR', length=5, input='phrase-freqs.txt.2') generate_boolean_queries(options, 'OR', length=2, input='phrase-freqs.txt.2')
def retrieve_term_freqs(options, fields): # for each retrive the high/med/low freq terms fo, writer = csv_writer('term-freqs.txt', ['field', 'type', 'token', 'freq']) for f in fields: try: print 'Getting freqs for: %s' % f rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS}) high_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'high', high_freq) if len(high_freq.values()) == 0: continue max_count = int(max(0.1, min(high_freq.values())) / 2) - 1 if max_count < 1: continue rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count}) med_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'med', med_freq) if len(med_freq.values()) == 0: continue max_count = max(int(max(0.1, min(med_freq.values())) / 2) - 1, 1) if max_count < 1: continue rsp = req('%s/terms' % options.query_endpoint, **{'terms.fl':f, 'terms.limit':RETRIEVE_MAX_TOKENS, 'terms.maxcount': max_count}) low_freq = dict(zip(rsp['terms'][f][0::2],rsp['terms'][f][1::2])) write_terms(writer, f, 'low', low_freq) except Exception, e: print 'Error getting terms for: %s' % f traceback.print_exc()
def main(options): # get availalbe defined fiels data = req('%s/admin/luke?wt=json&show=schema&indent=true' % options.query_endpoint, **dict(show='schema')) fields = data['schema']['fields'].keys() if os.path.exists('term-freqs.txt') and EXISTING_TERMS_REPLACE or not os.path.exists('term-freqs.txt'): retrieve_term_freqs(options, fields) if os.path.exists('phrase-freqs.txt') and EXISTING_TERMS_REPLACE \ or not os.path.exists('phrase-freqs.txt'): retrieve_pseudo_collocations(options, maxlen=2, stop_after_reaching=100000, output_name='phrase-freqs.txt') if os.path.exists('phrase5-freqs.txt') and EXISTING_TERMS_REPLACE \ or not os.path.exists('phrase5-freqs.txt'): retrieve_pseudo_collocations(options, maxlen=5, stop_after_reaching=100000, output_name='phrase5-freqs.txt') generate_field_queries(options) generate_wild_queries(options) generate_phrase_queries(options, length=2, input='phrase-freqs.txt') generate_phrase_queries(options, length=5, input='phrase5-freqs.txt') generate_fuzzy_queries(options, length=1, input='phrase-freqs.txt') generate_fuzzy_queries(options, length=2, input='phrase-freqs.txt') generate_near_queries(options, length=2, input='phrase5-freqs.txt') generate_near_queries(options, length=4, input='phrase5-freqs.txt') generate_boolean_queries(options, 'AND', length=5, input='phrase5-freqs.txt') generate_boolean_queries(options, 'AND', length=2, input='phrase5-freqs.txt') generate_boolean_queries(options, 'OR', length=5, input='phrase5-freqs.txt') generate_boolean_queries(options, 'OR', length=2, input='phrase5-freqs.txt')
def retrieve_pseudo_collocations(options, max_time=600, maxlen=3, stop_after_reaching=100000, max_clauses=2, upper_limit='1.0', lower_limit='0.97', output_name='collocations-freqs.txt'): fo, writer = csv_writer(output_name, ['field', 'type', 'token', 'freq']) terms = {} for fn in DISCOVER_PHRASES_FIELDS: terms[fn] = [] for term in csv_reader('term-freqs.txt', generic=True): if len(term) != 4: continue if term[1] == 'high' and term[0] in DISCOVER_PHRASES_FIELDS: terms[term[0]].append(term[2]) jobs = {} for fn in DISCOVER_PHRASES_FIELDS: if fn not in terms: 'skipping: %s as we have no data for it' % fn continue # register job rsp = req("%s/batch" % options.query_endpoint, command="find-freq-phrases", maxlen=maxlen, upperLimit=upper_limit, lowerLimit=lower_limit, fields=fn, maxClauses=max_clauses, stopAfterReaching=stop_after_reaching, ) jobs[fn] = rsp['jobid'] # first write terms to disk fi, tmpfile = tempfile.mkstemp() fd = open(tmpfile, 'w') fd.write("\n".join(terms[fn])) fd.close() kwdata = dict(endpoint = options.query_endpoint, jobid=rsp['jobid'], tmpfile=tmpfile) run_cmd(["curl '%(endpoint)s/batch?command=receive-data&jobid=%(jobid)s' --data-binary @%(tmpfile)s -H 'Content-type:text/txt; charset=utf-8'" % kwdata]) # start processing rsp = req("%s/batch" % options.query_endpoint, command="start") some_future = time.time() + max_time jobs_finished = {} while time.time() < some_future: if len(jobs) == 0: break for k, v in jobs.items(): rsp = req("%s/batch" % options.query_endpoint, command="status", jobid=v) if rsp['job-status'] == 'failed': error("Failed executing: %s - %s" % (k,v)) elif rsp['job-status'] == 'finished': print 'finished: %s' % k del jobs[k] jobs_finished[k] = v else: time.sleep(3) for k,v in jobs_finished.items(): run_cmd(["curl -o %s '%s/batch?command=get-results&jobid=%s'" % ('collocations.%s.freq' % k, options.query_endpoint, v) ]) with open('collocations.%s.freq' % k, 'r') as c_file: for line in c_file: data = line.strip().split('\t') if len(data) > 1: writer.writerow([k, 'high', data[0], data[1]]) os.remove('collocations.%s.freq' % k) fo.close()
def retrieve_pseudo_collocations(options, maxlen=[3], stop_after_reaching=100000, max_clauses=2, output_name='collocations-freqs.txt'): maxlen = maxlen[:] terms = {} for fn in DISCOVER_PHRASES_FIELDS: terms[fn] = [] fields = set() for term in csv_reader('term-freqs.txt', generic=True): if len(term) != 4: continue if term[1] == 'high' and term[0] in DISCOVER_PHRASES_FIELDS: terms[term[0]].append(term[2]) fields.add(term[0]) tally = {} freqs = {} for x in maxlen: tally[x] = 0 freqs[x] = {} for f in fields: freqs[x][f] = defaultdict(lambda: 0) for field, terms in terms.items(): if len(maxlen) == 0: break for term in terms: rsp = req('%s/query' % options.query_endpoint, **{ 'q' : '{}:"{}"'.format(field, term), 'wt': 'json', 'hl': 'true', 'hl.fl': field, 'hl.requireFieldMatch': 'true', 'hl.simple.re': '<em>', 'hl.simple.post': '</em>', }) if rsp['response'].get('numFound', 0) > 0: hls = extract_highlights(term, rsp['highlighting']) for f in maxlen: for left in hls.get_all_left(f): freqs[f][field][left] += 1 tally[f] += 1 for right in hls.get_all_right(f): freqs[f][field][right] += 1 tally[f] += 1 if tally[f] >= stop_after_reaching: maxlen.pop(maxlen.index(f)) for maxlen, freqx in freqs.items(): fo, writer = csv_writer('{}.{}'.format(output_name, maxlen), ['field', 'type', 'token', 'freq']) for field, vals in freqx.items(): vs = sorted(vals.items(), key=lambda x: x[1], reverse=True) for v, freq in vs: try: writer.writerow([field, 'high', v, freq]) except UnicodeEncodeError: try: writer.writerow([field, 'high', v.encode('utf8'), freq]) except UnicodeEncodeError: pass fo.close()