def _handle_tsnecoordinates_view(self, view, whoosh_query): cache_key = hashlib.md5(json.dumps(view)+repr(whoosh_query)).hexdigest() counts_raw = self.cache.get(cache_key) if counts_raw is not None: return json.loads(counts_raw) else: coordinates = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) logger.debug(self.tracking_code + " whoosh search results: %s" % (repr(hits))) for hit in hits: if '2DtSNECoordinates' in hit: refpoints = whooshutils.split_keywords(hit['2DtSNECoordinates']) id = hit['id'] sentence = hit['sentence'] for refpoint in refpoints: if refpoint: coordinate_splits = whooshutils.split_keywords(refpoint) coordinates[id] = {'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence} result = { 'coordinates': [{'id': i, 'coordinates': {'x': p['x'], 'y': p['y']}, 'text': p['text']} for i, p in coordinates.iteritems()] } self.cache.set(cache_key, json.dumps(result)) return result
def _handle_tsnecoordinates_view(self, view, whoosh_query): coordinates = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) print >> sys.stderr, "whoosh search results: %s" % (repr(hits)) for hit in hits: if '2DtSNECoordinates' in hit: refpoints = whooshutils.split_keywords( hit['2DtSNECoordinates']) id = hit['id'] sentence = hit['sentence'] for refpoint in refpoints: if refpoint: coordinate_splits = whooshutils.split_keywords( refpoint) coordinates[id] = { 'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence } return { 'coordinates': [{ 'id': i, 'coordinates': { 'x': p['x'], 'y': p['y'] }, 'text': p['text'] } for i, p in coordinates.iteritems()] }
def _handle_plottimeline_view(self, view, whoosh_query): def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive): op = whoosh.query.Or if is_disjunctive else whoosh.query.And rel_query = op([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs]) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None) cooc_counts = {} for hit in hits: if need_field in hit: for cooc_field in cooc_fields: known_field = cooc_field in entities for value in whooshutils.split_keywords(hit[cooc_field]): if not (known_field and value in entities[cooc_field]): cooc_counts.setdefault((cooc_field, value), 0) cooc_counts[cooc_field, value] += 1 cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True) return cooc_counts[:self.plottimeline_max_cooccurring_entities], len(cooc_counts) result = {} cluster_field = view['clusterField'] entities = view['entities'] if 'cooccurrences' in view: is_disjunctive = view['cooccurrences'] == 'or' cooc_entities, num_total_coocs = find_cooccurrences(entities, set(view['cooccurrenceFields']), cluster_field, is_disjunctive) entities = dict((ef, set(evs)) for ef, evs in entities.iteritems()) for (entity_field, entity_value), entity_count in cooc_entities: entities.setdefault(entity_field, set()) entities[entity_field].add(entity_value) result['numCooccurringEntities'] = num_total_coocs result['numIncludedCooccurringEntities'] = len(cooc_entities) # Checking for cluster_field per hit below seems to be slightly faster (empirically) than including Every(cluster_field) in the query rel_query = whoosh.query.Or([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs]) timeline = dict((ef, dict((ev, {}) for ev in evs)) for ef, evs in entities.iteritems()) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None) for hit in hits: if cluster_field in hit: year = int(hit['year']) cluster_values = set(whooshutils.split_keywords(hit[cluster_field])) for entity_field, entity_values in entities.iteritems(): hit_entity_values = set(whooshutils.split_keywords(hit[entity_field])) for entity_value in entity_values: if entity_value in hit_entity_values: timeline[entity_field][entity_value].setdefault(year, set()) timeline[entity_field][entity_value][year] |= cluster_values for entity_field, entity_values in entities.iteritems(): field_timeline = timeline[entity_field] for entity_value in entity_values: field_timeline[entity_value] = dict((y, list(cvs)) for y, cvs in field_timeline[entity_value].iteritems()) result['timeline'] = timeline return result
def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive): op = whoosh.query.Or if is_disjunctive else whoosh.query.And rel_query = op([ whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs ]) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And( [whoosh_query, rel_query]), limit=None) cooc_counts = {} for hit in hits: if need_field in hit: for cooc_field in cooc_fields: known_field = cooc_field in entities for value in whooshutils.split_keywords( hit[cooc_field]): if not (known_field and value in entities[cooc_field]): cooc_counts.setdefault((cooc_field, value), 0) cooc_counts[cooc_field, value] += 1 cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True) return cooc_counts[:self. plottimeline_max_cooccurring_entities], len( cooc_counts)
def generate_field_counts(self, response, views, whoosh_query): """ Handles all the count by field value views for a query. All values of a multiple-valued field are counted. """ print >> sys.stderr, "generating field counts for fields: %s" % ( ' '.join(v['field'] for v in views.itervalues())) for view_id, view in views.iteritems(): response[view_id] = {'counts': {}} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) print >> sys.stderr, "whoosh search results: %s" % (repr(hits)) for hit in hits: for view_id, view in views.iteritems(): field = view['field'] field = backend_domain_config.field_name_aliases( field) or field if field in hit: values = set( v for v in whooshutils.split_keywords(hit[field])) counts = response[view_id]['counts'] for value in values: counts.setdefault(value, 0) counts[value] += 1 for view_id, view in views.iteritems(): counts = response[view_id]['counts'].items() counts.sort(key=lambda (v, c): c, reverse=True) response[view_id]['counts'] = counts
def generate_field_counts(self, response, views, whoosh_query): """ Handles all the count by field value views for a query. All values of a multiple-valued field are counted. """ logger.debug(self.tracking_code + " generating field counts for fields: %s" % (' '.join(v['field'] for v in views.itervalues()))) for view_id, view in views.iteritems(): response[view_id] = {'counts': {}} logger.debug(self.tracking_code + " whoosh_query: " + repr(whoosh_query)) logger.debug(self.tracking_code + " view: " + json.dumps(views)) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) logger.info(self.tracking_code + " whoosh search results: %s" % (repr(hits))) for hit in hits: for view_id, view in views.iteritems(): field = view['field'] field = domain_config.field_name_aliases(field) or field if field in hit: values = set(v for v in whooshutils.split_keywords(hit[field])) counts = response[view_id]['counts'] for value in values: counts.setdefault(value, 0) counts[value] += 1 for view_id, view in views.iteritems(): counts = response[view_id]['counts'].items() counts.sort(key=lambda (v, c): c, reverse=True) response[view_id]['counts'] = counts
def generate_field_counts(self, response, views, whoosh_query): """ Handles all the count by field value views for a query. All values of a multiple-valued field are counted. """ for view_id, view in views.iteritems(): response[view_id] = { 'counts': {} } with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) print >> sys.stderr, "whoosh search results: %s" % (repr(hits)) for hit in hits: for view_id, view in views.iteritems(): field = view['field'] if field in hit: values = set(v for v in whooshutils.split_keywords(hit[field])) counts = response[view_id]['counts'] for value in values: counts.setdefault(value, 0) counts[value] += 1 for view_id, view in views.iteritems(): counts = response[view_id]['counts'].items() counts.sort(key=lambda (v, c): c, reverse=True) response[view_id]['counts'] = counts
def _handle_tsnecoordinates_view(self, view, whoosh_query): coordinates = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) print >> sys.stderr, "whoosh search results: %s" % (repr(hits)) for hit in hits: if '2DtSNECoordinates' in hit: refpoints = whooshutils.split_keywords(hit['2DtSNECoordinates']) id = hit['id'] sentence = hit['sentence'] for refpoint in refpoints: if refpoint: coordinate_splits = whooshutils.split_keywords(refpoint) coordinates[id] = {'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence} return { 'coordinates': [{ 'id': i, 'coordinates': {'x': p['x'], 'y': p['y']}, 'text': p['text'] } for i, p in coordinates.iteritems()] }
def _handle_tsnecoordinates_view(self, view, whoosh_query): cache_key = hashlib.md5(json.dumps(view) + repr(whoosh_query)).hexdigest() counts_raw = self.cache.get(cache_key) if counts_raw is not None: return json.loads(counts_raw) else: coordinates = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) logger.debug(self.tracking_code + " whoosh search results: %s" % (repr(hits))) for hit in hits: if '2DtSNECoordinates' in hit: refpoints = whooshutils.split_keywords( hit['2DtSNECoordinates']) id = hit['id'] sentence = hit['sentence'] for refpoint in refpoints: if refpoint: coordinate_splits = whooshutils.split_keywords( refpoint) coordinates[id] = { 'x': coordinate_splits[0], 'y': coordinate_splits[1], 'text': sentence } result = { 'coordinates': [{ 'id': i, 'coordinates': { 'x': p['x'], 'y': p['y'] }, 'text': p['text'] } for i, p in coordinates.iteritems()] } self.cache.set(cache_key, json.dumps(result)) return result
def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive): op = whoosh.query.Or if is_disjunctive else whoosh.query.And rel_query = op([whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs]) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None) cooc_counts = {} for hit in hits: if need_field in hit: for cooc_field in cooc_fields: known_field = cooc_field in entities for value in whooshutils.split_keywords(hit[cooc_field]): if not (known_field and value in entities[cooc_field]): cooc_counts.setdefault((cooc_field, value), 0) cooc_counts[cooc_field, value] += 1 cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True) return cooc_counts[:self.plottimeline_max_cooccurring_entities], len(cooc_counts)
def _handle_referencepointlinks_view(self, view, whoosh_query): link_counts = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) logger.debug(self.tracking_code + " whoosh search results: %s" % (repr(hits))) for hit in hits: refpoints = whooshutils.split_keywords(hit['referencePoints']) for i, refpoint1 in enumerate(refpoints): for refpoint2 in refpoints[i+1:]: if refpoint1 != refpoint2: # Use lexicographic order to guarantee unique choices of two distinct reference points pair = (refpoint1, refpoint2) if refpoint1 < refpoint2 else (refpoint2, refpoint1) link_counts.setdefault(pair, 0) link_counts[pair] += 1 return { 'links': [{'refpoints': p, 'count': c} for (p, c) in link_counts.iteritems()] }
def _handle_referencepointlinks_view(self, view, whoosh_query): link_counts = {} with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh_query, limit=None) print >> sys.stderr, "whoosh search results: %s" % (repr(hits)) for hit in hits: refpoints = whooshutils.split_keywords(hit['referencePoints']) for i, refpoint1 in enumerate(refpoints): for refpoint2 in refpoints[i + 1:]: if refpoint1 != refpoint2: # Use lexicographic order to guarantee unique choices of two distinct reference points pair = (refpoint1, refpoint2) if refpoint1 < refpoint2 else ( refpoint2, refpoint1) link_counts.setdefault(pair, 0) link_counts[pair] += 1 return { 'links': [{ 'refpoints': p, 'count': c } for (p, c) in link_counts.iteritems()] }
def _handle_plottimeline_view(self, view, whoosh_query): def find_cooccurrences(entities, cooc_fields, need_field, is_disjunctive): op = whoosh.query.Or if is_disjunctive else whoosh.query.And rel_query = op([ whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs ]) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And( [whoosh_query, rel_query]), limit=None) cooc_counts = {} for hit in hits: if need_field in hit: for cooc_field in cooc_fields: known_field = cooc_field in entities for value in whooshutils.split_keywords( hit[cooc_field]): if not (known_field and value in entities[cooc_field]): cooc_counts.setdefault((cooc_field, value), 0) cooc_counts[cooc_field, value] += 1 cooc_counts = sorted(cooc_counts.iteritems(), key=lambda (e, c): c, reverse=True) return cooc_counts[:self. plottimeline_max_cooccurring_entities], len( cooc_counts) result = {} cluster_field = view['clusterField'] entities = view['entities'] if 'cooccurrences' in view: is_disjunctive = {'and': False, 'or': True}[view['cooccurrences']] cooc_entities, num_total_coocs = find_cooccurrences( entities, set(view['cooccurrenceFields']), cluster_field, is_disjunctive) entities = dict((ef, set(evs)) for ef, evs in entities.iteritems()) for (entity_field, entity_value), entity_count in cooc_entities: entities.setdefault(entity_field, set()) entities[entity_field].add(entity_value) result['numCooccurringEntities'] = num_total_coocs result['numIncludedCooccurringEntities'] = len(cooc_entities) # Checking for cluster_field per hit below seems to be slightly faster (empirically) than including Every(cluster_field) in the query rel_query = whoosh.query.Or([ whoosh.query.Term(ef, ev) for ef, evs in entities.iteritems() for ev in evs ]) timeline = dict((ef, dict((ev, {}) for ev in evs)) for ef, evs in entities.iteritems()) with self.whoosh_index.searcher() as searcher: hits = searcher.search(whoosh.query.And([whoosh_query, rel_query]), limit=None) for hit in hits: if cluster_field in hit: year = int(hit['year']) cluster_values = set( whooshutils.split_keywords(hit[cluster_field])) for entity_field, entity_values in entities.iteritems(): hit_entity_values = set( whooshutils.split_keywords(hit[entity_field])) for entity_value in entity_values: if entity_value in hit_entity_values: timeline[entity_field][ entity_value].setdefault(year, set()) timeline[entity_field][entity_value][ year] |= cluster_values for entity_field, entity_values in entities.iteritems(): field_timeline = timeline[entity_field] for entity_value in entity_values: field_timeline[entity_value] = dict( (y, list(cvs)) for y, cvs in field_timeline[entity_value].iteritems()) result['timeline'] = timeline return result