def segment_unit(self, unit, args): patterns = self.get_patterns() unit.patterns = [] found_groups = {} rgx_matched = re.compile(ur'[_<>]') unit.match_conditions = True self.get_plain_content_from_unit(unit) for pattern in patterns: patternid = pattern['id'] if not patternid: continue condition = self.get_condition(pattern) if condition == 'ignore': continue # get regex from pattern rgx = self.get_regex_from_pattern(patterns, patternid) hilited = patternid in self.options['hilite'] def markup_segment(match): segment = match.group(0) if rgx_matched.search(segment, 1): return segment # mark it up span = '<span class="m ms">' if hilited else '<span class="m">' unit.patterns.append([patternid, segment]) rep = ur'%s%s</span>' % (span, segment.replace(' ', '_')) # add variant if hilited and ('variants' in self.toreturn): variant = self.get_variant_from_segment(segment) self.variants[variant] = self.variants.get(variant, 0) + 1 return rep # apply regex to unit if rgx: len_before = len(unit.plain_content) unit.plain_content, found = rgx.subn(markup_segment, unit.plain_content, 1) found = len(unit.plain_content) != len_before if (condition == 'include' and not found) or (condition == 'exclude' and found): unit.match_conditions = False if found: found_groups[re.sub(ur'-\d+$', '', pattern['key'])] = 1 dputils.inc_counter(self.stats['patterns'], pattern['id'], 1) else: unit.patterns.append([patternid, '']) for group in found_groups: self.stats['groups'][group] += 1 if found_groups: unit.plain_content = unit.plain_content.replace('_', ' ')
def get_elementid_from_xml_element(element, idcount, as_string=False): ''' returns the elementid as a list e.g. [(u'', u'clause'), (u'type', u'disposition')] element: an xml element (etree) idcount: a dictionary, new for each enclosing text unit. Used to know which occurrence of an element we are seeing and generate a unique id. E.g. two same titles ('sheriff') marked up in the same way within the same entry => we need a count to differentiate them. We add [@o, 2] to second occurrence, etc. ''' from django.utils.text import slugify element_text = utils.get_xml_element_text(element) # eg. parts: [(u'', u'clause'), (u'type', u'disposition')] parts = [(unicode(re.sub('data-dpt-?', '', k)), unicode(v)) for k, v in element.attrib.iteritems() if k.startswith('data-dpt') and k not in ['data-dpt-cat']] # white list to filter the elements if parts[0][1] in ('clause', 'location', 'person'): element_text = slugify(u'%s' % element_text.lower()) if len(element_text) > 0 and len(element_text) < 20: parts.append(['@text', element_text]) else: parts = None if parts: order = dputils.inc_counter(idcount, repr(parts)) if order > 1: # add (u'@o', u'2') if it is the 2nd occurence of this elementid parts.append((u'@o', u'%s' % order)) return parts
def draw_internal(self): self.context['canvas'] = {'width': 500, 'height': 500} self.drawing = { 'points': [], 'x': [], 'y': [], 'bar_height': self.bar_height, 'font_size': self.font_size, 'label_margin': self.margin } points = self.drawing['points'] self.drawing['colors'] = [ query.get_color() for query in self.queries.get_queries() ] self.drawing['summaries'] = [ query.get_summary() for query in self.queries.get_queries() ] # {'agreement': [10, 20]} self.init_bands() # process all records # for record in self.get_all_conflated_ids(): cat_hit = [0] * len(self.queries.get_queries()) points_order = sorted(self.points.keys(), key=lambda cid: self.points[cid][0][0]) # for point in self.points.values(): for cid in points_order: point = self.points[cid] found = any(point[2]) x = point[0] ys = point[1] # update the min / max x if x[0] is not None and x[0] not in MAX_DATE_RANGE and ( self.mins[0] is None or self.mins[0] > x[0]): self.mins[0] = x[0] if x[1] is not None and x[1] not in MAX_DATE_RANGE and ( self.maxs[0] is None or self.maxs[0] < x[1]): self.maxs[0] = x[1] # update histogram if 0: for xi in range(x[0], x[1] + 1): hist = self.histogram[xi] = self.histogram.get(xi, {}) for layer in point[2]: self.histogram_height = max(inc_counter(hist, layer), self.histogram_height) else: for xi in range(x[0], x[1] + 1): hist = self.histogram[xi] = self.histogram.get(xi, {}) layers_key = ','.join( ['%s' % li for li in sorted(point[2])]) inc_counter(hist, layers_key) # convert y to numerical value if not isinstance(ys, list): ys = [ys] for v in ys: y = self.bands.get(v, 0) # add the points to the stack point[0] = x point[1] = y # convert layers from set to list to allow json serialisation point[2] = list(point[2]) self.stack_point(point) points.append(point) # increment hits per category self.cat_hits[v] = self.cat_hits.get(v, [0, 0][:]) self.cat_hits[v][0] += 1 if found: self.cat_hits[v][1] += 1
def draw_internal(self): self.context['canvas'] = {'width': 500, 'height': 500} self.drawing = {'points': [], 'x': [], 'y': [], 'bar_height': self.bar_height, 'font_size': self.font_size, 'label_margin': self.margin} points = self.drawing['points'] self.drawing['colors'] = [query.get_color() for query in self.queries.get_queries()] # {'agreement': [10, 20]} self.init_bands() from digipal.utils import get_range_from_date, MAX_DATE_RANGE # process all records #for record in self.get_all_conflated_ids(): cat_hit = [0] * len(self.queries.get_queries()) for point in self.points.values(): found = any(point[2]) x = point[0] ys = point[1] # convert x to numerical value if self.fields[0]['type'] == 'date': x = get_range_from_date(x) elif self.fields[0]['key'] == 'locus': # 12v => 25 n = int(x[0:-1]) * 2 if x[-1] == 'v': n += 1 x = [n] * 2 else: # ()TODO: other type than date for x x = 0 # turn all x into range if not isinstance(x, list): x = [x, x] # update the min / max x if x[0] is not None and x[0] not in MAX_DATE_RANGE and (self.mins[0] is None or self.mins[0] > x[0]): self.mins[0] = x[0] if x[1] is not None and x[1] not in MAX_DATE_RANGE and (self.maxs[0] is None or self.maxs[0] < x[1]): self.maxs[0] = x[1] # update histogram for xi in range(x[0], x[1]+1): hist = self.histogram[xi] = self.histogram.get(xi, {}) for layer in point[2]: self.histogram_height = max(inc_counter(hist, layer), self.histogram_height) # convert y to numerical value if not isinstance(ys, list): ys = [ys] for v in ys: y = self.bands.get(v, 0) # add the points to the stack point[0] = x point[1] = y # convert layers from set to list to allow json serialisation point[2] = list(point[2]) self.stack_point(point) points.append(point) # increment hits per category self.cat_hits[v] = self.cat_hits.get(v, [0,0][:]) self.cat_hits[v][0] += 1 if found: self.cat_hits[v][1] += 1
def draw_internal(self): self.context['canvas'] = {'width': 500, 'height': 500} self.drawing = {'points': [], 'x': [], 'y': [], 'bar_height': self.bar_height, 'font_size': self.font_size, 'label_margin': self.margin} points = self.drawing['points'] self.drawing['colors'] = [query.get_color() for query in self.queries.get_queries()] self.drawing['summaries'] = [query.get_summary() for query in self.queries.get_queries()] # {'agreement': [10, 20]} self.init_bands() # process all records #for record in self.get_all_conflated_ids(): cat_hit = [0] * len(self.queries.get_queries()) points_order = sorted(self.points.keys(), key=lambda cid: self.points[cid][0][0]) #for point in self.points.values(): for cid in points_order: point = self.points[cid] found = any(point[2]) x = point[0] ys = point[1] # update the min / max x if x[0] is not None and x[0] not in MAX_DATE_RANGE and (self.mins[0] is None or self.mins[0] > x[0]): self.mins[0] = x[0] if x[1] is not None and x[1] not in MAX_DATE_RANGE and (self.maxs[0] is None or self.maxs[0] < x[1]): self.maxs[0] = x[1] # update histogram if 0: for xi in range(x[0], x[1]+1): hist = self.histogram[xi] = self.histogram.get(xi, {}) for layer in point[2]: self.histogram_height = max(inc_counter(hist, layer), self.histogram_height) else: for xi in range(x[0], x[1]+1): hist = self.histogram[xi] = self.histogram.get(xi, {}) layers_key = ','.join(['%s' % li for li in sorted(point[2])]) inc_counter(hist, layers_key) # convert y to numerical value if not isinstance(ys, list): ys = [ys] for v in ys: y = self.bands.get(v, 0) # add the points to the stack point[0] = x point[1] = y # convert layers from set to list to allow json serialisation point[2] = list(point[2]) self.stack_point(point) points.append(point) # increment hits per category self.cat_hits[v] = self.cat_hits.get(v, [0,0][:]) self.cat_hits[v][0] += 1 if found: self.cat_hits[v][1] += 1