def command_unit(self): # from digipal_text.models import TextUnit # rs = TextUnit.objects from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units rid = self.get_arg(1) fitler = {} if rid: fitler = {'id': rid} ctx = TextContentXML.objects.filter(**fitler).first() cnt = 0 if ctx: print ctx location_type = self.get_arg(2, 'locus') location = self.get_arg(3, None) units = get_all_units(ctx.content, location_type) for unit in units: if location is None or dputils.is_unit_in_range(unit['unitid'], location): cnt += 1 print '%-10s %-5s %-10s' % (unit['unitid'], len(unit['content']), repr(unit['content'][:10])) if location: print repr(unit['content']) print '%s units' % cnt
def command_unit(self): # from digipal_text.models import TextUnit # rs = TextUnit.objects from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units rid = self.get_arg(1) fitler = {} if rid: fitler = {'id': rid} ctx = TextContentXML.objects.filter(**fitler).first() cnt = 0 if ctx: print ctx location_type = self.get_arg(2, 'locus') location = self.get_arg(3, None) units = get_all_units(ctx.content, location_type) for unit in units: if location is None or dputils.is_unit_in_range( unit['unitid'], location): cnt += 1 print '%-10s %-5s %-10s' % (unit['unitid'], len(unit['content']), repr(unit['content'][:10])) if location: print repr(unit['content']) print '%s units' % cnt
def command_search(self): if len(self.args) < 3: raise CommandError('Convert requires 2 arguments') from digipal.management.commands.utils import get_stats_from_xml_string from digipal_text.views.viewer import get_fragment_extent, get_all_units pattern = unicode(self.args[3]) #pattern = ur'.{1,30}ħ.{1,30}' pattern = ur'(?musi)#MSTART#(.*?)#MEND#' stats = {} cnt = 0 import regex as re all_entries = [] for tcx in TextContentXML.objects.filter( text_content__item_part_id=self.args[1], text_content__type__slug=self.args[2]): if 1: for match in re.findall(pattern, tcx.content): cnt += 1 if len(re.findall(ur'<p>', match)) > 1: print '>1' entries = re.findall(ur'"entry">(.*?)<', match) if entries: all_entries.extend(entries) if 0: units = get_all_units(tcx.content, 'entry') for unit in units: for match in re.findall(pattern, unit['content']): #print unit['unitid'], repr(match) #print repr(match) #print re.findall(ur'<p>', match) cnt += 1
def command_download(self): ret = ur'' recordid = self.args[1] unitid = '' if len(self.args) > 2: unitid = self.args[2] from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units text_content_xml = TextContentXML.objects.get(id=recordid) content = text_content_xml.content suffix = '' if unitid: suffix = '-unit' units = get_all_units(content, 'entry') for unit in units: if unit['unitid'] == unitid: ret = ur'<root>%s</root>' % unit['content'] else: ret = content import regex if ret is None: ret = u'' # print repr(ret) file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix) from digipal.utils import write_file write_file(file_name, ret) print 'Written file %s ' % file_name
def command_download(self): ret = ur'' recordid = self.args[1] unitid = '' if len(self.args) > 2: unitid = self.args[2] from digipal_text.models import TextContentXML from digipal_text.views.viewer import get_fragment_extent, get_all_units text_content_xml = TextContentXML.objects.get(id=recordid) content = text_content_xml.content suffix = '' if unitid: suffix = '-unit' units = get_all_units(content, 'entry') for unit in units: if unit['unitid'] == unitid: ret = ur'<root>%s</root>' % unit['content'] else: ret = content import regex if ret is None: ret = u'' # ret = regex.sub(ur'(?musi)<span data-dpt="abbr">.*?</span>(<span data-dpt="exp">)', ur'\1', ret) # ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="su[pb]">(.*?)</span>', ur'\1', ret) # ret = regex.sub(ur'(?musi)<i>(.*?)</i>', ur'\1', ret) # print repr(ret) # for it in regex.findall('<span data-dpt="hi" data-dpt-rend="su[pb]">.*?</span>', ret): # print repr(it) # for it in regex.findall(ur'(?musi)qu[i1][i1]', ret): # print repr(it) if 0: ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sup">([^<]+)</span>', ur'<sup>\1</sup>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="hi" data-dpt-rend="sub">([^<]+)</span>', ur'<sub>\1</sub>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="ms"></span>', ur'<br/>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="lb" data-dpt-src="prj"></span>', ur'<lb/>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="abbr">(.*?)</span>', ur'<abbr>\1</abbr>', ret) ret = regex.sub(ur'(?musi)<span data-dpt="exp">(.*?)</span>', ur'<exp>\1</exp>', ret) # print repr(ret) file_name = 'tcx%s%s.xml' % (text_content_xml.id, suffix) from digipal.utils import write_file write_file(file_name, ret) print 'Written file %s ' % file_name
def command_search(self): if len(self.args) < 3: raise CommandError('Convert requires 2 arguments') from digipal.management.commands.utils import get_stats_from_xml_string from digipal_text.views.viewer import get_fragment_extent, get_all_units pattern = unicode(self.args[3]) #pattern = ur'.{1,30}ħ.{1,30}' stats = {} cnt = 0 import regex as re for tcx in TextContentXML.objects.filter(text_content__item_part_id=self.args[1], text_content__type__slug=self.args[2]): units = get_all_units(tcx.content, 'entry') for unit in units: for match in re.findall(pattern, unit['content']): #print unit['unitid'], repr(match) print repr(match) cnt += 1 print '%s occurences' % cnt