def lookoutsideoftheline(linenumber: int, numberofextrawords: int, workid: str, searchobject: SearchObject, cursor) -> str: """ grab a line and add the N words at the tail and head of the previous and next lines this will let you search for phrases that fall along a line break "και δη | και" if you wanted to look for 'ἀείδων Ϲπάρτηϲ' you need this individual line: 2.1.374 δεξιτερὴν γὰρ ἀνέϲχε μετάρϲιον, ὡϲ πρὶν ἀείδων to turn extend out to: ὑφαίνων δεξιτερὴν γὰρ ἀνέϲχε μετάρϲιον ὡϲ πρὶν ἀείδων ϲπάρτηϲ :param linenumber: :param numberofextrawords: :param workid: :param searchobject: :param cursor: :return: """ whitespace = ' ' workdbname = workid[0:6] query = 'SELECT {wltmp} FROM {db} WHERE index BETWEEN %s AND %s ORDER BY index ASC'.format( wltmp=worklinetemplate, db=workdbname) data = (linenumber - 1, linenumber + 1) cursor.execute(query, data) results = cursor.fetchall() lines = [dblineintolineobject(r) for r in results] # will get key errors if there is no linenumber+/-1 if len(lines) == 2: if lines[0].index == linenumber: lines = [makeablankline(workdbname, linenumber - 1)] + lines else: lines.append(makeablankline(workdbname, linenumber + 1)) if len(lines) == 1: lines = [makeablankline(workdbname, linenumber - 1)] + lines lines.append(makeablankline(workdbname, linenumber + 1)) text = list() for line in lines: wordsinline = line.wordlist(searchobject.usewordlist) if line.index == linenumber - 1: text = wordsinline[(numberofextrawords * -1):] elif line.index == linenumber: text += wordsinline elif line.index == linenumber + 1: text += wordsinline[0:numberofextrawords] aggregate = whitespace.join(text) aggregate = re.sub(r'\s\s', whitespace, aggregate) aggregate = ' {a} '.format(a=aggregate) return aggregate
def brackethtmlifysearchfinds(listoflineobjects: list, searchobject: SearchObject, linehtmltemplate: str) -> list: """ can't do comprehensions: require a thisline/previousline structure so you can call setcontinuationvalue() :param listoflineobjects: :param searchobject: :param linehtmltemplate: :return: """ brackettypes = findactivebrackethighlighting(searchobject.session) continuationdict = {t: False for t in brackettypes} passage = list() lines = deque(listoflineobjects) try: previous = lines.popleft() except IndexError: previous = makeablankline('gr0000w000', -1) passage.append(linehtmltemplate.format(id=previous.getlineurl(), lc=previous.locus(), ft=previous.markeditorialinsersions(continuationdict))) while lines: ln = lines.popleft() passage.append(linehtmltemplate.format(id=ln.getlineurl(), lc=ln.locus(), ft=ln.markeditorialinsersions(continuationdict))) continuationdict = {t: setcontinuationvalue(ln, previous, continuationdict[t], t) for t in brackettypes} previous = ln return passage
def linesintoindex(lineobjects: List[dbWorkLine], activepoll) -> dict: """ generate the condordance dictionary: { wordA: [(workid1, index1, locus1), (workid2, index2, locus2),..., wordB: ...]} {'illic': [('lt0472w001', 2048, '68A.35')], 'carpitur': [('lt0472w001', 2048, '68A.35')], ...} :return: """ # kill off titles and salutations: dangerous as there l1='t' has not been 100% ruled out as a valid body citation # lineobjects = [ln for ln in lineobjects if ln.l1 not in ['t', 'sa']] completeindex = dict() try: defaultwork = lineobjects[0].wkuinversalid except IndexError: return completeindex # clickable entries will break after too many words. Toggle bewteen indexing methods by guessing N words per line and # then pick 'locus' when you have too many lineobjects: a nasty hack # a RangeError arises from jquery trying to push too many items onto its stack? # in which case if you had 32k indexlocationa and then indexlocationb and then ... you could avoid this? # pretty hacky, but it might work; then again, jquery might die after N of any kind not just N of a specific kind if len(lineobjects) < hipparchia.config[ 'CLICKABLEINDEXEDPASSAGECAP'] or hipparchia.config[ 'CLICKABLEINDEXEDPASSAGECAP'] < 0: # [a] '<indexedlocation id="linenumbergr0032w008/31011">2.17.6</indexedlocation>' vs [b] just '2.17.6' indexingmethod = 'anchoredlocus' elif session['indexskipsknownwords']: indexingmethod = 'anchoredlocus' else: indexingmethod = 'locus' while lineobjects: try: line = lineobjects.pop() if activepoll: activepoll.remain(len(lineobjects)) except IndexError: line = makeablankline(defaultwork, None) if line.index: words = line.indexablewordlist() for w in words: referencestyle = getattr(line, indexingmethod) try: completeindex[w].append( (line.wkuinversalid, line.index, referencestyle())) except KeyError: completeindex[w] = [(line.wkuinversalid, line.index, referencestyle())] return completeindex
def grableadingandlagging(hitline: dbWorkLine, searchobject: SearchObject, cursor, override=None) -> dict: """ take a dbline and grab the N words in front of it and after it it would be a good idea to have an autocommit connection here? override was added so that the rewritten so of precomposedphraseandproximitysearch() can set 'seeking' as it wishes :param hitline: :param searchobject: :param cursor: :return: """ so = searchobject # look out for off-by-one errors distance = so.distance + 1 if override: seeking = override elif so.lemma: seeking = wordlistintoregex(so.lemma.formlist) so.usewordlist = 'polytonic' else: seeking = so.termone # expanded searchzone bacause "seeking" might be a multi-line phrase prev = grabonelinefromwork(hitline.authorid, hitline.index - 1, cursor) next = grabonelinefromwork(hitline.authorid, hitline.index + 1, cursor) prev = dbWorkLine(*prev) next = dbWorkLine(*next) searchzone = ' '.join([ getattr(prev, so.usewordlist), getattr(hitline, so.usewordlist), getattr(next, so.usewordlist) ]) match = re.search(r'{s}'.format(s=seeking), searchzone) # but what if you just found 'paucitate' inside of 'paucitatem'? # you will have 'm' left over and this will throw off your distance-in-words count past = None upto = None lagging = list() leading = list() ucount = 0 pcount = 0 try: past = searchzone[match.end():].strip() except AttributeError: # AttributeError: 'NoneType' object has no attribute 'end' pass try: upto = searchzone[:match.start()].strip() except AttributeError: pass if upto: ucount = len([x for x in upto.split(' ') if x]) lagging = [x for x in upto.split(' ') if x] if past: pcount = len([x for x in past.split(' ') if x]) leading = [x for x in past.split(' ') if x] atline = hitline.index while ucount < distance + 1: atline -= 1 try: previous = dblineintolineobject( grabonelinefromwork(hitline.authorid, atline, cursor)) except TypeError: # 'NoneType' object is not subscriptable previous = makeablankline(hitline.authorid, -1) ucount = 999 lagging = previous.wordlist(so.usewordlist) + lagging ucount += previous.wordcount() lagging = lagging[-1 * (distance - 1):] lagging = ' '.join(lagging) atline = hitline.index while pcount < distance + 1: atline += 1 try: nextline = dblineintolineobject( grabonelinefromwork(hitline.authorid, atline, cursor)) except TypeError: # 'NoneType' object is not subscriptable nextline = makeablankline(hitline.authorid, -1) pcount = 999 leading += nextline.wordlist(so.usewordlist) pcount += nextline.wordcount() leading = leading[:distance - 1] leading = ' '.join(leading) returndict = {'lag': lagging, 'lead': leading} return returndict
def textmaker(author: str, work=None, passage=None, endpoint=None, citationdelimiter='|') -> JSON_STR: """ build a text suitable for display "GET /textof/lt0474/024/20/30" :return: """ probeforsessionvariables() dbconnection = ConnectionObject('autocommit') dbcursor = dbconnection.cursor() linesevery = hipparchia.config['SHOWLINENUMBERSEVERY'] po = TextmakerInputParsingObject(author, work, passage, endpoint, citationdelimiter) ao = po.authorobject wo = po.workobject segmenttext = str() # consolewarning('po.passageaslist: {p}'.format(p=po.passageaslist)) if ao and wo: # we have both an author and a work, maybe we also have a subset of the work if endpoint: firstlinenumber = finddblinefromincompletelocus( wo, po.passageaslist, dbcursor) lastlinenumber = finddblinefromincompletelocus(wo, po.endpointlist, dbcursor, findlastline=True) if firstlinenumber['code'] == 'success' and lastlinenumber[ 'code'] == 'success': startline = firstlinenumber['line'] endline = lastlinenumber['line'] startlnobj = dblineintolineobject( grabonelinefromwork(ao.universalid, startline, dbcursor)) stoplnobj = dblineintolineobject( grabonelinefromwork(ao.universalid, endline, dbcursor)) else: msg = '"buildtexttospan/" could not find first and last: {a}w{b} - {c} TO {d}' consolewarning( msg.format(a=author, b=work, c=passage, d=endpoint)) startlnobj = makeablankline(work, 0) stoplnobj = makeablankline(work, 1) startline = 0 endline = 1 segmenttext = 'from {a} to {b}'.format(a=startlnobj.shortlocus(), b=stoplnobj.shortlocus()) elif not po.passageaslist: # whole work startline = wo.starts endline = wo.ends else: startandstop = textsegmentfindstartandstop(ao, wo, po.passageaslist, dbcursor) startline = startandstop['startline'] endline = startandstop['endline'] texthtml = buildtext(wo.universalid, startline, endline, linesevery, dbcursor) else: texthtml = str() if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']: texthtml = gtltsubstitutes(texthtml) if not segmenttext: segmenttext = '.'.join(po.passageaslist) if not ao or not wo: ao = makeanemptyauthor('gr0000') wo = makeanemptywork('gr0000w000') results = dict() results['authorname'] = avoidsmallvariants(ao.shortname) results['title'] = avoidsmallvariants(wo.title) results['structure'] = avoidsmallvariants(wo.citation()) results['worksegment'] = segmenttext results['texthtml'] = texthtml results = json.dumps(results) dbconnection.connectioncleanup() return results
def buildindexto(searchid: str, author: str, work=None, passage=None, endpoint=None, citationdelimiter='|', justvocab=False) -> JSON_STR: """ build a complete index to a an author, work, or segment of a work :return: """ probeforsessionvariables() pollid = validatepollid(searchid) starttime = time.time() progresspolldict[pollid] = ProgressPoll(pollid) progresspolldict[pollid].activate() dbconnection = ConnectionObject('autocommit') dbcursor = dbconnection.cursor() po = IndexmakerInputParsingObject(author, work, passage, endpoint, citationdelimiter) ao = po.authorobject wo = po.workobject psg = po.passageaslist stop = po.endpointlist if not work: wo = makeanemptywork('gr0000w000') # bool useheadwords = session['headwordindexing'] allworks = list() output = list() cdict = dict() segmenttext = str() valid = True if ao and work and psg and stop: start = psg firstlinenumber = finddblinefromincompletelocus(wo, start, dbcursor) lastlinenumber = finddblinefromincompletelocus(wo, stop, dbcursor, findlastline=True) if firstlinenumber['code'] == 'success' and lastlinenumber[ 'code'] == 'success': cdict = { wo.universalid: (firstlinenumber['line'], lastlinenumber['line']) } startln = dblineintolineobject( grabonelinefromwork(ao.universalid, firstlinenumber['line'], dbcursor)) stopln = dblineintolineobject( grabonelinefromwork(ao.universalid, lastlinenumber['line'], dbcursor)) else: msg = '"indexspan/" could not find first and last: {a}w{b} - {c} TO {d}' consolewarning(msg.format(a=author, b=work, c=passage, d=endpoint)) startln = makeablankline(work, 0) stopln = makeablankline(work, 1) valid = False segmenttext = 'from {a} to {b}'.format(a=startln.shortlocus(), b=stopln.shortlocus()) elif ao and work and psg: # subsection of a work of an author progresspolldict[pollid].statusis( 'Preparing a partial index to {t}'.format(t=wo.title)) startandstop = textsegmentfindstartandstop(ao, wo, psg, dbcursor) startline = startandstop['startline'] endline = startandstop['endline'] cdict = {wo.universalid: (startline, endline)} elif ao and work: # one work progresspolldict[pollid].statusis( 'Preparing an index to {t}'.format(t=wo.title)) startline = wo.starts endline = wo.ends cdict = {wo.universalid: (startline, endline)} elif ao: # whole author allworks = [ '{w} ⇒ {t}'.format(w=w.universalid[6:10], t=w.title) for w in ao.listofworks ] allworks.sort() progresspolldict[pollid].statusis( 'Preparing an index to the works of {a}'.format(a=ao.shortname)) for wkid in ao.listworkids(): cdict[wkid] = (workdict[wkid].starts, workdict[wkid].ends) else: # we do not have a valid selection valid = False output = ['invalid input'] if not stop: segmenttext = '.'.join(psg) if valid and justvocab: dbconnection.connectioncleanup() del progresspolldict[pollid] return cdict if valid: output = buildindextowork(cdict, progresspolldict[pollid], useheadwords, dbcursor) # get ready to send stuff to the page count = len(output) try: locale.setlocale(locale.LC_ALL, 'en_US') count = locale.format_string('%d', count, grouping=True) except locale.Error: count = str(count) progresspolldict[pollid].statusis('Preparing the index HTML') indexhtml = wordindextohtmltable(output, useheadwords) buildtime = time.time() - starttime buildtime = round(buildtime, 2) progresspolldict[pollid].deactivate() if not ao: ao = makeanemptyauthor('gr0000') results = dict() results['authorname'] = avoidsmallvariants(ao.shortname) results['title'] = avoidsmallvariants(wo.title) results['structure'] = avoidsmallvariants(wo.citation()) results['worksegment'] = segmenttext results['elapsed'] = buildtime results['wordsfound'] = count results['indexhtml'] = indexhtml results['keytoworks'] = allworks results['newjs'] = supplementalindexjs() results = json.dumps(results) dbconnection.connectioncleanup() del progresspolldict[pollid] return results
def subqueryphrasesearch(workerid, foundlineobjects: ListProxy, searchphrase: str, listofplacestosearch: ListProxy, searchobject: SearchObject, dbconnection) -> ListProxy: """ foundlineobjects, searchingfor, searchlist, commitcount, whereclauseinfo, activepoll use subquery syntax to grab multi-line windows of text for phrase searching line ends and line beginning issues can be overcome this way, but then you have plenty of bookkeeping to do to to get the proper results focussed on the right line tablestosearch: ['lt0400', 'lt0022', ...] a search inside of Ar., Eth. Eud.: SELECT secondpass.index, secondpass.accented_line FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM (SELECT index, accented_line, concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle FROM gr0086 WHERE ( (index BETWEEN 15982 AND 18745) ) ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s LIMIT 200 a search in x., hell and x., mem less book 3 of hell and book 2 of mem: SELECT secondpass.index, secondpass.accented_line FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.accented_line FROM (SELECT index, accented_line, concat(accented_line, ' ', lead(accented_line) OVER (ORDER BY index ASC)) as linebundle FROM gr0032 WHERE ( (index BETWEEN 1 AND 7918) OR (index BETWEEN 7919 AND 11999) ) AND ( (index NOT BETWEEN 1846 AND 2856) AND (index NOT BETWEEN 8845 AND 9864) ) ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s LIMIT 200 :return: """ # print('subqueryphrasesearch()') so = searchobject activepoll = so.poll # build incomplete sfo that will handle everything other than iteratethroughsearchlist() sfo = returnsearchfncobject(workerid, foundlineobjects, listofplacestosearch, so, dbconnection, None) querytemplate = """ SELECT secondpass.index, secondpass.{co} FROM (SELECT firstpass.index, firstpass.linebundle, firstpass.{co} FROM (SELECT index, {co}, concat({co}, ' ', lead({co}) OVER (ORDER BY index ASC)) AS linebundle FROM {db} {whr} ) firstpass ) secondpass WHERE secondpass.linebundle ~ %s {lim}""" wheretempate = """ WHERE EXISTS (SELECT 1 FROM {tbl}_includelist_{a} incl WHERE incl.includeindex = {tbl}.index) """ # substringsearch() needs ability to CREATE TEMPORARY TABLE sfo.dbconnection.setreadonly(False) dbcursor = sfo.dbconnection.cursor() qcomb = QueryCombinator(searchphrase) # the last item is the full phrase: ('one two three four five', '') combinations = qcomb.combinations() combinations.pop() # lines start/end sp = re.sub(r'^\s', r'(^|\\s)', searchphrase) sp = re.sub(r'\s$', r'(\\s|$)', sp) # on the reasoning behind the following substitution see 'DEBUGGING notes: SQL oddities' above # sp = re.sub(r' ', r'\\s', sp) if not so.onehit: lim = ' LIMIT ' + str(so.cap) else: # the windowing problem means that '1' might be something that gets discarded lim = ' LIMIT 5' if so.redissearchlist: listofplacestosearch = True while listofplacestosearch and activepoll.gethits() <= so.cap: # sfo.getnextfnc() also takes care of the commitcount authortable = sfo.getnextfnc() sfo.updatepollremaining() if authortable: whr = str() r = so.indexrestrictions[authortable] if r['type'] == 'between': indexwedwhere = buildbetweenwhereextension(authortable, so) if indexwedwhere != '': # indexwedwhere will come back with an extraneous ' AND' indexwedwhere = indexwedwhere[:-4] whr = 'WHERE {iw}'.format(iw=indexwedwhere) elif r['type'] == 'temptable': avoidcollisions = assignuniquename() q = r['where']['tempquery'] q = re.sub('_includelist', '_includelist_{a}'.format(a=avoidcollisions), q) dbcursor.execute(q) whr = wheretempate.format(tbl=authortable, a=avoidcollisions) query = querytemplate.format(db=authortable, co=so.usecolumn, whr=whr, lim=lim) data = (sp, ) # print('subqueryphrasesearch() find indices() q,d:\n\t',query, data) dbcursor.execute(query, data) indices = [i[0] for i in dbcursor.fetchall()] # this will yield a bunch of windows: you need to find the centers; see 'while...' below locallineobjects = list() if indices: for i in indices: query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=authortable) data = (i, ) # print('subqueryphrasesearch() iterate through indices() q,d:\n\t', query, data) dbcursor.execute(query, data) locallineobjects.append( dblineintolineobject(dbcursor.fetchone())) locallineobjects.reverse() # debugging # for l in locallineobjects: # print(l.universalid, l.locus(), getattr(l,so.usewordlist)) gotmyonehit = False while locallineobjects and activepoll.gethits( ) <= so.cap and not gotmyonehit: # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133] # figure out which line is really the line with the goods # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in # subsequent lines means that you really should check your work carefully; this is not an especially costly # operation relative to the whole search and esp. relative to the speed gains of using a subquery search lineobject = locallineobjects.pop() if re.search(sp, getattr(lineobject, so.usewordlist)): sfo.addnewfindstolistoffinds([lineobject]) activepoll.addhits(1) if so.onehit: gotmyonehit = True else: try: nextline = locallineobjects[0] except IndexError: nextline = makeablankline('gr0000w000', -1) if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != ( nextline.index - 1): # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101) # usually you won't get a hit by grabbing the next db line, but sometimes you do... query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=authortable) data = (lineobject.index + 1, ) # print('subqueryphrasesearch() "while locallineobjects..." loop q,d:\n\t', query, data) dbcursor.execute(query, data) try: nextline = dblineintolineobject( dbcursor.fetchone()) except: nextline = makeablankline('gr0000w000', -1) for c in combinations: tail = c[0] + '$' head = '^' + c[1] # debugging # print('re',getattr(lo,so.usewordlist),tail, head, getattr(next,so.usewordlist)) t = False h = False try: t = re.search(tail, getattr(lineobject, so.usewordlist)) except re.error: pass try: h = re.search(head, getattr(nextline, so.usewordlist)) except re.error: pass if t and h: sfo.addnewfindstolistoffinds([lineobject]) activepoll.addhits(1) if so.onehit: gotmyonehit = True else: # redis will return None for authortable if the set is now empty listofplacestosearch = None sfo.listcleanup() if sfo.needconnectioncleanup: sfo.dbconnection.connectioncleanup() return foundlineobjects
def precomposedsqlsubqueryphrasesearch(so: SearchObject) -> List[dbWorkLine]: """ use subquery syntax to grab multi-line windows of text for phrase searching line ends and line beginning issues can be overcome this way, but then you have plenty of bookkeeping to do to to get the proper results focussed on the right line these searches take linear time: same basic time for any given scope regardless of the query """ # rebuild the searchsqldict but this time pass through rewritequerystringforsubqueryphrasesearching() so.searchsqldict = searchlistintosqldict(so, so.phrase, subqueryphrasesearch=True) # debugmessage('precomposedsqlsubqueryphrasesearch() so.searchsqldict: {d}'.format(d=so.searchsqldict)) # the windowed collection of lines; you will need to work to find the centers # windowing will increase the number of hits: 2+ lines per actual find initialhitlines = generatepreliminaryhitlist(so, recap=so.cap * 3) m = 'Generating final list of hits by searching among the {h} preliminary hits' so.poll.statusis(m.format(h=so.poll.gethits())) so.poll.sethits(0) sp = re.sub(r'^\s', r'(^|\\s)', so.phrase) sp = re.sub(r'\s$', r'(\\s|$)', sp) combinations = QueryCombinator(so.phrase) # the last item is the full phrase and it will have already been searched: ('one two three four five', '') combinations = combinations.combinations() combinations.pop() listoffinds = list() dbconnection = ConnectionObject() dbcursor = dbconnection.cursor() setofhits = set() while initialhitlines: # windows of indices come back: e.g., three lines that look like they match when only one matches [3131, 3132, 3133] # figure out which line is really the line with the goods # it is not nearly so simple as picking the 2nd element in any run of 3: no always runs of 3 + matches in # subsequent lines means that you really should check your work carefully; this is not an especially costly # operation relative to the whole search and esp. relative to the speed gains of using a subquery search lineobject = initialhitlines.pop() if not so.onehit or lineobject.authorid not in setofhits: if re.search(sp, getattr(lineobject, so.usewordlist)): listoffinds.append(lineobject) so.poll.addhits(1) setofhits.add(lineobject.authorid) else: try: nextline = initialhitlines[0] except IndexError: nextline = makeablankline('gr0000w000', -1) if lineobject.wkuinversalid != nextline.wkuinversalid or lineobject.index != ( nextline.index - 1): # you grabbed the next line on the pile (e.g., index = 9999), not the actual next line (e.g., index = 101) # usually you won't get a hit by grabbing the next db line, but sometimes you do... query = 'SELECT {wtmpl} FROM {tb} WHERE index=%s'.format( wtmpl=worklinetemplate, tb=lineobject.authorid) data = (lineobject.index + 1, ) dbcursor.execute(query, data) try: nextline = dblineintolineobject(dbcursor.fetchone()) except: nextline = makeablankline('gr0000w000', -1) for c in combinations: tail = c[0] + '$' head = '^' + c[1] t = False h = False try: t = re.search(tail, getattr(lineobject, so.usewordlist)) except re.error: pass try: h = re.search(head, getattr(nextline, so.usewordlist)) except re.error: pass if t and h: listoffinds.append(lineobject) so.poll.addhits(1) setofhits.add(lineobject.authorid) dbconnection.connectioncleanup() return listoffinds