def rewritesqlsearchdictforlemmata(so: SearchObject) -> dict: """ you have { table1: {query: q, data: d, temptable: t}, table2: {query: q, data: d, temptable: t}, ... } but the 'data' needs to be swapped out { ..., 'gr0059_20': { 'data': '(^|\\s)δηλώϲητε(\\s|$)|(^|\\s)δηλώϲωϲι(\\s|$)|(^|\\s)δεδηλωμένην(\\s|$)|(^|\\s)δηλωθε[ίὶ]ϲ(\\s|$)|(^|\\s)δεδήλωκεν(\\s|$)|(^|\\s)δηλώϲαντα(\\s|$)|(^|\\s)δηλώϲῃϲ(\\s|$)|(^|\\s)δηλώϲουϲαν(\\s|$)|(^|\\s)δηλωϲάϲηϲ(\\s|$)|(^|\\s)δηλοῖμεν(\\s|$)', 'query': 'SELECT wkuniversalid, index, level_05_value, level_04_value, level_03_value, level_02_value, level_01_value, level_00_value, marked_up_line, accented_line, stripped_line, hyphenated_words, annotations FROM gr0059 WHERE ( (index BETWEEN 2172 AND 4884) OR (index BETWEEN 40842 AND 52799) OR (index BETWEEN 1 AND 677) ) AND ( accented_line ~* %s ) LIMIT 200', 'temptable': '' }, 'gr0059_21': { 'data': '(^|\\s)ἐδηλώϲαντο(\\s|$)|(^|\\s)δεδηλωμένοϲ(\\s|$)|(^|\\s)δήλουϲ(\\s|$)|(^|\\s)δηλούϲαϲ(\\s|$)|(^|\\s)δηλώϲειεν(\\s|$)|(^|\\s)δηλωθ[έὲ]ν(\\s|$)|(^|\\s)δηλώϲειϲ(\\s|$)|(^|\\s)δηλουμένων(\\s|$)|(^|\\s)δηλώϲαϲαν(\\s|$)|(^|\\s)δηλώϲετε(\\s|$)', 'query': 'SELECT wkuniversalid, index, level_05_value, level_04_value, level_03_value, level_02_value, level_01_value, level_00_value, marked_up_line, accented_line, stripped_line, hyphenated_words, annotations FROM gr0059 WHERE ( (index BETWEEN 2172 AND 4884) OR (index BETWEEN 40842 AND 52799) OR (index BETWEEN 1 AND 677) ) AND ( accented_line ~* %s ) LIMIT 200', 'temptable': '' } } """ searchdict = so.searchsqldict terms = so.lemmaone.formlist chunksize = min(int(len(terms) / (hipparchia.config['WORKERS'] * 2)), 25) newtablenames = '{t}_{c}' chunked = [terms[i:i + chunksize] for i in range(0, len(terms), chunksize)] chunked = [wordlistintoregex(c) for c in chunked] modifieddict = dict() for authortable in searchdict: count = -1 for c in chunked: count += 1 modifieddict[newtablenames.format(t=authortable, c=count)] = dict() target = modifieddict[newtablenames.format(t=authortable, c=count)] target['data'] = c target['query'] = searchdict[authortable]['query'] target['temptable'] = searchdict[authortable]['temptable'] return modifieddict
def withinxlines(workdbname: str, searchobject: SearchObject, dbconnection) -> List[tuple]: """ after finding x, look for y within n lines of x people who send phrases to both halves and/or a lot of regex will not always get what they want :param workdbname: :param searchobject: :return: """ so = searchobject dbcursor = dbconnection.cursor() dbconnection.setautocommit() # you will only get session['maxresults'] back from substringsearch() unless you raise the cap # "Roman" near "Aetol" will get 3786 hits in Livy, but only maxresults will come # back for checking: but the Aetolians are likely not among those 200 or so passages... templimit = 2000000 if so.lemma: chunksize = hipparchia.config['LEMMACHUNKSIZE'] terms = so.lemma.formlist chunked = [ terms[i:i + chunksize] for i in range(0, len(terms), chunksize) ] chunked = [wordlistintoregex(c) for c in chunked] hitlist = list() for c in chunked: hitlist += list( substringsearch(c, workdbname, so, dbcursor, templimit)) else: hitlist = list( substringsearch(so.termone, workdbname, so, dbcursor, templimit)) # fullmatches = lemmatizedwithinxlines(searchobject, hitlist, dbcursor) if so.lemmaone or so.lemmatwo: fullmatches = lemmatizedwithinxlines(searchobject, hitlist, dbcursor) else: fullmatches = simplewithinxlines(searchobject, hitlist, dbcursor) return fullmatches
def grableadingandlagging(hitline: dbWorkLine, searchobject: SearchObject, cursor, override=None) -> dict: """ take a dbline and grab the N words in front of it and after it it would be a good idea to have an autocommit connection here? override was added so that the rewritten so of precomposedphraseandproximitysearch() can set 'seeking' as it wishes :param hitline: :param searchobject: :param cursor: :return: """ so = searchobject # look out for off-by-one errors distance = so.distance + 1 if override: seeking = override elif so.lemma: seeking = wordlistintoregex(so.lemma.formlist) so.usewordlist = 'polytonic' else: seeking = so.termone # expanded searchzone bacause "seeking" might be a multi-line phrase prev = grabonelinefromwork(hitline.authorid, hitline.index - 1, cursor) next = grabonelinefromwork(hitline.authorid, hitline.index + 1, cursor) prev = dbWorkLine(*prev) next = dbWorkLine(*next) searchzone = ' '.join([ getattr(prev, so.usewordlist), getattr(hitline, so.usewordlist), getattr(next, so.usewordlist) ]) match = re.search(r'{s}'.format(s=seeking), searchzone) # but what if you just found 'paucitate' inside of 'paucitatem'? # you will have 'm' left over and this will throw off your distance-in-words count past = None upto = None lagging = list() leading = list() ucount = 0 pcount = 0 try: past = searchzone[match.end():].strip() except AttributeError: # AttributeError: 'NoneType' object has no attribute 'end' pass try: upto = searchzone[:match.start()].strip() except AttributeError: pass if upto: ucount = len([x for x in upto.split(' ') if x]) lagging = [x for x in upto.split(' ') if x] if past: pcount = len([x for x in past.split(' ') if x]) leading = [x for x in past.split(' ') if x] atline = hitline.index while ucount < distance + 1: atline -= 1 try: previous = dblineintolineobject( grabonelinefromwork(hitline.authorid, atline, cursor)) except TypeError: # 'NoneType' object is not subscriptable previous = makeablankline(hitline.authorid, -1) ucount = 999 lagging = previous.wordlist(so.usewordlist) + lagging ucount += previous.wordcount() lagging = lagging[-1 * (distance - 1):] lagging = ' '.join(lagging) atline = hitline.index while pcount < distance + 1: atline += 1 try: nextline = dblineintolineobject( grabonelinefromwork(hitline.authorid, atline, cursor)) except TypeError: # 'NoneType' object is not subscriptable nextline = makeablankline(hitline.authorid, -1) pcount = 999 leading += nextline.wordlist(so.usewordlist) pcount += nextline.wordcount() leading = leading[:distance - 1] leading = ' '.join(leading) returndict = {'lag': lagging, 'lead': leading} return returndict
def dynamicsqlsearchdispatcher(searchobject: SearchObject) -> List[dbWorkLine]: """ assign the search to multiprocessing workers searchobject: <server.hipparchiaclasses.SearchObject object at 0x1102c15f8> activepoll: <server.hipparchiaclasses.ProgressPoll object at 0x1102c15f8> :param searchobject: :param activepoll: :return: """ # clean out the pool if necessary before starting # this seems like the safest time for a reset of the pool: otherwise you could have workers working # but if you have a multi-user environment AND pool problems this code might make things worse cleanpoolifneeded() so = searchobject activepoll = so.poll # recompose 'searchingfor' (if it exists) # note that 'proximate' does not need as many checks if so.seeking: searchingfor = massagesearchtermsforwhitespace(so.seeking) else: searchingfor = str() # lunate sigmas / UV / JI issues unomdifiedskg = searchingfor unmodifiedprx = so.proximate activepoll.statusis('Loading the the dispatcher...') # of long-term interest is the new shared_memory module; using it will break the 3.6-3.7 installations # https://docs.python.org/3.8/library/multiprocessing.shared_memory.html#module-multiprocessing.shared_memory manager = Manager() founddblineobjects = manager.list() workers = setthreadcount() if so.redisresultlist and so.redissearchlist: listofplacestosearch = None buildredissearchlist(list(so.indexrestrictions.keys()), so.searchid) else: listofplacestosearch = manager.list(so.indexrestrictions.keys()) activepoll.allworkis(len(so.searchlist)) activepoll.remain(len(so.indexrestrictions.keys())) activepoll.sethits(0) # be careful about getting mp aware args into the function targetfunction = None argumentuple = None if so.searchtype == 'simple': activepoll.statusis('Executing a simple word search...') targetfunction = workonsimplesearch argumentuple = (founddblineobjects, listofplacestosearch, so) elif so.searchtype == 'simplelemma': activepoll.statusis( 'Executing a lemmatized word search for the {n} known forms of {w}...' .format(n=len(so.lemma.formlist), w=so.lemma.dictionaryentry)) # don't search for every form at once (100+?) # instead build a list of tuples: [(ORed_regex_forms_part_01, authortable1), ...] chunksize = hipparchia.config['LEMMACHUNKSIZE'] terms = so.lemma.formlist chunked = [ terms[i:i + chunksize] for i in range(0, len(terms), chunksize) ] chunked = [wordlistintoregex(c) for c in chunked] searchtuples = manager.list() masterlist = so.indexrestrictions.keys() for c in chunked: for item in masterlist: searchtuples.append((c, item)) activepoll.allworkis(len(searchtuples)) if so.usequeue: searchtuples = loadsearchqueue([t for t in searchtuples], workers) if so.redissearchlist: ptuples = [pickle.dumps(s) for s in searchtuples] buildredissearchlist(ptuples, so.searchid) targetfunction = workonsimplelemmasearch argumentuple = (founddblineobjects, searchtuples, so) elif so.searchtype == 'phrase': activepoll.statusis('Executing a phrase search.') so.leastcommon = findleastcommonterm(so.termone, so.accented) lccount = findleastcommontermcount(so.termone, so.accented) # print('least common word in phrase:', lccount, ':', so.leastcommon, so.termone) # longestterm = max([len(t) for t in so.termone.split(' ') if t]) # need to figure out when it will be faster to go to subqueryphrasesearch() and when not to # logic + trial and error # e.g., any phrase involving λιποταξίου (e.g., γράψομαι λιποταξίου) can be very fast because that form appears 36x: # you can find it in 1s but if you go through subqueryphrasesearch() you will spend about 17s per full TLG search # lccount = -1 if you are unaccented # 'if 0 < lccount < 500 or longestterm > 5' got burned badly with 'ἐξ ἀρχῆϲ πρῶτον' # 'or (lccount == -1 and longestterm > 6)' would take 1m to find διαφοραϲ ιδεαν via workonphrasesearch() # but the same can be found in 16.45s via subqueryphrasesearch() # it looks like unaccented searches are very regularly faster via subqueryphrasesearch() # when is this not true? being wrong about sqs() means spending an extra 10s; being wrong about phs() means an extra 40s... if 0 < lccount < 500: # print('workonphrasesearch()', searchingfor) targetfunction = workonphrasesearch argumentuple = (founddblineobjects, listofplacestosearch, so) else: # print('subqueryphrasesearch()', searchingfor) targetfunction = subqueryphrasesearch argumentuple = (founddblineobjects, so.termone, listofplacestosearch, so) elif so.searchtype == 'proximity': activepoll.statusis('Executing a proximity search...') if so.lemma or so.proximatelemma: pass elif so.accented or re.search(r'^[a-z]', so.termone) and so.near: # choose the necessarily faster option leastcommon = findleastcommonterm( unomdifiedskg + ' ' + unmodifiedprx, so.accented) if leastcommon != unomdifiedskg: tmp = so.termone so.termone = so.termtwo so.termtwo = tmp elif len(so.termtwo) > len(so.termone) and so.near: # look for the longest word first since that is probably the quicker route # but you can't swap searchingfor and proximate this way in a 'is not near' search without yielding the wrong focus tmp = so.termone so.termone = so.termtwo so.termtwo = tmp targetfunction = workonproximitysearch argumentuple = (founddblineobjects, listofplacestosearch, so) else: # impossible, but... workers = 0 # non-parallel multiprocessing implementation across platforms: widows can't pickle a connection; # everyone else needs to pickle the connection if icanpickleconnections(): # you need to give each job its own connection if you use a connection pool # otherwise there will be problems with threading # note that we are not yet taking care of connection types: 'autocommit', etc oneconnectionperworker = { i: ConnectionObject() for i in range(workers) } else: # will grab a connection later once inside of 'sfo' oneconnectionperworker = {i: None for i in range(workers)} # note that the following (when fully implemented...) does not produce speedups # operedisconnectionperworker = {i: establishredisconnection() for i in range(workers)} argumentswithconnections = [ tuple([i] + list(argumentuple) + [oneconnectionperworker[i]]) for i in range(workers) ] jobs = [ Process(target=targetfunction, args=argumentswithconnections[i]) for i in range(workers) ] for j in jobs: j.start() for j in jobs: j.join() if so.redisresultlist: foundlineobjects = loadredisresults(so.searchid) else: # foundlineobjects = [dblineintolineobject(item) for item in founddblineobjects] foundlineobjects = list(founddblineobjects) if oneconnectionperworker[0]: for c in oneconnectionperworker: oneconnectionperworker[c].connectioncleanup() return foundlineobjects
def executesearch(searchid: str, so=None, req=request) -> JSON_STR: """ the interface to all of the other search functions tell me what you are looking for and i'll try to find it the results are returned in a json bundle that will be used to update the html on the page note that cosdistbysentence vector queries also flow through here: they need a hitdict overview: buildsearchobject() and then start modifying elements of the SearchObject build a search list via compilesearchlist() modify search list via flagexclusions() modify search list via calculatewholeauthorsearches() build search list restrictions via indexrestrictions() search via searchdispatcher() format results via buildresultobjects() :return: """ pollid = validatepollid(searchid) if not so: # there is a so if singlewordsearch() sent you here probeforsessionvariables() so = buildsearchobject(pollid, req, session) frozensession = so.session progresspolldict[pollid] = ProgressPoll(pollid) so.poll = progresspolldict[pollid] so.poll.activate() so.poll.statusis('Preparing to search') nosearch = True output = SearchOutputObject(so) allcorpora = [ 'greekcorpus', 'latincorpus', 'papyruscorpus', 'inscriptioncorpus', 'christiancorpus' ] activecorpora = [c for c in allcorpora if frozensession[c]] if (len(so.seeking) > 0 or so.lemma or frozensession['tensorflowgraph'] or frozensession['topicmodel']) and activecorpora: so.poll.statusis('Compiling the list of works to search') so.searchlist = compilesearchlist(listmapper, frozensession) if so.searchlist: # do this before updatesearchlistandsearchobject() which collapses items and cuts your total workssearched = len(so.searchlist) # calculatewholeauthorsearches() + configurewhereclausedata() so = updatesearchlistandsearchobject(so) nosearch = False skg = None prx = None isgreek = re.compile( '[α-ωϲἀἁἂἃἄἅἆἇᾀᾁᾂᾃᾄᾅᾆᾇᾲᾳᾴᾶᾷᾰᾱὰάἐἑἒἓἔἕὲέἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗὀὁὂὃὄὅόὸὐὑὒὓὔὕὖὗϋῠῡῢΰῦῧύὺᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇἤἢἥἣὴήἠἡἦἧὠὡὢὣὤὥὦὧᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷώὼ]' ) if so.lemmaone: so.termone = wordlistintoregex(so.lemma.formlist) skg = so.termone if re.search(isgreek, skg): # 'v' is a problem because the lemmata list is going to send 'u' # but the greek lemmata are accented so.usecolumn = 'accented_line' if so.lemmatwo: so.termtwo = wordlistintoregex(so.lemmatwo.formlist) prx = so.termtwo if re.search(isgreek, prx): so.usecolumn = 'accented_line' so.setsearchtype() thesearch = so.generatesearchdescription() htmlsearch = so.generatehtmlsearchdescription() # now that the SearchObject is built, do the search... hits = precomposedsqlsearch(so) so.poll.statusis('Putting the results in context') # hits is List[dbWorkLine] hitdict = sortresultslist(hits, so, authordict, workdict) if so.vectorquerytype == 'cosdistbylineorword': # print('executesearch(): h - cosdistbylineorword') # take these hits and head on over to the vector worker output = findabsolutevectorsfromhits(so, hitdict, workssearched) del progresspolldict[pollid] return output resultlist = buildresultobjects(hitdict, authordict, workdict, so) so.poll.statusis('Converting results to HTML') sandp = rewriteskgandprx(skg, prx, htmlsearch, so) skg = sandp['skg'] prx = sandp['prx'] htmlsearch = sandp['html'] for r in resultlist: r.lineobjects = flagsearchterms(r, skg, prx, so) if so.context > 0: findshtml = htmlifysearchfinds(resultlist, so) else: findshtml = nocontexthtmlifysearchfinds(resultlist) if hipparchia.config['INSISTUPONSTANDARDANGLEBRACKETS']: findshtml = gtltsubstitutes(findshtml) findsjs = insertbrowserclickjs('browser') resultcount = len(resultlist) if resultcount < so.cap: hitmax = False else: hitmax = True output.title = thesearch output.found = findshtml output.js = findsjs output.setresultcount(resultcount, 'passages') output.setscope(workssearched) output.searchtime = so.getelapsedtime() output.thesearch = thesearch output.htmlsearch = htmlsearch output.hitmax = hitmax if nosearch: if not activecorpora: output.reasons.append('there are no active databases') if len(so.seeking) == 0: output.reasons.append('there is no search term') if len(so.seeking) > 0 and len(so.searchlist) == 0: output.reasons.append('zero works match the search criteria') output.title = '(empty query)' output.setresultcount(0, 'passages') output.explainemptysearch() so.poll.deactivate() jsonoutput = json.dumps(output.generateoutput()) del progresspolldict[pollid] return jsonoutput
def withinxwords(workdbname: str, searchobject: SearchObject, dbconnection) -> List[dbWorkLine]: """ int(session['proximity']), searchingfor, proximate, curs, wkid, whereclauseinfo after finding x, look for y within n words of x getting to y: find the search term x and slice it out of its line then build forwards and backwards within the requisite range then see if you get a match in the range if looking for 'paucitate' near 'imperator' you will find: 'romani paucitate seruorum gloriatos itane tandem ne' this will become: 'romani' + 'seruorum gloriatos itane tandem ne' :param workdbname: :param searchobject: :return: """ so = searchobject dbcursor = dbconnection.cursor() dbconnection.setautocommit() # you will only get session['maxresults'] back from substringsearch() unless you raise the cap # "Roman" near "Aetol" will get 3786 hits in Livy, but only maxresults will come # back for checking: but the Aetolians are likley not among those passages... templimit = 9999 if so.lemma: chunksize = hipparchia.config['LEMMACHUNKSIZE'] terms = so.lemma.formlist chunked = [ terms[i:i + chunksize] for i in range(0, len(terms), chunksize) ] chunked = [wordlistintoregex(c) for c in chunked] hits = list() for c in chunked: hits += list( substringsearch(c, workdbname, so, dbcursor, templimit)) so.usewordlist = 'polytonic' else: hits = list( substringsearch(so.termone, workdbname, so, dbcursor, templimit)) fullmatches = list() for hit in hits: hitline = dblineintolineobject(hit) leadandlag = grableadingandlagging(hitline, so, dbcursor) lagging = leadandlag['lag'] leading = leadandlag['lead'] # print(hitline.universalid, so.termtwo, '\n\t[lag] ', lagging, '\n\t[lead]', leading) if so.near and (re.search(so.termtwo, leading) or re.search(so.termtwo, lagging)): fullmatches.append(hit) elif not so.near and not re.search( so.termtwo, leading) and not re.search(so.termtwo, lagging): fullmatches.append(hit) return fullmatches
def precomposedphraseandproximitysearch(so: SearchObject) -> List[dbWorkLine]: """ do a precomposedsqlsubqueryphrasesearch() and then search inside the results for part two... corner case tester: two line-enders: non solum + temporum dignitatem [12] Caesar, De Bello Gallico: book 7, chapter 54, section 4, line 2 7.54.3.3 multatos agris, omnibus ereptis sociis, imposito stipendio, 7.54.4.1 obsidibus summa cum contumelia extortis, et quam in 7.54.4.2 fortunam quamque in amplitudinem deduxisset, ut non 7.54.4.3 solum in pristinum statum redissent, sed omnium tem- 7.54.4.4 porum dignitatem et gratiam antecessisse viderentur. corner case tester: two distant line-enders: temporum dignitatem + obsides Galliae ut non solum in pristinum statum redissent, sed omnium tem- 7.54.4.3 porum dignitatem et gratiam antecessisse viderentur. his datis mandatis eos ab se dimisit. Noviodunum erat oppidum Haeduorum ad ripas 7.55.1.1 Ligeris opportuno loco positum. huc Caesar omnes ob- 7.55.2.1 sides Galliae, frumentum, pecuniam publicam, suorum the old code will trick you by pretending it is doing a valid search even though it is not really set up to handle this situation and was not supposed to promise that it could do phrase+ [it's the phrase-spanning-two-lines bit that yields the problem since you do "lemma+" but have no handler for the multi-line issue] 0.0.0-1.8.1 Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν« Searched 3,182 works and found 1 passage (0.77s) Searched between 850 B.C.E. and 300 B.C.E. Sorted by name [1] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47 3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι 3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον 3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν- 3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα 3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου 1.8.2+ Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν« Searched 2,346 works and found 2 passages (2.2s) Searched between 850 B.C.E. and 300 B.C.E. Sorted by name [1] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47 3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι 3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον 3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν- 3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα 3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου [2] Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 14, line 54 3c,688,F.14.52 (40) καὶ ἐλυπήθη λύπην ϲφοδρὰν Μεγάβυζοϲ, καὶ ἐπένθηϲε, καὶ ἠιτήϲατο 3c,688,F.14.53 ἐπὶ Ϲυρίαν τὴν ἑαυτοῦ χώραν ἀπιέναι. ἐνταῦθα λάθραι καὶ τοὺϲ ἄλλουϲ τῶν 3c,688,F.14.54 Ἑλλήνων προέπεμπε. καὶ ἀπήιει, καὶ ἀπέϲτη βαϲιλέωϲ, καὶ ἀθροίζει μεγάλην 3c,688,F.14.55 δύναμιν ἄχρι πεντεκαίδεκα μυριάδων χωρὶϲ τῶν ἱππέων [καὶ τῶν πεζῶν]. 3c,688,F.14.56 καὶ πέμπεται Οὔϲιριϲ κατ’ αὐτοῦ ϲὺν ⟨κ⟩ μυριάϲι, καὶ ϲυνάπτεται πόλεμοϲ, καὶ """ # # initially do "within x lines" # phrasefinder = re.compile(r'[^\s]\s[^\s]') if re.search(phrasefinder, so.seeking) and re.search( phrasefinder, so.proximate): secondsearch = precomposedsqlsubqueryphrasesearch elif not re.search(phrasefinder, so.seeking) and re.search( phrasefinder, so.proximate): so.swapseekingandproxmate() so.swaplemmaoneandtwo() secondsearch = basicprecomposedsqlsearcher else: secondsearch = basicprecomposedsqlsearcher c = so.cap ps = so.proximate so.proximate = str() pl = so.lemmatwo so.lemmatwo = str() so.phrase = so.seeking firstterm = so.phrase so.cap = hipparchia.config['INTERMEDIATESEARCHCAP'] initialhitlines = precomposedsqlsubqueryphrasesearch(so) so.seeking = ps so.lemmaone = pl so.setsearchtype() so.cap = c if secondsearch == precomposedsqlsubqueryphrasesearch: so.phrase = ps else: so.phrase = str() so = perparesoforsecondsqldict(so, initialhitlines) so.searchsqldict = searchlistintosqldict(so, so.seeking) if so.lemmaone: so.searchsqldict = rewritesqlsearchdictforlemmata(so) so.poll.sethits(0) newhitlines = secondsearch(so) initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines} newhitlineids = set() for nhl in newhitlines: indices = list( range(nhl.index - so.distance, nhl.index + so.distance + 1)) ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices] newhitlineids.update(ids) maybefinalhitines = list() if so.near: # "is near" maybefinalhitines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl in newhitlineids ] elif not so.near: # "is not near" maybefinalhitines = [ initialhitlinedict[hl] for hl in initialhitlinedict if hl not in newhitlineids ] # # if neccessary, do "within x words" as x lines hits will always be a subset of the first set # if so.lemmaone: secondterm = wordlistintoregex(so.lemmaone.formlist) else: secondterm = so.seeking if so.scope == 'words': finalhitlines = paredowntowithinxwords(so, firstterm, secondterm, maybefinalhitines) else: finalhitlines = maybefinalhitines # to humor rewriteskgandprx() # but that formatting doesn't 100% work yet... so.termone = firstterm so.termtwo = secondterm so.lemmatwo = so.lemmaone return finalhitlines