Пример #1
0
def rebuildsearchobjectviasearchorder(so: SearchObject) -> SearchObject:
    """

	rewrite the searchobject so that you look for the less common things first

	"""

    if so.lemmaone and so.lemmatwo:
        hwone = querytotalwordcounts(so.lemmaone.dictionaryentry)
        hwtwo = querytotalwordcounts(so.lemmatwo.dictionaryentry)
        # from server.hipparchiaobjects.wordcountobjects import dbWordCountObject
        # print('{a}: {b}, {c}: {d}'.format(a=so.lemmaone.dictionaryentry, b=hwone.t, c=so.lemmatwo.dictionaryentry, d=hwtwo.t))
        if hwtwo.t < hwone.t:
            tmp = so.lemmaone
            so.lemmaone = so.lemmatwo
            so.lemmatwo = tmp
    elif so.lemma or so.proximatelemma:
        pass
    elif so.accented or re.search(r'^[a-z]', so.termone) and so.near:
        # choose the necessarily faster option
        unomdifiedskg = massagesearchtermsforwhitespace(so.seeking)
        unmodifiedprx = so.proximate
        leastcommon = findleastcommonterm(unomdifiedskg + ' ' + unmodifiedprx,
                                          so.accented)
        if leastcommon != unomdifiedskg:
            tmp = so.termone
            so.termone = so.termtwo
            so.termtwo = tmp
    elif len(so.termtwo) > len(so.termone) and so.near:
        # look for the longest word first since that is probably the quicker route
        # but you can't swap searchingfor and proximate this way in a 'is not near' search without yielding the wrong focus
        tmp = so.termone
        so.termone = so.termtwo
        so.termtwo = tmp

    return so
Пример #2
0
def precomposedphraseandproximitysearch(so: SearchObject) -> List[dbWorkLine]:
    """

    do a precomposedsqlsubqueryphrasesearch() and then search inside the results for part two...

    corner case tester: two line-enders: non solum + temporum dignitatem

    [12]   Caesar, De Bello Gallico: book 7, chapter 54, section 4, line 2

    7.54.3.3 multatos agris, omnibus ereptis sociis, imposito stipendio,
    7.54.4.1 obsidibus summa cum contumelia extortis, et quam in
    7.54.4.2 fortunam quamque in amplitudinem deduxisset, ut non
    7.54.4.3 solum in pristinum statum redissent, sed omnium tem-
    7.54.4.4 porum dignitatem et gratiam antecessisse viderentur.


    corner case tester: two distant line-enders: temporum dignitatem + obsides Galliae

    ut non
    solum in pristinum statum redissent, sed omnium tem- 	7.54.4.3
    porum dignitatem et gratiam antecessisse viderentur.
    his datis mandatis eos ab se dimisit.
          Noviodunum erat oppidum Haeduorum ad ripas 	7.55.1.1
    Ligeris opportuno loco positum. huc Caesar omnes ob- 	7.55.2.1
    sides Galliae, frumentum, pecuniam publicam, suorum

    the old code will trick you by pretending it is doing a valid search even though it is not really set up
    to handle this situation and was not supposed to promise that it could do phrase+
    [it's the phrase-spanning-two-lines bit that yields the problem since you do "lemma+" but have no handler for
    the multi-line issue]

    0.0.0-1.8.1

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 3,182 works and found 1 passage (0.77s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου

    1.8.2+

    Sought all 19 known forms of »χώρα« within 1 lines of »μεγάλην δύναμιν«
    Searched 2,346 works and found 2 passages (2.2s)
    Searched between 850 B.C.E. and 300 B.C.E.
    Sorted by name
    [1]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 5, line 47

    3c,688,F.5.45 τόπουϲ. (3) γενόμενον δ’ ἀποϲτάτην καὶ πείϲαντα τὸ ϲύμπαν ἔθνοϲ ἀντέχεϲθαι
    3c,688,F.5.46 τῆϲ ἐλευθερίαϲ, αἱρεθῆναι ϲτρατηγὸν διὰ τὴν ἀνδρείαν. ἔπειτα πυνθανόμενον
    3c,688,F.5.47 ἀθροιζομένην ἐπ’ αὐτὸν μεγάλην δύναμιν, καθοπλίϲαι τοὺϲ Καδουϲίουϲ παν-
    3c,688,F.5.48 δημεί, καὶ καταϲτρατοπεδεῦϲαι πρὸϲ ταῖϲ εἰϲ τὴν χώραν εἰϲβολαῖϲ, ἔχοντα
    3c,688,F.5.49 τοὺϲ ϲύμπανταϲ οὐκ ἐλάττουϲ εἴκοϲι μυριάδων. (4) τοῦ δὲ βαϲιλέωϲ Ἀρταίου
    [2]   Ctesias, Fragmenta: Volume-Jacoby#-F 3c,688,F, fragment 14, line 54

    3c,688,F.14.52    (40) καὶ ἐλυπήθη λύπην ϲφοδρὰν Μεγάβυζοϲ, καὶ ἐπένθηϲε, καὶ ἠιτήϲατο
    3c,688,F.14.53 ἐπὶ Ϲυρίαν τὴν ἑαυτοῦ χώραν ἀπιέναι. ἐνταῦθα λάθραι καὶ τοὺϲ ἄλλουϲ τῶν
    3c,688,F.14.54 Ἑλλήνων προέπεμπε. καὶ ἀπήιει, καὶ ἀπέϲτη βαϲιλέωϲ, καὶ ἀθροίζει μεγάλην
    3c,688,F.14.55 δύναμιν ἄχρι πεντεκαίδεκα μυριάδων χωρὶϲ τῶν ἱππέων [καὶ τῶν πεζῶν].
    3c,688,F.14.56 καὶ πέμπεται Οὔϲιριϲ κατ’ αὐτοῦ ϲὺν ⟨κ⟩ μυριάϲι, καὶ ϲυνάπτεται πόλεμοϲ, καὶ

    """

    #
    # initially do "within x lines"
    #

    phrasefinder = re.compile(r'[^\s]\s[^\s]')
    if re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        secondsearch = precomposedsqlsubqueryphrasesearch
    elif not re.search(phrasefinder, so.seeking) and re.search(
            phrasefinder, so.proximate):
        so.swapseekingandproxmate()
        so.swaplemmaoneandtwo()
        secondsearch = basicprecomposedsqlsearcher
    else:
        secondsearch = basicprecomposedsqlsearcher

    c = so.cap
    ps = so.proximate
    so.proximate = str()
    pl = so.lemmatwo
    so.lemmatwo = str()
    so.phrase = so.seeking
    firstterm = so.phrase

    so.cap = hipparchia.config['INTERMEDIATESEARCHCAP']

    initialhitlines = precomposedsqlsubqueryphrasesearch(so)

    so.seeking = ps
    so.lemmaone = pl
    so.setsearchtype()
    so.cap = c

    if secondsearch == precomposedsqlsubqueryphrasesearch:
        so.phrase = ps
    else:
        so.phrase = str()

    so = perparesoforsecondsqldict(so, initialhitlines)
    so.searchsqldict = searchlistintosqldict(so, so.seeking)

    if so.lemmaone:
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    so.poll.sethits(0)

    newhitlines = secondsearch(so)

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()

    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    maybefinalhitines = list()
    if so.near:
        # "is near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        maybefinalhitines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    #
    # if neccessary, do "within x words" as x lines hits will always be a subset of the first set
    #

    if so.lemmaone:
        secondterm = wordlistintoregex(so.lemmaone.formlist)
    else:
        secondterm = so.seeking

    if so.scope == 'words':
        finalhitlines = paredowntowithinxwords(so, firstterm, secondterm,
                                               maybefinalhitines)
    else:
        finalhitlines = maybefinalhitines

    # to humor rewriteskgandprx()
    # but that formatting doesn't 100% work yet...

    so.termone = firstterm
    so.termtwo = secondterm
    so.lemmatwo = so.lemmaone

    return finalhitlines
Пример #3
0
def precomposedsqlwithinxlinessearch(so: SearchObject) -> List[dbWorkLine]:
    """

    after finding x, look for y within n lines of x

    people who send phrases to both halves and/or a lot of regex will not always get what they want

    note that this implementations is significantly slower than the standard withinxlines() + simplewithinxlines()

    """

    initialhitlines = generatepreliminaryhitlist(so)

    # we are going to need a new searchsqldict w/ a new temptable
    # sq = { table1: {query: q, data: d, temptable: t},
    #         table2: {query: q, data: d, temptable: t}, ...

    # this means refeeding searchlistintosqldict() and priming it for a 'temptable' search
    # the temptable follows the paradigm of wholeworktemptablecontents()
    # r {'type': 'temptable', 'where': {'tempquery': '\n\tCREATE TEMPORARY TABLE in0f08_includelist AS \n\t\tSELECT values \n\t\t\tAS includeindex FROM unnest(ARRAY[768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,763,764,765,766,767]) values\n\t'}}

    so = perparesoforsecondsqldict(so, initialhitlines)

    so.searchsqldict = searchlistintosqldict(so, so.termtwo)
    if so.lemmatwo:
        so.lemmaone = so.lemmatwo
        so.searchsqldict = rewritesqlsearchdictforlemmata(so)

    m = 'Now searching among the {n} initial finds for {l}"{x}"'
    so.poll.statusis(m.format(n=len(initialhitlines), x=so.termtwo, l=str()))
    if so.lemmaone:
        so.poll.statusis(
            m.format(n=len(initialhitlines),
                     x=so.lemmaone.dictionaryentry,
                     l="all forms of "))

    so.poll.sethits(0)
    newhitlines = basicprecomposedsqlsearcher(so)

    # newhitlines will contain, e.g., in0001w0ig_493 and in0001w0ig_492, i.e., 2 lines that are part of the same 'hit'
    # so we need can't use newhitlines directly but have to check it against the initial hits
    # that's fine since "not near" would push us in this direction in any case

    initialhitlinedict = {hl.uniqueid: hl for hl in initialhitlines}
    newhitlineids = set()
    for nhl in newhitlines:
        indices = list(
            range(nhl.index - so.distance, nhl.index + so.distance + 1))
        ids = ['{a}_{b}'.format(a=nhl.wkuinversalid, b=i) for i in indices]
        newhitlineids.update(ids)

    finalhitlines = list()
    if so.near:
        # "is near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl in newhitlineids
        ]
    elif not so.near:
        # "is not near"
        finalhitlines = [
            initialhitlinedict[hl] for hl in initialhitlinedict
            if hl not in newhitlineids
        ]

    return finalhitlines