Exemplo n.º 1
0
def issue_search(queries_list,
                 return_objects=False,
                 make_phrase=False,
                 case_sensitive=False):
    # Remove quotation marks
    queries = [q.replace("'", "").replace('"', '') for q in queries_list]

    if make_phrase:
        queries = ["\"" + q + "\"" for q in queries]

    ix = get_issue_index(case_sensitive=case_sensitive)
    parser = QueryParser("text", schema=ix.schema)
    parser.add_plugin(PhrasePlugin())

    with ix.searcher() as searcher:
        parsed_queries = [parser.parse(q) for q in queries]
        q = whoosh.query.Or(parsed_queries)
        results = searcher.search(q, limit=None)
        # print "   -", len(results), "results"
        if return_objects:
            return [
                LobbyingSpecificIssue.query.get(int(i['id'])) for i in results
            ]
        else:
            return [i['id'] for i in results]
Exemplo n.º 2
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []
        print("Searching using", pattern)
        with self.ix.searcher() as searcher:
            matches = []
            for f in ["title", "path", "content"]:
                queryp = QueryParser(f, self.ix.schema)
                queryp.add_plugin(RegexPlugin())
                # r"pattern" is the desired regex term format
                query = queryp.parse('r"' + pattern + '"')
                ms = searcher.search(query, limit=None) # default limit is 10!
                for m in ms:
                    if not m in matches:
                        matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = ""
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" +
                         path + "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
            print("Finished searching", pattern)
Exemplo n.º 3
0
def get_html_correction(searcher, query_str, qp):
    exact_qp = QueryParser('exact', my_index.search_schema)
    exact_qp.add_plugin(DateParserPlugin())
    exact_qp = exact_qp.parse(query_str)
    try:
        corrected_query = searcher.correct_query(exact_qp, query_str, prefix=1)
    except:
        return ""

    for token in corrected_query.tokens:
        # is this some sort of bug with Whoosh? startchar:8, endchar:9 original:'tes?' the hell?
        if query_str[token.startchar:token.endchar] != token.original:
            return ""
        for variations in (uk_variations, us_variations):
            if token.original in variations and searcher.ixreader.frequency(
                    'exact', variations[token.original]) > 0:
                token.text = variations[token.original]
                break
        # not sure this code ever gets a chance to run due to above possible bug
        if re.search(r'\W', token.original):
            token.text = token.original
    corrected_query_str = replace_tokens(query_str, corrected_query.tokens)
    corrected_qp = QueryParser('stemmed', my_index.search_schema)
    corrected_qp.add_plugin(DateParserPlugin())
    corrected_qp = corrected_qp.parse(corrected_query_str)
    if corrected_qp == qp:
        return ""

    result = '<h3>Did you mean <a href="{}">{}</a>?</strong></h3>'.format(
        stateful_url_for('search_form', q_query=urlize(corrected_query_str)),
        corrected_query.format_string(
            highlight.HtmlFormatter(classname="change")))
    return result
Exemplo n.º 4
0
    def search(self,
               user_query,
               ranking_function=scoring.BM25F(),
               phraseSearch=False):
        qp = QueryParser("body", schema=self.ix.schema)

        # Once you have a QueryParser object, you can call parse() on it to parse a query string into a query object:
        # default query lang:
        # If the user doesn’t explicitly specify AND or OR clauses:
        # by default, the parser treats the words as if they were connected by AND,
        # meaning all the terms must be present for a document to match
        # we will change this
        # to phrase search "<query>" - use quotes

        qp.add_plugin(qparser.GtLtPlugin)
        # qp.remove_plugin_class(qparser.PhrasePlugin)
        qp.add_plugin(qparser.PhrasePlugin)

        if phraseSearch == True:
            user_query = '"' + user_query + '"'

        query = qp.parse(user_query)
        print("# user_query", user_query, ", Query: ", query)
        print(query)

        with self.ix.searcher(weighting=ranking_function) as searcher:
            matches = searcher.search(query, limit=None)
            print("Total Number of Results:", len(matches))
            print("Number of scored and sorted docs in this Results object:",
                  matches.scored_length())
            results = [item.fields() for item in matches]

        resultsDF = pandas.DataFrame.from_dict(results)
        return (matches.docs(), resultsDF)
Exemplo n.º 5
0
 def searchNote(self):
     pattern = self.searchEdit.text()
     qres = []
     with self.ix.searcher() as searcher:
         queryp = QueryParser("content", self.ix.schema)
         queryp.add_plugin(RegexPlugin())
         query = queryp.parse('r"' + pattern + '"')
                              # r"pattern" is the desired regex term format
         pathFacet = sorting.FieldFacet("path")
         scores = sorting.ScoreFacet()
         results = searcher.search(
             query, limit=None, sortedby=[pathFacet, scores])  # default limit is 10!
         for r in results:
             listItem = QListWidgetItem()
             title = r['title']
             text = r['path']
             term = r.highlights("content")
             qres.append([title, text, term])
         html = """
                 <style>
                     body { font-size: 14px; }
                     .path { font-size: 12px; color: #009933; }
                 </style>
                """
         for ti, te, hi in qres:
             html += ("<p><a href='" + te + "'>" + ti + 
                      "</a><br/><span class='path'>" + 
                     te + "</span><br/>" + hi + "</p>")
         self.searchView.setHtml(html)
Exemplo n.º 6
0
    def query(self, q):
        # parser
        qpcontent = QueryParser("fullText", schema=self.indexer.schema)
        qpanchor = QueryParser("anchorText", schema=self.indexer.schema)
        qpcontent.add_plugin(qparser.OperatorsPlugin())
        qpanchor.add_plugin(qparser.OperatorsPlugin())
        # query
        qcontent = qpcontent.parse(q)
        qanchor = qpanchor.parse(q)
        resWeb = []
        with self.indexer.searcher() as s:
            resContent = s.search(qcontent, limit=40)
            resAnchor = s.search(qanchor, limit=40)
            resFinal = resAnchor
            resFinal.upgrade_and_extend(resContent)

            respgMap = {}
            resAnchorMap = {}
            resTextMap = {}
            resURLMap = {}

            for r in resFinal:
                resURLMap[r['pageURL']] = r
            for r in resFinal:
                respgMap[r['pageURL']] = self.pgrank[self.allFilesMap[
                    r['pageURL']]]
                resAnchorMap[r['pageURL']] = r['anchorText']
                resTextMap[r['pageURL']] = r['fullText']

            supportedRes = []
            for r in respgMap:
                supportedRes.append((r, respgMap[r]))

            # consolidate the result by VSMSP algorithm
            ii = 0
            while ii + 10 < len(supportedRes):
                supportedRes[ii:ii + 10] = sorted(supportedRes[ii:ii + 10],
                                                  key=operator.itemgetter(1))
                ii = ii + 10

            supportedRes[ii:] = sorted(supportedRes[ii:],
                                       key=operator.itemgetter(1))

            for r in supportedRes:
                hts = self.__cleanhtml(resURLMap[r[0]].highlights("anchorText",
                                                                  top=3))
                hts = hts + self.__cleanhtml(resURLMap[r[0]].highlights(
                    "fullText", top=2))
                resWeb.append([resURLMap[r[0]]["title"], r[0], hts])
                """
                print(resURLMap[r[0]]["title"])
                print(r[0])
                print(r[1])
                print(self.__cleanhtml(resURLMap[r[0]].highlights("anchorText",top=3)))
                print(self.__cleanhtml(resURLMap[r[0]].highlights("fullText",top=2)))
                print("\n\n\n")
                """

        return resWeb
Exemplo n.º 7
0
def getLastNotice():
    """ @returns the last added notice to the Index """
    ix = open_dir("index")
    with ix.searcher() as searcher: 
        qp = QueryParser("content", ix.schema, group=OrGroup)
        qp.add_plugin(DateParserPlugin())
        query = qp.parse(u"date:'[18000101 to today]") 
        results = searcher.search(query, limit=1) 
        return results.fields(0)
Exemplo n.º 8
0
def searcher(index_path, query):
    ix = open_dir(index_path)
    searcher = ix.searcher()
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    my_query = parser.parse(query)
    results = searcher.search(my_query, limit=None)
    for result in results:
        print(result['content'])
def QuerySent(query,schema,SentNum):
    # SetIndex("TestIndex")
    # use Fuzzy
    parser =QueryParser(None,schema)
    parser.add_plugin(MultifieldPlugin(["title","sent"]))
    # parser.add_plugin(FuzzyTermPlugin())
    myquery = parser.parse(query)
    results =searcher.search(myquery)
    acturalResult = ""
    for rs in results:
        if rs["sent"] == str(SentNum):
            return rs["content"]
Exemplo n.º 10
0
def findMailbyDate(indexes, input_user):
    ix = indexes.get("index_emails")
    qp = QueryParser("date", schema=ix.schema)
    qp.add_plugin(DateParserPlugin())
    date_now = datetime.now()
    date_input = datetime.strptime(input_user, "%Y%m%d")
    q = qp.parse(unicode("date:["+str(date_input)+"to"+str(date_now)+"]"))
        
    with ix.searcher() as s:
        results = s.search(q)
        for result in results:
            print "Remitente: %s, Destinatarios: %s, Asunto: %s" % (result["sender_email"], result["recipient_emails"], result["subject"])
Exemplo n.º 11
0
	def search(self,text):
		parser = QueryParser("data", self.ix.schema,group=qparser.OrGroup)
		parser.add_plugin(qparser.FuzzyTermPlugin())
		parser.add_plugin(qparser.SequencePlugin())
		
		query = parser.parse(text)
		output=[]
		with self.ix.searcher() as searcher:    
			results = searcher.search(query,terms=True)
			for r in results:
				output.append( { 'icd': { 'name' : r['name'], 'icdcode': r['icdcode'] } } )
		return output
Exemplo n.º 12
0
def apartado_b(date1,date2):
    ix = open_dir("Index")
    dataFromResults = []
    with ix.searcher() as searcher:
        parser = QueryParser("fecha",ix.schema)
        parser.add_plugin(DateParserPlugin())
        query = u"date:[" + date1 + " to " + date2 + "]"
        print(query)
        query = parser.parse(query)
        results = searcher.search(query)
        for r in results:
            dataFromResults.append([r["titulo"],r["fecha"]])
    return dataFromResults
def searchFe(busqueda):
    ix = open_dir("index")
    searcher = ix.searcher()
    date = "{" + busqueda + " to]"
    parser = QueryParser("fecha", ix.schema)

    parser.add_plugin(DateParserPlugin(free=True))
    parser.add_plugin(GtLtPlugin())
    myquery = parser.parse(date)

    results = searcher.search(myquery)

    return results
Exemplo n.º 14
0
def search_index(words):
    xg_duanluo = []
    with ix.searcher() as s:
        qp = QueryParser('duanluo', schema=ix.schema, group=qparser.OrGroup)
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'{}'.format(word))
            results = s.search(q, limit=10)
            for i in results:
                xg_duanluo.append((i['id'], i['duanluo']))
    return xg_duanluo
Exemplo n.º 15
0
async def word_count(query_str, ctx):
    ix = open_dir("indexdir")
    parser = QueryParser("title", ix.schema)
    query = parser.parse(query_str)
    parser.add_plugin(DateParserPlugin())
    print(query)
    with ix.searcher(weighting=scoring.BM25F) as searcher:
        results = searcher.search(query)
        embed = discord.Embed(title="Wordcount", color=discord.Color(0x3cd63d))
        for hit in results:
            embed.add_field(name="{}".format(hit["title"]),
                            value="Wordcount: **{}**".format(hit["wordcount"]),
                            inline=False)
        await ctx.send(embed=embed)
Exemplo n.º 16
0
def basic_search(query,
                 query_parse,
                 group=default_group,
                 facet=default_facet,
                 index=default_index):
    searcher = index.searcher()
    parser = QueryParser(query_parse, index.schema, group=group)
    myquery = parser.parse(query)
    parser.remove_plugin_class(qparser.PhrasePlugin)
    parser.add_plugin(qparser.SequencePlugin())
    parser.add_plugin(qparser.FuzzyTermPlugin())
    results = searcher.search(
        myquery, limit=None, sortedby=facet)  # limit为搜索结果的限制,默认为10,详见博客开头的官方文档
    print(results)
    return results
Exemplo n.º 17
0
def search_index(words):
    xg_words = []
    with ix.searcher() as s:
        qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup)

        # 可以使用通配符搜索
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'{}'.format(word))
            results = s.search(q, limit=10)
            for i in results:
                xg_words.append(i['section'])
    return xg_words
Exemplo n.º 18
0
    def _search(self, query='',  field=None,  index=None, terms=False, limit=None):
        ''' query (exaxct mathch) search '''
        index = index or self._default_index
        ix = self.get_index(index)
        fieldin = field or 'content'

        qp = QueryParser(fieldin, ix.schema)
        qp.add_plugin(ws.qparser.SingleQuotePlugin())
        query = qp.parse(query, normalize=False)
        with ix.searcher() as searcher:
            if terms is True:
                results = searcher.search(query, terms=True, limit=limit).matched_terms()
            else:
                results = list(searcher.search(query, limit=limit).items())

        return results
def ApartadoB(fecha):        
    ix = open_dir("index")
    qp = QueryParser("fecha", schema=ix.schema)
    query = unicode("'"+fecha+" to today'")
    qp.add_plugin(DateParserPlugin(free=True))
    q = qp.parse(query)
    print(q)
    s=ix.searcher()
    results = s.search(q)
    print(results)
    for n in results:
        print n.get("fecha")
        print n.get("remitente")
        print n.get("destinatarios")
        print n.get("asunto")
        print("*************\n")
    return results
Exemplo n.º 20
0
 def search(self, expr, limit=10000):
     with self._index.searcher() as searcher:
         query = QueryParser("raw", self._index.schema)
         query.add_plugin(FieldsPlugin())
         query.add_plugin(RangePlugin())
         query.add_plugin(GtLtPlugin())
         query.add_plugin(WildcardPlugin())
         query = query.parse(expr)
         for x in searcher.search(query, limit=limit):
             yield x
Exemplo n.º 21
0
 def search(self, expr, limit=10000):
     with self._index.searcher() as searcher:
         query = QueryParser("raw", self._index.schema)
         query.add_plugin(FieldsPlugin())
         query.add_plugin(RangePlugin())
         query.add_plugin(GtLtPlugin())
         query.add_plugin(WildcardPlugin())
         query = query.parse(expr)
         for x in searcher.search(query, limit=limit):
             yield x
Exemplo n.º 22
0
def match(query_str, idx, limit=40):
    ret_results = []

    query_words = words_get(query_str)
    if len(query_words) == 0:
        return ret_results

    with idx.searcher() as searcher:
        rome_facet = sorting.FieldFacet('rome')

        # Strict search, with forced correction
        parser = QueryParser('label', idx.schema)
        query = parser.parse(f'{query_str}')
        cor = searcher.correct_query(query, query_str)
        results = searcher.search(cor.query, limit=20, collapse=rome_facet)

        # Word-joker search
        parser = QueryParser('label', idx.schema)
        query = parser.parse(f'{query_str}*')
        results_partial = searcher.search(query, limit=20, collapse=rome_facet)
        results.upgrade_and_extend(results_partial)

        # Fuzzy search
        parser = QueryParser('label', idx.schema, termclass=CustomFuzzyTerm)
        parser.add_plugin(FuzzyTermPlugin())

        shortword = re.compile(r'\W*\b\w{1,3}\b')
        query_prep = shortword.sub('', query_str)
        query = parser.parse(query_prep)
        results_fuzzy = searcher.search(query,
                                        limit=limit,
                                        collapse=rome_facet)

        results.upgrade_and_extend(results_fuzzy)
        for res in results:
            ret_results.append({
                'id': res['rome'],
                'label': res['label'],
                'value': res['label'],
                'occupation': res['slug'],
                'source': res['source'],
                'score': res.score
            })

    return sorted(ret_results, key=lambda e: e['score'], reverse=True)
Exemplo n.º 23
0
def search_index(words):
    xg_words = []
    with ix.searcher() as s:

        # group=qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才能出结果
        qp = QueryParser('section', schema=ix.schema, group=qparser.OrGroup)

        # 下面两行表示可以使用通配符搜索,如"窗前*月光"
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        for word in words:
            q = qp.parse(u'%s' % word)
            # limit:代表返回多少条搜索结果
            results = s.search(q, limit=10)
            for i in results:
                xg_words.append(i['section'])
                # print (word,i['section'])
    return xg_words
Exemplo n.º 24
0
    def test_custom_fields(self):
        obj = ObjectD(title=u'title', blurb='this is a blurb')
        db.session.add(obj)
        db.session.commit()

        self.assert_search_result(ObjectD, 'blurb') 
        self.assert_search_result(ObjectD, '/blog/%s' % obj.id, fields=['url'])

        ### Date ###
        from whoosh.qparser import QueryParser
        from whoosh.qparser.dateparse import DateParserPlugin

        # Instatiate a query parser / add the DateParserPlugin to the parser
        qp = QueryParser("date", ObjectD.pure_whoosh._index.schema)
        qp.add_plugin(DateParserPlugin())

        self.assert_search_result(ObjectD, qp.parse("created:today"))

        self.assert_search_no_result(ObjectD, 'what')        # Sanity check
Exemplo n.º 25
0
Arquivo: commands.py Projeto: maw/urla
    def run(args):
        query = unicode(" ".join(args.query))

        ix = open_dir("urla.index")

        qp = QueryParser("content", ix.schema)
        qp.add_plugin(DateParserPlugin())

        with ix.searcher() as searcher:
            parsed = qp.parse(query)
            print parsed

            results = searcher.search(parsed, sortedby="when", reverse=True,
                                      limit=None)

            for result in results:
                timestamp = result["when"].strftime("%Y-%m-%d")

                print "%s %s" % (timestamp, result["content"].encode("utf-8"))
Exemplo n.º 26
0
class Query(object):
    def __init__(self, index_name):
        self.index_name = index_name
        self.ix = open_dir(index_name)
        self.indexer = Indexer("./template/cache/")
        self.singleParser = QueryParser("word", schema=self.indexer.schema)
        self.searcher = self.ix.searcher()
        self.singleParser.add_plugin(EveryPlugin)

    def translate(self, word, level):
        query = self.singleParser.parse(word.lower())
        query = And([query, Term("level", level)])
        results = self.searcher.search(query)
        for result in results:
            self.log(word)
            return "%s [%s]" % (
                colorize(word, color='magenta'),
                colorize(result["translation"], color='white', bold=True))
        else:
            return word

    def query(self, sentence, level):
        ll = [[word_tokenize(w), ' '] for w in sentence.split()]
        words = list(itertools.chain(*list(itertools.chain(*ll))))
        str_ = ""
        for word in words:
            str_ += self.translate(word, level)

        return str_

    def process(self, filename, level):
        translated = ""
        with open(filename, 'r') as f:
            for line in f.readlines():
                translated += self.query(line, level)
                translated += '\n'
        return translated

    def log(self, sentence):
        with open('logging.txt', 'a') as f:
            f.write(datetime.datetime.now().strftime("%y%m%d%H%M%S") + " " +
                    sentence)
Exemplo n.º 27
0
def test_fuzzy_prefix():
    from whoosh import scoring

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        # Match -> first
        w.add_document(title=u("First"),
                       content=u("This is the first document we've added!"))
        # No match
        w.add_document(
            title=u("Second"),
            content=u("The second one is even more interesting! filst"))
        # Match -> first
        w.add_document(title=u("Third"),
                       content=u("The world first line we've added!"))
        # Match -> zeroth
        w.add_document(
            title=u("Fourth"),
            content=u("The second one is alaways comes after zeroth!"))
        # Match -> fire is within 2 edits (transpose + delete) of first
        w.add_document(title=u("Fifth"), content=u("The fire is beautiful"))

    from whoosh.qparser import QueryParser, FuzzyTermPlugin
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    q = parser.parse("first~2/3 OR zeroth", debug=False)

    assert isinstance(q, query.Or)
    ft = q[0]
    assert isinstance(ft, query.FuzzyTerm)
    assert ft.maxdist == 2
    assert ft.prefixlength == 3

    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        results = searcher.search(q)
        assert len(results) == 4
        assert (" ".join(sorted(
            hit["title"] for hit in results)) == "Fifth First Fourth Third")
Exemplo n.º 28
0
def search_langs(repos, q, limit=1000, **kw):
    index_ = get_langs_index(repos)
    qp = QueryParser("ini", schema=index_.schema)
    qp.add_plugin(GtLtPlugin())
    q = '{0} {1}'.format(
        q, ' '.join('{0}:"{1}"'.format(k, v) for k, v in kw.items()))

    def highlight(res):
        hl = res.highlights('ini', top=1)
        if hl:
            for line in hl.split('\n'):
                if '[[' in line:
                    return line.strip()

    with index_.searcher() as searcher:
        results = searcher.search(qp.parse(q), limit=limit)
        results.formatter = BracketFormatter()
        return (len(results), [
            Languoid(r['id'], r.get('iso'), r['name'], r['level'], r['fname'],
                     highlight(r)) for r in results
        ])
Exemplo n.º 29
0
def summary_search(queries_list, return_objects=False, make_phrase=False):
    queries = [unicode(q).replace("\'", "").replace('\"', '')
               for q in queries_list]

    if make_phrase:
        queries = ["\"" + q + "\"" for q in queries]

    create_summary_index()
    ix = get_summary_index()
    parser = QueryParser("summary", schema=ix.schema)
    parser.add_plugin(PhrasePlugin())

    with ix.searcher() as searcher:
        parsed_queries = [parser.parse(q) for q in queries]
        total_query = whoosh.query.Or(parsed_queries)

        results = searcher.search(total_query, limit=None)
        if return_objects:
            return [Bill.query.get(b['id']) for b in results]
        else:
            return [b['id'] for b in results]
Exemplo n.º 30
0
    def search():
        window = Toplevel()
        scrollbar = Scrollbar(window)
        scrollbar.pack(side = RIGHT, fill=Y)
        mylist3 = Listbox(window, width=100, height=20,yscrollcommand=scrollbar.set)
        scrollbar.config(command = mylist3.yview)
        
        iC = open_dir("indexCorreo")
        qp = QueryParser("fecha", schema=iC.schema)
        qp.add_plugin(DateParserPlugin())
        fecha = E1.get()
        date_now = datetime.now()
        date_email = datetime.strptime(fecha, "%Y%m%d")
        q = qp.parse(u"fecha:["+str(date_email)+"to"+str(date_now)+"]")
        
        with iC.searcher() as s:
            results = s.search(q)
            for result in results:
                mylist3.insert(END, "Remitente: %s, Destinatarios: %s, Asunto: %s" % (result["remitente"],result["destinatarios"],result["asunto"]))

        mylist3.pack(side = LEFT, fill = BOTH)
Exemplo n.º 31
0
async def search(query_str, ctx):
    ix = open_dir("indexdir")
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(qparser.FuzzyTermPlugin())
    parser.add_plugin(GtLtPlugin())
    parser.add_plugin(DateParserPlugin())
    query = parser.parse(query_str)
    print(query)
    with ix.searcher(weighting=scoring.PL2) as searcher:
        results = searcher.search(query, limit=5)
        results.fragmenter = highlight.SentenceFragmenter()
        results.fragmenter.surround = 50
        results.fragmenter.maxchars = 10000
        results.formatter = DiscordBoldFormatter()
        embed = discord.Embed(
            title="Results",
            color=discord.Color(0x3cd63d),
            description="From search: **{}**".format(query_str))
        for hit in results:
            # embed.add_field(name="[{}]({})".format(hit["title"], hit["url"]), value="{}".format(hit.highlights("content")))
            embed.add_field(name="\u200b",
                            value=f"[{hit['title']}]({hit['url']})\n"
                            f"{hit.highlights('content', minscore=0)}",
                            inline=False)
    await ctx.send(embed=embed)
Exemplo n.º 32
0
def test_fuzzy_prefix():
    from whoosh import scoring

    schema = fields.Schema(title=fields.TEXT(stored=True),
                           content=fields.TEXT(spelling=True))

    ix = RamStorage().create_index(schema)
    with ix.writer() as w:
        # Match -> first
        w.add_document(title=u("First"),
                       content=u("This is the first document we've added!"))
        # No match
        w.add_document(title=u("Second"),
                       content=u("The second one is even more interesting! filst"))
        # Match -> first
        w.add_document(title=u("Third"),
                       content=u("The world first line we've added!"))
        # Match -> zeroth
        w.add_document(title=u("Fourth"),
                       content=u("The second one is alaways comes after zeroth!"))
        # Match -> fire is within 2 edits (transpose + delete) of first
        w.add_document(title=u("Fifth"),
                       content=u("The fire is beautiful"))

    from whoosh.qparser import QueryParser, FuzzyTermPlugin
    parser = QueryParser("content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    q = parser.parse("first~2/3 OR zeroth", debug=False)

    assert isinstance(q, query.Or)
    ft = q[0]
    assert isinstance(ft, query.FuzzyTerm)
    assert ft.maxdist == 2
    assert ft.prefixlength == 3

    with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
        results = searcher.search(q)
        assert len(results) == 4
        assert (" ".join(sorted(hit["title"] for hit in results))
                == "Fifth First Fourth Third")
Exemplo n.º 33
0
def summary_search(queries_list, return_objects=False, make_phrase=False):
    queries = [
        unicode(q).replace("\'", "").replace('\"', '') for q in queries_list
    ]

    if make_phrase:
        queries = ["\"" + q + "\"" for q in queries]

    create_summary_index()
    ix = get_summary_index()
    parser = QueryParser("summary", schema=ix.schema)
    parser.add_plugin(PhrasePlugin())

    with ix.searcher() as searcher:
        parsed_queries = [parser.parse(q) for q in queries]
        total_query = whoosh.query.Or(parsed_queries)

        results = searcher.search(total_query, limit=None)
        if return_objects:
            return [Bill.query.get(b['id']) for b in results]
        else:
            return [b['id'] for b in results]
Exemplo n.º 34
0
  def __init__(self, url, headers, rows):
    self.url = url
    self.headers = headers
    self.rows = rows

    self.schema = Schema(
      name=TEXT(stored=False),
      alternative_names=TEXT(stored=False),
      id=ID(stored=True)
    )
    self.index = RamStorage().create_index(self.schema)

    for c in [NAME_HEADER, ALT_NAMES_HEADER, TYPE_HEADER]:
      assert c in self.headers, 'Required "{}" column not found in {}'.format(c, url)

    name_idx = self.headers.index(NAME_HEADER)
    alt_names_idx = self.headers.index(ALT_NAMES_HEADER)

    writer = self.index.writer()
    for idx, row in enumerate(self.rows):
      name = row[name_idx]
      alt_names = row[alt_names_idx]
      writer.add_document(
        name=str(name),
        alternative_names=str(alt_names),
        id=str(idx)
      )
    writer.commit()

    parser = QueryParser("name", self.index.schema)
    self.exact_name_query_parser = parser

    parser = QueryParser("name", self.index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    self.name_query_parser = parser

    parser = QueryParser("alternative_names", self.index.schema)
    parser.add_plugin(FuzzyTermPlugin())
    self.alt_names_query_parser = parser
Exemplo n.º 35
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []

        with self.whoosh.ix.searcher() as searcher:
            matches = []
            for f in ["title", "path", "content"]:
                queryp = QueryParser(f, self.whoosh.ix.schema)
                queryp.add_plugin(RegexPlugin())
                # r"pattern" is the desired regex term format
                query = queryp.parse('r"' + pattern + '"')
                ms = searcher.search(query, limit=None)  # default limit is 10!
                for m in ms:
                    if not m in matches:
                        matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = """
                    <style>
                        body { font-size: 14px; }
                        .path { font-size: 12px; color: #009933; }
                    </style>
                   """
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" + path +
                         "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
Exemplo n.º 36
0
def issue_search(queries_list, return_objects=False, make_phrase=False,
                 case_sensitive=False):
    # Remove quotation marks
    queries = [q.replace("'", "").replace('"', '') for q in queries_list]

    if make_phrase:
        queries = ["\"" + q + "\"" for q in queries]

    ix = get_issue_index(case_sensitive=case_sensitive)
    parser = QueryParser("text", schema=ix.schema)
    parser.add_plugin(PhrasePlugin())
    
    with ix.searcher() as searcher:
        parsed_queries = [parser.parse(q) for q in queries]
        q = whoosh.query.Or(parsed_queries)
        results = searcher.search(q, limit=None)
        # print "   -", len(results), "results"
        if return_objects:
            return [LobbyingSpecificIssue.query.get(int(i['id']))
                    for i in results]
        else:
            return [i['id'] for i in results]
Exemplo n.º 37
0
def search_date(entry):
    ix = index.open_dir("events")
    tk = Tk()
    scrollbar = Scrollbar(tk, orient="vertical")
    lb = Listbox(tk, width=50, height=20, yscrollcommand=scrollbar.set)
    scrollbar.config(command=lb.yview)

    scrollbar.pack(side="right", fill="y")
    lb.pack(side="left", fill="both", expand=True)
    date = str(entry)
    myquery = "date:<= " + date
    print("Myquery " + myquery)
    qp = QueryParser('fechaInicio', ix.schema)
    qp.add_plugin(DateParserPlugin())

    t = qp.parse(u"date:"+date)
    print(t)
    with ix.searcher() as s:
        results_t = s.search(t, limit=None)
        for r in results_t:
            lb.insert(END, "Categorias: " + r["categorias"], "Título: " + r["titulo"], "Fecha: " + r["fechaInicio"], "")

    tk.mainloop()
Exemplo n.º 38
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []
        print("Searching using", pattern)
        with self.ix.searcher() as searcher:
            matches = []
            queryp = QueryParser("content", self.ix.schema)
            #allow escaped qutoes when regex searching
            queryp.add_plugin(
                RegexPlugin(expr=r'r"(?P<text>[^"\\]*(\\.[^"\\]*)*)"'))
            # ~~r"pattern" is the desired regex term format~~ Don't autoforce regexing
            query = queryp.parse(pattern)
            #print("durp durp", query)
            ms = searcher.search(query, limit=None)  # default limit is 10!
            for m in ms:
                #if not m in matches:
                matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = ""
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" + path +
                         "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
            print("Finished searching", pattern)
Exemplo n.º 39
0
Arquivo: commands.py Projeto: maw/urla
    def run(args):
        ix = open_dir("urla.index")

        qp = QueryParser("content", ix.schema)
        qp.add_plugin(DateParserPlugin())

        while True:
            try:
                query = unicode(raw_input("> "))
            except EOFError:
                print
                sys.exit(0)

            with ix.searcher() as searcher:
                parsed = qp.parse(query)
                print parsed

                results = searcher.search(parsed)

                for result in results:
                    timestamp = result["when"].strftime("%Y-%m-%d")

                    print "%s %s" % (timestamp,
                                     result["content"].encode("utf-8"))
Exemplo n.º 40
0
    def searchNote(self):
        """ Sorting criteria: "title > path > content"
            Search matches are organized into html source.
        """

        pattern = self.searchEdit.text()
        if not pattern:
            return
        results = []
        print("Searching using", pattern)
        with self.ix.searcher() as searcher:
            matches = []
            queryp = QueryParser("content", self.ix.schema)
            #allow escaped qutoes when regex searching
            queryp.add_plugin(RegexPlugin(expr=r'r"(?P<text>[^"\\]*(\\.[^"\\]*)*)"'))
            # ~~r"pattern" is the desired regex term format~~ Don't autoforce regexing
            query = queryp.parse(pattern)
            #print("durp durp", query)
            ms = searcher.search(query, limit=None) # default limit is 10!
            for m in ms:
                #if not m in matches:
                matches.append(m)

            for r in matches:
                title = r['title']
                path = r['path']
                term = r.highlights("content")
                results.append([title, path, term])

            html = ""
            for title, path, hi in results:
                html += ("<p><a href='" + path + "'>" + title +
                         "</a><br/><span class='path'>" +
                         path + "</span><br/>" + hi + "</p>")
            self.searchView.setHtml(html)
            print("Finished searching", pattern)
Exemplo n.º 41
0
def search_index(words):
    xg_part = []
    with ix.searcher() as s:
        # group = qparser.OrGroup 表示可匹配任意查询词,而不是所有查询词都匹配才出结果
        qp = QueryParser('part', schema=ix.schema, group=qparser.OrGroup)

        # 下面两行表示可以使用通配符,如“窗前*月光”
        qp.remove_plugin_class(qparser.WildcardPlugin)
        qp.add_plugin(qparser.PrefixPlugin())

        # 随机数
        num = random.randint(3, 7)

        for word in words:
            q = qp.parse(u'%s' % word)

            # limit 表示多少条搜索结果
            results = s.search(q, limit=num)
            count = 0
            for i in results:
                if count > 0:  # 防止等于本身
                    xg_part.append((i['pid'], i['part']))
                count += 1
    return xg_part
Exemplo n.º 42
0
 def buscar_apartadob(self, query):
     if not self.indice:
         tkMessageBox.showerror('Error', 'No existe ningún índice.\nPor favor, cree un índice y reintente la búsqueda.')
     else:
         result = []
         indice = self.indice
         searcher = indice.searcher()
         
         qp = QueryParser('fecha', schema=self.schema)
         qp.add_plugin(FieldsPlugin())
         qp.add_plugin(RangePlugin())
         qp.add_plugin(GtLtPlugin())
         q = qp.parse(unicode(query))
         
         with searcher as s:
             busqueda = s.search(q)
             result = [[correo['remitente'], correo['destinatarios'], correo['asunto']] for correo in busqueda]
             
     return result
Exemplo n.º 43
0
 def buscar_indice(self, campo, query):
     if not self.indice:
         tkMessageBox.showerror('Error', 'No existe ningún índice.\nPor favor, cree un índice y reintente la búsqueda.')
     else:
         result = []
         indice = self.indice
         searcher = indice.searcher()
         
         qp = QueryParser(campo, schema=self.schema)
         qp.add_plugin(FieldsPlugin())
         qp.add_plugin(RangePlugin())
         qp.add_plugin(GtLtPlugin())
         q = qp.parse(unicode(query))
         
         with searcher as s:
             busqueda = s.search(q)
             result = [correo['numero'] for correo in busqueda]
     
     return result
Exemplo n.º 44
0
#     You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes.
parser = QueryParser("content", ix.schema)  # ix.schema 和 schema 是相同的东西
print(len(parser.plugins), parser.plugins)  # 11
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>,
#  <whoosh.qparser.plugins.FieldsPlugin>,     <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,
#  <whoosh.qparser.plugins.RangePlugin>,      <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,
#  <whoosh.qparser.plugins.BoostPlugin>,      <whoosh.qparser.plugins.EveryPlugin>]
## default_set(): Returns the default list of plugins to use.
print(len(parser.default_set()), parser.default_set())  # 10
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>,
#  <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,      <whoosh.qparser.plugins.RangePlugin>,
#  <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,   <whoosh.qparser.plugins.BoostPlugin>,
#  <whoosh.qparser.plugins.EveryPlugin>]
parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 10 10
parser.add_plugin(qparser.PrefixPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 11 10
## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree.
query = parser.parse('document')
## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object.
# See :doc:`/searching` for more information.
results = searcher.search(query)  # 检索 "content" 中出现 "document"
print(results)  # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184>
print(type(results))  # <class 'whoosh.searching.Results'>

## 查询方法二: 上面两行是只用方法, 下面一行也形
## find(defaultfield, querystring, **kwargs)
results = searcher.find("title", "document")  # 检索标题中出现 'document' 的文档
print(results)  # <Top 2 Results for Term('title', 'document') runtime=0.0008875329999682435>
print(type(results))  # <class 'whoosh.searching.Results'>; 和上面第一种方法得到的结果一样
Exemplo n.º 45
0
#     You can specify None for the schema to create a parser that does not analyze the text of the query, usually for testing purposes.
parser = QueryParser("content", ix.schema)  # ix.schema 和 schema 是相同的东西
print(len(parser.plugins), parser.plugins)  # 11
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>,
#  <whoosh.qparser.plugins.FieldsPlugin>,     <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,
#  <whoosh.qparser.plugins.RangePlugin>,      <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,
#  <whoosh.qparser.plugins.BoostPlugin>,      <whoosh.qparser.plugins.EveryPlugin>]
## default_set(): Returns the default list of plugins to use.
print(len(parser.default_set()), parser.default_set())  # 10
# [<whoosh.qparser.plugins.WhitespacePlugin>, <whoosh.qparser.plugins.SingleQuotePlugin>, <whoosh.qparser.plugins.FieldsPlugin>,
#  <whoosh.qparser.plugins.WildcardPlugin>,   <whoosh.qparser.plugins.PhrasePlugin>,      <whoosh.qparser.plugins.RangePlugin>,
#  <whoosh.qparser.plugins.GroupPlugin>,      <whoosh.qparser.plugins.OperatorsPlugin>,   <whoosh.qparser.plugins.BoostPlugin>,
#  <whoosh.qparser.plugins.EveryPlugin>]
parser.remove_plugin_class(whoosh.qparser.plugins.WildcardPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 10 10
parser.add_plugin(qparser.PrefixPlugin)
print(len(parser.plugins), len(parser.default_set()))  # 11 10
## parse(text, normalize=True, debug=False) Parses the input string and returns a :class:`whoosh.query.Query` object/tree.
query = parser.parse('document')
## search(q, **kwargs) Runs a :class:`whoosh.query.Query` object on this searcher and returns a :class:`Results` object.
# See :doc:`/searching` for more information.
results = searcher.search(query)  # 检索 "content" 中出现 "document"
print(
    results
)  # <Top 1 Results for Term('content', 'document') runtime=0.0015511049998622184>
print(type(results))  # <class 'whoosh.searching.Results'>

## 查询方法二: 上面两行是只用方法, 下面一行也形
## find(defaultfield, querystring, **kwargs)
results = searcher.find("title", "document")  # 检索标题中出现 'document' 的文档
print(
Exemplo n.º 46
0
import datetime
import simplejson as json

from flask import Flask, request, render_template, Response

from whoosh.index import open_dir
from whoosh.qparser import QueryParser
from whoosh.qparser.dateparse import DateParserPlugin

from search_index import TweetSchema

app = Flask(__name__)

search_index = open_dir("index")
parser = QueryParser("text", TweetSchema())
parser.add_plugin(DateParserPlugin())

class APIEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (datetime.datetime, datetime.date)):
            return obj.isoformat()
        return json.JSONEncoder.default(self, obj)

def jsonify(**data):
    return Response(json.dumps(data, cls=APIEncoder), mimetype='application/json')

@app.route("/")
def index():
    return render_template("index.html")

@app.route("/search")
Exemplo n.º 47
0
def main():
    
    ix = open_dir("index")
    searcher = ix.searcher()
    
    

    docid=1
    if len(sys.argv)>=2:
        input=sys.argv[1]
    
    
    else:
        print 'You need to type a file name\nType python search.py filename'
        exit()
    

    queries=[]

    with open(input, 'r') as f:
        for line in f:
            queries.append(line)


    qp = QueryParser("desc", schema=ix.schema)

    qp.add_plugin(qparser.WildcardPlugin())
    qp.add_plugin(qparser.PrefixPlugin())
    qp.add_plugin(qparser.RegexPlugin())


    f= open('testresult.txt', 'w')


    for query in queries:
        myquery = qp.parse(query)
        
        
        searcher = ix.searcher()

        results = searcher.search(myquery, limit=400)

        print "\nYou are searching:"
        all_terms=list(myquery.iter_all_terms())
        print query.strip()
        print 'number of hits'
        print (len(results))


        if (len(results))==0:
            continue
        whooshresults =[int(x['index']) for x in results]

    # hits


        myresults=[x['index'] for x in results]

        vectorizer = TfidfVectorizer(encoding="latin-1", stop_words='english')

        corpus=[x['desc'] for x in results]

        dt=[x['dt'] for x in results]

        latlong=[x['latlong'] for x in results]


#pageranding part

        X = vectorizer.fit_transform(corpus)

        X= X.toarray()

        similarity=cosine_similarity(X,X)

        G = nx.Graph()
        for i in range(len(myresults)):
            G.add_node(myresults[i])



        for i in range(len(similarity)):
            for j in range(i+1,len(similarity[i])):
                
                delta=abs(dt[i]-dt[j])
                
                loci=latlong[i][1:-1].split(',')
                locj=latlong[j][1:-1].split(',')
                
                
                loci=(loci[0],loci[1])
                locj=(locj[0],locj[1])
                
                #if similarity[i][j]>=0.1:
                #    G.add_edge(myresults[i],myresults[j], weight=similarity[i][j])
                
                if similarity[i][j]>=0.18 and delta<datetime.timedelta(days=7) and vincenty(loci, locj).miles<100:
                    G.add_edge(myresults[i],myresults[j], weight=(similarity[i][j]))

        nxresult=sorted(nx.pagerank(G, alpha=0.85).items(), key=lambda x:-x[1])


        #query expansion

        keywords = [keyword for keyword, score in results.key_terms("desc", docs=30, numterms=(len(all_terms)+1))]



        #get expanded query

        keywords=[('desc', x) for x in keywords]
        newterms=[x for x in keywords if x not in all_terms]
        newterms=newterms+all_terms
        newterms=[x[0]+':'+x[1] for x in newterms]
        newterms=' '.join(newterms)

        #expanded query
        print 'Do you want search?'
        print newterms



        newquery = qp.parse(newterms)

        print newquery

        searcher = ix.searcher()
                    
        expansionresults = searcher.search(newquery, limit=1000)


        nxdesc=[]
        for j in nxresult[0:10]:
            doc=searcher.document(index=j[0])
            nxdesc.append(j[0]+'\t'+doc['desc'])





        printpagerank(nxresult)
        printtfidf(whooshresults)
        printquery_expansion(expansionresults, newquery)

        visualize(G,  nxresult)




        #writing output

        f.write('\n\n'+query)
        
        f.write("\n\nTop 10 query expansion results baseline result\n")
        f.write(('\n'.join([x['index']+'\t'+x['desc'] for x in results][0:10])).encode('utf8'))
        
        f.write("\n\nTop 10 ranking by nx pageranking\n")
        f.write(('\n'.join(nxdesc)).encode('utf8'))
        
        f.write("\n\nTop 10 query expansion results\n")
        f.write(('\n'.join([x['index']+'\t'+x['desc'] for x in expansionresults][0:10])).encode('utf8'))


    f.close()
Exemplo n.º 48
0
        abstract = TEXT
        authors = TEXT(stored=True)
        year = NUMERIC(stored=True)
        month = NUMERIC(stored=True)
        day = NUMERIC(stored=True)
        review = BOOLEAN(stored=True)
        journal = STORED
        volume = STORED
        pages = STORED
        
    ix = index.create_in(ABSTRACT_INDEX_PATH, Schema)


# query parser and searcher
parser = QueryParser('abstract',ix.schema)
parser.add_plugin(PhrasePlugin)
searcher = ix.searcher(weighting=BM25F)


# facet object for sorting abstracts by date (some have years but not dates)
datefacet = MultiFacet()
datefacet.add_field('year')
datefacet.add_field('month')
datefacet.add_field('day')


#Builds Query
def buildquery(keywords=None):
    # get keyword branch of query
    print "keywords (buildquery input) ==", keywords
    keywords = keywords.decode("utf-8")
Exemplo n.º 49
0
 def search(self, query):
     #self.message(u"Searching for: \"{}\".".format(query))
     parser = QueryParser(self.default_field, self.schema, group=OrGroup)
     parser.add_plugin(FuzzyTermPlugin())
     parsed_query = parser.parse(query)
     return self.searcher.search(parsed_query)
Exemplo n.º 50
0
def update_metabolites(db):
    """
    Find metabolites mentioned in new articles, and insert new records into the
    metabolite_abstract table in the database.
    
    (For each metabolite in the metabolite_info.txt file, search against the 
    temporary whoosh index containing only new articles.)
    """

    logger.debug('Scanning for metabolites')

    # Don't open the index until this enclosing function is called, because
    # we'll be deleting it and re-creating it in a previous state of the 
    # update process.
    ix = open_index(TEMP_METABOLITE_INDEX_PATH)
    cursor = getcursor(db)


    # query parser and searcher
    parser = QueryParser('abstract',ix.schema)
    parser.add_plugin(PhrasePlugin)
    searcher = ix.searcher(weighting=BM25F)


    #Get all common names so they don't repeat
    #outfile = open('metabolite2pubmed.txt','w') #mapping file
    common_name_set = set()
    with open('metabolite_info.txt')as f:
        for line in f:
            if line.startswith('HMDB'):
                synonym_line=f.next().strip()
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                #print(common_name)
                common_name_set.add(common_name)


    #search abstracts and write to metabolite2pubmed.txt
    with open('metabolite_info.txt') as f:
        for line in f:
            if line.startswith('HMDB'):
                #outfile.write(line) #Write ID to file (line 1)
                
                hmdb_id = line.strip()
                
                synonym_line = f.next().strip()
                #outfile.write(synonym_line)
                synonyms = synonym_line.split('\t')
                common_name = synonyms[0]
                printsyn = common_name + '\t'
                for s in synonyms:
                    if s in common_name_set and s != common_name:
                        synonyms.remove(s)
                        continue
                    if s == common_name:
                        continue
                    printsyn = printsyn + '\t' +s
                #outfile.write(printsyn+'\n') #Write synonyms to file (line 2)
                reference_line = f.next().strip()
                references = set(reference_line.split('\t'))
                if '\n' in references:
                    references.remove('\n')

                for name in synonyms:
                    query = '"' + name + '"' #performs complete query
                    results = get_abstracts(parser, searcher, query) #searches with get_abstracts useing "line" as the search keyword
                    for item in results:
                        references.add(str(item))


                rlist = list(references)
                
                insert_db_records(cursor, hmdb_id, rlist)
                
                #rline = '\t'.join(references) + '\n'
                #outfile.write(rline) #Write references to file (line 3)


    logger.info('updated metabolite-abstract links')
Exemplo n.º 51
0
def main(query: ("Query", 'option', 'q'), arg_sentence=None, ):
    # test_data = SENTENCES
    # test_data = get_test_data(config.TEST_DATA_CSV)
    if arg_sentence:
        test_data = [(arg_sentence, [])]
    else:
        test_data = [
            # ("Do you have something like the 2005 Zinfandel of Turley?".lower(), []),
            ("redd wine nappa chateau latoor", []),
            ("nappa valley", ['napa valley']),
            ("latour", ['chateau latour']),
            ("red chateu latour", ['red', 'chateau latour']),
            ("red", ['red']),
            ("red chateau lator", ['red', 'chateau latour']),
            ("cabernet sauvignon", ['cabernet sauvignon']),
            ("caubernet sauvignon", ['cabernet sauvignon']),
            ("cabernet savignon", ['cabernet sauvignon']),
            ("caubernet sauvignon", ['cabernet sauvignon']),
            ("how are yoou", []),
            ("chateu meru lator", ['merus', 'chateau latour']),
            ("chateau lator", ['chateau latour']),
            ("blak opul", ['black opal']),
            ("red caubernet sauvignon", ['red', 'cabernet sauvignon'])
        ]
    print()
    print()
    success = 0
    total = len(test_data)

    if query:
        with magia_search._searcher(weighting=scoring.TF_IDF()) as s:
            qp = QueryParser(TEXT_FIELD, schema=magia_search._schema)
            qp.add_plugin(FuzzyTermPlugin)
            q = qp.parse(query)
            magia_search.get_search_results(ix, s, q)
            sys.exit()

    failed = []
    for chunk, expected in test_data:
        orig_chunk = chunk
        print("Input chunk: {}".format(chunk))
        start_time = datetime.now()
        result = lookup_attributes(remove_stopwords(chunk))

        if sorted(result) == sorted(expected):
            success += 1
            cprint('Success', foreground="green", background="black")
        else:
            cprint('Fail', foreground="red", background="black")
            failed.append((chunk, result, expected))

        print('Completed in {}'.format(datetime.now() - start_time))
        print('Expected', expected)
        print('Got:', result)
        print('--------------')
        print()
    print("{}/{} tests passed. {}%".format(success, total, success * 100 // total))
    if failed:
        print()
        cprint('Failed', foreground="red", background="black")
        for chunk, result, expected in failed:
            print('*IN: {} *OUT: {} *EXPECTED: {}'.format(chunk, result, expected))
Exemplo n.º 52
0
def MultiFieldWordNetParser(fieldnames, schema, fieldboosts=None, expansion=1, **kwargs):
    p = QueryParser(None, schema, **kwargs)
    mfp = WordnetPlugin(fieldnames, fieldboosts=fieldboosts, expansion=expansion)
    p.add_plugin(mfp)
    return p
Exemplo n.º 53
0
for i in range(0,len(continents)):
	writer.add_document(city_name=capitalName[i],country_name=countryName[i],continent=continents[i],city_text=capitalText[i],country_text=countryText[i])
writer.commit()

###2
##find cities
with ix.searcher() as searcher:
	parser = QueryParser("city_text",ix.schema)
	#greek & roman -persian
	myquery = parser.parse('Greek AND Roman NOT Persian')
	results = searcher.search(myquery,limit=None)
	for result in results:
		print(result['city_name'])
	#shakespeare incl mispelled
	parser.add_plugin(FuzzyTermPlugin())
	myquery = parser.parse(u'Shakespeare~3')
	results = searcher.search(myquery,limit=None)
	for result in results:
		print(result['city_name'])
	#located below sea level
	# parser.remove_plugin_class(PhrasePlugin)
	# parser.add_plugin(SequencePlugin())
	# myquery = parser.parse("located below sea level~10")
	# myquery = SpanNear.phrase("city_text",["located","below","sea","level"],slop=10)
	myquery = Phrase("city_text",list([unicode("located"),unicode("below"),unicode("sea"),unicode("level")]),slop=10)
	results = searcher.search(myquery,limit=None)
	for result in results:
		print(result['city_name'])

###3
Exemplo n.º 54
0
from whoosh.index import create_in
from whoosh.fields import *
schema = Schema(title=TEXT(stored=True), content=TEXT)
ix = create_in("indexdir", schema)
writer = ix.writer()
writer.add_document(title=u"First document", content=u"This is the first document we've added!")
writer.add_document(title=u"Second document", content=u"The second one is even more interesting!")
writer.add_document(title=u"Third document", content=u"letter first, stamp second, mail third")
writer.add_document(title=u"Fourth document", content=u"stamp first, mail third")
writer.add_document(title=u"Fivth document", content=u"letter first,  mail third")
writer.add_document(title=u"Sixth document", content=u"letters first, stamps second, mail third")
writer.add_document(title=u"Seventh document", content=u"stamp first, letters second, mial third")
writer.commit()


from whoosh.qparser import QueryParser, FuzzyTermPlugin, PhrasePlugin, SequencePlugin
with ix.searcher() as searcher:
    parser = QueryParser(u"content", ix.schema)
    parser.add_plugin(FuzzyTermPlugin())
    parser.remove_plugin_class(PhrasePlugin)
    parser.add_plugin(SequencePlugin())
    query = parser.parse(u"Apple iphone 6")
    print query
    results = searcher.search(query)
    print "nb of results =", len(results)
    for r in results:
        print r
def predict_TF_IDF(data, docs_per_q):
    # index docs
    exclude = set(string.punctuation)

    res = []

    for idx, row in data.iterrows():
        print row["id"]
        # get answers words
        w_A = set(utils.tokenize(row["answerA"]))
        w_B = set(utils.tokenize(row["answerB"]))
        w_C = set(utils.tokenize(row["answerC"]))
        w_D = set(utils.tokenize(row["answerD"]))

        sc_A = 0
        sc_B = 0
        sc_C = 0
        sc_D = 0

        q_punc = row["question"]  # first thing to debug if not working
        question = "".join(ch for ch in q_punc if ch not in exclude)
        qp = QueryParser("content", schema=schema, group=qparser.OrGroup)
        qp.add_plugin(qparser.FuzzyTermPlugin())
        qp.remove_plugin_class(qparser.PhrasePlugin)
        qp.add_plugin(qparser.SequencePlugin())
        q = qp.parse(unicode(question, "utf-8"))
        # q = qp.parse('physics')
        # cp = qparser.CompoundsPlugin( AndMaybe="&~")
        with ix.searcher() as s, ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf:
            results = s.search(q, limit=docs_per_q)
            """
            u_id = unicode(uuid.uuid1())
            if not os.path.exists("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id):
                os.mkdir("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id)
            q_ix = index.create_in("/home/evan/Desktop/Kaggle/allen/glove/kaggle_allen/data/whoosh7/%s" % u_id, schema)
            q_writer = q_ix.writer()
            for document in results:
                q_writer.add_document(article_title=document['article_title'], content=document['content'])
            q_writer.commit()
            """
            # with q_ix.searcher(weighting=scoring.TF_IDF()) as scoring_searcher_tfidf
            for document in results:
                doc_parser = QueryParser("content", schema=schema)
                doc_q = doc_parser.parse(u"article_title:%s" % document["article_title"])
                for w in w_A:
                    try:
                        sc_A += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_B:
                    try:
                        sc_B += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_C:
                    try:
                        sc_C += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass
                for w in w_D:
                    try:
                        sc_D += (
                            scoring.TF_IDF()
                            .scorer(scoring_searcher_tfidf, "content", w)
                            .score(doc_q.matcher(scoring_searcher_tfidf))
                        )
                    except TermNotFound:
                        pass

        res.append(["A", "B", "C", "D"][np.argmax([sc_A, sc_B, sc_C, sc_D])])

    return res
Exemplo n.º 56
0
class Query(object):

    def __init__(self):
        self.ix = index.open_dir(config.index_file_path)
        self.ix2 = index.open_dir(config.index2_file_path)
        # Instatiate a query parser
        self.qp = QueryParser("content", self.ix.schema)

        # Add the DateParserPlugin to the parser
        self.qp.add_plugin(DateParserPlugin())
        self.qp.add_plugin(WildcardPlugin())
        self.qp.add_plugin(PrefixPlugin())
        self.qp.add_plugin(RegexPlugin())

    ## 将search返回的结果解析成json格式,用于前端展示
    def _results_todata(self, results):
        data = {}
        if isinstance(results, Results):
            data["total"] = results.estimated_length()
        elif isinstance(results, ResultsPage):
            data['total'] = results.total
        result_list = []
        for result in results:
            item = {}
            for key in result.keys():
                item[key] = result.get(key)
            import re
            match_class = re.compile('class="match term[0-9]"')
            item['description'] = match_class.sub(" ", str(result.highlights('content'))) \
                .replace(" ", "").replace("\r\n", "").replace("\n", "")
            item['description'] = self.truncate_description(
                item['description'])
            item['docnum'] = result.docnum
            result_list.append(item)
        data["results"] = result_list
        return data

    def _results_tohotdata(self, results):
        from datetime import datetime, timedelta
        now = datetime.now()
        daySeconds = 86400
        weekSeconds = daySeconds * 7
        monthSecond = weekSeconds * 30
        data = {}
        if isinstance(results, Results):
            data["total"] = results.estimated_length()
        elif isinstance(results, ResultsPage):
            data['total'] = results.total
        result_list = []
        i = 0
        for result in results:
            i = i + 1
            item = {}
            for key in result.keys():
                item[key] = result.get(key)
            timespan = (now - item['publish_time']).seconds
            if timespan > daySeconds:
                if timespan < weekSeconds:
                    item['hotScore'] = result.score * 1
                else:
                    item['hotScore'] = result.score * 0.5
            else:
                item['hotScore'] = result.score * 1.5
            import re
            match_class = re.compile('class="match term[0-9]"')
            item['description'] = match_class.sub(" ", str(result.highlights('content'))) \
                .replace(" ", "").replace("\r\n", "").replace("\n", "")
            item['description'] = self.truncate_description(
                item['description'])
            item['docnum'] = result.docnum
            result_list.append(item)
            if i == 100:
                result_list = sorted(result_list, key=lambda results: results['hotScore'])
        if i < 100:
            result_list = sorted(result_list, key=lambda results: results['hotScore'])
        data["results"] = result_list
        return data

    ## 搜索功能,每次搜索一页
    def query_page(self, term, page_num, page_len, sort_type):

        with self.ix.searcher() as searcher:
            if sort_type == 1:  # default sorted
                results = searcher.search_page(self.qp.parse(
                    term), pagenum=page_num, pagelen=page_len,sortedby=ScoreFacet())
                #results2 = searcher.search_page(self.qp.parse(
                #    term), pagenum=page_num, pagelen=page_len, sortedby=ScoreAndTimeFacet())
                #self.generate_similarQuery(results,term)
            if sort_type == 2:  # sorted by custom hot value
                publish_time = FieldFacet("publish_time", reverse=True)
                results = searcher.search_page(self.qp.parse(
                    term), pagenum=page_num, pagelen=page_len, sortedby=publish_time)
            if sort_type == 3:  # sorted by time
                publish_time = FieldFacet("publish_time", reverse=True)
                results = searcher.search_page(self.qp.parse(
                    term), pagenum=page_num, pagelen=page_len, sortedby=ScoreAndTimeFacet())
            return self._results_todata(results), results.results.runtime

    ## 截断正文内容,避免过长
    def truncate_description(self, description):
        """
        Truncate description to fit in result format.
        """
        if len(description) <= 160:
            return description
        cut_desc = description[:160]
        i = 160
        letter = description[i]
        length = len(description)
        while i < length - 1 and not (letter == ',' or letter == ',' or letter == '.' or letter == '。'):
            cut_desc += letter
            i = i + 1
            letter = description[i]
        cut_desc += letter
        # print(cut_desc)
        return cut_desc

    # 计算句子的TF-IDF
    def cal_TF_IDF(self,words,countKey):
        with self.ix.searcher(weighting=scoring.TF_IDF()) as searcher_tfidf:
            #words = list(jieba.cut(sentence))
            count = 0
            score = 0
            for word in words:
                #if word == u'的' or word == u'地' or word == u'和':
                #    continue
                count += 1
                try:
                    tf = searcher_tfidf.term_info('content', word).max_weight()
                except:
                    tf = 0.1
                score += searcher_tfidf.idf('content',word) * tf
            if count == 0:
                return 0
            else:
                return countKey * score / count
    
    
    def sentenceFind(self,sentence,terms):
        for term in terms:
            if sentence.find(term) != -1 :
                return 1
        return 0
    # 在20个句子中 选取5个包含关键词的较高TF-IDF句子
    def generate_similarQuery(self, results, query_str):
        import re
        word_count = 0 # 句子数量
        keywords = []
        items = []
        similarQuery = []
        terms = []
        keywords = re.split(" ", query_str)
        for keyword in keywords:
            temps = list(jieba.cut(keyword))
            for temp in temps:
                if len(temp) == 0 :
                    continue
                terms.append(temp)
        for result in results[0:9]:
            content_count = 0
            content = result.get('content')
            content = content.replace(" ", ",")
            sentences = re.split(r"[,|.|,|。|!|!|?|?|:|:|;|;|……|、]", content)
            for sentence in sentences:
                item = {}
                countKey = 0
                #for keyword in keywords:
                for term in terms:
                    #terms = jieba.cut(keyword)
                    #self.sentenceFind(sentence,terms)
                    #if sentence.find(keyword) != -1:
                    #if self.sentenceFind(sentence,keyword) != 0:
                    if sentence.find(term) != -1:
                        countKey += 1
                        continue
                if countKey == 0:
                    continue
                pattern = re.compile(r'[^\u4e00-\u9fa5]')
                sentence_cn = re.sub(pattern, '', sentence)
                words = list(jieba.cut(sentence_cn))
                if len(words) > 8 or len(words) < 3:
                    continue
                #score = self.cal_TF_IDF(re.sub(pattern, '', words))
                score = self.cal_TF_IDF(words,countKey)
                item['sentence'] = sentence
                item['score'] = score
                items.append(item)
                word_count += 1
                content_count += 1
                if content_count > 2:   # 文章中有5个以上句子
                    break
                if word_count >= 30:    # 只挑选20个句子
                    break
            if word_count >= 30:
                break
        #items = list(set(items))
        items.sort(key=lambda temp : temp['score'],reverse=True)

        count = 0
        
        #SentenceFilter = ""
        last_score = 0
        for item in items:
            if count >= 5:
                continue
            if last_score == item["score"]:
                continue
            similarQuery.append(item["sentence"])
            count += 1
            last_score = item['score']
        return similarQuery






    ## 根据关键词生成snippet
    def generate_snippet_from_keyword(self, content, keywords):
        content = content.replace(" ", "")
        import re
        sentences = re.split(r"[,|.|,|。|!|!|?|?]", content)
        snippet = ""
        count = 0
        for sentence in sentences:
            for keyword in keywords:
                if sentence.find(keyword) > 0:
                    # print(keyword, sentence)
                    snippet = snippet + "," + sentence
                    keywords.remove(keyword)
                    break
            if len(keywords) == 0:
                return snippet[1:] + "。"
        return snippet[1:] + "。"

    def get_hot_words(self):
        import re
        keywords = []
        searchitem = ''
        word = ''
        reader = self.ix2.reader()
        sentences = list(reader.field_terms('content'))
        for sentence in sentences:
            words = re.split(r"0xffff",sentence)
            for word in words:
                searchitem = searchitem + word + ' '
            searchitem = searchitem.strip()
            keywords.append(searchitem)
            searchitem = ''
        return keywords
    ## 根据关键词生成推荐新闻,并生成摘要
    def recommend_news(self):
        data = {}
        total = 0
        result_list = []
        keywords = self.get_hot_words()
        data["results"] = result_list
        with self.ix.searcher() as searcher:
            for keyword in keywords:
                results = searcher.search(self.qp.parse(keyword), limit=1)
                # keywords = [keyword for keyword, score
                #             in results.key_terms("content", docs=10, numterms=5)]
                # print(keywords)
                item = {}
                for result in results:
                    total = total + 1
                    for key in result.keys():
                        item[key] = result.get(key)
                    item["keywords"] = [keyword[0] for keyword in searcher.key_terms([result.docnum], "content")]
                    item["snippet"] = self.generate_snippet_from_keyword(item['content'], item['keywords'])
                    print(item['snippet'])
                    result_list.append(item)
                    break
            data['total'] = total
            data['results'] = result_list
            return data

    def get_recommend_query(self, term):
        recom_query = []
        with self.ix.searcher() as searcher:
            results = searcher.search_page(
                self.qp.parse(term), pagenum=1, pagelen=10)
            recommends = self.generate_similarQuery(results, term)
            for recommend in recommends:
                item = {}
                item['term']  = recommend
                recom_query.append(item)
            # for result in results:
            #     #self.generate_similarQuery(results, term)
            #     item = {}
            #     item['term'] = result['title']
            #     recom_query.append(item)
        return recom_query

    def search_more_like_this(self, url, fieldname, top):
        with self.ix.searcher() as searcher:
            docnum = searcher.document_number(url=url)
            results = searcher.more_like(docnum, fieldname, text=None,
                                         top=top, numterms=5, model=Bo1Model,
                                         normalize=True, filter=None)

            return self._results_todata(results)
Exemplo n.º 57
0
def search(terms, limit=50, time_slice=None):

        
        big_tables = {}
        for i in cats:
            big_tables[i]=[]
            

        f = open("./search_results.html", "w+")
        master_str = "<!DOCTYPE html><html><style>hr {border: 4;width: 80%;}</style><title>Search Results [term(s): "+terms+"]</title><body><br>"
        ix = index.open_dir("cl_index", indexname="CL")
        w = ix.writer()
        qp = QueryParser("content", schema=w.schema)
        qp.add_plugin(DateParserPlugin())
        qp.add_plugin(GtLtPlugin())
        q = qp.parse(terms)
        
        with w.searcher() as s:
            results = s.search(q, limit=limit)
            if time_slice != None:
                within = []
                start = int("".join(time_slice[0].split(":")))
                end = int("".join(time_slice[1].split(":")))
                if (0<=start<=2400) and (0 <=end<=2400):
                    for res in results:
                        time = res["posted"]
                        if time.minute < 10:
                            t = int(str(time.hour)+"0"+ str(time.minute))
                        else:
                            t = int(str(time.hour)+ str(time.minute))
                        
                        if start < end and start <= t <=end:
                            within.append(res)
                        elif end < start and (start <= t or t <= end):
                            within.append(res)
                        else:
                            pass
                    
                    results = within
                else:
                    print "Invalid time slice, no results returned."
                    results = []
            print "%d search results" % len(results)
            print "--"*15
            for res in results:
                to_nums(res["liwc"], big_tables)
                master_str += to_html(res, True)
        
        master_str += "</body></html>"
        f.write(master_str)
        f.close()
        
        res_str = "<!DOCTYPE html><html><title>LIWC statistics for term(s): "+terms+"</title><body><br>"
        res_str += "<table><tr>"+("<th>Category&nbsp;</th><th>Average</th><th>Std Dev</th><th>Max&nbsp</th><th>Min&nbsp</th>"*3)+"</tr>"
        count = 0
        for_later = {}
        for j in big_tables.keys():
            vals = big_tables[j]
            outputs = []
            if len(vals) != 0:
                avg = sum(vals)/len(vals)
                outputs.append(round(avg,4))
                var = [(i-avg)**2 for i in vals]
                std = math.sqrt(sum(var)/len(var))
                outputs.append(round(std,4))
                outputs.append(round(max(vals),4))
                outputs.append(round(min(vals),4))
            else:
                outputs = ["NA","NA","NA","NA"]
            if count%3 == 0:
                res_str+= "<tr>"
            
            
            res_str += "<td>"+str(j)+"</td><td>"+str(outputs[0])+"</td><td>"+str(outputs[1])+"</td><td>"+str(outputs[2])+"</td><td>"+str(outputs[3])+"</td>"
            count +=1
            if count%3 == 0:
                res_str+= "</tr>"
            for_later[j] = outputs
        res_str+="</table>"
    
        if big_tables["WC"] == []:
            big_tables = ""
            res_str = "<!DOCTYPE html><html><title>LIWC statistics for term(s): "+terms+"</title><body><br>" 
            res_str += "<p>No matches found </p></body></html>"
        t = open("./search_averages.html", "w+")
        t.write(res_str)
        t.close()
        
        return res_str, for_later, master_str