def get_not_processed_files(options): files = [] filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED) if options.get("include_companies", None): filter = Q(ccvm=" ".join( options.get("include_companies", []))) & (filter) _logger.debug("Loading from database the files to be crawled...") paginator = CaravaggioSearchPaginator( query_string=str(filter), limit=1000, max_limit=1000).\ models(BovespaCompanyFile).\ select("ccvm", "doc_type", "fiscal_date", "version") while paginator.has_next(): _logger.debug( "{0}/{1} files loaded from database...". format(paginator.get_loaded_docs(), paginator.get_hits())) paginator.next() files.extend([(d.ccvm, d.doc_type, d.fiscal_date, d.version) for d in paginator.get_results()]) # return [file for file in # CaravaggioSearchQuerySet().models(BovespaCompanyFile). # raw_search(str(filter)). # values_list("ccvm", "doc_type", "fiscal_date", "version")] return files
def load_accounts(valid_account_types): key = "command:gen_accountability_plan:load_accounts" # Get all the company accounts for the given # company and fiscal_date filter = Q(value=Range(0, "*", safe=True, boundaries="exclusive")) balance_type_filter = None for balance_type in valid_account_types: if balance_type_filter is None: balance_type_filter = Q(balance_type=balance_type) else: balance_type_filter |= Q(balance_type=balance_type) filter = filter & (balance_type_filter) paginator = CaravaggioSearchPaginator( str(filter), limit=5000, **{"group": "true", "group.field": "number_exact", "group.limit": 1}, useFieldCache=True, ).models(BovespaAccount) accounts = {} while paginator.has_next(): _logger.debug("{} accounts loaded from database...".format(paginator.get_loaded_docs())) paginator.next() for acc_number, details in paginator.get_results().items(): accounts[acc_number] = (details[0].balance_type, details[0].name) return accounts
def get_not_processed_files(options, producer): filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED) if options.get("include_companies", None): filter = Q(ccvm=Value("({})".format(" ".join( options.get("include_companies", []))), safe=True)) & (filter) _logger.debug("Loading from database the files to be crawled...") paginator = (CaravaggioSearchPaginator( query_string=str(filter), limit=1000, max_limit=1000).models(BovespaCompanyFile).select( "ccvm", "doc_type", "fiscal_date", "version")) while paginator.has_next(): _logger.debug("{0}/{1} files loaded from database...".format( paginator.get_loaded_docs(), paginator.get_hits())) paginator.next() for d in paginator.get_results(): params = { "ccvm": d.ccvm, "doc_type": d.doc_type, "fiscal_date": d.fiscal_date, "version": d.version } producer.add_crawl_params(params, options)
def where(self, q, **kwargs): """Adds a conjunctive filter to a query. :param q: string or `solrq.Q` object :param kwargs: Arguments to construct a `solrq.Q` with :return: QueryBuilder object :rtype: :py:class:`QueryBuilder` """ if isinstance(q, string_types): if self._raw_query is None: self._raw_query = [] self._raw_query.append(q) elif isinstance(q, Q) or kwargs: if self._query is not None: raise ApiError( "Use .and_() or .or_() for an extant solrq.Q object") if kwargs: self._process_guid = self._process_guid or kwargs.get( "process_guid") q = Q(**kwargs) self._query = q else: raise ApiError(".where() only accepts strings or solrq.Q objects") return self
def __init__(self, **kwargs): if kwargs: self._query = Q(**kwargs) else: self._query = None self._raw_query = None self._process_guid = None
def load_accounts(ccvm, period): key = "{ccvm}_{period:%Y%m%d}".format(ccvm=ccvm, period=period) period_data = accounts_data_cache.get(key, None) if not period_data: period_data = {} # Get all the company accounts for the given # company and fiscal_date filter = Q(period=period) & Q(ccvm=ccvm) _logger.debug( "Loading from database the accounts for {} - {}...".format( ccvm, str(filter))) paginator = (CaravaggioSearchPaginator( query_string=str(filter), sort="version_exact asc, number_exact asc", limit=5000, max_limit=5000).models(BovespaAccount).select( "version", "number", "name", "financial_info_type", "balance_type", "comments", "amount")) while paginator.has_next(): _logger.debug("{0}/{1} accounts loaded from database...".format( paginator.get_loaded_docs(), paginator.get_hits())) paginator.next() for d in paginator.get_results(): _logger.debug("Raw Account: {}".format(d)) balance_type_accounts = period_data.setdefault( d.balance_type, {}) financial_type_accounts = balance_type_accounts.setdefault( d.financial_info_type, {}) financial_type_accounts[d.number] = { "number": d.number, "name": d.name, "comments": d.comments, "financial_info_type": d.financial_info_type, "balance_type": d.balance_type, "amount": float(d.amount), } accounts_data_cache[key] = period_data return period_data
def buildQuery(): """ faculteit kan meerdere keren voorkomen vandaar de dict.getlist zie: https://werkzeug.palletsprojects.com/en/1.0.x/datastructures/#werkzeug.datastructures.MultiDict NB nu ff niet geimplementeerd dit werkt niet: return Q(text="amsterdam", type="master", faculteit="FEB", faculteit="FMG") dit werkt wel: return Q(text="amsterdam", type="master", faculteit="FEB") & Q(faculteit="FMG") """ dict = request.args dict.getlist('faculteit') return Q(**dict)
def has_files_to_be_processed(ccvm): filter = ~Q(file_url=Range(ANY, ANY)) | Q(status=FILE_STATUS_NOT_PROCESSED) filter = Q(ccvm=ccvm) & (filter) _logger.debug("Loading from database the files to be crawled...") paginator = (CaravaggioSearchPaginator( query_string=str(filter), limit=1, max_limit=1).models(BovespaCompanyFile).select("ccvm", "doc_type", "fiscal_date", "version")) if paginator.has_next(): paginator.next() if paginator.get_hits() > 0: _logger.debug( "Company {0} HAS {1} FILES PENDING to be processed...".format( ccvm, paginator.get_hits())) return True _logger.debug( "Company {0} has not files pending to be processed...".format(ccvm)) return False
def specialisedDeeperSearch(queryIndex): results = [] # print('\n SPECIALIZED DEEPER SEARCH') # Deeper NLP Pipeline search for i in range(len(queryIndex)): text = queryIndex[i]['text'] tokens = queryIndex[i]['tokens'] stems = queryIndex[i]['stem'] lemmas = queryIndex[i]['lemma'] posTags = queryIndex[i]['posTag'] nounPhrases = queryIndex[i]['nounPhrases'] hypernyms = queryIndex[i]['hypernym'] hyponyms = queryIndex[i]['hyponym'] meronyms = queryIndex[i]['meronym'] holonyms = queryIndex[i]['holonym'] results.append( solr2.search( (Q(text=text) ^ 0.5) & (Q(stem=stems) ^ 0.5) & (Q(lemma=lemmas) ^ 4) & (Q(posTag=posTags) ^ 0.02) & (Q(nounPhrases=nounPhrases) ^ 5) & (Q(hypernym=hypernyms) ^ 2) & (Q(hyponym=hyponyms) ^ 0.5) & (Q(meronym=meronyms) ^ 0.5) & (Q(holonym=holonyms) ^ 0.5), sort='score desc', score=True, fl='*,score', rows=1)) for i in range(len(results)): # print('\n-----------------------------------------------------------------------------------------------\n') # print("Saw {0} result(s).".format(len(results[i])), '\n') # print('Input sentence', i + 1, ': ', input[i], '\n') for result in results[i]: # print("The ID is '{0}'.".format(result['id'])) # print("The Sentence is '{0}'.".format(result['text'])) # print("The Score is '{0}'.".format(result['score'])) # print('\n') print(result['text']) return result['text']
def not_(self, q, **kwargs): """Adds a negative filter to a query. :param q: `solrq.Q` object :param kwargs: Arguments to construct a `solrq.Q` with :return: QueryBuilder object :rtype: :py:class:`QueryBuilder` """ if kwargs: q = ~Q(**kwargs) if isinstance(q, Q): if self._query is None: self._query = q else: self._query = self._query & q else: raise ApiError(".not_() only accepts solrq.Q objects")
def deeperSearch(queryIndex): results = [] print('\n DEEPER SEARCH') # Deeper NLP Pipeline search for i in range(len(queryIndex)): tokens = queryIndex[i]['tokens'] stems = queryIndex[i]['stem'] lemmas = queryIndex[i]['lemma'] posTags = queryIndex[i]['posTag'] nounPhrases = queryIndex[i]['nounPhrases'] hypernyms = queryIndex[i]['hypernym'] hyponyms = queryIndex[i]['hyponym'] meronyms = queryIndex[i]['meronym'] holonyms = queryIndex[i]['holonym'] results.append( solr2.search(Q(tokens=tokens) & Q(stem=stems) & Q(lemma=lemmas) & Q(posTag=posTags) & Q(nounPhrases=nounPhrases) & Q(hypernym=hypernyms) & Q(hyponym=hyponyms) & Q(meronym=meronyms) & Q(holonym=holonyms), sort='score desc', score=True, fl='*,score')) for i in range(len(results)): print( '\n-----------------------------------------------------------------------------------------------\n' ) print("Saw {0} result(s).".format(len(results[i])), '\n') print('Input sentence', i + 1, ': ', input[i], '\n') for result in results[i]: print("The ID is '{0}'.".format(result['id'])) print("The Sentence is '{0}'.".format(result['text'])) print("The Score is '{0}'.".format(result['score'])) print('\n')
def and_(self, q, **kwargs): """Adds a conjunctive filter to a query. :param q: string or `solrq.Q` object :param kwargs: Arguments to construct a `solrq.Q` with :return: QueryBuilder object :rtype: :py:class:`QueryBuilder` """ if isinstance(q, string_types): self.where(q) elif isinstance(q, Q) or kwargs: if kwargs: q = Q(**kwargs) if self._query is None: self._query = q else: self._query = self._query & q else: raise ApiError(".and_() only accepts strings or solrq.Q objects") return self
def or_(self, q, **kwargs): """Adds a disjunctive filter to a query. :param q: `solrq.Q` object :param kwargs: Arguments to construct a `solrq.Q` with :return: QueryBuilder object :rtype: :py:class:`QueryBuilder` """ if kwargs: self._process_guid = self._process_guid or kwargs.get("process_guid") q = Q(**kwargs) if isinstance(q, Q): if self._query is None: self._query = q else: self._query = self._query | q else: raise ApiError(".or_() only accepts solrq.Q objects") return self
def field(): rows = int(request.json.get('rows')) word_similar = request.json.get('word_similar') topic = request.json.get('topic') title = request.json.get('title') description = request.json.get('description') content = request.json.get('content') author = request.json.get('author') publish_date = request.json.get('publish_date') # tokenizer and word similar topic = ViTokenizer.tokenize(topic.strip()) if topic else '' author = author.strip().replace(' ', '_') if (author and author.strip()) else '' publish_date = publish_date.strip() if publish_date else '' if word_similar == True: title = ws.find_word_similar(title.strip()) if title else "" description = ws.find_word_similar( description.strip()) if description else "" content = ws.find_word_similar(content.strip()) if content else "" else: title = ViTokenizer.tokenize(title.strip()) if title else '' description = ViTokenizer.tokenize( description.strip()) if description else '' content = ViTokenizer.tokenize(content.strip()) if content else '' # convert to solrQ if topic == '': topic_q = Q(topic="*") else: topic_q = Q(topic=topic) if title == '': title_q = Q(title="*") else: title_q = Q(title=title) if description == '': description_q = Q(description="*") else: description_q = Q(description=description) if content == '': content_q = Q(content="*") else: content_q = Q(content=content) if author == '': author_q = Q(author="*") else: author_q = Q(author=author) if publish_date == '': publish_date_q = Q(publish_date="*") else: publish_date_q = Q(publish_date=publish_date) query = topic_q & title_q & author_q & description_q & content_q & publish_date_q query_q = str(query).replace('\\', '').replace('(', '').replace(')', '') print(query_q) result = solr.search( query_q, **{ 'rows': rows, 'hl': 'true', 'hl.method': 'original', 'hl.simple.pre': '<mark style="background-color:#ffff0070;">', 'hl.simple.post': '</mark>', 'hl.highlightMultiTerm': 'true', 'hl.fragsize': 100, 'defType': 'edismax', 'fl': '*, score', # 'bq': '{!func}linear(clicked, 0.01 ,0.0 )', 'mm': 1, 'ps': 3, 'pf': 'topic^1 title^1 content^1 author^1 description^1 publish_date^1', 'qf': 'topic^1 title^1 content^1 author^1 description^1 publish_date^1', }) highlight = [] for i in result.highlighting.values(): highlight.append(i) # for i in highlight: # print(i) return jsonify(results=list(result), hightlight=highlight)
with open(stopWordsFile) as f: stopwords = f.read().splitlines() cleanQuestionWords = [] for word in words: if word.lower() not in stopwords and word: cleanQuestionWords.append(word) cleanWordsStr = ' '.join(cleanQuestionWords) from solrq import Q, Proximity import pysolr solr = pysolr.Solr(solrURL) #res = solr.search(cleanWordsStr.replace(' ','+')) res = solr.search(Q(text=Proximity(cleanWordsStr, 10))) #print len(res), ' results found' # Just loop over it to access the results. fout = open('wiki.result', 'w') for r in res: #print("Reading wiki article : '{0}'.".format(r['title'])) articleBody = r['text'].replace('[[','').replace(']]','') fout.write(remove_tags(articleBody)) break fout.close() log = headword.lower()
import pysolr import json from solrq import Q from pyvi import ViTokenizer # Setup a Solr instance. The timeout is optional. solr = pysolr.Solr('http://localhost:8983/solr/bkcv', always_commit=True, timeout=100) # # general_text = '' title = ViTokenizer.tokenize("Ba ôtô dàn hàng ngang vượt đèn đỏ") # print(title) # query = Q(title="\"{}\"".format(title))^2 \ # | Q(description="Hải Phòng Khi tín hiệu đèn đỏ còn ở giây")^1 query = Q(content="\"Nước Anh chính_thức phong_tỏa\"") print(query) results = solr.search( str(query).replace("\\", ""), **{ 'rows': 100000, 'hl': 'true', 'hl.method': 'original', 'hl.simple.pre': '<mark style="background-color:#ffff0070;">', 'hl.simple.post': '</mark>', 'hl.highlightMultiTerm': 'true', 'hl.fragsize': 100, 'defType': 'edismax', 'fl': '*, score', # 'bq':'{!func}linear(clicked, 0.01 ,0.0 )', # # 'bq':'{!func}log(linear(clicked, 20 ,0.0 ))',
def get_documents_in_node( node: SophoraNode, *, document_type: Optional[str] = None, sort_field: Optional[str] = "modificationDate_dt", sort_order: Optional[str] = "desc", max_age: Union[dt.timedelta, dt.datetime, None] = None, force_exact: bool = False, ) -> Generator[Dict, None, None]: """Request all Sophora documents in a specific node. Args: node (SophoraNode): Sohopra node to request data for force_exact (bool, optional): If true, forces ``EXACT`` matching type instead of ``STARTS`` for the sophora node, even if ``node.use_exact_search`` is ``False``. Defaults to False. Yields: Generator[Dict, None, None]: The parsed JSON of individual Sophora documents as retrieved from the API. """ node_str = node.node use_exact = force_exact or node.use_exact_search if not use_exact: node_str = node_str + "/" params = { "structureNodePath": node_str, } if document_type is not None: params["documentType"] = document_type if sort_field is not None: params["sortField"] = sort_field if sort_order is not None: params["sortOrder"] = sort_order if max_age is not None: if isinstance(max_age, dt.timedelta) and max_age.total_seconds() > 0: max_age = -max_age elif isinstance(max_age, dt.datetime): max_age = max_age.astimezone(UTC) # Encode filter query with solrq params["filterQueries"] = Q( modificationDate_dt=Range(max_age, dt.timedelta())) url = _sophora_api_url( "getDocumentsByStructureNodePath", "EXACT" if use_exact else "STARTS", "1", "20", ) logger.info("Paging through URL {}", url) while True: response = requests.get(url, params=params) response.raise_for_status() logger.debug(response.request.url) response_data = response.json() yield from response_data["data"] if response_data["moreLink"] is None: break else: url = response_data["moreLink"]["moreUrl"] # Remove badly unescaped query from URL parsed = urlparse(url) url = parsed.copy_with(query=None, fragment=None).unsplit()