def sparql_query(fallback: bool = False): if fallback == True: # Fall back to English as gloss language return execute_sparql_query(f''' SELECT ?sense ?gloss WHERE {{ VALUES ?l {{wd:{self.lexeme_id}}}. ?l ontolex:sense ?sense. ?sense skos:definition ?gloss. # Get only the swedish gloss, exclude otherwise FILTER(LANG(?gloss) = "en") # Exclude lexemes without a linked QID from at least one sense # ?sense wdt:P5137 []. }}''' # debug=True ) else: return execute_sparql_query(f''' SELECT ?sense ?gloss WHERE {{ VALUES ?l {{wd:{self.lexeme_id}}}. ?l ontolex:sense ?sense. ?sense skos:definition ?gloss. # Get only the swedish gloss, exclude otherwise FILTER(LANG(?gloss) = "{usage_example.record.language_code.value}") # Exclude lexemes without a linked QID from at least one sense # ?sense wdt:P5137 []. }}''' # debug=True )
def fetch_forms_without_an_example(self): logger = logging.getLogger(__name__) # title:Forms that have no example demonstrating them and that have at least # one sense with P5137 (item for this sense) random_offset = random.randint(20, 1000) logger.info(f"random offset:{random_offset}") results = execute_sparql_query(f''' select ?lexeme ?form ?form_representation ?category (group_concat(distinct ?feature; separator = ",") as ?grammatical_features) WHERE {{ ?lexeme dct:language wd:{self.language_qid.value}; wikibase:lemma ?lemma; wikibase:lexicalCategory ?category; ontolex:lexicalForm ?form; ontolex:sense ?sense. ?sense wdt:P5137 []. ?form ontolex:representation ?form_representation; wikibase:grammaticalFeature ?feature. MINUS {{ ?lexeme p:P5831 ?statement. ?statement ps:P5831 ?example; pq:P6072 []; pq:P5830 ?form_with_example. }} }} group by ?lexeme ?form ?form_representation ?category offset {random_offset} limit {config.number_of_forms_to_fetch}''', debug=False) self.forms_without_an_example = [] # pprint(results) if "results" in results: if "bindings" in results["results"]: #logger.debug(f"data:{results['results']['bindings']}") forms = results["results"]['bindings'] logger.info(f"Got {len(forms)} lexemes") for entry in forms: # logger.info(f"data:{entry.keys()}") # logging.debug(f"lexeme_json:{entry}") form = Form(entry) logger.info(f"appending {form} to list of forms") # logger.info("debug exit") # exit(0) self.forms_without_an_example.append(form) else: raise ValueError("Got no bindings dict from WD") else: raise ValueError("Got no results dict from WD") if len(self.forms_without_an_example) == 0: console.print( "Got no forms from Wikidata to work on for this language " "if you think this is a bug, please open an issue here " f"{tui.issue_url()}") exit() else: logger.info( f"Got {len(self.forms_without_an_example)} " f"forms from WDQS for language {self.language_code.name.title()}" )
def calculate_total_lexemes(self): """Calculate how many lexemes exists in Wikidata""" result = (execute_sparql_query(f''' SELECT (COUNT(?l) as ?count) WHERE {{ ?l a ontolex:LexicalEntry. }}''')) count: int = wdqs.extract_count(result) logging.debug(f"count:{count}") self.total_lexemes = count
def count_number_of_lexemes(self): """Returns an int""" logger = logging.getLogger(__name__) result = (execute_sparql_query(f''' SELECT (COUNT(?l) as ?count) WHERE {{ ?l dct:language wd:{self.language_qid.value}. }}''')) logger.debug(f"result:{result}") count: int = wdqs.extract_count(result) logging.debug(f"count:{count}") return count
def lookup_qid(self): # Given a docuemnt id lookup the QID if any result = execute_sparql_query(f""" SELECT ?item WHERE {{ ?item wdt:P8433 "{self.id}". }} """) logging.info(f"result:{result}") self.document_qid = extract_the_first_wikibase_value_from_a_wdqs_result_set( result, "item") logging.info(f"document_qid:{self.document_qid}")
def count_number_of_senses_with_P5137(self): """Returns an int""" result = (execute_sparql_query(f''' SELECT (COUNT(?sense) as ?count) WHERE {{ VALUES ?l {{wd:{self.id}}}. ?l ontolex:sense ?sense. ?sense skos:definition ?gloss. # Exclude lexemes without a linked QID from at least one sense ?sense wdt:P5137 []. }}''')) count: int = wdqs.extract_count(result) logging.debug(f"count:{count}") return count
def get_records( form: Form = None, lexemes: Lexemes = None ) -> List[UsageExample]: logger = logging.getLogger(__name__) if form is None: raise ValueError("form was None") if lexemes is None: raise ValueError("language was None") if lexemes.language_code in config.fast_nlp_languages: limit = config.wikisource_max_results_size_fast_nlp else: limit = config.wikisource_max_results_size_slow_nlp logger.info( f"Fetching usage examples from the {lexemes.language_code.name.title()} Wikisource...") # search using sparql # borrowed from Scholia # thanks to Vigneron for the tip :) results = execute_sparql_query(f''' SELECT ?title ?titleUrl ?snippet WHERE {{ SERVICE wikibase:mwapi {{ bd:serviceParam wikibase:api "Search" . bd:serviceParam wikibase:endpoint "{lexemes.language_code.value}.wikisource.org" . bd:serviceParam mwapi:srsearch "{form.representation}" . bd:serviceParam mwapi:language "{lexemes.language_code.value}" . ?title wikibase:apiOutput mwapi:title . ?snippet_ wikibase:apiOutput "@snippet" . }} hint:Prior hint:runFirst "true" . BIND(CONCAT("https://br.wikisource.org/wiki/", ENCODE_FOR_URI(?title)) AS ?titleUrl) BIND(REPLACE(REPLACE(?snippet_, '</span>', ''), '<span class="searchmatch">', '') AS ?snippet) }} LIMIT {limit} ''') logger.debug(f"results:{results}") records = [] for item in results["results"]["bindings"]: records.append(WikisourceRecord(json=item, lexemes=lexemes)) length = len(records) logger.info(f"Got {length} records") if logger.getEffectiveLevel() == 10: for record in records: logging.debug(record) return process_records(form=form, records=records, lexemes=lexemes)
def count_number_of_senses_with_p5137(self): """Returns an int""" logger = logging.getLogger(__name__) result = (execute_sparql_query(f''' SELECT (COUNT(?sense) as ?count) WHERE {{ ?l dct:language wd:{self.language_qid.value}. ?l ontolex:sense ?sense. ?sense skos:definition ?gloss. # Exclude lexemes without a linked QID from at least one sense ?sense wdt:P5137 []. }}''')) logger.debug(f"result:{result}") count: int = wdqs.extract_count(result) logging.debug(f"count:{count}") return count
def count_number_of_forms_without_an_example(self): """Returns an int""" # TODO fix this to count all senses in a given language result = (execute_sparql_query(f''' SELECT (COUNT(?form) as ?count) WHERE {{ ?l dct:language wd:{self.language_qid.value}. ?l ontolex:lexicalForm ?form. ?l ontolex:sense ?sense. # exclude lexemes that already have at least one example MINUS {{?l wdt:P5831 ?example.}} # Exclude lexemes without a linked QID from at least one sense ?sense wdt:P5137 []. }}''')) count: int = wdqs.extract_count(result) logging.debug(f"count:{count}") self.number_of_forms_without_an_example = count