def processURI(self): """ compute shortUri, word, tag """ self.extractShortUri() if "_" in self.shortUri: # we do not consider multi words expressions as candidates self.tag = -1 return else: self.word = " ".join(uri_to_lemmas(self.shortUri)) if self.fullUri.endswith("/n") or "/n/" in self.fullUri: ## NN pos tag self.tag = 1 if self.fullUri.count("/") == 4 and self.fullUri[-2] != "/": # no pos tag self.tag = -1
def processURI(self): """ compute shortUri, word, tag """ self.extractShortUri() if '_' in self.shortUri: # we do not consider multi words expressions as candidates self.tag = -1 return else: self.word = ' '.join(uri_to_lemmas(self.shortUri)) if self.fullUri.endswith('/n') or '/n/' in self.fullUri: ## NN pos tag self.tag = 1 if self.fullUri.count('/') == 4 and self.fullUri[-2] != '/': # no pos tag self.tag = -1
def convert_to_solr(input_filename, output_filename): """ Convert a JSON stream to a different JSON file that can be loaded into Solr. A JSON stream differs from standard JSON in that it contains several objects separated by line breaks. A Solr input file differs from standard JSON in a different way: it is represented as a single object with many fields. The values of these fields are the various different objects, but the key of each field must be "add". Having many values with the same key is incompatible with Python dictionaries, but is technically allowed by the JSON grammar. To create the output JSON file in Python, we have to write its components incrementally. """ out = codecs.open(output_filename, 'w', encoding='utf-8') print("{", file=out) for info in read_json_stream(input_filename): boost = info['weight'] # Handle searchable lemmas info['relLemmas'] = '' info['startLemmas'] = ' '.join(uri_to_lemmas(info['start'])) info['endLemmas'] = ' '.join(uri_to_lemmas(info['end'])) if boost > 0: if 'surfaceText' in info and info['surfaceText'] is None: del info['surfaceText'] solr_struct = {'doc': info, 'boost': boost} solr_fragment = '\t"add": %s,' % json.dumps(solr_struct) print(solr_fragment, file=out) print('\t"commit": {}', file=out) print('}', file=out)
def convert_to_solr(input_filename, output_filename): """ Convert a JSON stream to a different JSON file that can be loaded into Solr. A JSON stream differs from standard JSON in that it contains several objects separated by line breaks. A Solr input file differs from standard JSON in a different way: it is represented as a single object with many fields. The values of these fields are the various different objects, but the key of each field must be "add". Having many values with the same key is incompatible with Python dictionaries, but is technically allowed by the JSON grammar. To create the output JSON file in Python, we have to write its components incrementally. """ out = codecs.open(output_filename, "w", encoding="utf-8") print("{", file=out) for info in read_json_stream(input_filename): boost = info["weight"] # Handle searchable lemmas info["relLemmas"] = "" info["startLemmas"] = " ".join(uri_to_lemmas(info["start"])) info["endLemmas"] = " ".join(uri_to_lemmas(info["end"])) if boost > 0: if "surfaceText" in info and info["surfaceText"] is None: del info["surfaceText"] solr_struct = {"doc": info, "boost": boost} solr_fragment = '\t"add": %s,' % json.dumps(solr_struct) print(solr_fragment, file=out) print('\t"commit": {}', file=out) print("}", file=out)