def processURI(self):
     """
         compute shortUri, word, tag
     """
     self.extractShortUri()
     if "_" in self.shortUri:  #  we do not consider multi words expressions as candidates
         self.tag = -1
         return
     else:
         self.word = " ".join(uri_to_lemmas(self.shortUri))
     if self.fullUri.endswith("/n") or "/n/" in self.fullUri:  ## NN pos tag
         self.tag = 1
     if self.fullUri.count("/") == 4 and self.fullUri[-2] != "/":  # no pos tag
         self.tag = -1
 def processURI(self):
     """
         compute shortUri, word, tag
     """
     self.extractShortUri()
     if '_' in self.shortUri: # we do not consider multi words expressions as candidates
         self.tag = -1
         return
     else:
         self.word = ' '.join(uri_to_lemmas(self.shortUri))
     if self.fullUri.endswith('/n') or '/n/' in self.fullUri: ## NN pos tag
         self.tag = 1
     if self.fullUri.count('/') == 4 and self.fullUri[-2] != '/': # no pos tag
         self.tag = -1
Пример #3
0
def convert_to_solr(input_filename, output_filename):
    """
    Convert a JSON stream to a different JSON file that can be loaded into
    Solr.

    A JSON stream differs from standard JSON in that it contains several
    objects separated by line breaks.

    A Solr input file differs from standard JSON in a different way: it is
    represented as a single object with many fields. The values of these
    fields are the various different objects, but the key of each field
    must be "add".

    Having many values with the same key is incompatible with Python
    dictionaries, but is technically allowed by the JSON grammar. To create the
    output JSON file in Python, we have to write its components incrementally.
    """
    out = codecs.open(output_filename, 'w', encoding='utf-8')

    print("{", file=out)
    for info in read_json_stream(input_filename):
        boost = info['weight']

        # Handle searchable lemmas
        info['relLemmas'] = ''
        info['startLemmas'] = ' '.join(uri_to_lemmas(info['start']))
        info['endLemmas'] = ' '.join(uri_to_lemmas(info['end']))

        if boost > 0:
            if 'surfaceText' in info and info['surfaceText'] is None:
                del info['surfaceText']

            solr_struct = {'doc': info, 'boost': boost}
            solr_fragment = '\t"add": %s,' % json.dumps(solr_struct)
            print(solr_fragment, file=out)
    print('\t"commit": {}', file=out)
    print('}', file=out)
Пример #4
0
def convert_to_solr(input_filename, output_filename):
    """
    Convert a JSON stream to a different JSON file that can be loaded into
    Solr.

    A JSON stream differs from standard JSON in that it contains several
    objects separated by line breaks.

    A Solr input file differs from standard JSON in a different way: it is
    represented as a single object with many fields. The values of these
    fields are the various different objects, but the key of each field
    must be "add".

    Having many values with the same key is incompatible with Python
    dictionaries, but is technically allowed by the JSON grammar. To create the
    output JSON file in Python, we have to write its components incrementally.
    """
    out = codecs.open(output_filename, "w", encoding="utf-8")

    print("{", file=out)
    for info in read_json_stream(input_filename):
        boost = info["weight"]

        # Handle searchable lemmas
        info["relLemmas"] = ""
        info["startLemmas"] = " ".join(uri_to_lemmas(info["start"]))
        info["endLemmas"] = " ".join(uri_to_lemmas(info["end"]))

        if boost > 0:
            if "surfaceText" in info and info["surfaceText"] is None:
                del info["surfaceText"]

            solr_struct = {"doc": info, "boost": boost}
            solr_fragment = '\t"add": %s,' % json.dumps(solr_struct)
            print(solr_fragment, file=out)
    print('\t"commit": {}', file=out)
    print("}", file=out)