示例#1
0
out = codecs.open(output_file, u"w", encoding=u"utf-8")
for line in codecs.open(input_file, u"r", encoding=u"utf-8"):
    counter += 1
    if counter < start:
        continue
    if counter > finish:
        break
    if counter % frequency == 0:
        if not filtering:
            out.write(line)
            saved_count += 1
        else:
            split = line.split(u"\t")
            wiki_coordinates = (float(split[0]), float(split[1]))
            name = u" ".join(eval(split[5])).strip()
            db_coordinates = get_coordinates(c, name)
            distance = []
            for candidate in db_coordinates:
                distance.append(great_circle(wiki_coordinates, (float(candidate[0]), float(candidate[1]))).kilometers)
            distance = sorted(distance)
            if distance[0] > max_distance:
                print(name, distance[0])
                filtered_count += 1
            else:
                out.write(line)
                saved_count += 1

print(u"Saved", saved_count, u"samples.")
if filtering:
    print(u"Filtered:", filtered_count, u"samples.")
示例#2
0
文件: geoparse.py 项目: x3xiong/EUPEG
def geoparse(text, result_string, model, word_to_index):
    """
    This function allows one to geoparse text i.e. extract toponyms (place names) and disambiguate to coordinates.
    :param text: to be parsed
    :return: currently only prints results to the screen, feel free to modify to your task
    """
    nlp = spacy.load(
        u'en'
    )  # or spacy.load(u'en') depending on your Spacy Download (simple or full)
    conn = sqlite3.connect(
        u'/opt/gsda/EUPEG/Geoparsers/camCoder/data/geonames.db').cursor(
        )  # this DB can be downloaded using the GitHub link
    padding = nlp(u"0")[0]  # Do I need to explain? :-)

    doc = nlp(text)  # NER with Spacy NER
    for entity in doc.ents:
        if entity.label_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]:
            name = entity.text if not entity.text.startswith(
                'the') else entity.text[4:].strip()
            start = entity.start_char if not entity.text.startswith(
                'the') else entity.start_char + 4
            end = entity.end_char
            near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, entity.start - CONTEXT_LENGTH / 2):entity.start]], True, padding) + \
                       pad_list(CONTEXT_LENGTH / 2, [x for x in doc[entity.end: entity.end + CONTEXT_LENGTH / 2]], False, padding)
            far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, entity.start - CONTEXT_LENGTH):max(0, entity.start - CONTEXT_LENGTH / 2)]], True, padding) + \
                      pad_list(CONTEXT_LENGTH / 2, [x for x in doc[entity.end + CONTEXT_LENGTH / 2: entity.end + CONTEXT_LENGTH]], False, padding)
            map_vector = text2mapvec(doc=near_inp + far_inp,
                                     mapping=ENCODING_MAP_1x1,
                                     outliers=OUTLIERS_MAP_1x1,
                                     polygon_size=1,
                                     db=conn,
                                     exclude=name)

            context_words, entities_strings = [], []
            target_string = pad_list(TARGET_LENGTH,
                                     [x.text.lower() for x in entity], True,
                                     u'0')
            target_string = [
                word_to_index[x]
                if x in word_to_index else word_to_index[UNKNOWN]
                for x in target_string
            ]
            for words in [near_inp, far_inp]:
                for word in words:
                    if word.text.lower() in word_to_index:
                        vec = word_to_index[word.text.lower()]
                    else:
                        vec = word_to_index[UNKNOWN]
                    if word.ent_type_ in [
                            u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"
                    ]:
                        entities_strings.append(vec)
                        context_words.append(word_to_index[u'0'])
                    elif word.is_alpha and not word.is_stop:
                        context_words.append(vec)
                        entities_strings.append(word_to_index[u'0'])
                    else:
                        context_words.append(word_to_index[u'0'])
                        entities_strings.append(word_to_index[u'0'])

            prediction = model.predict([
                np.array([context_words]),
                np.array([context_words]),
                np.array([entities_strings]),
                np.array([entities_strings]),
                np.array([map_vector]),
                np.array([target_string])
            ])
            prediction = index_to_coord(
                REVERSE_MAP_2x2[np.argmax(prediction[0])], 2)
            candidates = get_coordinates(conn, name)

            if len(candidates) == 0:
                # print(u"Don't have an entry for", name, u"in GeoNames")
                continue

            max_pop = candidates[0][2]
            best_candidate = []
            bias = 0.905  # Tweak the parameter depending on the domain you're working with.
            # Less than 0.9 suitable for ambiguous text, more than 0.9 suitable for less ambiguous locations, see paper
            for candidate in candidates:
                err = great_circle(
                    prediction, (float(candidate[0]), float(candidate[1]))).km
                best_candidate.append(
                    (err -
                     (err * max(1, candidate[2]) / max(1, max_pop)) * bias,
                     (float(candidate[0]), float(candidate[1]))))
            best_candidate = sorted(best_candidate, key=lambda (a, b): a)[0]

            # England,, England,, 51.5,, -0.11,, 669,, 676 || - use evaluation script to test correctness
            # print name, start, end

            # print u"Coordinates:", best_candidate[1]
            name = name.encode('utf-8')
            one_toponym = "{0},,{1},,{2},,{3},,{4},,{5}||".format(
                name, name, best_candidate[1][0], best_candidate[1][1], start,
                end)
            result_string = result_string + one_toponym

    return result_string
print(u'Crunching numbers, sit tight...')
# errors = codecs.open(u"errors.tsv", u"w", encoding=u"utf-8")
# Uncomment the above line for error diagnostics, also the section below.
conn = sqlite3.connect(u'../data/geonames.db')
file_name = u"data/eval_" + test_data + u".txt"
final_errors = []

print("Processing file..." + file_name)
for prediction, (y, name, context) in zip(
        model.predict_generator(
            generate_arrays_from_file(file_name, word_to_index, train=False),
            steps=int(check_output([u"wc", file_name]).split()[0]) /
            BATCH_SIZE,
            verbose=True), generate_strings_from_file(file_name)):
    prediction = index_to_coord(REVERSE_MAP_2x2[np.argmax(prediction)], 2)
    candidates = get_coordinates(conn.cursor(), name)

    if len(candidates) == 0:
        print(u"Don't have an entry for", name, u"in GeoNames")
        raise Exception(u"Check your database, buddy :-)")

    # candidates = [candidates[0]]  # Uncomment for population heuristic.
    # THE ABOVE IS THE POPULATION ONLY BASELINE IMPLEMENTATION

    #print("Prediction..."+" ".join(y)+"..."+name+".."+str(context))
    print("Prediction..." + " " + str(y) + "...." + name + ".." + context)
    best_candidate = []
    max_pop = candidates[0][2]
    bias = 0.905  # the Bias parameter in the paper
    for candidate in candidates:
        print(candidate)