Пример #1
0
    # load tours data
    path = args.path
    if not path:
        path = os.path.join(os.path.dirname(__file__), "../data/spots.json")
    cityspots = CitySpots.load(path)

    # create city information data
    cityspotsj = []
    for cs in cityspots:
        j = {
            "id": cs.city.code,
            "spots": [s.code for s in cs.spots]
        }
        cityspotsj.append(j)

    # save as document
    cityspots_doc = CitySpots.to_doc(cityspots)

    print("show city spots corpus")
    cityspots_doc.show_vocab(limit=20)

    doc_path = os.path.join(os.path.dirname(path), "./cityspots_doc.pickle")
    p = PickleResource(doc_path)
    p.save(cityspots_doc)

    # save as json file
    j = json.dumps(cityspotsj, indent=2, ensure_ascii=False)
    data_path = os.path.join(os.path.dirname(path), "./cityspots.json")
    with open(data_path, "wb") as f:
        f.write(j.encode("utf-8"))
Пример #2
0
    p = PickleResource(path)
    doc = p.load()

    if args.freq > 0:
        doc.cut_frequent(args.freq)

    doc.cut_pos({"pos": ["動詞", "副詞"], "class1": ["接尾", "副詞可能"], "class2": ["人名", "地域", "副詞可能"]})

    if args.under > 0:
        doc.cut_under(args.under)

    if args.above > 0:
        doc.cut_above(args.above)

    if args.ignore:
        ig_path = os.path.join(os.path.dirname(path), args.ignore)
        ig = FileResource(ig_path)
        words = ig.load()
        for w in words:
            doc.remove_vocab(w[0])

    doc.show_vocab(show_pos=True)

    if args.save:
        fname = os.path.basename(path)
        doc_fname = os.path.splitext(fname)[0] + "_edited.pickle"
        doc_path = os.path.join(os.path.dirname(path), "./" + doc_fname)

        pe = PickleResource(doc_path)
        pe.save(doc)