parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(thread=cpu_count())
    parser.add_argument("-t", "--thread", dest="thread", type=int, help="the number of threads")
    parser.set_defaults(knn=1)
    parser.add_argument("-k", "--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Sentence2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Sentence2Vec.load(modelfile2)

    sent_cat = readSentence(CatSentence(args.train, cont_col=3, split=args.split))
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
            count += 1
            if count > maxN:
                break
示例#2
0
if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    wikip_data = current_dir+"/"+wiki_name
    s2v_model_name = current_dir+"/"+model_dir+"/"+ wiki_name + "_sent.model"
    if not os.path.exists(current_dir+"/"+model_dir):
        os.mkdir(current_dir+"/"+model_dir)
    if not os.path.isfile(wikip_data):
        logger.info("downloading Wikipedia data")
        urllib.urlretrieve(wiki_url, wikip_data)
        logger.info("downloaded in %s" % wikip_data)
    
    sentences = WikiSentence(wikip_data)
    if not os.path.isfile(s2v_model_name):
        model = Sentence2Vec(sentences,iteration=10, model="cb", hs = 1, negative = 0, size=300, update_mode = 0)
        model.save(s2v_model_name)
    else:
        model = Sentence2Vec.load(s2v_model_name)
    
    print "Input an article title (type EXIT to exit)"
    sys.stdout.write("Name: ")
    line = sys.stdin.readline()
    while line:
        line = utils.to_unicode(line.rstrip())
        if line == "EXIT":
            break
        try:
            if model.sent_no_hash.has_key(line):
                sent_no = model.sent_no_hash[line]
                sent_vec = model.sents[sent_no]
示例#3
0
        sent_cat[sent_id] = cat_id
    return sent_cat


if __name__ == "__main__":
    logging.basicConfig(
        format=
        '%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    parser = Sentence2Vec.arg_parser()
    parser.add_argument("--split",
                        dest="split",
                        action="store_true",
                        help="use this option for split training data",
                        default=False)
    parser.add_argument("--modelfile",
                        dest="modelfile",
                        type=str,
                        help="trained model file")
    parser.add_argument("--test", dest="test", type=str, help="test file")
    parser.set_defaults(maxn=sys.maxint)
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(knn=1)
    parser.add_argument("-k",
                        "--knn",
    sent_cat = {}
    for tpl in sent:
        sent_id = tpl[1]
        cat_id = tpl[2]
        sent_cat[sent_id] = cat_id
    return sent_cat
    
if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(relativeCreated)d : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
    logging.info("running %s" % " ".join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)

    parser = Sentence2Vec.arg_parser()
    parser.add_argument("--split", dest="split", action="store_true", help="use this option for split training data", default=False)
    parser.add_argument("--modelfile", dest="modelfile", type=str, help="trained model file")
    parser.add_argument("--test", dest="test", type=str, help="test file")
    parser.set_defaults(maxn=sys.maxint)
    parser.add_argument("--maxN", dest="maxn", type=int, help="")
    parser.set_defaults(knn=1)
    parser.add_argument("-k","--knn", dest="knn", type=int, help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.train:
        print "ERROR: specify training set"
        quit()
示例#5
0
    parser.add_argument("-k",
                        "--knn",
                        dest="knn",
                        type=int,
                        help="use k of the nearest neighbors (default 1)")
    args = parser.parse_args()
    test_file = args.test
    topK = args.knn
    maxN = args.maxn
    if not args.modelfile1 or not args.modelfile2:
        print "Specify modelfile1 and modelfile2"
        quit(-1)

    logging.info("load trained model file")
    modelfile1 = args.modelfile1
    model1 = Sentence2Vec.load(modelfile1)
    modelfile2 = args.modelfile2
    model2 = Sentence2Vec.load(modelfile2)

    sent_cat = readSentence(
        CatSentence(args.train, cont_col=3, split=args.split))
    test_sentences = CatSentence(test_file)
    confusion_mtx = {}

    def prepare_sentences():
        count = 0
        for sent_tuple in test_sentences:
            yield sent_tuple
            count += 1
            if count > maxN: break