def main():
    
    parser = argparse.ArgumentParser(description='Run Doc2Vec/GloVe then push those vectors into Scikit-Learn')
    
    parser.add_argument('-e', '--epochs', type=int, default=10, help='Number of epochs for training')
    parser.add_argument('-v', '--verbose', action="store_true", help="Show verbose output")
    
    parser.add_argument("-s", "--vecsize", type=int, default=100, help="Vector size")
    
    parser.add_argument("-k", default=5, type=int, help="K for cross-validation")
    parser.add_argument('--nostop', action="store_true", help='Test data path')
    parser.add_argument('--stem', action="store_true", help='Test data path')
    parser.add_argument("--dataset", default="sentiment140", help="Which dataset to use")
    
    parser.add_argument("--datapath", help="Path to chosen dataset, required for first use.")
    
    parser.add_argument("--dvSample", default=0.0001, type=float, help="Doc2Vec sampling.")
    parser.add_argument("--dvNegative", default=5, help="Doc2Vec negative.")
    parser.add_argument("--dvMinCount", default=1, help="Doc2Vec min_count.")
    parser.add_argument("--window", default=1, help="Doc2Vec window.")
    parser.add_argument("--dvWorkers", default=1, help="Doc2Vec workers.")
    
    parser.add_argument("--lsi", action="store_true", help="Use Latent Semantic Indexing.")
    
    parser.add_argument("--dataLength", default=None, type=int, help="Use to limit the number of examples used")
    parser.add_argument("--dataSample", default=None, type=float, help="Use to sample examples from data")
    
    
    parser.add_argument("--nTrees", default=15, type=int, help="Number of trees for Random Forests.")
    parser.add_argument("--rfFeatures", default="sqrt", choices=["sqrt","log2","auto","all"],
                        help="Number of features for Random Forests.")
    
    parser.add_argument("--learningRate", default=0.05, type=float,help="GloVe learning rate.")
    
    parser.add_argument("--pca", action="store_true", help="Use pca with GloVe vectors")
    parser.add_argument('--parallelism', '-p', action='store',
                        default=4,
                        help=('Number of parallel threads to use'))

    parser.add_argument("--embeddings", choices=["glove","doc2vec"], default="doc2vec", help="Methods to generate vectors from text")
    
    args = parser.parse_args()
    
    logging.basicConfig(format='%(asctime)-15s: %(message)s', level=logging.INFO)

    all_data = list( loader.read( args.dataset,
                                 dataPath   = args.datapath,
                                 limit      = args.dataLength,
                                 sampleRate = args.dataSample ) )
    
    random.shuffle(all_data)
    test_model( all_data, args )
Exemplo n.º 2
0
def console_edit(name, value, type, description=None, world=None, frame=None):
    print "*********************************************************"
    print
    print "Editing resource", name, "of type", type
    print
    if description != None:
        print description
        print
    if frame != None:
        if isinstance(frame, (RigidObjectModel, RobotModelLink)):
            print "Reference frame:", frame.getName()
        else:
            print "Reference frame:", frame
        print
    print "*********************************************************"
    print "Current value:", value
    print "Do you wish to change it? (y/n/q) >",
    choice = ''
    while choice not in ['y', 'n', 'q']:
        choice = raw_input()[0].lower()
        if choice not in ['y', 'n', 'q']:
            print "Please enter y/n/q indicating yes/no/quit."
            print ">",
    if choice == 'y':
        print "Enter the new desired value below.  You may use native text,"
        print "JSON strings, or file(fn) to indicate a file name."
        print "New value >",
        data = raw_input()
        if data.startswith('{') or data.startswith('['):
            jsonobj = json.loads(data)
            try:
                obj = loader.fromJson(jsonobj, type)
                return True, obj
            except Exception:
                print "Error loading from JSON, press enter to continue..."
                raw_input()
                return False, value
        elif data.startswith('file('):
            try:
                obj = get(data[5:-1], type, doedit=False)
                if obj == None:
                    return False, value
                return True, obj
            except Exception:
                print "Error loading from file, press enter to continue..."
                raw_input()
                return False, value
        else:
            try:
                obj = loader.read(type, data)
                return True, obj
            except Exception:
                print "Error loading from text, press enter to continue..."
                raw_input()
                return False, value
    elif choice == 'n':
        print "Using current value."
        print "*********************************************************"
        return False, value
    elif choice == 'q':
        return False, None
Exemplo n.º 3
0
def load_table_data():
    log.debug("Loading table data...")
    for table, data in loader.read():
        log.debug(table)
        db_client.load(table, data)
    db_client.data_loaded = True
Exemplo n.º 4
0
def console_edit(name,value,type,description=None,world=None,frame=None):
    print "*********************************************************"
    print 
    print "Editing resource",name,"of type",type
    print
    if description!=None:
        print description
        print
    if frame!=None:
        if isinstance(frame,(RigidObjectModel,RobotModelLink)):
            print "Reference frame:",frame.getName()
        else:
            print "Reference frame:",frame
        print
    print "*********************************************************"
    print "Current value:",value
    print "Do you wish to change it? (y/n/q) >",
    choice = ''
    while choice not in ['y','n','q']:
        choice = raw_input()[0].lower()
        if choice not in ['y','n','q']:
            print "Please enter y/n/q indicating yes/no/quit."
            print ">",
    if choice=='y':
        print "Enter the new desired value below.  You may use native text,"
        print "JSON strings, or file(fn) to indicate a file name."
        print "New value >",
        data = raw_input()
        if data.startswith('{') or data.startswith('['):
            jsonobj = json.loads(data)
            try:
                obj = loader.fromJson(jsonobj,type)
                return True,obj
            except Exception:
                print "Error loading from JSON, press enter to continue..."
                raw_input()
                return False,value
        elif data.startswith('file('):
            try:
                obj = get(data[5:-1],type,doedit=False)
                if obj==None:
                    return False,value
                return True,obj
            except Exception:
                print "Error loading from file, press enter to continue..."
                raw_input()
                return False,value
        else:
            try:
                obj = loader.read(type,data)
                return True,obj
            except Exception:
                print "Error loading from text, press enter to continue..."
                raw_input()
                return False,value
    elif choice=='n':
        print "Using current value."
        print "*********************************************************"
        return False,value
    elif choice=='q':
        return False,None
Exemplo n.º 5
0
def main():
    graph = loader.read()                               # loads graph
    print_checks_and_debug.print_nodes(graph)           # DEBUG
    simulate.simulate(graph)                            # simulation
Exemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Run Doc2Vec/GloVe then push those vectors into Scikit-Learn')

    parser.add_argument('-e',
                        '--epochs',
                        type=int,
                        default=10,
                        help='Number of epochs for training')
    parser.add_argument('-v',
                        '--verbose',
                        action="store_true",
                        help="Show verbose output")

    parser.add_argument("-s",
                        "--vecsize",
                        type=int,
                        default=100,
                        help="Vector size")

    parser.add_argument("-k",
                        default=5,
                        type=int,
                        help="K for cross-validation")
    parser.add_argument('--nostop', action="store_true", help='Test data path')
    parser.add_argument('--stem', action="store_true", help='Test data path')
    parser.add_argument("--dataset",
                        default="sentiment140",
                        help="Which dataset to use")

    parser.add_argument("--datapath",
                        help="Path to chosen dataset, required for first use.")

    parser.add_argument("--dvSample",
                        default=0.0001,
                        type=float,
                        help="Doc2Vec sampling.")
    parser.add_argument("--dvNegative", default=5, help="Doc2Vec negative.")
    parser.add_argument("--dvMinCount", default=1, help="Doc2Vec min_count.")
    parser.add_argument("--window", default=1, help="Doc2Vec window.")
    parser.add_argument("--dvWorkers", default=1, help="Doc2Vec workers.")

    parser.add_argument("--lsi",
                        action="store_true",
                        help="Use Latent Semantic Indexing.")

    parser.add_argument("--dataLength",
                        default=None,
                        type=int,
                        help="Use to limit the number of examples used")
    parser.add_argument("--dataSample",
                        default=None,
                        type=float,
                        help="Use to sample examples from data")

    parser.add_argument("--nTrees",
                        default=15,
                        type=int,
                        help="Number of trees for Random Forests.")
    parser.add_argument("--rfFeatures",
                        default="sqrt",
                        choices=["sqrt", "log2", "auto", "all"],
                        help="Number of features for Random Forests.")

    parser.add_argument("--learningRate",
                        default=0.05,
                        type=float,
                        help="GloVe learning rate.")

    parser.add_argument("--pca",
                        action="store_true",
                        help="Use pca with GloVe vectors")
    parser.add_argument('--parallelism',
                        '-p',
                        action='store',
                        default=4,
                        help=('Number of parallel threads to use'))

    parser.add_argument("--embeddings",
                        choices=["glove", "doc2vec"],
                        default="doc2vec",
                        help="Methods to generate vectors from text")

    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)-15s: %(message)s',
                        level=logging.INFO)

    all_data = list(
        loader.read(args.dataset,
                    dataPath=args.datapath,
                    limit=args.dataLength,
                    sampleRate=args.dataSample))

    random.shuffle(all_data)
    test_model(all_data, args)
Exemplo n.º 7
0
    def __init__(self, rng, batchsize=100, activation=relu):

        import loader
        (numsent, charcnt, wordcnt, maxwordlen, maxsenlen,\
        kchr, kwrd, xchr, xwrd, y) = loader.read("tweets_clean.txt")

        dimword = 30
        dimchar = 5
        clword = 300
        clchar = 50
        kword = kwrd
        kchar = kchr

        datatrainword,\
        datatestword,\
        datatrainchar,\
        datatestchar,\
        targettrain,\
        targettest\
        = train_test_split(xwrd, xchr, y, random_state=1234, test_size=0.1)

        xtrainword = theano.shared(np.asarray(datatrainword, dtype='int16'),
                                   borrow=True)
        xtrainchar = theano.shared(np.asarray(datatrainchar, dtype='int16'),
                                   borrow=True)
        ytrain = theano.shared(np.asarray(targettrain, dtype='int8'),
                               borrow=True)
        xtestword = theano.shared(np.asarray(datatestword, dtype='int16'),
                                  borrow=True)
        xtestchar = theano.shared(np.asarray(datatestchar, dtype='int16'),
                                  borrow=True)
        ytest = theano.shared(np.asarray(targettest, dtype='int8'),
                              borrow=True)

        self.ntrainbatches = xtrainword.get_value(
            borrow=True).shape[0] / batchsize
        self.ntestbatches = xtestword.get_value(
            borrow=True).shape[0] / batchsize

        index = T.iscalar()
        xwrd = T.wmatrix('xwrd')
        xchr = T.wtensor3('xchr')
        y = T.bvector('y')
        train = T.iscalar('train')

        layercharembedinput = xchr

        layercharembed = EmbedIDLayer(rng,
                                      layercharembedinput,
                                      ninput=charcnt,
                                      noutput=dimchar)

        layer1input = layercharembed.output.reshape(
            (batchsize * maxsenlen, 1, maxwordlen, dimchar))

        layer1 = ConvolutionalLayer(rng,
                                    layer1input,
                                    filter_shape=(clchar, 1, kchar, dimchar),
                                    image_shape=(batchsize * maxsenlen, 1,
                                                 maxwordlen, dimchar))

        layer2 = MaxPoolingLayer(layer1.output,
                                 poolsize=(maxwordlen - kchar + 1, 1))

        layerwordembedinput = xwrd

        layerwordembed = EmbedIDLayer(rng,
                                      layerwordembedinput,
                                      ninput=wordcnt,
                                      noutput=dimword)

        layer3wordinput = layerwordembed.output.reshape(
            (batchsize, 1, maxsenlen, dimword))
        layer3charinput = layer2.output.reshape(
            (batchsize, 1, maxsenlen, clchar))

        layer3input = T.concatenate([layer3wordinput, layer3charinput], axis=3)

        layer3 = ConvolutionalLayer(rng,
                                    layer3input,
                                    filter_shape=(clword, 1, kword,
                                                  dimword + clchar),
                                    image_shape=(batchsize, 1, maxsenlen,
                                                 dimword + clchar),
                                    activation=activation)

        layer4 = MaxPoolingLayer(layer3.output,
                                 poolsize=(maxsenlen - kword + 1, 1))

        layer5input = layer4.output.reshape((batchsize, clword))

        layer5 = FullyConnectedLayer(rng,
                                     dropout(rng, layer5input, train),
                                     ninput=clword,
                                     noutput=50,
                                     activation=activation)

        layer6input = layer5.output

        layer6 = FullyConnectedLayer(rng,
                                     dropout(rng, layer6input, train, p=0.1),
                                     ninput=50,
                                     noutput=2,
                                     activation=None)

        result = Result(layer6.output, y)
        loss = result.negativeloglikelihood()
        accuracy = result.accuracy()
        params = layer6.params\
                +layer5.params\
                +layer3.params\
                +layerwordembed.params\
                +layer1.params\
                +layercharembed.params
        updates = RMSprop(learningrate=0.001, params=params).updates(loss)

        self.trainmodel = theano.function(
            inputs=[index],
            outputs=[loss, accuracy],
            updates=updates,
            givens={
                xwrd: xtrainword[index * batchsize:(index + 1) * batchsize],
                xchr: xtrainchar[index * batchsize:(index + 1) * batchsize],
                y: ytrain[index * batchsize:(index + 1) * batchsize],
                train: np.cast['int32'](1)
            })

        self.testmodel = theano.function(
            inputs=[index],
            outputs=[loss, accuracy],
            givens={
                xwrd: xtestword[index * batchsize:(index + 1) * batchsize],
                xchr: xtestchar[index * batchsize:(index + 1) * batchsize],
                y: ytest[index * batchsize:(index + 1) * batchsize],
                train: np.cast['int32'](0)
            })