Exemplo n.º 1
0
parser.add_argument('--transformer',
                    help='Are we using a Transformer (default is LSTM) LM',
                    type=str2bool,
                    default=False)
parser.add_argument("--device",
                    type=str,
                    default="cuda" if torch.cuda.is_available() else "cpu",
                    help="Device (cuda or cpu)")
args = parser.parse_known_args()[0]

embed_type = 'learned-positional' if args.transformer else 'default'

feature_desc = {
    'word': {
        'vectorizer':
        baseline.Token1DVectorizer(mxlen=-1, transform_fn=baseline.lowercase),
        'embed': {
            'embed_file': args.embeddings,
            'embed_type': embed_type,
            'unif': 0.05
        }
    }
}


class DictionaryDatasetWrapper(Dataset):
    def __init__(self, x, y):
        self.tensor_dataset = TensorDataset(x, y)

    def __getitem__(self, index):
        # stuff
Exemplo n.º 2
0
pool_field = 'cmotsz' if args.model_type == 'default' else 'rnnsz'

model_params = {
    'model_type': args.model_type,
    'filtsz': args.filts,
    pool_field: args.poolsz
}

if args.stacksz is not None:
    model_params['hsz'] = args.stacksz


feature_desc = {
    'word': {
        'vectorizer': bl.Token1DVectorizer(mxlen=args.mxlen, transform_fn=bl.lowercase),
        'embed': {'file': args.embeddings, 'type': 'default', 'unif': 0.25}
    }
}
# Create a reader that is using our vectorizers to parse a TSV file
# with rows like:
# <label>\t<sentence>\n

vectorizers = {k: v['vectorizer'] for k, v in feature_desc.items()}
reader = bl.TSVSeqLabelReader(vectorizers,
                              clean_fn=bl.TSVSeqLabelReader.do_clean)

train_file = args.train
valid_file = args.valid
test_file = args.test
parser.add_argument('--mxlen', help='Maximum post length (number of words) during training', type=int, default=100)
parser.add_argument('--train', help='Training file', default='../data/stsa.binary.phrases.train')
parser.add_argument('--valid', help='Validation file', default='../data/stsa.binary.dev')
parser.add_argument('--test', help='Testing file', default='../data/stsa.binary.test')
parser.add_argument('--embeddings', help='Pretrained embeddings file', default='/data/embeddings/GoogleNews-vectors-negative300.bin')
parser.add_argument('--ll', help='Log level', type=str, default='info')
parser.add_argument('--lr', help='Learning rate', type=float, default=0.001)
parser.add_argument("--device", type=str,
                    default="cuda" if torch.cuda.is_available() else "cpu",
                    help="Device (cuda or cpu)")
args = parser.parse_known_args()[0]


feature_desc = {
    'word': {
        'vectorizer': baseline.Token1DVectorizer(mxlen=100, transform_fn=baseline.lowercase),
        'embed': {'file': args.embeddings, 'type': 'default', 'unif': 0.25}
    }
}
# Create a reader that is using our vectorizers to parse a TSV file
# with rows like:
# <label>\t<sentence>\n

class DictionaryDatasetWrapper(Dataset):
    def __init__(self, x, x_lengths, y):
        self.tensor_dataset = TensorDataset(x, x_lengths, y)

    def __getitem__(self, index):
        # stuff
        x, x_length, y = self.tensor_dataset[index]
        return {'word': x.to(args.device), "lengths": x_length.to(args.device)}, y.to(args.device)
Exemplo n.º 4
0
import baseline as bl
import argparse
import os
import numpy as np
"""Take a file of TSVs where format is `label<\t>content` and convert to NPZ file of vectors from pretrained embeddings

"""

BP = '../data'
TRAIN = 'stsa.binary.phrases.train'.format(BP)
VALID = 'stsa.binary.dev'
TEST = 'stsa.binary.test'
LABELS = os.path.join(BP, 'stsa.binary.labels')
W2V_GN_300 = '/data/embeddings/GoogleNews-vectors-negative300.bin'
VECTORIZERS = {'word': bl.Token1DVectorizer(mxlen=40)}


def output_file(input_file):
    return input_file + '.npz'


def convert_input(file, embeddings, batchsz=50):
    batch_x = []
    batch_y = []
    dsz = embeddings.get_dsz()
    ts = reader.load(file, vocabs={'word': embeddings.vocab}, batchsz=batchsz)
    pg = bl.create_progress_bar(len(ts))
    for batch in pg(ts):
        x = batch['word']
        B, T = x.shape
        flat_x = x.reshape(B * T)