def _ldWordVectors(self, path, vocab): """ Reads in word embeddings restricted to a subset of tokens. Parameters ---------- path : str path where word embeddings are saved vocab : { Object:int } which embeddings we should pull and their index Returns ---------- embeddings : (N, D) numpy float array embedding matrix """ wvecs = {} embDim = 0 f = open_file(path, 'rt', encoding='utf8') for i, line in enumerate(f): word = line[:line.find(' ')] if word in vocab: try: wemb = np.asarray( [float(w) for w in line.strip().split()[1:]]) embDim = wemb.shape[0] wvecs[vocab[word]] = wemb except Exception as ex: raise ex # if not i % 1000000: # print ('Loading word vecs: %.1fM checked, %d found' % (i/10.**6, len(wvecs))) f.close() embeddings = np.zeros((len(vocab), embDim)) for index in wvecs: embeddings[index, :] = wvecs[index] print('Loaded word vecs: %d unigrams found' % (len(wvecs))) return embeddings
def serialize(self, path): outFile = open_file(path, 'wb') pickle.dump(self, outFile) outFile.close()
def deserialize(path): f = open_file(path, 'rb') classifier = pickle.load(f) f.close() return classifier
def loadData(path, depvars, proptest=None): """ Read data. Each line contains a single JSON record with the tweet text, labels it's been assigned, and train/dev/test fold. Missing labels are denoted with null values. If examples are not assigned to folds, then we train the model by cross-fold validation, otherwise we train it by tuning on dev set. Parameters ---------- path : str path to data file depvars : [ str ] dependent variables to extract, extracts all dependent proptest : float if set, constructs test set by setting this proportion of examples as test Returns ---------- trainDocs : [ str ] documents to train on, just text testDocs : [ str ] documents to test on, text trainLabels : [ [ str ] ] labels for train set, one for each dependent variable testLabels : [ [ str ] ] labels for test set folds : numpy int vector fold each example is placed in tuningFolds : [ int ] which folds to evaluate on depvars : [ str ] all dependent variables if depvars was empty alphabets : { str:{ str:int } } dictionary of labels for each label type """ random.seed(SEED) np.random.seed(SEED) labelAlphabets = {v: Alphabet() for v in depvars} trainDocs = [] testDocs = [] trainLabels = [] testLabels = [] # keep track of label frequency labelCounts = [{} for v in depvars] # See if we need to split into 5 folds, or if they are given explicitly. # When numeric folds are given, assumes highest index fold is the test fold, allDepVars = set() # all dependent features in data hasDevFold = False # tuning fold explicitly set testFoldIdx = -1 f = open_file(os.path.join(DATA_DIR, path), 'rt') for ln in f: try: tweet = json.loads(ln) except ValueError: continue # keep track of all dependent variables allDepVars |= set([v for v in tweet['label'].keys()]) if 'fold' in tweet and tweet['fold'] == 'dev': hasDevFold = True elif 'fold' in tweet and (type(tweet['fold']) == int or re.match('\d+', tweet['fold'])): testFoldIdx = max(testFoldIdx, int(tweet['fold'])) f.close() if not depvars: depvars = sorted(list(allDepVars)) labelCounts = [{} for v in depvars] labelAlphabets = {v: Alphabet() for v in depvars} if testFoldIdx > -1: # folds are already numbered, treat highest as test fold NUM_FOLDS = 1 + testFoldIdx tuningFolds = list(range(NUM_FOLDS - 1)) elif not hasDevFold: # assign to folds myself NUM_FOLDS = 5 tuningFolds = list(range(NUM_FOLDS - 1)) # TODO these are not being assigned! else: NUM_FOLDS = 3 tuningFolds = [1] testFoldIdx = NUM_FOLDS - 1 folds = [] f = open_file(os.path.join(DATA_DIR, path), 'rt') for ln in f: try: tweet = json.loads(ln) except ValueError: continue # read from tweet fields # make our own test set by rolling a die, if fold is not given if (proptest is not None) and ('fold' not in tweet): tweet['fold'] = 'test' if random.random() < proptest else 'train' fold = tweet['fold'] labels = [ tweet['label'][v] if (v in tweet['label']) and (tweet['label'][v] is not None) and ((type(tweet['label'][v]) != str) or tweet['label'][v].strip()) else None for v in depvars ] if 'text' in tweet: text = tweet['text'] else: text = tweet['tweet'][ 'text'] # pull out text from the embedded tweet for alpha, label, counts in zip([labelAlphabets[v] for v in depvars], labels, labelCounts): if label != None: alpha.put(label) if label not in counts: counts[label] = 0 counts[label] += 1 if fold == 'train': trainDocs.append(text) trainLabels.append(labels) if hasDevFold: # train is fold 0, dev is 1, test is 2 folds.append(0) else: # TODO what if hasDevFold == False?? is this correct moving this into an else? folds.append(np.random.randint(0, NUM_FOLDS - 1)) elif fold == 'dev': # we have an explicitly set dev fold trainDocs.append(text) trainLabels.append(labels) folds.append(1) # TODO is this correct now if hasDevFold? elif fold == 'test': testDocs.append(text) testLabels.append(labels) elif type(tweet['fold']) == int or re.match('\d+', tweet['fold']): if int(tweet['fold']) == testFoldIdx: testDocs.append(text) testLabels.append(labels) else: trainDocs.append(text) trainLabels.append(labels) folds.append(int(tweet['fold'])) else: # Should never hit this raise Exception('Example missing fold!', text, labels) f.close() alphabets = {v: alpha._wToI for v, alpha in labelAlphabets.items()} # make the class with the most examples be the negative one. May want to change this # eventually to let user set positive class. for counts, v in zip(labelCounts, depvars): majWord = max([(c, w) for w, c in counts.items()])[1] oldNegWord = ([w for w in alphabets[v] if alphabets[v][w] == 0])[0] alphabets[v][majWord], alphabets[v][oldNegWord] = alphabets[v][ oldNegWord], alphabets[v][majWord] return trainDocs, testDocs, trainLabels, testLabels, folds, tuningFolds, depvars, alphabets
def write(self, *args): self.stdoutFile.write(*args) if self.outPath: self.outFile = open_file(self.outPath, 'at') self.outFile.write(*args) self.outFile.close()