def get_attributes(args): """ Gather all data necessary for metrics calculations """ # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE)) # Now gather all usage data numbers from the MongoDB 'adsdata' collection, # keyed on bibcode ads_data = get_mongo_data(bibcodes=bibcodes) missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes) app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) # Get precomputed and citation data metrics_data = get_metrics_data(bibcodes=bibcodes) # Get the number of citing papers Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values()))))) Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values()))))) # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,ads_data,metrics_data) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
def train_model(train_file, dev_file, outpath, epochs): """ trains a tagging model """ tags, trainloader, dev_x, dev_y = utils.generate_input_data( train_file, dev_file) V = len(utils.VOCAB) E = 50 # Embedding dimensions D_in = 5 # pp_w, p_w, words[i], n_w, nn_w H = 128 # hidden layers D_out = len(tags) # out layer # Construct our model by instantiating the class defined above model = Tagger_Net(V, E, D_in, H, D_out) criterion = torch.nn.CrossEntropyLoss() # cross entropy loss optimizer = torch.optim.Adam(model.parameters()) # ADAM for epoch in range(epochs): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs inputs, labels = data # wrap them in Variable if torch.cuda.is_available(): model.cuda() inputs, labels = Variable(inputs).cuda(), Variable( labels).cuda() else: inputs, labels = Variable(inputs), Variable(labels) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.data[0] if i % 2000 == 1999: # print every 2000 mini-batches labels, predicted = utils.get_metrics_data(model, dev_x, dev_y) print('[%d, %5d] loss: %.3f dev: %.3f' % (epoch + 1, i + 1, running_loss / 2000, utils.accuracy(labels, predicted, tags))) running_loss = 0.0 # Checkpoint every epoch torch.save(model, outpath) return [model, tags]
def get_attributes(args): """ Gather all data necessary for metrics calculations """ # Get publication information if 'query' in args: # If we were fed a query, gather the associated bibcodes bibcodes = get_publications_from_query(args['query']) elif 'bibcodes' in args: bibcodes = map(lambda a: a.strip(), args['bibcodes']) elif 'libid' in args: # In theory we allow for retrieving bibcodes from private libraries # Clearly this will currently not be used bibcodes = get_bibcodes_from_private_library(args['libid']) # Split the list of bibcodes up in chunks, for parallel processing biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE)) # Get precomputed metrics data, key-ed on bibcode metrics_data = get_metrics_data(bibcodes=bibcodes) missing_bibcodes = filter(lambda a: a not in metrics_data.keys(), bibcodes) if len(missing_bibcodes) > 0: app.logger.error("Bibcodes found with missing metrics data: %s" % ",".join(missing_bibcodes)) bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes) bibcodes_without_authnums = map(lambda b: b['_id'],filter(lambda a: a['author_num'] == 0, metrics_data.values())) if len(bibcodes_without_authnums): app.logger.error("Bibcodes found with author number equal to zero: %s" % ",".join(bibcodes_without_authnums)) bibcodes = filter(lambda a: a not in bibcodes_without_authnums, bibcodes) # Get the number of citing papers Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values()))))) # Nciting_ref refers to citation to the refereed papers in the set Nciting_ref = len(list(set(itertools.chain(*map(lambda b: b['citations'], filter(lambda a: a['refereed']==True,metrics_data.values())))))) # The attribute vectors will be used to calculate the metrics attr_list = make_vectors(bibcodes,metrics_data) # We sort the entries in the attribute list on citation count, which # will make e.g. the calculation of 'h' trivial attr_list = sort_list_of_lists(attr_list,2) return attr_list,Nciting,Nciting_ref
def train_model(train_file, dev_file, outpath, repr, epochs): """ trains a tagging model """ train_stats = [] if repr == "c": tags, train_batcher, X_dev, y_dev = utils.generate_input_data(train_file, dev_file, ixs=3, pretrained=True) else: tags, train_batcher, X_dev, y_dev = utils.generate_input_data(train_file, dev_file, pretrained=True, chars=True) V = len(utils.VOCAB) C = len(utils.CHARS) E = 25 # Char Embedding dimensions R = 50 # Representation dimensions H = 128 # hidden layers D_out = len(tags) # out layer # Choose our representation if repr == "a": reprW = ReprA(V, R) # part 1 if repr == "b": reprW = ReprB(V, E, R) # part 2 if repr == "c": reprW = ReprC(V, R) # part 3 if repr == "d": reprW = ReprD(C, V, R) # part 4 #Init our model model = BiLSTM_Tagger(reprW, H, D_out) criterion = torch.nn.CrossEntropyLoss() # cross entropy loss optimizer = torch.optim.Adam(model.parameters()) # ADAM start = time.time() for epoch in range(epochs): # loop over the dataset multiple times running_loss = 0.0 epoch_time = time.time() train_size = train_batcher.batch_count() data_count = 0 for i, data in enumerate(train_batcher.get_batches(), 0): data_count += len(data) if data_count > 500: labels, predicted = utils.get_metrics_data(model, X_dev, y_dev) train_stats.append({"Epoch": epoch + 1, "Loss": running_loss / train_size, "Dev": utils.accuracy(labels, predicted, tags)}) data_count -= 500 inputs, labels = data if torch.cuda.is_available(): model.cuda() labels = Variable(torch.LongTensor(labels)).cuda() else: labels = Variable(torch.LongTensor(labels)) labels = torch.cat(labels) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = criterion(outputs, labels) running_loss += loss.data[0] loss.backward() optimizer.step() # epoch stats end = time.time() labels, predicted = utils.get_metrics_data(model, X_dev, y_dev) print('Epoch [%d] loss: %.3f dev: %.3f epoch time %f runtime %f' % (epoch + 1, running_loss / train_size, utils.accuracy(labels, predicted, tags), end - epoch_time, end - start)) #Save model torch.save(model, outpath) return train_stats