示例#1
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE))
    # Now gather all usage data numbers from the MongoDB 'adsdata' collection,
    # keyed on bibcode
    ads_data = get_mongo_data(bibcodes=bibcodes)
    missing_bibcodes = filter(lambda a: a not in ads_data.keys(), bibcodes)
    app.logger.error("Bibcodes found with missing metadata: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    # Get precomputed and citation data
    metrics_data = get_metrics_data(bibcodes=bibcodes)
    # Get the number of citing papers
    Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values())))))
    Nciting_ref = len(list(set(itertools.chain(*map(lambda a: a['refereed_citations'], metrics_data.values())))))
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,ads_data,metrics_data)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
示例#2
0
def train_model(train_file, dev_file, outpath, epochs):
    """
    trains a tagging model
    """
    tags, trainloader, dev_x, dev_y = utils.generate_input_data(
        train_file, dev_file)

    V = len(utils.VOCAB)
    E = 50  # Embedding dimensions
    D_in = 5  # pp_w, p_w, words[i], n_w, nn_w
    H = 128  # hidden layers
    D_out = len(tags)  # out layer

    # Construct our model by instantiating the class defined above
    model = Tagger_Net(V, E, D_in, H, D_out)
    criterion = torch.nn.CrossEntropyLoss()  # cross entropy loss
    optimizer = torch.optim.Adam(model.parameters())  # ADAM

    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data
            # wrap them in Variable
            if torch.cuda.is_available():
                model.cuda()
                inputs, labels = Variable(inputs).cuda(), Variable(
                    labels).cuda()
            else:
                inputs, labels = Variable(inputs), Variable(labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print statistics
            running_loss += loss.data[0]
            if i % 2000 == 1999:  # print every 2000 mini-batches
                labels, predicted = utils.get_metrics_data(model, dev_x, dev_y)
                print('[%d, %5d] loss: %.3f dev: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000,
                       utils.accuracy(labels, predicted, tags)))
                running_loss = 0.0
        # Checkpoint every epoch
        torch.save(model, outpath)

    return [model, tags]
示例#3
0
def get_attributes(args):
    """
    Gather all data necessary for metrics calculations
    """
    # Get publication information
    if 'query' in args:
        # If we were fed a query, gather the associated bibcodes
        bibcodes = get_publications_from_query(args['query'])
    elif 'bibcodes' in args:
        bibcodes = map(lambda a: a.strip(), args['bibcodes'])
    elif 'libid' in args:
        # In theory we allow for retrieving bibcodes from private libraries
        # Clearly this will currently not be used
        bibcodes = get_bibcodes_from_private_library(args['libid'])
    # Split the list of bibcodes up in chunks, for parallel processing
    biblists = list(chunks(bibcodes,config.METRICS_CHUNK_SIZE))
    # Get precomputed metrics data, key-ed on bibcode
    metrics_data = get_metrics_data(bibcodes=bibcodes)
    missing_bibcodes = filter(lambda a: a not in metrics_data.keys(), bibcodes)
    if len(missing_bibcodes) > 0:
        app.logger.error("Bibcodes found with missing metrics data: %s" % ",".join(missing_bibcodes))
    bibcodes = filter(lambda a: a not in missing_bibcodes, bibcodes)
    bibcodes_without_authnums = map(lambda b: b['_id'],filter(lambda a: a['author_num'] == 0, metrics_data.values()))
    if len(bibcodes_without_authnums):
        app.logger.error("Bibcodes found with author number equal to zero: %s" % ",".join(bibcodes_without_authnums))
    bibcodes = filter(lambda a: a not in bibcodes_without_authnums, bibcodes)
    # Get the number of citing papers
    Nciting = len(list(set(itertools.chain(*map(lambda a: a['citations'], metrics_data.values())))))
    # Nciting_ref refers to citation to the refereed papers in the set
    Nciting_ref = len(list(set(itertools.chain(*map(lambda b: b['citations'], filter(lambda a: a['refereed']==True,metrics_data.values()))))))
    # The attribute vectors will be used to calculate the metrics
    attr_list = make_vectors(bibcodes,metrics_data)
    # We sort the entries in the attribute list on citation count, which
    # will make e.g. the calculation of 'h' trivial
    attr_list = sort_list_of_lists(attr_list,2)

    return attr_list,Nciting,Nciting_ref
def train_model(train_file, dev_file, outpath, repr, epochs):
    """
    trains a tagging model
    """
    train_stats = []
    if repr == "c":
        tags, train_batcher, X_dev, y_dev = utils.generate_input_data(train_file, dev_file, ixs=3, pretrained=True)
    else:
        tags, train_batcher, X_dev, y_dev = utils.generate_input_data(train_file, dev_file, pretrained=True, chars=True)
    V = len(utils.VOCAB)
    C = len(utils.CHARS)
    E = 25 # Char Embedding dimensions
    R = 50 # Representation dimensions
    H = 128 # hidden layers
    D_out = len(tags) # out layer

    # Choose our representation
    if repr == "a":
        reprW = ReprA(V, R) # part 1
    if repr == "b":
        reprW = ReprB(V, E, R)  # part 2
    if repr == "c":
        reprW = ReprC(V, R) # part 3
    if repr == "d":
        reprW = ReprD(C, V, R) # part 4

    #Init our model
    model = BiLSTM_Tagger(reprW, H, D_out)
    criterion = torch.nn.CrossEntropyLoss() # cross entropy loss
    optimizer = torch.optim.Adam(model.parameters()) # ADAM
    start = time.time()

    for epoch in range(epochs):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_time = time.time()
        train_size = train_batcher.batch_count()
        data_count = 0
        for i, data in enumerate(train_batcher.get_batches(), 0):
            data_count += len(data)
            if data_count > 500:
                labels, predicted = utils.get_metrics_data(model, X_dev, y_dev)
                train_stats.append({"Epoch": epoch + 1, "Loss": running_loss / train_size,
                                    "Dev": utils.accuracy(labels, predicted, tags)})
                data_count -= 500

            inputs, labels = data
            if torch.cuda.is_available():
                model.cuda()
                labels = Variable(torch.LongTensor(labels)).cuda()
            else:
                labels = Variable(torch.LongTensor(labels))

            labels = torch.cat(labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.data[0]
            loss.backward()
            optimizer.step()

        # epoch stats
        end = time.time()
        labels, predicted = utils.get_metrics_data(model, X_dev, y_dev)
        print('Epoch [%d] loss: %.3f dev: %.3f epoch time %f runtime %f' % (epoch + 1, running_loss / train_size,
                                                                            utils.accuracy(labels, predicted, tags),
                                                                            end - epoch_time, end - start))
    #Save model
    torch.save(model, outpath)
    return train_stats