Пример #1
0
    def prepare(self, dim, sd):
        """
        Make torch Tensors from g2-`dim`-`sd` and infer labels.
        Args:
            dim:
            sd:

        Returns:

        """
        filename = 'g2-{}-{}.txt'.format(dim, sd)
        data = []
        target = []
        with open(os.path.join(self.root, filename)) as in_f:
            for i, line in enumerate(in_f):
                a, b = list(map(int, line.split())), 0 if i < 1024 else 1
                data.append(a)
                target.append(b)
        data = torch.Tensor(data)
        target = torch.Tensor(target)

        if self.stardardize:
            data = (data - 550) / 50

        return data, target
Пример #2
0
    def prepare(self, *select):
        """

        Args:
            *select:

        Returns:

        """
        datafile, labelfile = self.files(*select)
        data_filepath = os.path.join(self.root, datafile)
        label_filepath = os.path.join(self.root, labelfile)
        data = []
        target = []
        with open(data_filepath) as data_f, open(label_filepath) as label_f:
            for x, y in zip(data_f, it.islice(label_f, self.sync_files, None)):
                data.append(list(map(int, x.split())))
                target.append(int(y))
        data = torch.Tensor(data)
        target = torch.Tensor(target)

        if self.stardardize:
            data_mean = data.mean(dim=0, keepdim=True)
            data_std = data.std(dim=0, keepdim=True)
            data = (data - data_mean) / data_std

        return data, target
Пример #3
0
def cbow(sentences, window_size):
    """
    Create data based on
    skip-gram approach aka
    (predict context word
    from target word).
    """

    data = []
    for pair in sentences:

        # Extract data
        doc_id, sentence = pair[0], pair[1]

        # For each index
        for i in range(window_size, len(sentence)-window_size):

            # Collect contexts
            context = [sentence[i-size] \
                for size in range(window_size, -window_size-1, -1) \
                if size != 0]

            # Target
            target = sentence[i]

            # Add to data
            data.append((doc_id, context, target))

    return data
Пример #4
0
def skip_gram(sentences, window_size):
    """
    Create data based on
    skip-gram approach aka
    (predict context word
    from target word).
    """

    data = []
    for sentence in sentences:

        # For each index
        for i, index in enumerate(sentence):

            # Collect valid context indexes
            contexts = []
            for window in range(window_size):

                # left side
                if i-(window+1) >= 0:
                    contexts.append(sentence[i-(window+1)])
                # right side
                if i+(window+1) < len(sentence):
                    contexts.append(sentence[i+(window+1)])

            # Add to data
            for context in contexts:
                data.append((index, context))

    return data
Пример #5
0
    def get_paths(self):
        print('Identifying %s dataset.' % self.split)
        data = []
        labels = []

        # Get the corresponding label for each image.
        for line in self.lines:
            imgpath = line
            img_filename = ntpath.basename(imgpath)
            anno_filename = img_filename.replace('jpg', 'png')

            labpath = imgpath.replace('imgs', 'annos').replace(img_filename, anno_filename)

            if not os.path.exists(labpath):
                print('Could not find label for %s.' % imgpath)
                continue

            data.append(imgpath)
            labels.append(labpath)

        if self.split in ['train', 'val']:
            self.train_data = data
            self.train_labels = labels
            self.val_data = self.train_data[-self.val_samples:]
            self.val_labels = self.train_labels[-self.val_samples:]
            self.train_data = self.train_data[:-self.val_samples]
            self.train_labels = self.train_labels[:-self.val_samples]
        else:
            self.test_data = data
            self.test_labels = labels
Пример #6
0
    def prepare(self):
        """
        Make torch Tensors from data and label files.
        Returns:

        """
        datafile = self.urls[0].rpartition('/')[2]
        data_filepath = os.path.join(self.root, datafile)
        data = []
        target = []
        with open(data_filepath) as data_f:
            for sample in data_f:
                x, y, label = tuple(map(float, sample.split()))
                data.append([x, y])
                target.append(int(label) - 1)
        data = torch.Tensor(data)
        target = torch.Tensor(target)

        if self.stardardize:
            data_mean = data.mean(dim=0, keepdim=True)
            data_std = data.std(dim=0, keepdim=True)
            data = (data - data_mean) / data_std

        return data, target
Пример #7
0
    parser = argparse.ArgumentParser()
    parser.add_argument("--fst_dset", type=str, help="first directory in shared dir")
    parser.add_argument("--snd_dset", type=str, help="second directory in shared dir")
    opt = parser.parse_args()
    print(opt)




    def load_image( infilename ) :
        img = Image.open( infilename )
        img.load()
        data = np.asarray( img, dtype="float64" )
        return data
    path = '/share/se3/export/data/'+opt.fst_dset+'/'+opt.snd_dset+'/'
    data = []
    counter = 0
    for filename in os.listdir(path):
        if counter > 15000:
            break
        if counter % 1000 == 0: 
            print('files read', counter)
        img = load_image(path+filename) 
        img *= 255.0/img.max()
        data.append(np.transpose(img))
        counter += 1
    data = torch.from_numpy(np.stack(data)) 
    torch.save(data, opt.fst_dset+'_'+opt.snd_dset+'.pt') 
    print ("Calculating Inception Score...")
    print (inception_score(data, cuda=False, batch_size=32, resize=True, splits=10))
Пример #8
0
    def __init__(self,
                 root,
                 image_set,
                 year,
                 img_size,
                 shots=1,
                 phase=1,
                 shuffle=False):
        self.shuffle = shuffle
        self.img_size = img_size
        self.phase = phase
        subset = 'shots'
        self.shot_path = os.path.join(root, 'annotations',
                                      'instances_{}2014.json'.format(subset))
        self.shots = shots
        # if phase == 2:
        #     self.shots = shots * 3

        # name, paths
        self._year = year
        self._image_set = image_set
        self._data_path = root

        # load COCO API, classes, class <-> id mappings
        self._COCO = COCO(self._get_ann_file())
        self.json_data = self._COCO.dataset.copy()
        cats = self._COCO.loadCats(self._COCO.getCatIds())

        self._classes = tuple(
            ['__background__'] +
            [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES] +
            [c['name'] for c in cats if c['name'] in cfg.VOC_CLASSES])

        self._class_to_coco_cat_id = dict(
            list(zip([c['name'] for c in cats], self._COCO.getCatIds())))
        self._image_index = self._load_image_set_index()

        # Some image sets are "views" (i.e. subsets) into others.
        # For example, minival2014 is a random 5000 image subset of val2014.
        # This mapping tells us where the view's images and proposals come from.
        self._view_map = {
            'minival2014': 'val2014',  # 5k val2014 subset
            'valminusminival2014': 'val2014',  # val2014 \setminus minival2014
            'valminuscapval2014': 'val2014',
            'capval2014': 'val2014',
            'captest2014': 'val2014',
            'shots2014': 'train2014'
        }
        coco_name = image_set + year  # e.g., "val2014"
        self._data_name = (self._view_map[coco_name]
                           if coco_name in self._view_map else coco_name)

        if phase == 1:
            self.metaclass = tuple(
                [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES])
        else:
            self.metaclass = tuple(
                [c['name'] for c in cats if c['name'] not in cfg.VOC_CLASSES] +
                [c['name'] for c in cats if c['name'] in cfg.VOC_CLASSES])
        class_to_idx = dict(zip(self.metaclass, range(len(
            self.metaclass))))  # class to index mapping

        self.prndata = []
        self.prncls = []

        prn_image_pth = os.path.join(root, 'annotations',
                                     'prn_image_{}shots.pt'.format(shots))
        prn_mask_pth = os.path.join(root, 'annotations',
                                    'prn_mask_{}shots.pt'.format(shots))

        if os.path.exists(prn_image_pth) and os.path.exists(
                prn_mask_pth) and self.phase == 1:
            prn_image = torch.load(prn_image_pth)
            prn_mask = torch.load(prn_mask_pth)
        else:
            prn_image, prn_mask = self.get_prndata()

            torch.save(prn_image, prn_image_pth)
            torch.save(prn_mask, prn_mask_pth)

        for i in range(shots):
            cls = []
            data = []
            for n, key in enumerate(list(prn_image.keys())):
                img = torch.from_numpy(
                    np.array(prn_image[key][i % len(prn_image[key])]))
                img = img.unsqueeze(0)
                mask = torch.from_numpy(
                    np.array(prn_mask[key][i % len(prn_mask[key])]))
                mask = mask.unsqueeze(0)
                mask = mask.unsqueeze(3)
                imgmask = torch.cat([img, mask], dim=3)
                cls.append(class_to_idx[key])
                data.append(imgmask.permute(0, 3, 1, 2).contiguous())
            self.prncls.append(cls)
            self.prndata.append(torch.cat(data, dim=0))
Пример #9
0
def read_langs(file_name, entity, can, ind2cand, max_line=None):
    logging.info(("Reading lines from {}".format(file_name)))
    # Read the file and split into lines
    data = []
    context = ""
    u = None
    r = None
    with open(file_name) as fin:
        cnt_ptr = 0
        cnt_voc = 0
        max_r_len = 0
        cnt_lin = 1
        for line in fin:
            line = line.strip()
            if line:
                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r = line.split('\t')
                    context += str(u) + " "
                    contex_arr = context.split(' ')[LIMIT:]
                    r_index = []
                    gate = []
                    for key in r.split(' '):
                        if (key in entity):
                            index = [
                                loc for loc, val in enumerate(contex_arr)
                                if val == key
                            ]
                            if (index):
                                index = max(index)
                                gate.append(1)
                                cnt_ptr += 1
                            else:
                                index = len(contex_arr)
                                gate.append(0)
                                cnt_voc += 1
                            r_index.append(index)

                    if (len(r_index) == 0):
                        r_index = [
                            len(contex_arr),
                            len(contex_arr),
                            len(contex_arr),
                            len(contex_arr)
                        ]
                    if (len(r_index) == 1):
                        r_index.append(len(contex_arr))
                        r_index.append(len(contex_arr))
                        r_index.append(len(contex_arr))

                    if len(r_index) > max_r_len:
                        max_r_len = len(r_index)

                    data.append(
                        [" ".join(contex_arr) + " $$$$", can[r], r_index, r])
                    context += str(r) + " "
                else:
                    r = line
                    context += str(r) + " "
            else:
                cnt_lin += 1
                if (max_line and cnt_lin >= max_line):
                    break
                context = ""
    max_len = max([len(d[0].split(' ')) for d in data])
    logging.info("Pointer percentace= {} ".format(cnt_ptr /
                                                  (cnt_ptr + cnt_voc)))
    logging.info("Max responce Len: {}".format(max_r_len))
    logging.info("Max Input Len: {}".format(max_len))
    return data, max_len, max_r_len
Пример #10
0
def read_langs(file_name, max_line=None):
    print(("Reading lines from {}".format(file_name)))
    data, context_arr, conv_arr, kb_arr = [], [], [], []
    max_resp_len = 0

    with open('data/MULTIWOZ2.1/global_entities.json') as f:
        global_entity = json.load(f)

    with open(file_name) as fin:
        cnt_lin, sample_counter = 1, 1
        for line in fin:
            line = line.strip()
            if line:
                if line[-1] == line[0] == "#":
                    line = line.replace("#", "")
                    task_type = line
                    continue

                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r, gold_ent = line.split('\t')
                    gen_u = generate_memory(u, "$u", str(nid))
                    context_arr += gen_u
                    conv_arr += gen_u

                    # Get gold entity for each domain
                    gold_ent = ast.literal_eval(gold_ent)
                    ent_idx_restaurant, ent_idx_attraction, ent_idx_hotel = [], [], []
                    if task_type == "restaurant":
                        ent_idx_restaurant = gold_ent
                    elif task_type == "attraction":
                        ent_idx_attraction = gold_ent
                    elif task_type == "hotel":
                        ent_idx_hotel = gold_ent
                    ent_index = list(set(ent_idx_restaurant + ent_idx_attraction + ent_idx_hotel))

                    # Get local pointer position for each word in system response
                    ptr_index = []
                    for key in r.split():
                        index = [loc for loc, val in enumerate(context_arr) if (val[0] == key and key in ent_index)]
                        if (index):
                            index = max(index)
                        else:
                            index = len(context_arr)
                        ptr_index.append(index)

                    # Get global pointer labels for words in system response, the 1 in the end is for the NULL token
                    selector_index = [1 if (word_arr[0] in ent_index or word_arr[0] in r.split()) else 0 for word_arr in
                                      context_arr] + [1]

                    sketch_response, gold_sketch = generate_template(global_entity, r, gold_ent, kb_arr, task_type)

                    data_detail = {
                        'context_arr': list(context_arr + [['$$$$'] * MEM_TOKEN_SIZE]),  # $$$$ is NULL token
                        'response': r,
                        'sketch_response': sketch_response,
                        'gold_sketch': gold_sketch,
                        'ptr_index': ptr_index + [len(context_arr)],
                        'selector_index': selector_index,
                        'ent_index': ent_index,
                        'ent_idx_restaurant': list(set(ent_idx_restaurant)),
                        'ent_idx_attraction': list(set(ent_idx_attraction)),
                        'ent_idx_hotel': list(set(ent_idx_hotel)),
                        'conv_arr': list(conv_arr),
                        'kb_arr': list(kb_arr),
                        'id': int(sample_counter),
                        'ID': int(cnt_lin),
                        'domain': task_type}
                    data.append(data_detail)

                    gen_r = generate_memory(r, "$s", str(nid))
                    context_arr += gen_r
                    conv_arr += gen_r
                    if max_resp_len < len(r.split()):
                        max_resp_len = len(r.split())
                    sample_counter += 1
                else:
                    r = line
                    kb_info = generate_memory(r, "", str(nid))
                    context_arr = kb_info + context_arr
                    kb_arr += kb_info
            else:
                cnt_lin += 1
                context_arr, conv_arr, kb_arr = [], [], []
                if (max_line and cnt_lin >= max_line):
                    break

    return data, max_resp_len
Пример #11
0
def read_langs(file_name, global_entity, type_dict, max_line=None):
    # print(("Reading lines from {}".format(file_name)))
    data, context_arr, conv_arr, kb_arr = [], [], [], []
    max_resp_len, sample_counter = 0, 0
    with open(file_name) as fin:
        cnt_lin = 1
        for line in fin:
            line = line.strip()
            if line:
                nid, line = line.split(' ', 1)
                # print("line", line)
                if '\t' in line:
                    u, r = line.split('\t')
                    gen_u = generate_memory(u, "$u", str(nid))
                    context_arr += gen_u
                    conv_arr += gen_u
                    ptr_index, ent_words = [], []

                    # Get local pointer position for each word in system response
                    for key in r.split():
                        if key in global_entity and key not in ent_words:
                            ent_words.append(key)
                        index = [
                            loc for loc, val in enumerate(context_arr)
                            if (val[0] == key and key in global_entity)
                        ]
                        index = max(index) if (index) else len(context_arr)
                        ptr_index.append(index)

                    # Get global pointer labels for words in system response, the 1 in the end is for the NULL token
                    selector_index = [
                        1 if (word_arr[0] in ent_words
                              or word_arr[0] in r.split()) else 0
                        for word_arr in context_arr
                    ] + [1]

                    sketch_response = generate_template(
                        global_entity, r, type_dict)

                    data_detail = {
                        'context_arr':
                        list(context_arr +
                             [['$$$$'] *
                              MEM_TOKEN_SIZE]),  # $$$$ is NULL token
                        'response':
                        r,
                        'sketch_response':
                        sketch_response,
                        'ptr_index':
                        ptr_index + [len(context_arr)],
                        'selector_index':
                        selector_index,
                        'ent_index':
                        ent_words,
                        'ent_idx_cal': [],
                        'ent_idx_nav': [],
                        'ent_idx_wet': [],
                        'conv_arr':
                        list(conv_arr),
                        'kb_arr':
                        list(kb_arr),
                        'id':
                        int(sample_counter),
                        'ID':
                        int(cnt_lin),
                        'domain':
                        ""
                    }
                    data.append(data_detail)

                    gen_r = generate_memory(r, "$s", str(nid))
                    context_arr += gen_r
                    conv_arr += gen_r
                    if max_resp_len < len(r.split()):
                        max_resp_len = len(r.split())
                    sample_counter += 1
                else:
                    r = line
                    kb_info = generate_memory(r, "", str(nid))
                    context_arr = kb_info + context_arr
                    kb_arr += kb_info
            else:
                cnt_lin += 1
                context_arr, conv_arr, kb_arr = [], [], []
                if (max_line and cnt_lin >= max_line):
                    break

    return data, max_resp_len
Пример #12
0
 def get_video_frames(self, video_index):
     front_vid_path, tac_path, pos_path, label_path, vid_frames, tac_frames, label = self.video_data[video_index]
     data = []
     for vid, tac, lab in zip(vid_frames, tac_frames, label):
         data.append(self.custom_getitem(front_vid_path, vid, lab, tac_path, pos_path, tac))
     return data, label_path
def read_langs(file_name,
               gating_dict,
               SLOTS,
               dataset,
               lang,
               mem_lang,
               sequicity,
               training,
               max_line=None):
    print(("Reading from {}".format(file_name)))
    data = []
    max_resp_len, max_value_len = 0, 0
    domain_counter = {}
    with open(file_name) as f:
        dials = json.load(f)
        # create vocab first
        for dial_dict in dials:
            if (args["all_vocab"] or dataset == "train") and training:
                for ti, turn in enumerate(dial_dict["dialogue"]):
                    lang.index_words(turn["system_transcript"], 'utter')
                    lang.index_words(turn["transcript"], 'utter')
        # determine training data ratio, default is 100%
        if training and dataset == "train" and args["data_ratio"] != 100:
            random.Random(10).shuffle(dials)
            dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])]

        cnt_lin = 1
        for dial_dict in dials:
            dialog_history = ""
            last_belief_dict = {}
            # Filtering and counting domains
            for domain in dial_dict["domains"]:
                if domain not in EXPERIMENT_DOMAINS:
                    continue
                if domain not in domain_counter.keys():
                    domain_counter[domain] = 0
                domain_counter[domain] += 1

            # Unseen domain setting
            if args["only_domain"] != "" and args[
                    "only_domain"] not in dial_dict["domains"]:
                continue
            if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict["domains"]) or \
               (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict["domains"]):
                continue

            # Reading data
            for ti, turn in enumerate(dial_dict["dialogue"]):
                turn_domain = turn["domain"]
                turn_id = turn["turn_idx"]
                turn_uttr = turn["system_transcript"] + " ; " + turn[
                    "transcript"]
                turn_uttr_strip = turn_uttr.strip()
                dialog_history += (turn["system_transcript"] + " ; " +
                                   turn["transcript"] + " ; ")
                source_text = dialog_history.strip()
                turn_belief_dict = fix_general_label_error(
                    turn["belief_state"], False, SLOTS)

                # Generate domain-dependent slot list
                slot_temp = SLOTS
                if dataset == "train" or dataset == "dev":
                    if args["except_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["except_domain"] not in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["except_domain"] not in k
                        ])
                    elif args["only_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["only_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["only_domain"] in k
                        ])
                else:
                    if args["except_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["except_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["except_domain"] in k
                        ])
                    elif args["only_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["only_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["only_domain"] in k
                        ])

                turn_belief_list = [
                    str(k) + '-' + str(v) for k, v in turn_belief_dict.items()
                ]

                if (args["all_vocab"] or dataset == "train") and training:
                    mem_lang.index_words(turn_belief_dict, 'belief')

                class_label, generate_y, slot_mask, gating_label  = [], [], [], []
                start_ptr_label, end_ptr_label = [], []
                for slot in slot_temp:
                    if slot in turn_belief_dict.keys():
                        generate_y.append(turn_belief_dict[slot])

                        if turn_belief_dict[slot] == "dontcare":
                            gating_label.append(gating_dict["dontcare"])
                        elif turn_belief_dict[slot] == "none":
                            gating_label.append(gating_dict["none"])
                        else:
                            gating_label.append(gating_dict["ptr"])

                        if max_value_len < len(turn_belief_dict[slot]):
                            max_value_len = len(turn_belief_dict[slot])

                    else:
                        generate_y.append("none")
                        gating_label.append(gating_dict["none"])

                # 可以根据ID和turn_idx将内容复原
                data_detail = {
                    "ID": dial_dict["dialogue_idx"],
                    "domains": dial_dict["domains"],
                    "turn_domain": turn_domain,
                    "turn_id": turn_id,
                    "dialog_history": source_text,
                    "turn_belief": turn_belief_list,
                    "gating_label": gating_label,
                    "turn_uttr": turn_uttr_strip,
                    'generate_y': generate_y
                }
                data.append(data_detail)

                if max_resp_len < len(source_text.split()):
                    max_resp_len = len(source_text.split())

            cnt_lin += 1
            if (max_line and cnt_lin >= max_line):
                break

    # add t{} to the lang file
    if "t{}".format(max_value_len -
                    1) not in mem_lang.word2index.keys() and training:
        for time_i in range(max_value_len):
            mem_lang.index_words("t{}".format(time_i), 'utter')

    print("domain_counter", domain_counter)
    return data, max_resp_len, slot_temp
Пример #14
0
def main():
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    global args, best_prec1
    best_prec1 = 0
    args = parser.parse_args()
    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    args.noise = not args.no_noise
    args.quant = not args.no_quantization
    args.act_quant = not args.no_act_quantization
    args.quant_edges = not args.no_quant_edges

    logging.info("saving to %s", save_path)
    logging.debug("run arguments: %s", args)

    if args.gpus is not None:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        device = 'cuda:' + str(args.gpus[0])
        cudnn.benchmark = True
    else:
        device = 'cpu'
    dtype = torch.float32

    args.step_setup = None

    model = models.__dict__[args.model]
    model_config = {
        'scale': args.scale,
        'input_size': args.input_size,
        'dataset': args.dataset,
        'bitwidth': args.bitwidth,
        'quantize': args.quant,
        'noise': args.noise,
        'step': args.step,
        'depth': args.depth,
        'act_bitwidth': args.act_bitwidth,
        'act_quant': args.act_quant,
        'quant_edges': args.quant_edges,
        'step_setup': args.step_setup,
        'quant_epoch_step': args.quant_epoch_step,
        'quant_start_stage': args.quant_start_stage,
        'normalize': args.no_pre_process_normalize,
        'noise_mask': args.noise_mask
    }

    if args.model_config is not '':
        model_config = dict(model_config, **literal_eval(args.model_config))

    # create model
    model = model(**model_config)
    logging.info("creating model %s", args.model)
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print("number of parameters: ", params)
    logging.info("created model with configuration: %s", model_config)
    print(model)

    data = None
    checkpoint_epoch = 0
    # optionally resume from a checkpoint
    if args.evaluate:
        if not os.path.isfile(args.evaluate):
            parser.error('invalid checkpoint: {}'.format(args.evaluate))
        checkpoint = torch.load(args.evaluate, map_location=device)
        load_model(model, checkpoint)
        logging.info("loaded checkpoint '%s' (epoch %s)", args.evaluate,
                     checkpoint['epoch'])

        print("loaded checkpoint {0} (epoch {1})".format(
            args.evaluate, checkpoint['epoch']))

    elif args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location=device)
            if not args.start_from_zero:
                args.start_epoch = checkpoint['epoch'] - 1
            best_test = checkpoint['best_prec1']
            checkpoint_epoch = checkpoint['epoch']

            load_model(model, checkpoint)

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        elif os.path.isdir(args.resume):
            checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar')
            csv_path = os.path.join(args.resume, 'results.csv')
            print("=> loading checkpoint '{}'".format(checkpoint_path))
            checkpoint = torch.load(checkpoint_path, map_location=device)
            best_test = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_path, checkpoint['epoch']))
            data = []
            with open(csv_path) as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    data.append(row)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.gpus is not None:
        model = torch.nn.DataParallel(
            model, [args.gpus[0]]
        )  # Statistics need to be calculated on single GPU to be consistant with data among multiplr GPUs

    # Data loading code
    default_transform = {
        'train':
        get_transform(args.dataset,
                      input_size=args.input_size,
                      augment=True,
                      integer_values=args.quant_dataloader,
                      norm=not args.no_pre_process_normalize),
        'eval':
        get_transform(args.dataset,
                      input_size=args.input_size,
                      augment=False,
                      integer_values=args.quant_dataloader,
                      norm=not args.no_pre_process_normalize)
    }
    transform = getattr(model.module, 'input_transform', default_transform)

    val_data = get_dataset(args.dataset,
                           'val',
                           transform['eval'],
                           datasets_path=args.datapath)
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=args.val_batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    train_data = get_dataset(args.dataset,
                             'train',
                             transform['train'],
                             datasets_path=args.datapath)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    statistics_train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.act_stats_batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.decay,
                                nesterov=True)
    model, criterion = model.to(device, dtype), criterion.to(device, dtype)
    if args.clr:
        scheduler = CyclicLR(optimizer,
                             base_lr=args.min_lr,
                             max_lr=args.max_lr,
                             step_size=args.epochs_per_step *
                             len(train_loader),
                             mode=args.mode)
    else:
        scheduler = MultiStepLR(optimizer,
                                milestones=args.schedule,
                                gamma=args.gamma)

    csv_logger = CsvLogger(filepath=save_path, data=data)
    csv_logger.save_params(sys.argv, args)
    csv_logger_training_stats = os.path.join(save_path, 'training_stats.csv')

    # pre-training activation and parameters statistics calculation ####
    if check_if_need_to_collect_statistics(model):
        for layer in model.modules():
            if isinstance(layer, actquant.ActQuantBuffers):
                layer.pre_training_statistics = True  # Turn on pre-training activation statistics calculation
        model.module.statistics_phase = True

        validate(
            statistics_train_loader,
            model,
            criterion,
            device,
            epoch=0,
            num_of_batches=80,
            stats_phase=True)  # Run validation on training set for statistics
        model.module.quantize.get_act_max_value_from_pre_calc_stats(
            list(model.modules()))
        _ = model.module.quantize.set_weight_basis(list(model.modules()), None)

        for layer in model.modules():
            if isinstance(layer, actquant.ActQuantBuffers):
                layer.pre_training_statistics = False  # Turn off pre-training activation statistics calculation
        model.module.statistics_phase = False

    else:  # Maximal activation values still need to be derived from loaded stats
        model.module.quantize.assign_act_clamp_during_val(list(
            model.modules()),
                                                          print_clamp_val=True)
        model.module.quantize.assign_weight_clamp_during_val(
            list(model.modules()), print_clamp_val=True)
        # model.module.quantize.get_act_max_value_from_pre_calc_stats(list(model.modules()))

    if args.gpus is not None:  # Return to Multi-GPU after statistics calculations
        model = torch.nn.DataParallel(model.module, args.gpus)
        model, criterion = model.to(device, dtype), criterion.to(device, dtype)

    # pre-training activation statistics calculation ####

    if args.evaluate:
        val_loss, val_prec1, val_prec5 = validate(val_loader,
                                                  model,
                                                  criterion,
                                                  device,
                                                  epoch=0)
        print("val_prec1: ", val_prec1)
        return

    # fast forward to curr stage
    for i in range(args.quant_start_stage):
        model.module.switch_stage(0)

    for epoch in trange(args.start_epoch, args.epochs + 1):

        if not isinstance(scheduler, CyclicLR):
            scheduler.step()

        #     scheduler.optimizer = optimizer
        train_loss, train_prec1, train_prec5 = train(
            train_loader,
            model,
            criterion,
            device,
            epoch,
            optimizer,
            scheduler,
            training_stats_logger=csv_logger_training_stats)

        for layer in model.modules():
            if isinstance(layer, actquant.ActQuantBuffers):
                layer.print_clamp()

        # evaluate on validation set

        val_loss, val_prec1, val_prec5 = validate(val_loader, model, criterion,
                                                  device, epoch)

        # remember best prec@1 and save checkpoint
        is_best = val_prec1 > best_prec1
        best_prec1 = max(val_prec1, best_prec1)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': args.model,
                'config': args.model_config,
                'state_dict': model.state_dict(),
                'best_prec1': best_prec1,
                'layers_b_dict': model.module.
                layers_b_dict  #TODO this doesn't work for multi gpu - need to del
            },
            is_best,
            path=save_path)
        # New type of logging
        csv_logger.write({
            'epoch': epoch + 1,
            'val_error1': 1 - val_prec1,
            'val_error5': 1 - val_prec5,
            'val_loss': val_loss,
            'train_error1': 1 - train_prec1,
            'train_error5': 1 - train_prec5,
            'train_loss': train_loss
        })
        csv_logger.plot_progress(title=args.model + str(args.depth))
        csv_logger.write_text(
            'Epoch {}: Best accuracy is {:.2f}% top-1'.format(
                epoch + 1, best_prec1 * 100.))
Пример #15
0
    target_dir = 'data/modelnet40_2048_category'
    train_files = getDataFiles(os.path.join(base_dir, 'test_files.txt'))
    # print(train_files)
    # TEST_FILES = getDataFiles(os.path.join(BASE_DIR, 'data/modelnet40_ply_hdf5_2048/test_files.txt'))

    shape_names = []
    with open(os.path.join(base_dir, 'shape_names.txt'), 'r') as f:
        shape_names = [line.replace('\n', '') for line in f.readlines()]
    print(shape_names)

    data = []
    label = []
    for fn in range(len(train_files)):
        print('----' + str(fn) + '-----')
        current_data, current_label = loadDataFile(train_files[fn])
        data.append(current_data)
        label.append(current_label)
    data = np.concatenate(data, axis=0)
    label = np.concatenate(label, axis=0)
    print(data.shape)
    print(label.shape)

    phase = 'test'

    for i, shape in enumerate(shape_names):
        indices = np.asarray([ind for ind, l in enumerate(label) if l == i])
        shape_data = data[indices]
        dest_dir = os.path.join(target_dir, shape)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        with h5py.File(os.path.join(dest_dir, '%s_%s.h5' % (shape, phase)),
Пример #16
0
def train_discriminator(dataset,
                        dataset_fp=None,
                        pretrained_model="gpt2-medium",
                        epochs=10,
                        learning_rate=0.0001,
                        batch_size=64,
                        log_interval=10,
                        save_model=False,
                        cached=False,
                        no_cuda=False,
                        output_fp='.'):
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"
    add_eos_token = pretrained_model.startswith("gpt2")

    if save_model:
        if not os.path.exists(output_fp):
            os.makedirs(output_fp)
    classifier_head_meta_fp = os.path.join(
        output_fp, "{}_classifier_head_meta.json".format(dataset))
    classifier_head_fp_pattern = os.path.join(
        output_fp, "{}_classifier_head_epoch".format(dataset) + "_{}.pt")

    print("Preprocessing {} dataset...".format(dataset))
    start = time.time()

    if dataset == "SST":
        idx2class = [
            "positive", "negative", "very positive", "very negative", "neutral"
        ]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
        train_data, val_data, test_data = datasets.SST.splits(
            text,
            label,
            fine_grained=True,
            train_subtrees=True,
        )

        x = []
        y = []
        #preprocess dataset
        for i in trange(len(train_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(train_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            if add_eos_token:
                seq = [50256] + seq
            seq = torch.tensor(seq, device=device, dtype=torch.long)
            x.append(seq)
            y.append(class2idx[vars(train_data[i])["label"]])
        train_dataset = Dataset(x, y)

        test_x = []
        test_y = []
        for i in trange(len(test_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(test_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            if add_eos_token:
                seq = [50256] + seq
            seq = torch.tensor(seq, device=device, dtype=torch.long)
            test_x.append(seq)
            test_y.append(class2idx[vars(test_data[i])["label"]])
        test_dataset = Dataset(test_x, test_y)

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 2,
        }

    elif dataset == "clickbait":
        idx2class = ["non_clickbait", "clickbait"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        with open("datasets/clickbait/clickbait.txt") as f:
            data = []
            for i, line in enumerate(f):
                try:
                    data.append(eval(line))
                except:
                    print("Error evaluating line {}: {}".format(i, line))
                    continue
        x = []
        y = []
        with open("datasets/clickbait/clickbait.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        if add_eos_token:
                            seq = [50256] + seq
                        seq = torch.tensor(seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(d["label"])
                except:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 1,
        }

    elif dataset == "toxic":
        idx2class = ["non_toxic", "toxic"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        x = []
        y = []
        with open("datasets/toxic/toxic_train.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        if add_eos_token:
                            seq = [50256] + seq
                        seq = torch.tensor(seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(int(np.sum(d["label"]) > 0))
                except:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 0,
        }

    else:  # if dataset == "generic":
        # This assumes the input dataset is a TSV with the following structure:
        # class \t text

        if dataset_fp is None:
            raise ValueError("When generic dataset is selected, "
                             "dataset_fp needs to be specified aswell.")

        idx2class = get_idx2class(dataset_fp)

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        full_dataset = get_generic_dataset(dataset_fp,
                                           discriminator.tokenizer,
                                           device,
                                           idx2class=idx2class,
                                           add_eos_token=add_eos_token)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": {c: i
                            for i, c in enumerate(idx2class)},
            "default_class": 0,
        }

    end = time.time()
    print("Preprocessed {} data points".format(
        len(train_dataset) + len(test_dataset)))
    print("Data preprocessing took: {:.3f}s".format(end - start))

    if cached:
        print("Building representation cache...")

        start = time.time()

        train_loader = get_cached_data_loader(train_dataset,
                                              batch_size,
                                              discriminator,
                                              shuffle=True,
                                              device=device)

        test_loader = get_cached_data_loader(test_dataset,
                                             batch_size,
                                             discriminator,
                                             device=device)

        end = time.time()
        print("Building representation cache took: {:.3f}s".format(end -
                                                                   start))

    else:
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   collate_fn=collate_fn)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                  batch_size=batch_size,
                                                  collate_fn=collate_fn)

    if save_model:
        with open(classifier_head_meta_fp, "w") as meta_file:
            json.dump(discriminator_meta, meta_file)

    optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)

    test_losses = []
    test_accuracies = []

    for epoch in range(epochs):
        start = time.time()
        print("\nEpoch", epoch + 1)

        train_epoch(discriminator=discriminator,
                    data_loader=train_loader,
                    optimizer=optimizer,
                    epoch=epoch,
                    log_interval=log_interval,
                    device=device)
        test_loss, test_accuracy = evaluate_performance(
            data_loader=test_loader,
            discriminator=discriminator,
            device=device)

        end = time.time()
        print("Epoch took: {:.3f}s".format(end - start))

        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)

        print("\nExample prediction")
        predict(example_sentence,
                discriminator,
                idx2class,
                cached=cached,
                device=device)

        if save_model:
            # torch.save(discriminator.state_dict(),
            #           "{}_discriminator_{}.pt".format(
            #               args.dataset, epoch + 1
            #               ))
            torch.save(discriminator.get_classifier().state_dict(),
                       classifier_head_fp_pattern.format(epoch + 1))

    min_loss = float("inf")
    min_loss_epoch = 0
    max_acc = 0.0
    max_acc_epoch = 0
    print("Test performance per epoch")
    print("epoch\tloss\tacc")
    for e, (loss, acc) in enumerate(zip(test_losses, test_accuracies)):
        print("{}\t{}\t{}".format(e + 1, loss, acc))
        if loss < min_loss:
            min_loss = loss
            min_loss_epoch = e + 1
        if acc > max_acc:
            max_acc = acc
            max_acc_epoch = e + 1
    print("Min loss: {} - Epoch: {}".format(min_loss, min_loss_epoch))
    print("Max acc: {} - Epoch: {}".format(max_acc, max_acc_epoch))

    return discriminator, discriminator_meta
Пример #17
0
    def load_train_data_test(self):
        """
        todo 학습용 데이터 적재,
        학습에 들어갈 데이터 컬럼명 리스트 작성.
        전체 데이터 컬럼명 작성 (offset 포함된것)

        :return:
        """

        first_layer = ['user', 'product']
        second_layer = ['dense', 'sparse']
        third_layer = ['single', 'seq']

        for f_l in first_layer:
            file_member1 = OrderedDict()
            root_layer_col_name = OrderedDict()

            whole_root_layer_col_name = OrderedDict()
            for s_l in second_layer:
                file_member2 = OrderedDict()
                second_layer_col_name = OrderedDict()

                whole_second_layer_col_name = OrderedDict()
                for t_l in third_layer:

                    path = f"parquet_file/partitioned_data/train/{f_l}/{s_l}/{t_l}"
                    file_list = os.listdir(path)

                    file_list_py = [
                        file for file in file_list if file.endswith(".dms")
                    ]
                    data = list()

                    if file_list_py:
                        for file in file_list_py:
                            # if not empty
                            with open(f'{path}/{file}', 'rb') as f:

                                data.append(
                                    pd.read_parquet(f, engine='pyarrow'))

                        train_data_df = pd.concat(data, ignore_index=True)
                        train_data_df = train_data_df.set_index("idx")
                        # if f_l == 'user' and s_l =='sparse' and t_l =='single':
                        #     print(train_data_df.tail)
                        #     sys.exit()
                        train_data_df.to_csv(f"{f_l}{s_l}{t_l}.csv", mode='w')

                        if self.len == 0:
                            try:
                                self.len = train_data_df.shape[0]

                                if self.len < 1:
                                    raise Exception('empty train data')

                            except Exception as e:
                                print(e)
                                sys.exit(1)
                    else:
                        break

                    file_member2[t_l] = train_data_df.to_numpy()

                    # 모델에 사용할 컬럼만 추려낸다.
                    bad_list = ["offset"]

                    data = np.asarray(train_data_df.columns)
                    new_list = np.asarray(
                        [x for x in data if x not in bad_list])

                    second_layer_col_name[t_l] = new_list
                    whole_second_layer_col_name[t_l] = data

                file_member1[s_l] = file_member2
                root_layer_col_name[s_l] = second_layer_col_name
                whole_root_layer_col_name[s_l] = whole_second_layer_col_name

            self._data[f_l] = file_member1
            self.col_name[f_l] = root_layer_col_name

            # 전체 컬럼명 리스트
            self.whole_col_name[f_l] = whole_root_layer_col_name

        # label
        path = "parquet_file/partitioned_data/train/label"
        file_list = os.listdir(path)

        file_list_py = [file for file in file_list if file.endswith(".dms")]

        for file in file_list_py:
            # if not empty
            if file:
                with open(f'{path}/{file}', 'rb') as f:
                    data.append(pd.read_parquet(f, engine='pyarrow'))
            label_df = pd.concat(data, ignore_index=True)
            label_df = label_df.set_index("idx")

        label_df = label_df.to_numpy()

        self._label = label_df
Пример #18
0
def read_langs(file_name, max_line=None):
    logging.info(("Reading lines from {}".format(file_name)))
    data = []
    contex_arr = []
    conversation_arr = []
    entity = {}
    u = None
    r = None
    with open(file_name) as fin:
        cnt_ptr = 0
        cnt_voc = 0
        max_r_len = 0
        cnt_lin = 1
        user_counter = 0
        system_counter = 0
        system_res_counter = 0
        KB_counter = 0
        dialog_counter = 0
        for line in fin:
            line = line.strip()
            if line:
                if '#' in line:
                    line = line.replace("#", "")
                    task_type = line
                    continue
                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r, gold = line.split('\t')
                    user_counter += 1
                    system_counter += 1

                    gen_u = generate_memory(u, "$u", str(nid))
                    contex_arr += gen_u
                    conversation_arr += gen_u

                    r_index = []
                    gate = []
                    for key in r.split(' '):
                        index = [
                            loc for loc, val in enumerate(contex_arr)
                            if (val[0] == key)
                        ]
                        if (index):
                            index = max(index)
                            gate.append(1)
                            cnt_ptr += 1
                        else:
                            index = len(contex_arr)
                            gate.append(0)
                            cnt_voc += 1
                        r_index.append(index)
                        system_res_counter += 1

                    if len(r_index) > max_r_len:
                        max_r_len = len(r_index)
                    contex_arr_temp = contex_arr + [['$$$$'] * MEM_TOKEN_SIZE]

                    ent_index_calendar = []
                    ent_index_navigation = []
                    ent_index_weather = []

                    gold = ast.literal_eval(gold)
                    if task_type == "weather":
                        ent_index_weather = gold
                    elif task_type == "schedule":
                        ent_index_calendar = gold
                    elif task_type == "navigate":
                        ent_index_navigation = gold

                    ent_index = list(
                        set(ent_index_calendar + ent_index_navigation +
                            ent_index_weather))
                    data.append([
                        contex_arr_temp, r, r_index, gate, ent_index,
                        list(set(ent_index_calendar)),
                        list(set(ent_index_navigation)),
                        list(set(ent_index_weather)),
                        list(conversation_arr)
                    ])

                    gen_r = generate_memory(r, "$s", str(nid))
                    contex_arr += gen_r
                    conversation_arr += gen_r
                else:
                    KB_counter += 1
                    r = line
                    for e in line.split(' '):
                        entity[e] = 0
                    contex_arr += generate_memory(r, "", str(nid))
            else:
                cnt_lin += 1
                entity = {}
                if (max_line and cnt_lin >= max_line):
                    break
                contex_arr = []
                conversation_arr = []
                dialog_counter += 1

    max_len = max([len(d[0]) for d in data])
    logging.info("Pointer percentace= {} ".format(cnt_ptr /
                                                  (cnt_ptr + cnt_voc)))
    logging.info("Max responce Len: {}".format(max_r_len))
    logging.info("Max Input Len: {}".format(max_len))
    logging.info("Avg. User Utterances: {}".format(user_counter * 1.0 /
                                                   dialog_counter))
    logging.info("Avg. Bot Utterances: {}".format(system_counter * 1.0 /
                                                  dialog_counter))
    logging.info("Avg. KB results: {}".format(KB_counter * 1.0 /
                                              dialog_counter))
    logging.info("Avg. responce Len: {}".format(system_res_counter * 1.0 /
                                                system_counter))

    print('Sample: ', data[1][0], data[1][1], data[1][2], data[1][3],
          data[1][4])
    return data, max_len, max_r_len
Пример #19
0
vgg = VGG()
vgg.load_weights("vgg16-00b39a1b.pth")
vgg.cuda()
vgg.eval()

print("load data")
imsize = 32
data = []
for i in range(100):
    l = os.listdir("data/" + str(i))
    l.sort()
    for f in l:
        data.append((
            np.asarray(
                PIL.Image.open("data/" + str(i) + "/" +
                               f).convert("RGB").copy()),
            i,
            "data/" + str(i) + "/" + f,
        ))

print("extract features")
batchsize = 100
featurefile = open("featurefile.txt", "w")


def forwarddata():
    for i in range(0, len(data) + 1 - batchsize, batchsize):
        batchlabel = np.zeros(batchsize, dtype=int)
        batchimage = np.zeros((batchsize, 3, imsize, imsize), dtype=float)
        for j in range(batchsize):
            image, label, name = data[i + j]
 def next_batch(self, train=True):
     data = []
     label = []
     if train:
         remaining = self.source_size - self.source_id
         start = self.source_id
         if remaining <= self.source_batch_size:
             for i in self.source_list[start:]:
                 data.append(self.source_text[i, :])
                 label.append(self.label_source[i, :])
                 self.source_id += 1
             self.source_list = random.sample(range(self.source_size),
                                              self.source_size)
             self.source_id = 0
             for i in self.source_list[0:(self.source_batch_size -
                                          remaining)]:
                 data.append(self.source_text[i, :])
                 label.append(self.label_source[i, :])
                 self.source_id += 1
         else:
             for i in self.source_list[start:start +
                                       self.source_batch_size]:
                 data.append(self.source_text[i, :])
                 label.append(self.label_source[i, :])
                 self.source_id += 1
         remaining = self.target_size - self.target_id
         start = self.target_id
         if remaining <= self.target_batch_size:
             for i in self.target_list[start:]:
                 data.append(self.target_text[i, :])
                 # no target label
                 #label.append(self.label_target[i, :])
                 self.target_id += 1
             self.target_list = random.sample(range(self.target_size),
                                              self.target_size)
             self.target_id = 0
             for i in self.target_list[0:self.target_batch_size -
                                       remaining]:
                 data.append(self.target_text[i, :])
                 #label.append(self.label_target[i, :])
                 self.target_id += 1
         else:
             for i in self.target_list[start:start +
                                       self.target_batch_size]:
                 data.append(self.target_text[i, :])
                 #label.append(self.label_target[i, :])
                 self.target_id += 1
     else:
         remaining = self.val_size - self.val_id
         start = self.val_id
         if remaining <= self.val_batch_size:
             for i in self.val_list[start:]:
                 data.append(self.val_text[i, :])
                 label.append(self.label_val[i, :])
                 self.val_id += 1
             self.val_list = random.sample(range(self.val_size),
                                           self.val_size)
             self.val_id = 0
             for i in self.val_list[0:self.val_batch_size - remaining]:
                 data.append(self.val_text[i, :])
                 label.append(self.label_val[i, :])
                 self.val_id += 1
         else:
             for i in self.val_list[start:start + self.val_batch_size]:
                 data.append(self.val_text[i, :])
                 label.append(self.label_val[i, :])
                 self.val_id += 1
     data = self.scaler.transform(np.vstack(data))
     label = np.vstack(label)
     return torch.from_numpy(data).float(), torch.from_numpy(label).float()
Пример #21
0
    def __init__(self, dir_path, transforms=None):

        self.dir_path = dir_path

        imgs = []
        paths = []

        anger_path = os.path.join(dir_path, '0')
        disgust_path = os.path.join(dir_path, '1')
        fear_path = os.path.join(dir_path, '2')
        happy_path = os.path.join(dir_path, '3')
        sad_path = os.path.join(dir_path, '4')
        surprise_path = os.path.join(dir_path, '5')
        neutral_path = os.path.join(dir_path, '6')

        paths.append(anger_path)
        paths.append(disgust_path)
        paths.append(fear_path)
        paths.append(happy_path)
        paths.append(sad_path)
        paths.append(surprise_path)
        paths.append(neutral_path)

        image_num = 0
        num0 = 0
        num1 = 0
        num2 = 0
        num3 = 0
        num4 = 0
        num5 = 0
        num6 = 0

        for i in range(7):
            gap = 5

            sequences = os.listdir(paths[i])
            sequences.sort()
            for sequence in sequences:
                txt_path = os.path.join(paths[i], sequence)
                data = []
                img_paths = []
                for line in open(txt_path, "r"):  # 设置文件对象并读取每一行文件
                    data.append(line[:-1])  # 将每一行文件加入到list中
                for k in range(len(data)):
                    if k == 0:
                        img_paths.append(data[k][2:])
                    else:
                        img_paths.append(data[k][1:])
                    temp = img_paths[k]
                    temp = temp.replace('\\', '/')  # 替换斜杠方向
                    img_paths[k] = temp

                for id in range(len(img_paths)):
                    if id % gap == 0:
                        # img_p = os.path.join('/home/ubuntu/Code/data/AffWild2/', img_paths[id])
                        img_p = os.path.join('/home/ubuntu/Code/data/',
                                             img_paths[id])
                        img = Image.open(os.path.join(img_p)).convert('RGB')
                        image_num += 1
                        imgs.append((img, i))  # imgs存放样本(image, label)
                        if image_num % 1000 == 0:
                            print(image_num)
        print('**********************共有图片:', image_num)

        self.imgs = imgs
        self.transform = transforms
def read_langs2(source_text,
                utterance,
                gating_dict,
                SLOTS,
                dataset,
                lang,
                mem_lang,
                sequicity,
                training,
                max_line=None):
    data = []
    max_resp_len, max_value_len = 0, 0
    domain_counter = {}
    if 1 == 1:
        # dials = json.load(f)
        dials = []
        # create vocab first
        for dial_dict in dials:
            if (args["all_vocab"] or dataset == "train") and training:
                assert True == False
                for ti, turn in enumerate(dial_dict["dialogue"]):
                    lang.index_words(turn["system_transcript"], 'utter')
                    lang.index_words(turn["transcript"], 'utter')
        # determine training data ratio, default is 100%
        if training and dataset == "train" and args["data_ratio"] != 100:
            random.Random(10).shuffle(dials)
            dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])]

        cnt_lin = 1
        for dial_dict in ['placeholder']:
            dialog_history = source_text
            last_belief_dict = {}
            # Filtering and counting domains
            # for domain in dial_dict["domains"]:
            #     if domain not in EXPERIMENT_DOMAINS:
            #         continue
            #     if domain not in domain_counter.keys():
            #         domain_counter[domain] = 0
            #     domain_counter[domain] += 1

            # Unseen domain setting
            # if args["only_domain"] != "" and args["only_domain"] not in dial_dict["domains"]:
            #     continue
            # if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict[
            #     "domains"]) or \
            #         (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict[
            #             "domains"]):
            #     continue

            # Reading data
            for ti, turn in enumerate(['placeholder']):
                turn_domain = ''
                turn_id = '0'
                turn_uttr = utterance
                turn_uttr_strip = turn_uttr.strip()
                dialog_history += source_text
                source_text = dialog_history.strip()
                turn_belief_dict = {}

                # Generate domain-dependent slot list
                slot_temp = SLOTS
                # if dataset == "test":
                #     if args["except_domain"] != "":
                #         slot_temp = [k for k in SLOTS if args["except_domain"] in k]
                #         turn_belief_dict = OrderedDict(
                #             [(k, v) for k, v in turn_belief_dict.items() if args["except_domain"] in k])
                #     elif args["only_domain"] != "":
                #         slot_temp = [k for k in SLOTS if args["only_domain"] in k]
                #         turn_belief_dict = OrderedDict(
                #             [(k, v) for k, v in turn_belief_dict.items() if args["only_domain"] in k])

                # turn_belief_list = [str(k) + '-' + str(v) for k, v in turn_belief_dict.items()]
                turn_belief_list = []

                # if (args["all_vocab"] or dataset == "train") and training:
                #     mem_lang.index_words(turn_belief_dict, 'belief')

                class_label, generate_y, slot_mask, gating_label = [], [], [], []
                start_ptr_label, end_ptr_label = [], []
                # for slot in slot_temp:
                #     if slot in turn_belief_dict.keys():
                #         generate_y.append(turn_belief_dict[slot])
                #
                #         if turn_belief_dict[slot] == "dontcare":
                #             gating_label.append(gating_dict["dontcare"])
                #         elif turn_belief_dict[slot] == "none":
                #             gating_label.append(gating_dict["none"])
                #         else:
                #             gating_label.append(gating_dict["ptr"])
                #
                #         if max_value_len < len(turn_belief_dict[slot]):
                #             max_value_len = len(turn_belief_dict[slot])
                #
                #     else:
                #         generate_y.append("none")
                #         gating_label.append(gating_dict["none"])
                gating_label = [2] * 80
                generate_y = ['none'] * 80

                # 可以根据ID和turn_idx将内容复原
                data_detail = {
                    "ID": "0",
                    "domains": [],
                    "turn_domain": "",
                    "turn_id": 0,
                    "dialog_history": source_text,
                    "turn_belief": [],
                    "gating_label": gating_label,
                    "turn_uttr": turn_uttr_strip,
                    'generate_y': generate_y
                }
                data.append(data_detail)

                if max_resp_len < len(source_text.split()):
                    max_resp_len = len(source_text.split())

            cnt_lin += 1
            if (max_line and cnt_lin >= max_line):
                break

    # add t{} to the lang file
    # if "t{}".format(max_value_len - 1) not in mem_lang.word2index.keys() and training:
    #     for time_i in range(max_value_len):
    #         mem_lang.index_words("t{}".format(time_i), 'utter')

    # print("domain_counter", domain_counter)
    return data, max_resp_len, slot_temp
Пример #23
0
    def calc_gradients_wrt_output_whole_network_all_tasks(
            self,
            loader,
            out_path,
            if_pretrained_imagenet=False,
            layers=layers_bn_afterrelu,
            neuron_nums=[
                64, 64, 64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512,
                512, 512
            ],
            if_rename_layers=True):
        print(
            "Warning! Assume that loader returns in i-th batch only instances of i-th class"
        )
        # for model in self.model.values():
        #     model.zero_grad()
        #     model.eval()

        # target_layer_names = [layer.replace('_', '.') for layer in layers]#layers_bn_afterrelu] #+ ['feature_extractor']
        target_layer_names = [
            layer.replace('_', '.') if if_rename_layers else layer
            for layer in layers
        ]

        # neuron_nums = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

        def save_activation(activations, name, mod, inp, out):
            if name in target_layer_names:
                if_out_is_list = type(out) == list  #backbone output
                if if_out_is_list:
                    out = out[0]  #single-head cifar
                # print(out.shape)
                out.requires_grad_(True)
                # if 'bn1' in name:
                #     out = F.relu(out)
                out.retain_grad()
                activations[name] = out
                if if_out_is_list:
                    out = [out]
                return out

        activations = {}
        hooks = []
        for name, m in self.feature_extractor.named_modules():
            if name in target_layer_names:
                hooks.append(
                    m.register_forward_hook(
                        partial(save_activation, activations, name)))
        hooks.append(
            self.feature_extractor.register_forward_hook(
                partial(save_activation, activations, 'feature_extractor')))

        layer_names_for_pd = []
        neuron_indices_for_pd = []
        mean_grads_for_pd = defaultdict(list)
        if_already_saved_layer_names_and_neuron_indices = False
        n_classes = 10
        if if_pretrained_imagenet:
            n_classes = 1000
        iter_loader = iter(loader)
        for cond_idx in range(n_classes):
            print(cond_idx)
            batch = next(iter_loader)
            cur_grads = defaultdict(
                lambda: defaultdict(list)
            )  # layer -> neuron -> grads from every batch (i.e. 1 scalar per batch)
            ims, labels = batch
            if False:
                mask = (
                    labels == cond_idx
                )  #(labels != cond_idx)#np.array([True] * len(labels))#
                print(labels)
                ims_masked = ims[mask, ...]
                ims_masked = ims_masked.cuda()
            else:
                ims_masked = ims.cuda()
                print(labels)
            out = self.feature_extractor(ims_masked)
            if not if_pretrained_imagenet:
                #single-headed
                y = out[0]
                out_cond = self.model['all'].linear(y)
                out_cond[:, cond_idx].sum().backward()
            else:
                out[:, cond_idx].sum().backward()

            for layer_name in target_layer_names:
                print(layer_name)
                layer_grad = activations[layer_name].grad.detach().cpu()
                n_neurons = neuron_nums[target_layer_names.index(layer_name)]
                # print(layer_grad.shape[1], n_neurons)
                for target_neuron in range(n_neurons):
                    cur_grad = layer_grad[:, target_neuron]
                    try:
                        cur_grad = cur_grad.mean(axis=(-1, -2))
                    except:
                        pass
                    # cur_grad = np.sign(cur_grad)
                    # cur_grad[cur_grad < 0] = 0
                    cur_grad = cur_grad.mean().item()
                    cur_grads[layer_name][target_neuron].append(cur_grad)
                    if not if_already_saved_layer_names_and_neuron_indices:
                        layer_names_for_pd.append(layer_name)
                        neuron_indices_for_pd.append(target_neuron)

                activations[layer_name].grad.zero_()

            if_already_saved_layer_names_and_neuron_indices = True  # is set after the first batch of the first cond_idx

            for layer_name in target_layer_names:
                n_neurons = neuron_nums[target_layer_names.index(layer_name)]
                for target_neuron in range(n_neurons):
                    grad_meaned = np.mean(cur_grads[layer_name][target_neuron])
                    mean_grads_for_pd[cond_idx].append(grad_meaned)

        for hook in hooks:
            hook.remove()

        data = []
        for i in range(len(neuron_indices_for_pd)):
            data.append([layer_names_for_pd[i], neuron_indices_for_pd[i]] +
                        [mg[i] for mg in mean_grads_for_pd.values()])
        df = pd.DataFrame(data,
                          columns=['layer_name', 'neuron_idx'] +
                          list(range(n_classes)))
        df.to_pickle(out_path)

        return df
Пример #24
0
    def init(root, num_query, num_train):
        data_list = [
            'data_batch_1',
            'data_batch_2',
            'data_batch_3',
            'data_batch_4',
            'data_batch_5',
            'test_batch',
        ]
        base_folder = 'cifar-10-batches-py'

        data = []
        targets = []

        for file_name in data_list:
            file_path = os.path.join(root, base_folder, file_name)
            with open(file_path, 'rb') as f:
                if sys.version_info[0] == 2:
                    entry = pickle.load(f)
                else:
                    entry = pickle.load(f, encoding='latin1')
                data.append(entry['data'])
                if 'labels' in entry:
                    targets.extend(entry['labels'])
                else:
                    targets.extend(entry['fine_labels'])

        data = np.vstack(data).reshape(-1, 3, 32, 32)
        data = data.transpose((0, 2, 3, 1))  # convert to HWC
        targets = np.array(targets)

        CIFAR10.ALL_IMG = data
        CIFAR10.ALL_TARGETS = targets

        # sort by class
        sort_index = CIFAR10.ALL_TARGETS.argsort()
        CIFAR10.ALL_IMG = CIFAR10.ALL_IMG[sort_index, :]
        CIFAR10.ALL_TARGETS = CIFAR10.ALL_TARGETS[sort_index]

        # (num_query / number of class) query images per class
        # (num_train / number of class) train images per class
        query_per_class = num_query // 10
        train_per_class = num_train // 10

        # permutate index (range 0 - 6000 per class)
        perm_index = np.random.permutation(CIFAR10.ALL_IMG.shape[0] // 10)
        query_index = perm_index[:query_per_class]
        train_index = perm_index[query_per_class:query_per_class +
                                 train_per_class]

        query_index = np.tile(query_index, 10)
        train_index = np.tile(train_index, 10)
        inc_index = np.array(
            [i * (CIFAR10.ALL_IMG.shape[0] // 10) for i in range(10)])
        query_index = query_index + inc_index.repeat(query_per_class)
        train_index = train_index + inc_index.repeat(train_per_class)

        # split data, tags
        CIFAR10.QUERY_IMG = CIFAR10.ALL_IMG[query_index, :]
        CIFAR10.QUERY_TARGETS = CIFAR10.ALL_TARGETS[query_index]
        CIFAR10.TRAIN_IMG = CIFAR10.ALL_IMG[train_index, :]
        CIFAR10.TRAIN_TARGETS = CIFAR10.ALL_TARGETS[train_index]
Пример #25
0
def read_langs(file_name, max_line = None):
    print(("Reading lines from {}".format(file_name)))
    data, context_arr, kb_arr, kb_id = [], [], [], []
    max_resp_len = 0
    
    with open('data/KVR/kvret_entities.json') as f:
        global_entity = json.load(f)
        global_entity_list = {}
        for key in global_entity.keys():
            if key != 'poi':
                if key not in global_entity_list:
                    global_entity_list[key] = []
                global_entity_list[key] += [item.lower().replace(' ', '_') for item in global_entity[key]]
            else:
                #global_entity_list['poi'] = [d['poi'].lower().replace(' ', '_') for d in global_entity['poi']]
                for item in global_entity['poi']:
                    for k in item.keys():
                        if k == "type":
                            continue
                        if k not in global_entity_list:
                            global_entity_list[k] = []
                        global_entity_list[k] += [item[k].lower().replace(' ', '_')]
                    #global_entity_list['poi'] = [item[k].lower().replace(' ', '_') for k in item.keys()]
    
    with open(file_name) as fin:
        cnt_lin, sample_counter = 1, 1
        for line in fin:
            line = line.strip()
            if line:
                if '#' in line:
                    line = line.replace("#","")
                    task_type = line
                    continue

                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r, gold_ent = line.split('\t')
                    context_arr.append(u.split(' '))
                
                    # Get gold entity for each domain
                    gold_ent = ast.literal_eval(gold_ent)
                    ent_idx_cal, ent_idx_nav, ent_idx_wet = [], [], []
                    if task_type == "weather": ent_idx_wet = gold_ent
                    elif task_type == "schedule": ent_idx_cal = gold_ent
                    elif task_type == "navigate": ent_idx_nav = gold_ent
                    ent_index = list(set(ent_idx_cal + ent_idx_nav + ent_idx_wet))

                    # Get entity set
                    entity_set, entity_type_set = generate_entity_set(kb_arr)
                    entity_set, entity_type_set = generate_entity_from_context(context_arr, global_entity_list, entity_set, entity_type_set)

                    # Get local pointer position for each word in system response
                    ptr_index = []
                    for key in r.split():
                        if key in entity_set:
                            index = entity_set.index(key)
                        else:
                            index = len(entity_set)
                        ptr_index.append(index)
       
                    sketch_response = generate_template(global_entity_list, r, gold_ent, entity_set, entity_type_set, task_type)

                    #add empty token
                    if len(entity_set) == 0:
                        entity_set.append("$$$$")
                        entity_type_set.append("empty_token")
                     
                    entity_set.append("$$$$")
                    entity_type_set.append("empty_token")
                    
                    #generate indicator
                    indicator = generate_indicator(context_arr, entity_set)
                    
                    #generate graph
                    graph = generate_graph(entity_set, relation_set, kb_arr)
                    
                    data_detail = {
                        'context_arr':list(context_arr),
                        'kb_arr':list(entity_set),
                        'response':r.split(' '),
                        'sketch_response':sketch_response.split(' '),
                        'ptr_index':ptr_index+[len(entity_set) - 1],
                        'indicator':indicator,
                        'ent_index':ent_index,
                        'ent_idx_cal':list(set(ent_idx_cal)),
                        'ent_idx_nav':list(set(ent_idx_nav)),
                        'ent_idx_wet':list(set(ent_idx_wet)),
                        'id':int(sample_counter),
                        'ID':int(cnt_lin),
                        'domain':task_type,
                        'graph':graph}
                    data.append(data_detail)
                    
                    context_arr.append(r.split(' '))

                    if max_resp_len < len(r.split()):
                        max_resp_len = len(r.split())
                    sample_counter += 1
                else:
                    kb_id.append(nid)
                    kb_info = line.split(' ')
                    kb_arr.append(kb_info)
                    if len(kb_info) != 5:
                       print(kb_info)
            else:
                cnt_lin += 1
                context_arr, kb_arr, kb_id = [], [], []
                if(max_line and cnt_lin >= max_line):
                    break

    return data, max_resp_len
Пример #26
0
    def __getitem__(self, index):

        ## random select starting frame index t between [0, N - #sample_frames]
        N = self.num_frames[index]
        T = random.randint(0, N - self.opts.sample_frames)

        video = self.task_videos[index][0]

        ## load input and processed frames
        input_dir = os.path.join(self.opts.data_haze_dir, self.mode,
                                 "Rain_Haze", video)
        haze_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Haze",
                                video)
        gt_dir = os.path.join(self.opts.data_haze_dir, self.mode, "GT", video)
        alpha_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Alpha",
                                 video)
        trans_dir = os.path.join(self.opts.data_haze_dir, self.mode, "Trans",
                                 video)

        ## sample from T to T + #sample_frames - 1
        frame_i = []
        frame_h = []
        frame_a = []
        frame_t = []
        frame_g = []

        for t in range(T + 1, T + self.opts.sample_frames + 1):
            frame_i.append(
                utils.read_img(os.path.join(input_dir, "%d.jpg" % t)))
            frame_h.append(utils.read_img(os.path.join(haze_dir,
                                                       "%d.jpg" % t)))
            frame_a.append(
                utils.read_img(os.path.join(alpha_dir, "%d.jpg" % t)))
            frame_t.append(
                utils.read_img(os.path.join(trans_dir, "%d.jpg" % t)))
            frame_g.append(utils.read_img(os.path.join(gt_dir, "%d.jpg" % t)))

        ## data augmentation
        if self.mode == 'train':
            if self.opts.geometry_aug:

                ## random scale
                H_in = frame_i[0].shape[0]
                W_in = frame_i[0].shape[1]

                sc = np.random.uniform(self.opts.scale_min,
                                       self.opts.scale_max)
                H_out = int(math.floor(H_in * sc))
                W_out = int(math.floor(W_in * sc))

                ## scaled size should be greater than opts.crop_size
                if H_out < W_out:
                    if H_out < self.opts.crop_size:
                        H_out = self.opts.crop_size
                        W_out = int(
                            math.floor(W_in * float(H_out) / float(H_in)))
                else:  ## W_out < H_out
                    if W_out < self.opts.crop_size:
                        W_out = self.opts.crop_size
                        H_out = int(
                            math.floor(H_in * float(W_out) / float(W_in)))

                for t in range(self.opts.sample_frames):
                    frame_i[t] = cv2.resize(frame_i[t], (W_out, H_out))
                    frame_h[t] = cv2.resize(frame_h[t], (W_out, H_out))
                    frame_a[t] = cv2.resize(frame_a[t], (W_out, H_out))
                    frame_t[t] = cv2.resize(frame_t[t], (W_out, H_out))
                    frame_g[t] = cv2.resize(frame_g[t], (W_out, H_out))

            ## random crop
            cropper = RandomCrop(frame_i[0].shape[:2],
                                 (self.opts.crop_size, self.opts.crop_size))

            for t in range(self.opts.sample_frames):
                frame_i[t] = cropper(frame_i[t])
                frame_h[t] = cropper(frame_h[t])
                frame_a[t] = cropper(frame_a[t])
                frame_t[t] = cropper(frame_t[t])
                frame_g[t] = cropper(frame_g[t])

            if self.opts.geometry_aug:

                ### random rotate
                #rotate = random.randint(0, 3)
                #if rotate != 0:
                #    for t in range(self.opts.sample_frames):
                #        frame_i[t] = np.rot90(frame_i[t], rotate)
                #        frame_p[t] = np.rot90(frame_p[t], rotate)

                ## horizontal flip
                if np.random.random() >= 0.5:
                    for t in range(self.opts.sample_frames):
                        frame_i[t] = cv2.flip(frame_i[t], flipCode=0)
                        frame_h[t] = cv2.flip(frame_h[t], flipCode=0)
                        frame_t[t] = cv2.flip(frame_t[t], flipCode=0)
                        frame_a[t] = cv2.flip(frame_a[t], flipCode=0)
                        frame_g[t] = cv2.flip(frame_g[t], flipCode=0)

            if self.opts.order_aug:
                ## reverse temporal order
                if np.random.random() >= 0.5:
                    frame_i.reverse()
                    frame_h.reverse()
                    frame_a.reverse()
                    frame_t.reverse()
                    frame_g.reverse()

        elif self.mode == "test":
            ## resize image to avoid size mismatch after downsampline and upsampling
            H_i = frame_i[0].shape[0]
            W_i = frame_i[0].shape[1]

            H_o = int(
                math.ceil(float(H_i) / self.opts.size_multiplier) *
                self.opts.size_multiplier)
            W_o = int(
                math.ceil(float(W_i) / self.opts.size_multiplier) *
                self.opts.size_multiplier)

            for t in range(self.opts.sample_frames):
                frame_i[t] = cv2.resize(frame_i[t], (W_o, H_o))
                frame_h[t] = cv2.resize(frame_h[t], (W_o, H_o))
                frame_a[t] = cv2.resize(frame_a[t], (W_o, H_o))
                frame_t[t] = cv2.resize(frame_t[t], (W_o, H_o))
                frame_g[t] = cv2.resize(frame_g[t], (W_o, H_o))
        else:
            raise Exception("Unknown mode (%s)" % self.mode)

        ### convert (H, W, C) array to (C, H, W) tensor
        data = []
        for t in range(self.opts.sample_frames):
            data.append(
                torch.from_numpy(frame_i[t].transpose(2, 0, 1).astype(
                    np.float32)).contiguous())
            data.append(
                torch.from_numpy(frame_h[t].transpose(2, 0, 1).astype(
                    np.float32)).contiguous())
            data.append(
                torch.from_numpy(frame_a[t].transpose(2, 0, 1).astype(
                    np.float32)).contiguous())
            data.append(
                torch.from_numpy(frame_t[t].transpose(2, 0, 1).astype(
                    np.float32)).contiguous())
            data.append(
                torch.from_numpy(frame_g[t].transpose(2, 0, 1).astype(
                    np.float32)).contiguous())
        return data
Пример #27
0
    def __getitem__(self, index):
        if self.training:
            index_ratio = int(self.ratio_index[index])
        else:
            index_ratio = index
        # get the anchor index for current sample index
        # here we set the anchor index to the last one
        # sample in this group
        #index = 32014 # temp hack for testing case where crop excluded gt boxes
        minibatch_db = self._roidb[index_ratio]
        blobs = []
        data = []
        padding_data = []
        im_info = []
        data_heights = []
        data_widths = []
        gt_boxes = []
        gt_boxes_padding = []
        num_boxes = []
        # check for duplicate tracks within same frame
        assert len(minibatch_db[0]['track_id']) == len(np.unique(minibatch_db[0]['track_id'])), \
                'Cannot have >1 track with same id in same frame.'
        assert len(minibatch_db[1]['track_id']) == len(np.unique(minibatch_db[1]['track_id'])), \
                'Cannot have >1 track with same id in same frame.'

        # Iterate through each entry in the sample tuple
        for ientry, entry in enumerate(minibatch_db):
            blobs.append(get_minibatch([entry], self._num_classes))
            data.append(torch.from_numpy(blobs[ientry]['data']))
            im_info.append(torch.from_numpy(blobs[ientry]['im_info']))
            data_heights.append(data[ientry].size(1))
            data_widths.append(data[ientry].size(2))
            # random shuffle the bounding boxes
            #np.random.shuffle(blobs[ientry]['gt_boxes'])
            if not self.training and blobs[ientry]['gt_boxes'].shape[0] == 0:
                blobs[ientry]['gt_boxes'] = np.ones((1, 6), dtype=np.float32)
            gt_boxes.append(torch.from_numpy(blobs[ientry]['gt_boxes']))
            if self.training:
                ########################################################
                # padding the input image to fixed size for each group #
                ########################################################
                # if the image needs to be cropped, crop to the target size
                ratio = self.ratio_list_batch[index]
                if self._roidb[index_ratio][0]['need_crop']:
                    if ratio < 1.:
                        # this means that data_width << data_height and we crop the height
                        min_y = int(torch.min(gt_boxes[ientry][:, 1]))
                        max_y = int(torch.max(gt_boxes[ientry][:, 3]))
                        trim_size = int(np.floor(data_widths[ientry] / ratio))
                        if trim_size > data_heights[ientry]:
                            trim_size = data_heights[ientry]
                        box_region = max_y - min_y + 1
                        if min_y == 0:
                            y_s = 0
                        else:
                            if (box_region - trim_size) < 0:
                                y_s_min = max(max_y - trim_size, 0)
                                y_s_max = min(min_y,
                                              data_heights[ientry] - trim_size)
                                if y_s_min == y_s_max:
                                    y_s = y_s_min
                                else:
                                    y_s = np.random.choice(
                                        range(y_s_min, y_s_max))
                            else:
                                y_s_add = int((box_region - trim_size) / 2)
                                if y_s_add == 0:
                                    y_s = min_y
                                else:
                                    y_s = np.random.choice(
                                        range(min_y, min_y + y_s_add))
                        # crop the image
                        data[ientry] = data[ientry][:, y_s:(y_s +
                                                            trim_size), :, :]
                        # shift y coordiante of gt_boxes
                        gt_boxes[ientry][:,
                                         1] = gt_boxes[ientry][:,
                                                               1] - float(y_s)
                        gt_boxes[ientry][:,
                                         3] = gt_boxes[ientry][:,
                                                               3] - float(y_s)
                        # update gt bounding box according to trim
                        gt_boxes[ientry][:, 1].clamp_(0, trim_size - 1)
                        gt_boxes[ientry][:, 3].clamp_(0, trim_size - 1)
                    else:
                        # data_width >> data_height so crop width
                        min_x = int(torch.min(gt_boxes[ientry][:, 0]))
                        max_x = int(torch.max(gt_boxes[ientry][:, 2]))
                        trim_size = int(np.ceil(data_heights[ientry] * ratio))
                        if trim_size > data_widths[ientry]:
                            trim_size = data_widths[ientry]
                        box_region = max_x - min_x + 1
                        if min_x == 0:
                            x_s = 0
                        else:
                            if (box_region - trim_size) < 0:
                                x_s_min = max(max_x - trim_size, 0)
                                x_s_max = min(min_x,
                                              data_widths[ientry] - trim_size)
                                if x_s_min == x_s_max:
                                    x_s = x_s_min
                                else:
                                    x_s = np.random.choice(
                                        range(x_s_min, x_s_max))
                            else:
                                x_s_add = int((box_region - trim_size) / 2)
                                if x_s_add == 0:
                                    x_s = min_x
                                else:
                                    x_s = np.random.choice(
                                        range(min_x, min_x + x_s_add))
                        # crop the image
                        data[ientry] = data[ientry][:, :,
                                                    x_s:(x_s + trim_size), :]

                        # shift x coordiante of gt_boxes[ientry]
                        gt_boxes[ientry][:,
                                         0] = gt_boxes[ientry][:,
                                                               0] - float(x_s)
                        gt_boxes[ientry][:,
                                         2] = gt_boxes[ientry][:,
                                                               2] - float(x_s)
                        # update gt bounding box according the trip
                        gt_boxes[ientry][:, 0].clamp_(0, trim_size - 1)
                        gt_boxes[ientry][:, 2].clamp_(0, trim_size - 1)
                # based on the ratio, pad the image.
                if ratio < 1:
                    # data_width < data_height
                    trim_size = int(np.floor(data_widths[ientry] / ratio))
                    padding_data.append(torch.FloatTensor(int(np.ceil(data_widths[ientry] / ratio)),\
                            data_widths[ientry], 3).zero_())
                    padding_data[ientry][:data_heights[ientry], :, :] = data[
                        ientry][0]
                    im_info[ientry][0, 0] = padding_data[ientry].size(0)
                elif ratio > 1:
                    # data_width > data_height
                    padding_data.append(torch.FloatTensor(data_heights[ientry],\
                            int(np.ceil(data_heights[ientry] * ratio)), 3).zero_())
                    padding_data[ientry][:, :data_widths[ientry], :] = data[
                        ientry][0]
                    im_info[ientry][0, 1] = padding_data[ientry].size(1)
                else:
                    trim_size = min(data_heights[ientry], data_widths[ientry])
                    padding_data.append(
                        torch.FloatTensor(trim_size, trim_size, 3).zero_())
                    padding_data[ientry] = data[ientry][
                        0][:trim_size, :trim_size, :]
                    # gt_boxes[ientry].clamp_(0, trim_size)
                    gt_boxes[ientry][:, :4].clamp_(0, trim_size)
                    im_info[ientry][0, 0] = trim_size
                    im_info[ientry][0, 1] = trim_size
                # check the bounding box:
                not_keep = (gt_boxes[ientry][:,0] \
                        == gt_boxes[ientry][:,2]) | (gt_boxes[ientry][:,1] == gt_boxes[ientry][:,3])
                keep = torch.nonzero(not_keep == 0).view(-1)

                gt_boxes_padding.append(
                    torch.FloatTensor(self.max_num_box,
                                      gt_boxes[ientry].size(1)).zero_())
                if keep.numel() != 0:
                    gt_boxes[ientry] = gt_boxes[ientry][keep]
                    num_boxes.append(
                        torch.LongTensor(
                            [min(gt_boxes[ientry].size(0),
                                 self.max_num_box)]).cuda())
                    curr_num_boxes = int(num_boxes[ientry][0])
                    gt_boxes_padding[ientry][:curr_num_boxes, :] = gt_boxes[
                        ientry][:curr_num_boxes]
                else:
                    num_boxes.append(torch.LongTensor(1).cuda().zero_())

                # permute trim_data to adapt to downstream processing
                padding_data[ientry] = padding_data[ientry].squeeze(0).permute(
                    2, 0, 1).contiguous()
                padding_data[ientry] = padding_data[ientry].unsqueeze(0)
                #im_info[ientry] = im_info[ientry].view(3)
                gt_boxes_padding[ientry] = gt_boxes_padding[ientry].unsqueeze(
                    0)
                num_boxes[ientry] = num_boxes[ientry].unsqueeze(0)

                #return padding_data, im_info, gt_boxes_padding, num_boxes
            else:
                data[ientry] = data[ientry].permute(0, 3, 1, 2).contiguous().\
                        view(3, data_heights[ientry], data_widths[ientry])
                data[ientry] = data[ientry].unsqueeze(0)
                #im_info[ientry] = im_info[ientry].view(3)

                #gt_boxes.append(torch.FloatTensor([1,1,1,1,1]))
                gt_boxes_padding.append(
                    torch.FloatTensor(self.max_num_box,
                                      gt_boxes[ientry].size(1)).zero_())
                #gt_boxes[ientry] = gt_boxes[ientry].unsqueeze(0)
                num_boxes.append(
                    torch.LongTensor(
                        [min(gt_boxes[ientry].size(0),
                             self.max_num_box)]).cuda())
                #num_boxes.append(torch.LongTensor(1).cuda().zero_())
                num_boxes[ientry] = num_boxes[ientry].unsqueeze(0)
                curr_num_boxes = int(num_boxes[ientry][0])
                gt_boxes_padding[ientry][:curr_num_boxes, :] = gt_boxes[
                    ientry][:curr_num_boxes]
                gt_boxes_padding[ientry] = gt_boxes_padding[ientry].unsqueeze(
                    0)

                #return data, im_info, gt_boxes, num_boxes
            if _DEBUG:
                if self.training:
                    print(gt_boxes_padding[ientry])
                    print(padding_data[ientry].size())
                    self._plot_image(padding_data[ientry].permute(0, 2, 3, 1),
                                     gt_boxes_padding[ientry],
                                     num_boxes[ientry])
                else:
                    print(gt_boxes[ientry])
                    print(data[ientry].size())
                    self._plot_image(data[ientry].permute(0, 2, 3, 1),
                                     gt_boxes[ientry], num_boxes[ientry])

        im_info_pair = torch.cat(im_info, dim=0)
        num_boxes = torch.cat(num_boxes, dim=0)
        if self.training:
            data_pair = torch.cat(padding_data, dim=0)
            gt_boxes_padding_pair = torch.cat(gt_boxes_padding, dim=0)
            return data_pair, im_info_pair, gt_boxes_padding_pair, num_boxes
        else:
            data_pair = torch.cat(data, dim=0)
            gt_boxes_padding_pair = torch.cat(gt_boxes_padding, dim=0)
            #gt_boxes = torch.cat(gt_boxes, dim=0)
            return data_pair, im_info_pair, gt_boxes_padding_pair, num_boxes
Пример #28
0
def read_langs(file_name, entity, cand2DLidx, idx2candDL, max_line=None):
    logging.info(("Reading lines from {}".format(file_name)))
    data = []
    content_arr = []
    #conversation_arr = []
    u = None
    r = None
    user_counter = 0
    system_counter = 0
    system_res_counter = 0
    KB_counter = 0
    dialog_counter = 0
    with open(file_name) as fin:
        #cnt_ptr = 0
        #cnt_voc = 0
        max_r_len = 0
        cnt_lin = 1
        time_counter = 1
        for line in fin:
            line = line.strip()
            if line:
                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r = line.split('\t')
                    if u != '<SILENCE>': user_counter += 1
                    system_counter += 1
                    bot_action_idx = cand2DLidx[r]
                    bot_action = idx2candDL[bot_action_idx]

                    gen_u = generate_memory(u, "$u", str(time_counter))
                    content_arr += gen_u
                    #conversation_arr += gen_u

                    ent_query = {}
                    ent_query_idx = {}
                    for idx, key in enumerate(r.split(' ')):
                        if (key in entity):
                            index = [
                                loc for loc, val in enumerate(content_arr)
                                if (val[0] == key)
                            ]
                            if (index):
                                index = max(index)
                                #cnt_ptr += 1
                                ent_query_idx[bot_action.split(' ')
                                              [idx]] = index
                                ent_query[bot_action.split(' ')[idx]] = key
                            else:
                                print('[Wrong] Cannot find the entity')
                                exit(1)
                        system_res_counter += 1

                    if ent_query == {}:
                        ent_query = {'UNK': '$$$$'}
                        ent_query_idx = {'UNK': len(content_arr)}
                        content_arr_temp = content_arr + [['$$$$'] *
                                                          MEM_TOKEN_SIZE]
                    else:
                        content_arr_temp = content_arr
                    # ent = []
                    # for key in r.split(' '):
                    #     if(key in entity):
                    #         ent.append(key)

                    for ent in ent_query.keys():
                        data_item = {
                            'dialID': dialog_counter,
                            'turnID': system_counter,
                            'content_arr': content_arr_temp,
                            'bot_action': bot_action,
                            'bot_action_idx': bot_action_idx,
                            'ent_query': [ent, ent_query[ent]],
                            'ent_query_idx': [ent, ent_query_idx[ent]],
                            'gold_response': r
                        }
                        data.append(data_item)

                    #data.append([content_arr_temp,r,r_index,conversation_arr,ent])
                    gen_r = generate_memory(r, "$s", str(time_counter))
                    content_arr += gen_r
                    #conversation_arr += gen_r
                    time_counter += 1
                else:
                    KB_counter += 1
                    r = line
                    content_arr += generate_memory(r, "", "")
            else:
                cnt_lin += 1
                if (max_line and cnt_lin >= max_line):
                    break
                content_arr = []
                content_arr_temp = []
                #conversation_arr = []
                time_counter = 1
                dialog_counter += 1
    max_len = max([len(d['content_arr']) for d in data])
    logging.info("Nb of dialogs = {} ".format(dialog_counter))
    #logging.info("Pointer percentace= {} ".format(cnt_ptr/(cnt_ptr+cnt_voc)))
    logging.info("Max responce Len: {}".format(max_r_len))
    logging.info("Max Input Len: {}".format(max_len))
    logging.info("Avg. User Utterances: {}".format(user_counter * 1.0 /
                                                   dialog_counter))
    logging.info("Avg. Bot Utterances: {}".format(system_counter * 1.0 /
                                                  dialog_counter))
    logging.info("Avg. KB results: {}".format(KB_counter * 1.0 /
                                              dialog_counter))
    logging.info("Avg. responce Len: {}".format(system_res_counter * 1.0 /
                                                system_counter))

    print('Sample: ', data[5])
    return data, max_len
Пример #29
0
def read_langs(file_name,
               gating_dict,
               SLOTS,
               dataset,
               lang,
               mem_lang,
               sequicity,
               training,
               max_line=None):
    """
    Better name it construct_vocab?
    In fact, this function is the front line towards original data files.
    The 1st step to process data files. Convert them into python data-type.
    Params:
        SLOTS: contain slots from train, dev and test
        max_line: set the max number of dialogs that model deals with
    Returns:
        data: list of dicts, each element (one dict) is an abstract of each turn of all the dialogs.
              So the content is very redundant. See line 322.
        max_resp_len: the maximum length of dialog history
        slot_temp: The same as SLOTS in most conditions. slot_temp is different from SLOTS
                   ONLY when we do experiments on specific domains

    """
    print(("Reading from {}".format(file_name)))
    data = []
    max_resp_len, max_value_len = 0, 0
    domain_counter = {}  # distribution of domain in the datafiles
    with open(file_name) as f:
        dials = json.load(f)
        # create vocab first
        for dial_dict in dials:
            if (args["all_vocab"] or dataset == "train") and training:
                for ti, turn in enumerate(dial_dict["dialogue"]):
                    lang.index_words(turn["system_transcript"], 'utter')
                    lang.index_words(turn["transcript"], 'utter')
        # determine training data ratio, default is 100%
        if training and dataset == "train" and args["data_ratio"] != 100:
            random.Random(10).shuffle(dials)
            dials = dials[:int(len(dials) * 0.01 * args["data_ratio"])]

        cnt_lin = 1  # count the number of dialogs that have been processed
        for dial_dict in dials:
            dialog_history = ""
            last_belief_dict = {}
            # Filtering and counting domains
            for domain in dial_dict["domains"]:
                if domain not in EXPERIMENT_DOMAINS:
                    continue
                if domain not in domain_counter.keys():
                    domain_counter[domain] = 0
                domain_counter[domain] += 1
            ######
            # Unseen domain setting for zero-shot learning
            if args["only_domain"] != "" and args[
                    "only_domain"] not in dial_dict["domains"]:
                continue
            if (args["except_domain"] != "" and dataset == "test" and args["except_domain"] not in dial_dict["domains"]) or \
               (args["except_domain"] != "" and dataset != "test" and [args["except_domain"]] == dial_dict["domains"]):
                continue
            ######
            # Reading data
            for ti, turn in enumerate(dial_dict["dialogue"]):
                turn_domain = turn["domain"]
                turn_id = turn["turn_idx"]
                turn_uttr = turn["system_transcript"] + " ; " + turn[
                    "transcript"]
                turn_uttr_strip = turn_uttr.strip()
                dialog_history += (turn["system_transcript"] + " ; " +
                                   turn["transcript"] + " ; ")
                source_text = dialog_history.strip()
                '''Func below is very tricky. 0_0'''
                turn_belief_dict = fix_general_label_error(
                    turn["belief_state"], False, SLOTS)

                # Generate domain-dependent slot list
                slot_temp = SLOTS
                if dataset == "train" or dataset == "dev":
                    if args["except_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["except_domain"] not in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["except_domain"] not in k
                        ])
                    elif args["only_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["only_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["only_domain"] in k
                        ])
                else:
                    if args["except_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["except_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["except_domain"] in k
                        ])
                    elif args["only_domain"] != "":
                        slot_temp = [
                            k for k in SLOTS if args["only_domain"] in k
                        ]
                        turn_belief_dict = OrderedDict([
                            (k, v) for k, v in turn_belief_dict.items()
                            if args["only_domain"] in k
                        ])

                turn_belief_list = [
                    str(k) + '-' + str(v) for k, v in turn_belief_dict.items()
                ]

                if (args["all_vocab"] or dataset == "train") and training:
                    mem_lang.index_words(turn_belief_dict, 'belief')

                class_label, generate_y, slot_mask, gating_label  = [], [], [], []
                start_ptr_label, end_ptr_label = [], []
                for slot in slot_temp:
                    if slot in turn_belief_dict.keys():
                        generate_y.append(
                            turn_belief_dict[slot]
                        )  # generate_y stores the true label of values for domain-slot!
                        # It also includes "none", so the length is fixed to len(SLOTS)
                        '''Below is similar to the category in ProPara'''
                        if turn_belief_dict[slot] == "dontcare":
                            gating_label.append(gating_dict["dontcare"])
                        elif turn_belief_dict[slot] == "none":
                            gating_label.append(gating_dict["none"])
                        else:
                            gating_label.append(gating_dict["ptr"])

                        if max_value_len < len(
                                turn_belief_dict[slot]
                        ):  # max_value_len: the maximum of number of turn_belief items across all dialogs
                            max_value_len = len(turn_belief_dict[slot])

                    else:
                        generate_y.append("none")
                        gating_label.append(gating_dict["none"])

                data_detail = {
                    "ID": dial_dict["dialogue_idx"],
                    "domains": dial_dict["domains"],
                    "turn_domain": turn_domain,
                    "turn_id": turn_id,
                    "dialog_history": source_text,
                    "turn_belief": turn_belief_list,
                    "gating_label": gating_label,
                    "turn_uttr": turn_uttr_strip,
                    'generate_y': generate_y
                }
                data.append(
                    data_detail
                )  # data_detail is appended per turn in each dialogue. len(data)=(#average turns * #dialogs)
                # Each data_detail represents an primitive raw instance for training.

                if max_resp_len < len(source_text.split(
                )):  # max_resp_len: the maximum length of dialog history
                    max_resp_len = len(source_text.split())

            cnt_lin += 1  # count how many dialogs there are in the datafile
            if (max_line and cnt_lin >= max_line):
                break

    # add t{} to the mem_lang file
    # todo point of this operation?
    if "t{}".format(max_value_len -
                    1) not in mem_lang.word2index.keys() and training:
        for time_i in range(max_value_len):
            mem_lang.index_words("t{}".format(time_i), 'utter')

    print("domain_counter", domain_counter)
    return data, max_resp_len, slot_temp  # slot_temp is different from SLOTS if we only do experiments on specific domains
Пример #30
0
def train_discriminator(dataset,
                        train_dataset_fp=None,
                        valid_dataset_fp=None,
                        pretrained_model="gpt2-medium",
                        epochs=10,
                        batch_size=64,
                        log_interval=10,
                        save_model=False,
                        cached=False,
                        no_cuda=False,
                        reg_type=1):
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    print("Preprocessing {} dataset...".format(dataset))
    start = time.time()

    if dataset == "SST":
        idx2class = [
            "positive", "negative", "very positive", "very negative", "neutral"
        ]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device,
                                      reg_type=reg_type).to(device)

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
        train_data, val_data, test_data = datasets.SST.splits(
            text,
            label,
            fine_grained=True,
            train_subtrees=True,
        )

        x = []
        y = []
        for i in trange(len(train_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(train_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            x.append(seq)
            y.append(class2idx[vars(train_data[i])["label"]])
        train_dataset = Dataset(x, y)

        test_x = []
        test_y = []
        for i in trange(len(test_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(test_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            test_x.append(seq)
            test_y.append(class2idx[vars(test_data[i])["label"]])
        test_dataset = Dataset(test_x, test_y)

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 2,
        }

    elif dataset == "clickbait":
        idx2class = ["non_clickbait", "clickbait"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
            data = []
            for i, line in enumerate(f):
                try:
                    data.append(eval(line))
                except:
                    print("Error evaluating line {}: {}".format(i, line))
                    continue
        x = []
        y = []
        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        seq = torch.tensor([50256] + seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(d["label"])
                except:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 1,
        }

    elif dataset == "toxic":
        idx2class = ["non_toxic", "toxic"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        x = []
        y = []
        with open("datasets/toxic/toxic_train.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        seq = torch.tensor([50256] + seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(int(np.sum(d["label"]) > 0))
                except:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 0,
        }

    else:  # if dataset == "generic":
        # This assumes the input dataset is a TSV with the following structure:
        # class \t text

        if train_dataset_fp is None:
            raise ValueError("When generic dataset is selected, "
                             "train_dataset_fp needs to be specified aswell.")
        if valid_dataset_fp is None:
            raise ValueError("When generic dataset is selected, "
                             "valid_dataset_fp needs to be specified aswell.")

        discriminator = Discriminator(pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        x = []
        y = []
        with open(train_dataset_fp) as f:
            csv_reader = csv.reader(f, delimiter="\t")
            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
                if row:
                    label = float(row[0])
                    text = row[1]

                    try:
                        seq = discriminator.tokenizer.encode(text)
                        if (len(seq) < max_length_seq):
                            seq = torch.tensor([50256] + seq,
                                               device=device,
                                               dtype=torch.long)

                        else:
                            print("Line {} is longer than maximum length {}".
                                  format(i, max_length_seq))
                            continue

                        x.append(seq)
                        y.append(label)

                    except:
                        print(
                            "Error tokenizing line {}, skipping it".format(i))
                        pass

        train_dataset = Dataset(x, y)
        x = []
        y = []
        with open(valid_dataset_fp) as f:
            csv_reader = csv.reader(f, delimiter="\t")
            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
                if row:
                    label = float(row[0])
                    text = row[1]

                    try:
                        seq = discriminator.tokenizer.encode(text)
                        if (len(seq) < max_length_seq):
                            seq = torch.tensor([50256] + seq,
                                               device=device,
                                               dtype=torch.long)

                        else:
                            print("Line {} is longer than maximum length {}".
                                  format(i, max_length_seq))
                            continue

                        x.append(seq)
                        y.append(label)

                    except:
                        print(
                            "Error tokenizing line {}, skipping it".format(i))
                        pass

        test_dataset = Dataset(x, y)

        discriminator_meta = {
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
        }

    end = time.time()
    print("Preprocessed {} data points".format(
        len(train_dataset) + len(test_dataset)))
    print("Data preprocessing took: {:.3f}s".format(end - start))

    if cached:
        print("Building representation cache...")

        start = time.time()

        train_loader = get_cached_data_loader(train_dataset,
                                              batch_size,
                                              discriminator,
                                              shuffle=True,
                                              device=device)

        test_loader = get_cached_data_loader(test_dataset,
                                             batch_size,
                                             discriminator,
                                             device=device)

        end = time.time()
        print("Building representation cache took: {:.3f}s".format(end -
                                                                   start))

    else:
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   collate_fn=collate_fn)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                  batch_size=batch_size,
                                                  collate_fn=collate_fn)

    if save_model:
        with open("{}_classifier_head_meta.json".format(dataset),
                  "w") as meta_file:
            json.dump(discriminator_meta, meta_file)

    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)

    for epoch in range(epochs):
        start = time.time()
        print("\nEpoch", epoch + 1)

        train_epoch(discriminator=discriminator,
                    data_loader=train_loader,
                    optimizer=optimizer,
                    epoch=epoch,
                    log_interval=log_interval,
                    device=device)
        evaluate_performance(data_loader=test_loader,
                             discriminator=discriminator,
                             device=device)

        end = time.time()
        print("Epoch took: {:.3f}s".format(end - start))

        print("\nExample prediction")
        predict(example_sentence, discriminator, cached=cached, device=device)

        if save_model:
            # torch.save(discriminator.state_dict(),
            #           "{}_discriminator_{}.pt".format(
            #               args.dataset, epoch + 1
            #               ))
            torch.save(
                discriminator.get_classifier().state_dict(),
                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1))
ae = MNISTAE().cuda()
ae.load_state_dict(torch.load("mnist_conv_autoencoder_weights.pth"))

data = []
targets = []

n_samples = int(len(trainfolder) * 0.25)
counter = 0

for batch_x, batch_y in tqdm(trainloader):
    batch_x = batch_x.cuda().float()
    batch_x_preds = ae.encode(batch_x).detach().cpu().numpy()
    batch_y = batch_y.detach().cpu().numpy()

    for x, y in zip(batch_x_preds, batch_y):
        data.append(x.reshape(6400))
        targets.append(y)
        counter += 1

    if counter >= n_samples:
        break

data = np.array(data)
targets = np.array(targets)

data = data[:int(len(data) * 0.25)]
targets = targets[:int(len(targets) * 0.25)]

data = TSNE(n_components=2, perplexity=15, learning_rate=10,
            verbose=2).fit_transform(data)
Пример #32
0
def main():
    args = parser.parse_args()

    if args.seed is None:
        args.seed = random.randint(1, 10000)
    print("Random Seed: ", args.seed)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpus:
        torch.cuda.manual_seed_all(args.seed)

    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    if args.gpus is not None:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        device = 'cuda:' + str(args.gpus[0])
        cudnn.benchmark = True
    else:
        device = 'cpu'

    if args.type == 'float64':
        dtype = torch.float64
    elif args.type == 'float32':
        dtype = torch.float32
    elif args.type == 'float16':
        dtype = torch.float16
    else:
        raise ValueError('Wrong type!')  # TODO int8

    model = MobileNet2(input_size=args.input_size, scale=args.scaling)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    print(model)
    print('number of parameters: {}'.format(num_parameters))
    # TODO(3/30):flop计算方法还没看, loader要自己写
    print('FLOPs: {}'.format(
        flops_benchmark.count_flops(
            MobileNet2, args.batch_size //
            len(args.gpus) if args.gpus is not None else args.batch_size,
            device, dtype, args.input_size, 3, args.scaling)))

    train_loader, val_loader = get_loaders(args.dataroot, args.batch_size,
                                           args.batch_size, args.input_size,
                                           args.workers)
    # define loss function (criterion) and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    if args.gpus is not None:
        model = torch.nn.DataParallel(model, args.gpus)
    model.to(device=device, dtype=dtype)
    criterion.to(device=device, dtype=dtype)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.decay,
                                nesterov=True)
    # TODO(3/30):clr学习率寻找,尚未学习
    if args.find_clr:
        find_bounds_clr(model,
                        train_loader,
                        optimizer,
                        criterion,
                        device,
                        dtype,
                        min_lr=args.min_lr,
                        max_lr=args.max_lr,
                        step_size=args.epochs_per_step * len(train_loader),
                        mode=args.mode,
                        save_path=save_path)
        return

    if args.clr:
        scheduler = CyclicLR(optimizer,
                             base_lr=args.min_lr,
                             max_lr=args.max_lr,
                             step_size=args.epochs_per_step *
                             len(train_loader),
                             mode=args.mode)
    else:
        scheduler = MultiStepLR(optimizer,
                                milestones=args.schedule,
                                gamma=args.gamma)

    best_test = 0

    # optionally resume from a checkpoint
    data = None
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location=device)
            args.start_epoch = checkpoint['epoch'] - 1
            best_test = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        elif os.path.isdir(args.resume):
            checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar')
            csv_path = os.path.join(args.resume, 'results.csv')
            print("=> loading checkpoint '{}'".format(checkpoint_path))
            checkpoint = torch.load(checkpoint_path, map_location=device)
            args.start_epoch = checkpoint['epoch'] - 1
            best_test = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_path, checkpoint['epoch']))
            data = []
            with open(csv_path) as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    data.append(row)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.evaluate:
        loss, top1, top5 = test(model, val_loader, criterion, device,
                                dtype)  # TODO
        return

    # TODO(3/30):使用来做什么的?
    csv_logger = CsvLogger(filepath=save_path, data=data)
    csv_logger.save_params(sys.argv, args)

    # TODO(3、30):似乎是多余的,搞懂在干什么吧
    claimed_acc1 = None
    claimed_acc5 = None
    if args.input_size in claimed_acc_top1:
        if args.scaling in claimed_acc_top1[args.input_size]:
            claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling]
            claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling]
            csv_logger.write_text(
                'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format(
                    claimed_acc1 * 100., claimed_acc5 * 100.))
    train_network(args.start_epoch, args.epochs, scheduler, model,
                  train_loader, val_loader, optimizer, criterion, device,
                  dtype, args.batch_size, args.log_interval, csv_logger,
                  save_path, claimed_acc1, claimed_acc5, best_test)
Пример #33
0
    def __init__(self, path, mode, args):
        data = []
        with open(os.path.join(path, mode)) as f:
            all_lines = f.readlines()
            for line in all_lines:
                ins = json.loads(line)
                data.append(ins)
        
        
        entityMarker = EntityMarker(args)
        tot_instance = len(data)

        # load rel2id and type2id
        if os.path.exists(os.path.join(path, "rel2id.json")):
            rel2id = json.load(open(os.path.join(path, "rel2id.json")))
        else:
            raise Exception("Error: There is no `rel2id.json` in "+ path +".")
        if os.path.exists(os.path.join(path, "type2id.json")):
            type2id = json.load(open(os.path.join(path, "type2id.json")))
        else:
            print("Warning: There is no `type2id.json` in "+ path +", If you want to train model using `OT`, `CT` settings, please firstly run `utils.py` to get `type2id.json`.")
    
        print("pre process " + mode)
        # pre process data
        self.input_ids = np.zeros((tot_instance, args.max_length), dtype=int)
        self.mask = np.zeros((tot_instance, args.max_length), dtype=int) 
        self.h_pos = np.zeros((tot_instance), dtype=int)
        self.t_pos = np.zeros((tot_instance), dtype=int)
        self.h_pos_l = np.zeros((tot_instance), dtype=int)
        self.t_pos_l = np.zeros((tot_instance), dtype=int)
        self.label = np.zeros((tot_instance), dtype=int)

        for i, ins in enumerate(data):
            self.label[i] = rel2id[ins["relation"]]            
            # tokenize
            if args.mode == "CM":
                ids, ph, pt, ph_l, pt_l = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'])
            elif args.mode == "OC":
                ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], None, None, True, True)
            elif args.mode == "CT":
                h_type = "[unused%d]" % (type2id['subj_'+ins['h']['type']] + 10)
                t_type = "[unused%d]" % (type2id['obj_'+ins['t']['type']] + 10)
                ids, ph, pt = entityMarker.tokenize(data[i]["token"], data[i]['h']['pos'], data[i]['t']['pos'], h_type, t_type)
            elif args.mode == "OM":
                head = entityMarker.tokenizer.tokenize(ins['h']['name'])
                tail = entityMarker.tokenizer.tokenize(ins['t']['name'])
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT(head, tail, h_first)
            elif args.mode == "OT":
                h_type = "[unused%d]" % (type2id['subj_'+ins['h']['type']] + 10)
                t_type = "[unused%d]" % (type2id['obj_'+ins['t']['type']] + 10)
                h_first = ins['h']['pos'][0] < ins['t']['pos'][0]
                ids, ph, pt = entityMarker.tokenize_OMOT([h_type,], [t_type,], h_first)
            else:
                raise Exception("No such mode! Please make sure that `mode` takes the value in {CM,OC,CT,OM,OT}")

            length = min(len(ids), args.max_length)
            self.input_ids[i][0:length] = ids[0:length]
            self.mask[i][0:length] = 1
            self.h_pos[i] = min(ph, args.max_length-1) 
            self.t_pos[i] = min(pt, args.max_length-1) 
            self.h_pos_l[i] = min(ph_l, args.max_length) 
            self.t_pos_l[i] = min(pt_l, args.max_length) 
        print("The number of sentence in which tokenizer can't find head/tail entity is %d" % entityMarker.err)
Пример #34
0
def read_langs(file_name, max_line=None):
    print(("Reading lines from {}".format(file_name)))
    data, context_arr, conv_arr, kb_arr, domain_dict = [], [], [], [], {}
    max_resp_len = 0
    one_domain_cnt = 0
    node_list, list_object_node = [], []

    with open(file_name, encoding='utf-8') as fin:
        cnt_lin, sample_counter, node_idx = 1, 1, 0
        for line in fin:
            line = line.strip()
            if line:
                # 处理domain
                if line.startswith("#"):
                    flag = 0  # 标记是否是domain
                    line = line.split()
                    for a in line:
                        if a == "#":  # 若是'#'则跳过
                            continue
                        if a.startswith("0"):  # 是domain的序号
                            domain_idx = int(a)
                            assert 5 >= domain_idx >= 0
                            # domain_l.append(domain_idx)
                            flag = 1
                            continue
                        if flag == 1:  # 是domain
                            domain_dict[domain_idx] = a
                            assert 5 >= domains[a] >= 0
                            # domain_l.append(domains[a])
                            flag = 0
                            node_list.append([a, domain_idx, node_idx])
                            node_idx += 1
                            continue
                        dialog_id = a  # 读取dialogue ID
                    domain_l = "全部"
                    continue

                # 处理每句话,每行是一个KB(entity, attribute, value)/一个query-answer对
                nid, line = line.split(' ', 1)
                # 处理answer-query对
                if '\t' in line:
                    # 将user/response/gold entity分开
                    u_seged, r_seged, gold_ent = line.split('\t')
                    # 生成user话中每个词的memory
                    gen_u = generate_memory(u_seged, "$u", str(nid))
                    context_arr += gen_u
                    conv_arr += gen_u
                    for tri in gen_u:
                        node_list.append([tri, node_idx])
                        node_idx += 1

                    # Get gold entity for each domain
                    # eval 能将字符串转为其原本的数据形式list/tuple/dict
                    # 这里的ast.literal_eval能安全的转换,即该字符串不合法时直接抛出异常
                    gold_ent = ast.literal_eval(gold_ent)
                    # ent_idx_restaurant, ent_idx_attraction, ent_idx_hotel = [], [], []
                    # if task_type == "restaurant":
                    #     ent_idx_restaurant = gold_ent
                    # elif task_type == "attraction":
                    #     ent_idx_attraction = gold_ent
                    # elif task_type == "hotel":
                    #     ent_idx_hotel = gold_ent
                    ent_index = list(set(gold_ent))

                    # Get local pointer position for each word in system response
                    ptr_index = []
                    for key in r_seged.split():
                        # 获取local指针,对于之前整理好的global指针,如果这个单词是backend的单词,那就记录它的位置
                        index = [
                            loc for loc, val in enumerate(context_arr)
                            if (val[0] == key and key in ent_index)
                        ]
                        # 这里取最大的index,如果没有,就取句子长度
                        if index:
                            index = max(index)
                        else:
                            index = len(context_arr)
                        ptr_index.append(index)

                    # Get global pointer labels for words in system response, the 1 in the end is for the NULL token
                    # 对于user+KB中的单词,如果是system response出现的单词或者是KB实体,就标为1,否则为0,然后添加了一个1对应句子末尾NULL
                    selector_index = [
                        1 if (word_arr[0] in ent_index
                              or word_arr[0] in r.split()) else 0
                        for word_arr in context_arr
                    ] + [1]
                    # 生成带sketch的回复
                    sketch_response, gold_sketch = generate_template(
                        r_seged, gold_ent, kb_arr, domain_dict, node_list)
                    # if len(domain_label) < 3:
                    #     domain_label.append(RiSA_PAD_token)
                    # assert len(domain_label) == 3
                    # 把这段对话的所有内容放到一个dict中,然后加入到总数据
                    data_detail = {
                        'context_arr':
                        list(context_arr +
                             [['$$$$'] *
                              MEM_TOKEN_SIZE]),  # $$$$ is NULL token
                        'response':
                        r_seged,
                        'sketch_response':
                        sketch_response,
                        'gold_sketch':
                        gold_sketch,
                        'ptr_index':
                        ptr_index + [len(context_arr)],
                        'selector_index':
                        selector_index,
                        'ent_index':
                        ent_index,
                        'conv_arr':
                        list(conv_arr),
                        'kb_arr':
                        list(kb_arr),
                        'id':
                        int(sample_counter),
                        'ID':
                        int(cnt_lin),
                        'domain':
                        domain_l
                    }
                    data.append(data_detail)
                    # 注意,在这里就按照turn来生成了多个对话,将gold response作为context历史一并加入
                    gen_r = generate_memory(r_seged, "$s", str(nid))
                    context_arr += gen_r
                    conv_arr += gen_r
                    for tri in gen_r:
                        node_list.append([tri, node_idx])
                        node_idx += 1
                    # 统计一下最长的回复长度
                    if max_resp_len < len(r_seged.split()):
                        max_resp_len = len(r_seged.split())
                    sample_counter += 1
                # 处理(entity, attribute, value)
                else:
                    r = line
                    kb_info = generate_memory(r, "", str(nid))
                    context_arr = kb_info + context_arr
                    kb_arr += kb_info
                    node_list.extend(kb_info)
                    node_idx += 1
            else:
                cnt_lin += 1
                context_arr, conv_arr, kb_arr, node_list, domain_dict = [], [], [], [], {}
                node_idx = 0
                if max_line and cnt_lin >= max_line:
                    break

    return data, max_resp_len
Пример #35
0
def read_langs(file_name, SLOTS, dataset, lang, mem_lang, training, args):
    print(("Reading from {}".format(file_name)))
    data = []
    max_len_val_per_slot = 0
    max_len_slot_val = {}
    domain_counter = {}
    #count_noise = 0
    sorted_domainslots = sorted(SLOTS)
    sorted_in_domains = [
        i.split('-')[0] + "_DOMAIN" for i in sorted_domainslots
    ]
    sorted_in_slots = [i.split('-')[1] + "_SLOT" for i in sorted_domainslots]
    for ds in sorted_domainslots:
        max_len_slot_val[ds] = (1, "none")  # counting none/dontcare
    multival_count = 0

    with open(file_name) as f:
        dials = json.load(f)
        # create vocab first
        for dial_dict in dials:
            if (dataset == 'train' and training) or (args['pointer_decoder']):
                for ti, turn in enumerate(dial_dict["dialogue"]):
                    lang.index_words(turn["system_transcript"], 'utter')
                    lang.index_words(turn["transcript"], 'utter')

        for dial_dict in dials:
            last_belief_dict = {}
            # Filtering and counting domains
            for domain in dial_dict["domains"]:
                if domain not in domain_counter.keys():
                    domain_counter[domain] = 0
                domain_counter[domain] += 1

            # Reading data
            dialog_history = ''
            delex_dialog_history = ''
            prev_turn_belief_dict = {}

            for ti, turn in enumerate(dial_dict["dialogue"]):
                turn_id = turn["turn_idx"]
                if ti == 0:
                    user_sent = ' SOS ' + turn["transcript"] + ' EOS '
                    sys_sent = ''
                    dlx_user_sent = ' SOS ' + turn["delex_transcript"] + ' EOS '
                    dlx_sys_sent = ''
                else:
                    sys_sent = ' SOS ' + turn["system_transcript"] + ' EOS '
                    user_sent = 'SOS ' + turn["transcript"] + ' EOS '
                    dlx_sys_sent = ' SOS ' + turn[
                        "delex_system_transcript"] + ' EOS '
                    dlx_user_sent = 'SOS ' + turn["delex_transcript"] + ' EOS '
                turn_uttr = sys_sent + user_sent
                dialog_history += sys_sent
                delex_dialog_history += dlx_sys_sent
                dialog_history += user_sent
                delex_dialog_history += dlx_user_sent

                turn_belief_dict = fix_general_label_error(
                    turn["belief_state"], False, SLOTS)
                turn_belief_dict = fix_book_slot_name(turn_belief_dict, SLOTS)
                turn_belief_dict, multival_count = fix_multival(
                    turn_belief_dict, multival_count)
                turn_belief_dict = remove_none_value(turn_belief_dict)

                sorted_lenval, sorted_gates = get_sorted_lenval(
                    sorted_domainslots, turn_belief_dict, args['slot_gating'])
                sorted_in_domains2, sorted_in_slots2, sorted_generate_y, sorted_in_domainslots2_index = get_sorted_generate_y(
                    sorted_domainslots, sorted_lenval, turn_belief_dict)

                if args['auto_regressive']:
                    atrg_generate_y, sorted_in_domainslots2_index = get_atrg_generate_y(
                        sorted_domainslots, sorted_lenval, turn_belief_dict)
                else:
                    atrg_generate_y = None

                if args['delex_his']:
                    temp = dialog_history.split()
                    delex_temp = delex_dialog_history.split()
                    start_idx = [
                        i for i, t in enumerate(temp) if t == 'SOS'
                    ][-1]  #delex all except the last user utterance
                    in_delex_dialog_history = ' '.join(delex_temp[:start_idx] +
                                                       temp[start_idx:])
                    if len(in_delex_dialog_history.split()) != len(
                            dialog_history.split()):
                        pdb.set_trace()
                    if (dataset == 'train'
                            and training) or (args['pointer_decoder']):
                        lang.index_words(in_delex_dialog_history, 'utter')

                turn_belief_list = [
                    str(k) + '-' + str(v) for k, v in turn_belief_dict.items()
                ]
                for k, v in turn_belief_dict.items():
                    if len(v.split()) > max_len_slot_val[k][0]:
                        max_len_slot_val[k] = (len(v.split()), v)

                if dataset == 'train' and training:
                    mem_lang.index_words(turn_belief_dict, 'belief')

                data_detail = {
                    "ID": dial_dict["dialogue_idx"],
                    "turn_id": turn_id,
                    "dialog_history": dialog_history.strip(),
                    "delex_dialog_history": in_delex_dialog_history.strip(),
                    "turn_belief": turn_belief_list,
                    "sorted_domainslots": sorted_domainslots,
                    "turn_belief_dict": turn_belief_dict,
                    "turn_uttr": turn_uttr.strip(),
                    'sorted_in_domains': sorted_in_domains,
                    'sorted_in_slots': sorted_in_slots,
                    'sorted_in_domains2': sorted_in_domains2,
                    'sorted_in_slots2': sorted_in_slots2,
                    'sorted_in_domainslots2_idx': sorted_in_domainslots2_index,
                    'sorted_lenval': sorted_lenval,
                    'sorted_gates': sorted_gates,
                    'sorted_generate_y': sorted_generate_y,
                    'atrg_generate_y': atrg_generate_y
                }
                data.append(data_detail)
                if len(sorted_lenval) > 0 and max(
                        sorted_lenval) > max_len_val_per_slot:
                    max_len_val_per_slot = max(sorted_lenval)
                prev_turn_belief_dict = turn_belief_dict

    print("domain_counter", domain_counter)
    print("multival_count", multival_count)

    return data, SLOTS, max_len_val_per_slot, max_len_slot_val
Пример #36
0
def main():
    args = parser.parse_args()

    if args.seed is None:
        args.seed = random.randint(1, 10000)
    print("Random Seed: ", args.seed)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpus:
        torch.cuda.manual_seed_all(args.seed)

    time_stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    if args.evaluate:
        args.results_dir = '/tmp'
    if args.save is '':
        args.save = time_stamp
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    if args.gpus is not None:
        args.gpus = [int(i) for i in args.gpus.split(',')]
        device = 'cuda:' + str(args.gpus[0])
        cudnn.benchmark = True
    else:
        device = 'cpu'

    if args.type == 'float64':
        dtype = torch.float64
    elif args.type == 'float32':
        dtype = torch.float32
    elif args.type == 'float16':
        dtype = torch.float16
    else:
        raise ValueError('Wrong type!')  # TODO int8

    model = MobileNet2(input_size=args.input_size, scale=args.scaling)
    num_parameters = sum([l.nelement() for l in model.parameters()])
    print(model)
    print('number of parameters: {}'.format(num_parameters))
    print('FLOPs: {}'.format(
        flops_benchmark.count_flops(MobileNet2,
                                    args.batch_size // len(args.gpus) if args.gpus is not None else args.batch_size,
                                    device, dtype, args.input_size, 3, args.scaling)))

    train_loader, val_loader = get_loaders(args.dataroot, args.batch_size, args.batch_size, args.input_size,
                                           args.workers)
    # define loss function (criterion) and optimizer
    criterion = torch.nn.CrossEntropyLoss()
    if args.gpus is not None:
        model = torch.nn.DataParallel(model, args.gpus)
    model.to(device=device, dtype=dtype)
    criterion.to(device=device, dtype=dtype)

    optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.decay,
                                nesterov=True)
    if args.find_clr:
        find_bounds_clr(model, train_loader, optimizer, criterion, device, dtype, min_lr=args.min_lr,
                        max_lr=args.max_lr, step_size=args.epochs_per_step * len(train_loader), mode=args.mode,
                        save_path=save_path)
        return

    if args.clr:
        scheduler = CyclicLR(optimizer, base_lr=args.min_lr, max_lr=args.max_lr,
                             step_size=args.epochs_per_step * len(train_loader), mode=args.mode)
    else:
        scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma)

    best_test = 0

    # optionally resume from a checkpoint
    data = None
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location=device)
            args.start_epoch = checkpoint['epoch'] - 1
            best_test = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        elif os.path.isdir(args.resume):
            checkpoint_path = os.path.join(args.resume, 'checkpoint.pth.tar')
            csv_path = os.path.join(args.resume, 'results.csv')
            print("=> loading checkpoint '{}'".format(checkpoint_path))
            checkpoint = torch.load(checkpoint_path, map_location=device)
            args.start_epoch = checkpoint['epoch'] - 1
            best_test = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch']))
            data = []
            with open(csv_path) as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    data.append(row)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.evaluate:
        loss, top1, top5 = test(model, val_loader, criterion, device, dtype)  # TODO
        return

    csv_logger = CsvLogger(filepath=save_path, data=data)
    csv_logger.save_params(sys.argv, args)

    claimed_acc1 = None
    claimed_acc5 = None
    if args.input_size in claimed_acc_top1:
        if args.scaling in claimed_acc_top1[args.input_size]:
            claimed_acc1 = claimed_acc_top1[args.input_size][args.scaling]
            claimed_acc5 = claimed_acc_top5[args.input_size][args.scaling]
            csv_logger.write_text(
                'Claimed accuracies are: {:.2f}% top-1, {:.2f}% top-5'.format(claimed_acc1 * 100., claimed_acc5 * 100.))
    train_network(args.start_epoch, args.epochs, scheduler, model, train_loader, val_loader, optimizer, criterion,
                  device, dtype, args.batch_size, args.log_interval, csv_logger, save_path, claimed_acc1, claimed_acc5,
                  best_test)
Пример #37
0
def read_langs(file_name, max_line=None):
    logging.info(("Reading lines from {}".format(file_name)))
    # Read the file and split into lines
    data = []
    context = ""
    u = None  # u for user; r for response
    r = None
    with open(file_name) as fin:
        cnt_ptr = 0  # 在回答中有多少词带pointer
        cnt_voc = 0
        max_r_len = 0
        cnt_lin = 1  # 记录对话样本数
        for line in fin:
            line = line.strip()
            if line:  # 空行代表一个样本的结束
                nid, line = line.split(' ', 1)
                if '\t' in line:
                    u, r = line.split('\t')
                    context += str(u) + " "
                    # 当前response的对话历史,当前response会添加到下一轮的对话历史中
                    contex_arr = context.split(' ')[LIMIT:]
                    r_index = []
                    gate = []
                    for key in r.split(' '):
                        index = [
                            loc for loc, val in enumerate(contex_arr)
                            if val == key
                        ]
                        if (index):
                            index = max(index)
                            gate.append(1)
                            cnt_ptr += 1
                        else:
                            index = len(contex_arr) - 1
                            gate.append(0)
                            cnt_voc += 1
                        r_index.append(index)

                    if len(r_index) > max_r_len:
                        max_r_len = len(r_index)
                    # TODO: why this way ???
                    data.append(
                        [" ".join(contex_arr) + "$$$$", r, r_index, gate])
                    context += str(r) + " "
                else:
                    r = line
                    if USEKB:
                        context += str(r) + " "
            else:
                cnt_lin += 1
                if (max_line and cnt_lin >= max_line):
                    break
                context = ""
    max_len = max([len(d[0].split(' ')) for d in data])
    avg_len = sum([len(d[0].split(' ')) for d in data]) / float(
        len([len(d[0].split(' ')) for d in data]))
    logging.info("Pointer percentace= {} ".format(cnt_ptr /
                                                  (cnt_ptr + cnt_voc)))
    logging.info("Max responce Len: {}".format(max_r_len))
    logging.info("Max Input Len: {}".format(max_len))
    logging.info("AVG Input Len: {}".format(avg_len))
    return data, max_len, max_r_len