Exemplo n.º 1
0
def test_zip():
    """Test contrib.tzip"""
    with closing(StringIO()) as our_file:
        a = range(9)
        b = [i + 1 for i in a]
        if sys.version_info[:1] < (3,):
            assert tzip(a, b, file=our_file) == zip(a, b)
        else:
            gen = tzip(a, b, file=our_file)
            assert gen != list(zip(a, b))
            assert list(gen) == list(zip(a, b))
Exemplo n.º 2
0
def create_mosaic(image_list, flows):
    assert len(image_list) > 0, "List is empty."

    _, hm_h, hm_w = heatmap_for_image(image_list[0]).shape
    img_w, img_h = Image.open(image_list[0]).size

    flows = flows * (hm_w / img_w, hm_h / img_h)
    acc_flows = np.cumsum(flows, axis=0)
    acc_flows = acc_flows.round().astype(int)

    min_x, min_y = acc_flows.min(axis=0)
    max_x, max_y = acc_flows.max(axis=0) + (hm_w, hm_h) - 1

    acc_flows -= (min_x, min_y)

    mos_h, mos_w = (max_y - min_y + 1, max_x - min_x + 1)
    hms_mosaic = np.zeros((2, mos_h, mos_w), dtype=float)
    pms_mosaic = np.zeros((1, mos_h, mos_w), dtype=float)
    emb_mosaic = np.zeros((2, mos_h, mos_w), dtype=float)
    off_mosaic = np.zeros((2, mos_h, mos_w), dtype=float)
    mask = np.full((mos_h, mos_w), np.finfo(float).tiny)
    annotations = []

    for image, (tx, ty) in tzip(image_list, acc_flows):
        heatmaps, part_heatmaps, offsets, embeddings = mat_data_for_image(
            image)
        hms_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += heatmaps
        pms_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += part_heatmaps
        emb_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += embeddings
        off_mosaic[:, ty:ty + hm_h, tx:tx + hm_w] += offsets
        mask[ty:ty + hm_h, tx:tx + hm_w] += 1.0

        if (locations := annotation_for_image(image)) is not None:
            locations = locations * (hm_w / img_w, hm_h / img_h) + (tx, ty)
            annotations.extend(locations)
Exemplo n.º 3
0
def ParseFiles(group, files, invalid):

    Cols = [
        '系所代碼', '准考證號碼', '姓名', '學校', '一上班排百分', '一下班排百分', '二上班排百分', '二下班排百分',
        '三上班排百分', '一上組排百分', '一下組排百分', '二上組排百分', '二下組排百分', '三上組排百分', '一上校排百分',
        '一下校排百分', '二上校排百分', '二下校排百分', '三上校排百分', 'ParserInfo', 'FilePath'
    ]
    L = []
    for g, f in tzip(group, files):
        with open(f, 'rb') as ff:
            pdf = pdftotext.PDF(ff)
        try:
            L.append(eval(f'SSP.Parser_{g}.parse_info(pdf, f)') + [f])
        except Exception as e:
            print(f, g, e)
            L.append(
                list(SSP.ScoreSheetParser.parse_generic_info(pdf, f)) +
                [np.nan] * 16 +
                [f"Parser {g} failed! Or it's a new pattern!? ", f])
    for f in invalid:
        t = f.split('/')
        L.append([t[1], t[2]] + [np.nan] * 17 + ['Raw File Not Exist!!', f])
    data = pd.DataFrame(L)
    data.columns = Cols
    data.to_csv('Out.csv', index=False)
Exemplo n.º 4
0
def print_out_bleu_and_meteor_score(predicted_path, expected_path):

    scores = [('BLEU SCORE-1: ', []), ('BLEU SCORE-2: ', []),
              ('BLEU SCORE-3: ', []), ('BLEU SCORE-4: ', []),
              ('METEOR SCORE: ', [])]

    with open(predicted_path, 'r') as fp_pred, open(expected_path,
                                                    'r') as fp_exp:
        for prediction, expected in tzip(fp_pred, fp_exp):
            prediction = prediction.split(' ')
            expected_list = expected.split(' ')

            scores[0][1].append(
                sentence_bleu(prediction, expected_list, weights=(1, 0, 0, 0)))
            scores[1][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 1, 0, 0)))
            scores[2][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 0, 1, 0)))
            scores[3][1].append(
                sentence_bleu(prediction, expected_list, weights=(0, 0, 0, 1)))
            scores[4][1].append(meteor_score(prediction, expected))

    for score in scores:
        print(score[0] + str(sum(score[1]) / len(score[1])))

    return 0
Exemplo n.º 5
0
def sphere_log(data, scales=range(5, 9, 1), anisotropy_factor=5.0):
    data = asarray(data)
    scales = asarray(scales)

    log = empty((len(scales), ) + data.shape, dtype=data.dtype)
    for slog, scale in (tzip(log, scales)):
        slog[...] = scale**2 * gaussian_laplace(
            data, asarray([scale / anisotropy_factor, scale, scale]))
    peaks = local_minima(log)  # SZYX

    peaks_subset, peaks_list, threshold = get_peaks_subset(log, peaks, scales)
    return peaks_subset, peaks_list, log, peaks, threshold
Exemplo n.º 6
0
def main(args):
    output_path = set_output_path(args.output_folder)
    detector = set_detector(args.detector)
    epoch_limits = get_epoch_limits(detector)

    #### Load counts and ancillary data from the catalogue
    (
        detids,
        obsids,
        obs_dates,
        det_obs_modes,
        det_rates,
        det_rates_err,
        det_filters,
    ) = load_data(args.sources_table, detector)

    #### Create pseudospectra
    default_rmf_file = set_rmf_file(
        obs_dates[0], det_obs_modes[0], detector, epoch_limits
    )
    spec_channels, spec_energies, spec_quality, spec_grouping = set_default_spectrum(
        default_rmf_file
    )

    for (
        detid,
        obsid,
        obs_date,
        det_obs_mode,
        det_rate,
        det_rate_err,
        det_filter,
    ) in tzip(
        detids, obsids, obs_dates, det_obs_modes, det_rates, det_rates_err, det_filters
    ):
        if not_detected(det_rate):
            continue

        rsp_file = set_rmf_file(obs_date, det_obs_mode, detector, epoch_limits)
        arf_file = set_arf_file(detector, det_filter)
        spec_rate, spec_rate_err = set_spec_count_rates(
            spec_energies, det_rate, det_rate_err
        )

        spec = set_spec_fits(
            spec_channels, spec_rate, spec_rate_err, spec_quality, spec_grouping
        )
        spec = update_spec_fits_header(
            spec, detector, det_filter, rsp_file, arf_file, len(spec_channels)
        )

        save_spec(spec, obsid, detid, detector, output_path)
Exemplo n.º 7
0
    def process(self):
        data = pickle.load(open(os.path.join(self.raw_dir, self.name), 'rb'))
        self.graphs = []
        self.labels = []
        self.all_seqs = []
        self.max_seq_length = 0
        self.max_node_id = 0
        self.max_num_unique_node = 0

        for sequences, y in tzip(data[0], data[1]):
            i = 0
            nodes = {}  # dict{15: 0, 16: 1, 18: 2, ...}
            senders = []
            unique_nodes = []
            for node in sequences:
                if node not in nodes:
                    nodes[node] = i
                    unique_nodes.append([node])
                    i += 1
                senders.append(nodes[node])
            receivers = senders[:]

            del senders[-1]  # the last item is a receiver
            del receivers[0]  # the first item is a sender
            g = dgl.graph((senders, receivers), num_nodes=len(unique_nodes))
            g.ndata['x'] = torch.tensor(unique_nodes, dtype=torch.long)
            g.edata['w'] = torch.ones(g.num_edges(), dtype=torch.float)

            # print(f"\n{g.nodes()}, {g.edges()}, {g.ndata['x'].squeeze()}")

            self.graphs.append(g)
            self.all_seqs.append(sequences)
            self.labels.append(y)

            if max(sequences) > self.max_node_id:
                self.max_node_id = max(sequences)

            if len(unique_nodes) > self.max_num_unique_node:
                self.max_num_unique_node = len(unique_nodes)

            if len(sequences) > self.max_seq_length:
                self.max_seq_length = len(sequences)

        # Convert the label list to tensor for saving.
        self.num_graphs = len(self.graphs)
        self.num_labels = len(self.labels)
        self.max_labels = max(self.labels)
        self.labels = torch.LongTensor(self.labels)
def run_vis_completion():
    def concat_tile(im_list_2d):
        return cv2.vconcat(
            [cv2.hconcat(im_list_h) for im_list_h in im_list_2d])

    def scale_to_height(img, height):
        """幅が指定した値になるように、アスペクト比を固定して、リサイズする。
        """
        h, w = img.shape[:2]
        width = round(w * (height / h))
        dst = cv2.resize(img, dsize=(width, height))

        return dst

    dir_path = pathlib.Path('./sample_code/sample_images/input')
    dpaths = sorted(list(dir_path.glob('./*.png')))
    cpaths = sorted(list(dir_path.glob('./*.jpg')))
    print(dpaths)
    vis_list = list()
    for dpath, cpath in tzip(dpaths, cpaths):
        color = cv2.imread(str(cpath), cv2.IMREAD_COLOR)
        depth = cv2.imread(str(dpath), cv2.IMREAD_ANYDEPTH)
        depth_c = one_image_completion(str(dpath))

        depth_vis = cv2.convertScaleAbs(np.repeat(depth[:, :, None], 3,
                                                  axis=2),
                                        alpha=(255.0 / 65535.0))
        depth_c_vis = cv2.convertScaleAbs(np.repeat(depth_c[:, :, None],
                                                    3,
                                                    axis=2),
                                          alpha=(255.0 / 65535.0))
        vis = cv2.vconcat([color, depth_vis, depth_c_vis])
        vis = scale_to_height(vis, 800)
        vis_list.append(vis)

        cv2.imshow('out', vis)
        cv2.waitKey(100)

    dt_now = datetime.datetime.now()
    dt_str = dt_now.strftime('%Y-%m-%d-%H')

    vis_all = cv2.hconcat(vis_list)
    cv2.imwrite('./sample_code/sample_images/output/' + dt_str + '.png',
                vis_all)
    cv2.imshow('vis all', vis_all)
    cv2.waitKey(0)
Exemplo n.º 9
0
    def recommend(self, userlist_path, out_path):
        model = self._get_model()
        user_list = np.load(
            open(conf.root + 'res/user_list_' + str(self.max_len) + '.npy',
                 'rb'))
        with open(out_path, 'w') as fout:
            users = [u.strip() for u in open(userlist_path)]
            seens = self._get_seens(users)  # positive list
            test_X = self._get_articles(users)
            mp_list = self.mp.get_recommend_list(
                seens, topn=self.topn
            )  # user cold strat 어떻게 해결하지? => 일단 most popular에서 가져옴

            for user, articles in tzip(users, test_X):
                recommend = []
                left = []
                if len(articles) == 0:  # 기존에 읽은 히스토리가 없으면. (MP)
                    recommend = mp_list
                else:  # 기존에 읽은 히스토리가 있으면. (KNN)
                    pred = model.kneighbors([articles])
                    sim_users = pred[1][0]
                    dist = pred[0][0]
                    for i, u in enumerate(sim_users):
                        if dist[i] != 0:
                            recommend += self.dictionary[user_list[u]]

                if user in self.dictionary.keys():
                    recommend = list(
                        set(recommend) - set(self.dictionary[user]))
                else:
                    recommend = list(set(recommend))

        #         np.random.shuffle(recommend)

                if len(recommend) < 100:
                    recommend += mp_list
                recommend = list(set(recommend))

                fout.write('%s %s\n' % (user, ' '.join(recommend[:100])))
Exemplo n.º 10
0
def main(args):
    spec_folder = Path(args.spec_folder)
    results_folder = Path(args.results_folder)
    lastsource_file = Path(args.file_lastsource)

    first_source = get_last_source_fit(lastsource_file)
    if first_source == 0:
        check_results_folder(results_folder)

    obsids, detids, redshifts, nhgals = get_sources_data(
        args.sources_table, args.racol, args.deccol, args.zcol, args.nhcol,
        first_source)

    for obsid, detid, z, nh, current_source in tzip(obsids, detids,
                                                    redshifts, nhgals,
                                                    count(first_source)):
        try:
            fit_detection(z, nh, obsid, detid, results_folder, spec_folder,
                          args.fixgamma)
            update_last_source_fit(current_source + 1, lastsource_file)
        except Exception as e:
            logging.error(e)
            logging.error(f"Something went wrong fitting detection {detid}")
Exemplo n.º 11
0
def create_panorama(image_list, flows, image_size=(1024, 1024)):
    assert len(image_list) > 0, "List is empty."

    img_w, img_h = Image.open(image_list[0]).size
    hm_w, hm_h = image_size

    flows = flows * (hm_w / img_w, hm_h / img_h)
    flows = flows.round().astype(int)
    acc_flows = np.cumsum(flows, axis=0)
    acc_flows = acc_flows.astype(int)

    min_x, min_y = acc_flows.min(axis=0)
    max_x, max_y = acc_flows.max(axis=0) + (hm_w, hm_h) - 1

    acc_flows -= (min_x, min_y)

    shape = (max_y - min_y + 1, max_x - min_x + 1, 3)
    panorama = np.zeros(shape, dtype=np.uint8)

    first_img = read_image(image_list[0],
                           reshape_size=(hm_w, hm_h))[:, :int(hm_h / 2)]
    tx, ty = acc_flows[0]
    panorama[ty:ty + hm_h, tx:tx + int(hm_w / 2)] = first_img

    last_img = read_image(image_list[-1],
                          reshape_size=(hm_w, hm_h))[:, int(hm_h / 2):]
    tx, ty = acc_flows[-1]
    panorama[ty:ty + hm_h, tx + int(hm_w / 2):tx + hm_w] = last_img

    for image, (tx, ty), (dx, _) in tzip(image_list, acc_flows, flows):
        img = read_image(
            image, reshape_size=image_size)[:, hm_w - dx - int(hm_w / 2):hm_w -
                                            int(hm_w / 2)]
        panorama[ty:ty + hm_h, tx + hm_w - dx - int(hm_w / 2):tx + hm_w -
                 int(hm_w / 2)] = img

    return panorama
Exemplo n.º 12
0
def zipper(iterable1: Iterable, iterable2, verbose: bool, **kwargs):
    if not verbose:
        return zip(iterable1, iterable2)

    return tzip(iterable1, iterable2, **kwargs)
def create_csv(
    orig_tsv_file,
    csv_file,
    data_folder,
    accented_letters=False,
    language="en",
):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    orig_tsv_file : str
        Path to the Common Voice tsv file (standard file).
    data_folder : str
        Path of the CommonVoice dataset.
    accented_letters : bool, optional
        Defines if accented letters will be kept as individual letters or
        transformed to the closest non-accented letters.

    Returns
    -------
    None
    """

    # Check if the given files exists
    if not os.path.isfile(orig_tsv_file):
        msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file)
        logger.info(msg)
        raise FileNotFoundError(msg)

    # We load and skip the header
    loaded_csv = open(orig_tsv_file, "r").readlines()[1:]
    nb_samples = str(len(loaded_csv))

    msg = "Preparing CSV files for %s samples ..." % (str(nb_samples))
    logger.info(msg)

    # Adding some Prints
    msg = "Creating csv lists in %s ..." % (csv_file)
    logger.info(msg)

    csv_lines = [[
        "ID",
        "duration",
        "wav",
        "wav_format",
        "wav_opts",
        "spk_id",
        "spk_id_format",
        "spk_id_opts",
        "wrd",
        "wrd_format",
        "wrd_opts",
        "char",
        "char_format",
        "char_opts",
    ]]

    # Start processing lines
    total_duration = 0.0
    for line in tzip(loaded_csv):

        line = line[0]

        # Path is at indice 1 in Common Voice tsv files. And .mp3 files
        # are located in datasets/lang/clips/
        mp3_path = data_folder + "/clips/" + line.split("\t")[1]
        file_name = mp3_path.split(".")[-2].split("/")[-1]
        spk_id = line.split("\t")[0]
        snt_id = file_name

        # Reading the signal (to retrieve duration in seconds)
        if os.path.isfile(mp3_path):
            info = torchaudio.info(mp3_path)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        duration = info.num_frames / info.sample_rate
        total_duration += duration

        # Getting transcript
        words = line.split("\t")[2]

        # !! Language specific cleaning !!
        # Important: feel free to specify the text normalization
        # corresponding to your alphabet.

        if language in ["en", "fr", "it", "rw"]:
            words = re.sub("[^'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ]+", " ", words).upper()
        elif language == "ar":
            HAMZA = "\u0621"
            ALEF_MADDA = "\u0622"
            ALEF_HAMZA_ABOVE = "\u0623"
            letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA +
                       ALEF_MADDA + ALEF_HAMZA_ABOVE)
            words = re.sub("[^" + letters + "]+", " ", words).upper()

        # Remove accents if specified
        if not accented_letters:
            nfkd_form = unicodedata.normalize("NFKD", words)
            words = "".join(
                [c for c in nfkd_form if not unicodedata.combining(c)])
            words = words.replace("'", " ")

        # Remove multiple spaces
        words = re.sub(" +", " ", words)

        # Remove spaces at the beginning and the end of the sentence
        words = words.lstrip().rstrip()

        # Getting chars
        chars = words.replace(" ", "_")
        chars = " ".join([char for char in chars][:])

        # Remove too short sentences (or empty):
        if len(words) < 3:
            continue

        # Composition of the csv_line
        csv_line = [
            snt_id,
            str(duration),
            mp3_path,
            "wav",
            "",
            spk_id,
            "string",
            "",
            str(words),
            "string",
            "",
            str(chars),
            "string",
            "",
        ]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = "%s sucessfully created!" % (csv_file)
    logger.info(msg)
    msg = "Number of samples: %s " % (str(len(loaded_csv)))
    logger.info(msg)
    msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2)))
    logger.info(msg)
Exemplo n.º 14
0
train_hist['total_time'] = []
print('training start!')
start_time = time.time()
real = torch.ones(args.batch_size, 1, args.input_size // 4,
                  args.input_size // 4).to(device)
fake = torch.zeros(args.batch_size, 1, args.input_size // 4,
                   args.input_size // 4).to(device)
for epoch in range(args.train_epoch):
    epoch_start_time = time.time()
    G_decoder.train()
    G_scheduler.step()
    D_scheduler.step()
    Disc_losses = []
    Gen_losses = []
    Con_losses = []
    for (x, _), (y, _) in tzip(train_loader_src, train_loader_tgt):
        e = y[:, :, :, args.input_size:]
        y = y[:, :, :, :args.input_size]
        x, y, e = x.to(device), y.to(device), e.to(device)

        # train D
        D_optimizer.zero_grad()

        D_real = D(y)
        D_real_loss = BCE_loss(D_real, real)

        G_ = G_decoder(x)[0]
        D_fake = D(G_)
        D_fake_loss = BCE_loss(D_fake, fake)

        D_edge = D(e)
Exemplo n.º 15
0
def run(
    dir_MRI="data/ALFA_PET",
    dir_PET="data/ALFA_PET",
    dir_RR="data/Atlas/CL_2mm",
    outfile="data/ALFA_PET/Quant_realigned.csv",
    glob_PET="*_PET.nii.gz",
    glob_MRI="*_MRI.nii.gz",
):
    """
    Args:
      dir_MRI (str or Path): MRI directory
      dir_PET (str or Path): PET directory
      dir_RR (str or Path): Reference regions ROIs directory
        (standard Centiloid RR from GAAIN Centioid website: 2mm, nifti)
      outfile (str or Path): Output quantification file
    Returns:
      fname (list[str])
      greyCerebellum (list[float])
      wholeCerebellum (list[float])
      wholeCerebellumBrainStem (list[float])
      pons (list[float])
    """
    # PET & MR images lists
    s_PET_dir = list(tmap(gunzip, Path(dir_PET).glob(glob_PET), leave=False))
    s_MRI_dir = list(tmap(gunzip, Path(dir_MRI).glob(glob_MRI), leave=False))
    if len(s_PET_dir) != len(s_MRI_dir):
        raise IndexError("Different number of PET and MR images")

    eng = get_matlab()
    dir_spm = fspath(Path(eng.which("spm")).parent)

    for d_PET, d_MRI in tzip(s_PET_dir, s_MRI_dir):
        with tic("Step 0: Reorient PET subject"):
            eng.f_acpcReorientation(d_PET, nargout=0)

        with tic("Step 0: Reorient MRI subject"):
            eng.f_acpcReorientation(d_MRI, nargout=0)

        with tic("Step 1: CorregisterEstimate"):
            eng.f_1CorregisterEstimate(d_MRI, dir_spm, nargout=0)
        # Check Reg

        with tic("Step 2: CorregisterEstimate"):
            eng.f_2CorregisterEstimate(d_MRI, d_PET, nargout=0)
        # Check Reg

        with tic("Step 3: Segment"):
            eng.f_3Segment(d_MRI, dir_spm, nargout=0)

        with tic("Step 4: Normalise"):
            d_file_norm = fspath(
                Path(d_MRI).parent / ("y_" + Path(d_MRI).name))
            eng.f_4Normalise(d_file_norm, d_MRI, d_PET, nargout=0)

    s_PET = list(
        map(
            fspath,
            Path(dir_PET).glob("w" + (glob_PET[:-3] if glob_PET.lower().
                                      endswith(".gz") else glob_PET))))
    res = eng.f_Quant_centiloid(s_PET, fspath(dir_RR), nargout=5)
    if outfile:
        with open(outfile, "w") as fd:
            f = csv_writer(fd)
            f.writerow(("Fname", "GreyCerebellum", "WholeCerebellum",
                        "WholeCerebellumBrainStem", "Pons"))
            f.writerows(zip(*res))
    return res
Exemplo n.º 16
0
    def applySmartBatching(self, data, mask, target= None, index= None, text= "Iteration:"):
        data = np.stack(data)
        mask = np.stack(mask)
        if target is not None and index is None:
            target = target
        elif index is not None and target is None:
            index = index
        else:
            logging.warning("Provide exactly one of target or index.")

        def getArrayLength(x):
            return sum(x != 0)

        length_array = np.apply_along_axis(getArrayLength, np.stack(data).ndim - 1, np.stack(data))
        while length_array.ndim > 1:
            length_array = np.max(length_array, axis=1)
        sort_idx = length_array.argsort()
        length_array = length_array[sort_idx]
        data = data[sort_idx]
        mask = mask[sort_idx]
        if target is not None and index is None:
            target = target[sort_idx]
        elif index is not None and target is None:
            index = index[sort_idx]
        else:
            logging.warning("Provide exactly one of target or index.")

        data_batch = list()
        mask_batch = list()
        if target is not None and index is None:
            target_batch = list()
        elif index is not None and target is None:
            index_batch = list()
        else:
            logging.warning("Provide exactly one of target or index.")

        pbar = tqdm(total=len(data), desc="Apply dynamic batching")
        while len(data) > 0:
            to_take = min(self.train_batchSize, len(data))
            select = random.randint(0, len(data) - to_take)
            max_batch_len = max(length_array[select:select + to_take])
            data_batch += [torch.tensor(data[select:select + to_take][..., :max_batch_len], dtype=torch.long)]
            mask_batch += [torch.tensor(mask[select:select + to_take][..., :max_batch_len], dtype=torch.long)]
            if target is not None and index is None:
                target_batch += [torch.tensor(target[select:select + to_take], dtype=torch.long)]
            elif index is not None and target is None:
                index_batch += [torch.tensor(index[select:select + to_take], dtype=torch.long)]
            else:
                logging.error("Provide exactly one of target or index.")
            length_array = np.delete(length_array, np.s_[select:select + to_take], 0)
            data = np.delete(data, np.s_[select:select + to_take], 0)
            mask = np.delete(mask, np.s_[select:select + to_take], 0)
            if target is not None and index is None:
                target = np.delete(target, np.s_[select:select + to_take], 0)
            elif index is not None and target is None:
                index = np.delete(index, np.s_[select:select + to_take], 0)
            else:
                logging.warning("Provide exactly one of target or index.")
            pbar.update(to_take)
        pbar.close()
        if target is not None and index is None:
            return tzip(data_batch, mask_batch, target_batch, desc=text)
        elif index is not None and target is None:
            return tzip(data_batch, mask_batch, index_batch, desc=text)
        else:
            return tzip(data_batch, mask_batch, desc=text)
Exemplo n.º 17
0
def preprocess_data_test(
        ticker,
        column_scaler,
        n_steps=50,
        lookup_step=1,
        feature_columns=['low', 'high', 'open', 'volume', 'hour', 'minute']):
    total_df, dfs = load_data(train=False)
    # df = dfs[0]

    # this will contain all the elements we want to return from this function
    result = dict()
    result["column_scaler"] = column_scaler

    # we will also return the original dataframe itself
    result['total_df'] = total_df.copy()
    # result['dfs'] = [df.copy() for df in dfs]
    #
    # # # make sure that the passed feature_columns exist in the dataframe
    # # for col in feature_columns:
    # #     assert col in df.columns, f"'{col}' does not exist in the dataframe."
    # #
    # # # add date as a column
    # # if "date" not in df.columns:
    # #     df["date"] = df.index

    # # if scale:
    # total_df['low'] = np.log(total_df['low'])
    # total_df['high'] = np.log(total_df['high'])
    # total_df['open'] = np.log(total_df['open'])
    # total_df['close'] = np.log(total_df['close'])
    # total_df['volume'] = np.log(total_df['volume'])
    #
    # column_scaler = {}
    # # scale the data (prices) from 0 to 1
    # for column in feature_columns + ['close']:
    #     scaler = column_scaler[column]
    #     total_df[column] = scaler.fit_transform(np.expand_dims(total_df[column].values, axis=1))
    #
    # # add the MinMaxScaler instances to the result returned
    # result["column_scaler"] = column_scaler

    # all_sequence_data = list()
    print('Creating sequences...')
    # with tqdm(total=len(total_df)) as t:
    #     for df in dfs:
    total_df['low'] = np.log(total_df['low'])
    total_df['high'] = np.log(total_df['high'])
    total_df['open'] = np.log(total_df['open'])
    total_df['close'] = np.log(total_df['close'])
    total_df['volume'] = np.log(total_df['volume'])

    for column in feature_columns + ['close']:
        scaler = column_scaler[column]
        total_df[column] = scaler.transform(
            np.expand_dims(total_df[column].values, axis=1))

    # add the target column (label) by shifting by `lookup_step`
    total_df['future'] = total_df['close'].shift(-lookup_step)
    del total_df['close']

    # # last `lookup_step` columns contains NaN in future column
    # # get them before dropping NaNs
    # last_sequence = np.array(total_df[feature_columns].tail(lookup_step))

    # drop NaNs
    total_df.dropna(inplace=True)

    sequence_data = list()
    sequences = deque(maxlen=n_steps)

    for entry, target in tzip(total_df[feature_columns + ['time']].values,
                              total_df['future'].values):
        sequences.append(entry)
        if len(sequences) == n_steps:
            sequence_data.append([np.array(sequences), target])

    # all_sequence_data.extend(sequence_data)

    # # get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence
    # # for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 60 (that is 50+10) length
    # # this last_sequence will be used to predict future stock prices that are not available in the dataset
    # last_sequence = list([s[:len(feature_columns)] for s in sequences]) + list(last_sequence)
    # last_sequence = np.array(last_sequence).astype(np.float32)
    # # add to result
    # result['last_sequence'] = last_sequence

    # construct the X's and y's
    X, y = list(), list()
    print('Appending...')
    for seq, target in tqdm(sequence_data):
        X.append(seq)
        y.append(target)

    # convert to numpy arrays
    X = np.array(X)
    y = np.array(y)

    result['X'] = X
    result['y'] = y
    # if split_by_date:
    #     # split the dataset into training & testing sets by date (not randomly splitting)
    #     train_samples = int((1 - test_size) * len(X))
    #     result["X_train"] = X[:train_samples]
    #     result["y_train"] = y[:train_samples]
    #     result["X_test"] = X[train_samples:]
    #     result["y_test"] = y[train_samples:]
    #     if shuffle:
    #         # shuffle the datasets for training (if shuffle parameter is set)
    #         shuffle_in_unison(result["X_train"], result["y_train"])
    #         shuffle_in_unison(result["X_test"], result["y_test"])
    # else:
    #     # split the dataset randomly
    # #     result["X_train"], result["X_test"], result["y_train"], result["y_test"] = train_test_split(X, y,
    #                                                                                                 test_size=test_size,
    #                                                                                     shuffle=shuffle)

    # # # get the list of test set dates
    # dates = result["X_test"][:, -1, -1]
    # result['dates'] = result['X_test'][:, :, -1]
    # # [result['total_df'][result['total_df']['time'].isin(dates)] for dates in result['X_test'][:, :, -1]]
    # # # retrieve test features from the original dataframe
    # # result["test_df"] = result["df"].loc[dates]
    print('Creating test df...')
    dates = result["X"][:, -1, -1]
    result["test_df"] = result["total_df"][result["total_df"]['time'].isin(
        dates)]
    # result["test_df"] = result["total_df"][result["total_df"]['time'].isin(dates)]
    # # # remove duplicated dates in the testing dataframe
    # # result["test_df"] = result["test_df"][~result["test_df"].index.duplicated(keep='first')]
    # # remove dates from the training/testing sets & convert to float32
    result["X"] = result["X"][:, :, :len(feature_columns)].astype(np.float32)

    return result
Exemplo n.º 18
0
def train(args, global_model, raw_data_train, raw_data_test):
    start_time = time.time()
    user_list = list(raw_data_train[2].keys())[:100]
    nusers = len(user_list)
    cluster_models = [copy.deepcopy(global_model)]
    del global_model
    cluster_models[0].to(device)
    cluster_assignments = [
        user_list.copy()
    ]  # all users assigned to single cluster_model in beginning

    if args.cfl_wsharing:
        shaccumulator = Accumulator()

    if args.frac == -1:
        m = args.cpr
        if m > nusers:
            raise ValueError(
                f"Clients Per Round: {args.cpr} is greater than number of users: {nusers}"
            )
    else:
        m = max(int(args.frac * nusers), 1)
    print(f"Training {m} users each round")
    print(f"Trying to split after every {args.cfl_split_every} rounds")

    train_loss, train_accuracy = [], []
    for epoch in range(args.epochs):
        # CFL
        if (epoch + 1) % args.cfl_split_every == 0:
            all_losses = []
            new_cluster_models, new_cluster_assignments = [], []
            for cidx, (cluster_model, assignments) in enumerate(
                    tzip(cluster_models,
                         cluster_assignments,
                         desc="Try to split each cluster")):
                # First, train all models in cluster
                local_weights = []
                for user in tqdm(assignments,
                                 desc="Train ALL users in the cluster",
                                 leave=False):
                    local_model = LocalUpdate(args=args,
                                              raw_data=raw_data_train,
                                              user=user)
                    w, loss = local_model.update_weights(
                        copy.deepcopy(cluster_model),
                        local_ep_override=args.cfl_local_epochs)
                    local_weights.append(copy.deepcopy(w))
                    all_losses.append(loss)

                # record shared weights so far
                if args.cfl_wsharing:
                    shaccumulator.add(local_weights)

                weight_updates = subtract_weights(local_weights,
                                                  cluster_model.state_dict(),
                                                  args)
                similarities = pairwise_cossim(weight_updates)

                max_norm = compute_max_update_norm(weight_updates)
                mean_norm = compute_mean_update_norm(weight_updates)

                # wandb.log({"mean_norm / eps1": mean_norm, "max_norm / eps2": max_norm}, commit=False)
                split = mean_norm < args.cfl_e1 and max_norm > args.cfl_e2 and len(
                    assignments) > args.cfl_min_size
                print(f"CIDX: {cidx}[{len(assignments)}] elem")
                print(
                    f"mean_norm: {(mean_norm):.4f}; max_norm: {(max_norm):.4f}"
                )
                print(f"split? {split}")
                if split:
                    c1, c2 = cluster_clients(similarities)
                    assignments1 = [assignments[i] for i in c1]
                    assignments2 = [assignments[i] for i in c2]
                    new_cluster_assignments += [assignments1, assignments2]
                    print(
                        f"Cluster[{cidx}][{len(assignments)}] -> ({len(assignments1)}, {len(assignments2)})"
                    )

                    local_weights1 = [local_weights[i] for i in c1]
                    local_weights2 = [local_weights[i] for i in c2]

                    cluster_model.load_state_dict(
                        average_weights(local_weights1))
                    new_cluster_models.append(cluster_model)

                    cluster_model2 = copy.deepcopy(cluster_model)
                    cluster_model2.load_state_dict(
                        average_weights(local_weights2))
                    new_cluster_models.append(cluster_model2)

                else:
                    cluster_model.load_state_dict(
                        average_weights(local_weights))
                    new_cluster_models.append(cluster_model)
                    new_cluster_assignments.append(assignments)

            # Write everything
            cluster_models = new_cluster_models
            if args.cfl_wsharing:
                shaccumulator.write(cluster_models)
                shaccumulator.flush()
            cluster_assignments = new_cluster_assignments
            train_loss.append(sum(all_losses) / len(all_losses))

        # Regular FedAvg
        else:
            all_losses = []

            # Do FedAvg for each cluster
            for cluster_model, assignments in tzip(
                    cluster_models,
                    cluster_assignments,
                    desc="Train each cluster through FedAvg"):
                if args.sample_dist == "uniform":
                    sampled_users = random.sample(assignments, m)
                else:
                    xs = np.linspace(-args.sigm_domain, args.sigm_domain,
                                     len(assignments))
                    sigmdist = 1 / (1 + np.exp(-xs))
                    sampled_users = np.random.choice(assignments,
                                                     m,
                                                     p=sigmdist /
                                                     sigmdist.sum())

                local_weights = []
                for user in tqdm(sampled_users,
                                 desc="Training Selected Users",
                                 leave=False):
                    local_model = LocalUpdate(args=args,
                                              raw_data=raw_data_train,
                                              user=user)
                    w, loss = local_model.update_weights(
                        copy.deepcopy(cluster_model))
                    local_weights.append(copy.deepcopy(w))
                    all_losses.append(loss)

                # update global and shared weights
                if args.cfl_wsharing:
                    shaccumulator.add(local_weights)
                new_cluster_weights = average_weights(local_weights)
                cluster_model.load_state_dict(new_cluster_weights)

            if args.cfl_wsharing:
                shaccumulator.write(cluster_models)
                shaccumulator.flush()
            train_loss.append(sum(all_losses) / len(all_losses))

        # Calculate avg training accuracy over all users at every epoch
        # regardless if it was a CFL step or not
        test_acc, test_loss = [], []
        for cluster_model, assignments in zip(cluster_models,
                                              cluster_assignments):
            for user in assignments:
                local_model = LocalUpdate(args=args,
                                          raw_data=raw_data_test,
                                          user=user)
                acc, loss = local_model.inference(model=cluster_model)
                test_acc.append(acc)
                test_loss.append(loss)
        train_accuracy.append(sum(test_acc) / len(test_acc))

        wandb.log({
            "Train Loss": train_loss[-1],
            "Test Accuracy": (100 * train_accuracy[-1]),
            "Clusters": len(cluster_models)
        })
        print(
            f"Train Loss: {train_loss[-1]:.4f}\t Test Accuracy: {(100 * train_accuracy[-1]):.2f}%"
        )

    print(f"Results after {args.epochs} global rounds of training:")
    print("Avg Train Accuracy: {:.2f}%".format(100 * train_accuracy[-1]))
    print(f"Total Run Time: {(time.time() - start_time):0.4f}")
Exemplo n.º 19
0
fake3 = torch.zeros(args.batch_size, 1, args.input_size // 4,
                    args.input_size // 4).to(device)

for epoch in range(args.train_epoch):
    epoch_start_time = time.time()
    G_e.eval()
    D_scheduler.step()
    D1_scheduler.step()
    D2_scheduler.step()
    D3_scheduler.step()

    Disc_losses = []
    Gen_losses = []
    Con_losses = []
    for (x, _), (y, _), (y1, _), (y2, _), (y3, _) in tzip(
            train_loader_src, train_loader_tgt, train_loader_tgt1,
            train_loader_tgt2, train_loader_tgt3):
        e = y[:, :, :, args.input_size:]
        y = y[:, :, :, :args.input_size]
        x, y, e, y1, y2, y3 = x.to(device), y.to(device), e.to(device), y1.to(
            device), y2.to(device), y3.to(device)

        # train D
        D_optimizer.zero_grad()
        D1_optimizer.zero_grad()
        D2_optimizer.zero_grad()
        D3_optimizer.zero_grad()

        D_real = D(y)
        D_real_loss = BCE_loss(D_real, real)
Exemplo n.º 20
0
    np.array([id_tr[i] for i in l_out_label]).astype(int),
)


if args.spk_utt_all_combinations:

    all_combination = list(itertools.product(u_out, l_out))
    all_combination_label = list(itertools.product(u_out_label, l_out_label))

    u_out = []
    l_out = []

    u_out_label = np.array([])
    l_out_label = np.array([])

    for (u, l),(u_label, l_label) in tzip(all_combination,all_combination_label):
        if u_label != l_label:
            continue
        if len(u_out) == 0:
            u_out = np.array([u])
            l_out = np.array([l])
        else:
            u_out = np.append(u_out, [u], axis=0)
            l_out = np.append(l_out, [l], axis=0)

        u_out_label = np.append(u_out_label, u_label)
        l_out_label = np.append(l_out_label, l_label)

    print("x_vector_l samples after all_combination:", len(l_out))
    print("x_vector_u samples after all_combination:", len(u_out))
Exemplo n.º 21
0
    
for cam in CAM_SETS:
    CAM_PATH = os.path.join(DATASET_PATH, cam)
    IMG_PATH = os.path.join(CAM_PATH, 'image_2')
    ANN_PATH = os.path.join(CAM_PATH, 'label_2')
    # CALIB_PATH = os.path.join(CAM_PATH, 'calib')
    # CALIB_FILE = os.path.join(CALIB_PATH,'000000.txt')
    # shutil.move(CALIB_FILE, OUT_CALIB_PATH)

    img_list = os.listdir(IMG_PATH)
    img_list.sort(key=lambda x:int(x[:-4]))#这里需要排序,因为listdir是乱序的
    ann_list = os.listdir(ANN_PATH)
    ann_list.sort(key=lambda x:int(x[:-4]))
    print('moving {} imgs/anns from {} to {}'.format(len(img_list),IMG_PATH,OUT_PATH))
    print('with index start with *{}*'.format(count+1))

    for img, ann in tzip(img_list,ann_list):
        count += 1
        ann_ori_path = os.path.join(ANN_PATH, ann)
        ann_dst_path = os.path.join(OUT_ANN_PATH, '{:06d}.txt'.format(count))
        img_ori_path = os.path.join(IMG_PATH, img)
        img_dst_path = os.path.join(OUT_IMG_PATH, '{:06d}.png'.format(count))
        shutil.move(ann_ori_path, ann_dst_path)
        shutil.move(img_ori_path, img_dst_path)



    

    
Exemplo n.º 22
0
def create_csv(wav_list, csv_file):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    wav_list : list of str
        The list of wav files.
    csv_file : str
        The path of the output json file
    """

    # Adding some Prints
    msg = f"Creating csv lists in {csv_file} ..."
    logger.info(msg)

    csv_lines = []

    # Start processing lines
    total_duration = 0.0

    # Starting index
    idx = 0

    for wav_file in tzip(wav_list):
        wav_file = wav_file[0]

        path_parts = wav_file.split(os.path.sep)
        file_name, wav_format = os.path.splitext(path_parts[-1])

        # Peeking at the signal (to retrieve duration in seconds)
        if os.path.isfile(wav_file):
            info = torchaudio.info(wav_file)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        audio_duration = info.num_frames / info.sample_rate
        total_duration += audio_duration

        # Actual name of the language
        language = path_parts[-4]

        # Create a row with whole utterences
        csv_line = [
            idx,  # ID
            wav_file,  # File name
            wav_format,  # File format
            str(info.num_frames / info.sample_rate),  # Duration (sec)
            language,  # Language
        ]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

        # Increment index
        idx += 1

    # CSV column titles
    csv_header = ["ID", "wav", "wav_format", "duration", "language"]

    # Add titles to the list at indexx 0
    csv_lines.insert(0, csv_header)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = f"{csv_file} sucessfully created!"
    logger.info(msg)
    msg = f"Number of samples: {len(wav_list)}."
    logger.info(msg)
    msg = f"Total duration: {round(total_duration / 3600, 2)} hours."
    logger.info(msg)
Exemplo n.º 23
0
def create_csv(orig_tsv_file,
               csv_file,
               data_folder,
               accented_letters=False,
               language="en"):
    """
    Creates the csv file given a list of wav files.

    Arguments
    ---------
    orig_tsv_file : str
        Path to the Common Voice tsv file (standard file).
    data_folder : str
        Path of the CommonVoice dataset.
    accented_letters : bool, optional
        Defines if accented letters will be kept as individual letters or
        transformed to the closest non-accented letters.

    Returns
    -------
    None
    """

    # Check if the given files exists
    if not os.path.isfile(orig_tsv_file):
        msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file)
        logger.info(msg)
        raise FileNotFoundError(msg)

    # We load and skip the header
    loaded_csv = open(orig_tsv_file, "r").readlines()[1:]
    nb_samples = str(len(loaded_csv))

    msg = "Preparing CSV files for %s samples ..." % (str(nb_samples))
    logger.info(msg)

    # Adding some Prints
    msg = "Creating csv lists in %s ..." % (csv_file)
    logger.info(msg)

    csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]]

    # Start processing lines
    total_duration = 0.0
    for line in tzip(loaded_csv):

        line = line[0]

        # Path is at indice 1 in Common Voice tsv files. And .mp3 files
        # are located in datasets/lang/clips/
        mp3_path = data_folder + "/clips/" + line.split("\t")[1]
        file_name = mp3_path.split(".")[-2].split("/")[-1]
        spk_id = line.split("\t")[0]
        snt_id = file_name

        # Setting torchaudio backend to sox-io (needed to read mp3 files)
        if torchaudio.get_audio_backend() != "sox_io":
            logger.warning(
                "This recipe needs the sox-io backend of torchaudio")
            logger.warning("The torchaudio backend is changed to sox_io")
            torchaudio.set_audio_backend("sox_io")

        # Reading the signal (to retrieve duration in seconds)
        if os.path.isfile(mp3_path):
            info = torchaudio.info(mp3_path)
        else:
            msg = "\tError loading: %s" % (str(len(file_name)))
            logger.info(msg)
            continue

        duration = info.num_frames / info.sample_rate
        total_duration += duration

        # Getting transcript
        words = line.split("\t")[2]

        # Unicode Normalization
        words = unicode_normalisation(words)

        # !! Language specific cleaning !!
        # Important: feel free to specify the text normalization
        # corresponding to your alphabet.

        if language in ["en", "fr", "it", "rw"]:
            words = re.sub("[^’'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿéæœâçèàûî]+", " ",
                           words).upper()

        if language == "fr":
            # Replace J'y D'hui etc by J_ D_hui
            words = words.replace("'", " ")
            words = words.replace("’", " ")

        elif language == "ar":
            HAMZA = "\u0621"
            ALEF_MADDA = "\u0622"
            ALEF_HAMZA_ABOVE = "\u0623"
            letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA +
                       ALEF_MADDA + ALEF_HAMZA_ABOVE)
            words = re.sub("[^" + letters + "]+", " ", words).upper()
        elif language == "ga-IE":
            # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase
            def pfxuc(a):
                return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ"

            def galc(w):
                return w.lower(
                ) if not pfxuc(w) else w[0] + "-" + w[1:].lower()

            words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words)
            words = " ".join(map(galc, words.split(" ")))

        # Remove accents if specified
        if not accented_letters:
            words = strip_accents(words)
            words = words.replace("'", " ")
            words = words.replace("’", " ")

        # Remove multiple spaces
        words = re.sub(" +", " ", words)

        # Remove spaces at the beginning and the end of the sentence
        words = words.lstrip().rstrip()

        # Getting chars
        chars = words.replace(" ", "_")
        chars = " ".join([char for char in chars][:])

        # Remove too short sentences (or empty):
        if len(words.split(" ")) < 3:
            continue

        # Composition of the csv_line
        csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)]

        # Adding this line to the csv_lines list
        csv_lines.append(csv_line)

    # Writing the csv lines
    with open(csv_file, mode="w", encoding="utf-8") as csv_f:
        csv_writer = csv.writer(csv_f,
                                delimiter=",",
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)

        for line in csv_lines:
            csv_writer.writerow(line)

    # Final prints
    msg = "%s successfully created!" % (csv_file)
    logger.info(msg)
    msg = "Number of samples: %s " % (str(len(loaded_csv)))
    logger.info(msg)
    msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2)))
    logger.info(msg)
from tqdm.contrib import tenumerate, tzip, tmap
import numpy as np

for _ in tenumerate(range(int(1e6)), desc="builtin enumerate"):
    pass

for _ in tenumerate(np.random.random((999, 999)), desc="numpy.ndenumerate"):
    pass

for _ in tzip(np.arange(1e6), np.arange(1e6) + 1, desc="builtin zip"):
    pass

mapped = tmap(lambda x: x + 1, np.arange(1e6), desc="builtin map")
assert (np.arange(1e6) + 1 == list(mapped)).all()