예제 #1
0
 def __init__(self, Data_dir, stage, ratio=(1, 0, 0), seed=1000):
     random.seed(seed)
     self.Data_dir = Data_dir
     Data_list0 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_NL.txt')
     #Data_list1 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_MCI.txt')
     Data_list2 = read_txt('./lookuptxt/', 'ADNI_1.5T_GAN_AD.txt')
     Data_list3 = read_txt('./lookuptxt/', 'ADNI_3T_NL.txt')
     #Data_list4 = read_txt('./lookuptxt/', 'ADNI_3T_MCI.txt')
     Data_list5 = read_txt('./lookuptxt/', 'ADNI_3T_AD.txt')
     self.Data_list_lo = Data_list0 + Data_list2
     self.Data_list_hi = Data_list3 + Data_list5
     self.Label_list = [0] * len(Data_list0) + [1] * len(Data_list2)
     self.stage = stage
     self.length = len(self.Data_list_lo)
     self.patchsampler = PatchGenerator(patch_size=47)
     idxs = list(range(self.length))
     random.shuffle(idxs)
     split1, split2 = int(self.length * ratio[0]), int(
         self.length * (ratio[0] + ratio[1]))
     if self.stage == 'train_p':
         self.index_list = idxs[:split1]
     elif self.stage == 'train_w':
         self.index_list = idxs[:split1]
     elif self.stage == 'valid':
         self.index_list = idxs[split1:split2]
     elif self.stage == 'test':
         self.index_list = idxs[split2:]
     elif self.stage == 'all':
         self.index_list = idxs
     else:
         raise ValueError('invalid stage setting')
예제 #2
0
 def __init__(self,
              Data_dir,
              class1,
              class2,
              stage,
              ratio=(0.6, 0.2, 0.2),
              seed=1000,
              shuffle=True):
     random.seed(seed)
     self.Data_dir = Data_dir
     Data_list0 = read_txt('./lookuptxt/', class1 + '.txt')
     Data_list1 = read_txt('./lookuptxt/', class2 + '.txt')
     self.Data_list = Data_list0 + Data_list1
     self.Label_list = [0] * len(Data_list0) + [1] * len(Data_list1)
     self.stage = stage
     self.length = len(self.Data_list)
     idxs = list(range(self.length))
     if shuffle:
         random.shuffle(idxs)
     split1, split2 = int(self.length * ratio[0]), int(
         self.length * (ratio[0] + ratio[1]))
     if self.stage == 'train':
         self.index_list = idxs[:split1]
     elif self.stage == 'valid':
         self.index_list = idxs[split1:split2]
     elif self.stage == 'test':
         self.index_list = idxs[split2:]
     elif self.stage == 'all':
         self.index_list = idxs
     else:
         raise ValueError('invalid stage setting')
예제 #3
0
def load_test_data(img_shape=(224, 224)):
    class_map = {'normal': 0, 'pneumonia': 1, 'COVID-19': 2}

    dataset_path = Paths.DATASET_BASE_PATH + 'data/test/'
    csv_content = utils.read_txt(Paths.DATASET_BASE_PATH +
                                 'COVID-Net/test_COVIDx.txt')

    _x_test_paths = []
    _y_test = []
    for c in csv_content:
        c = c.split(' ')
        _y_test.append(class_map[c[-1].replace('\n', '')])
        _x_test_paths.append(dataset_path + c[-2])

    dataset_path = '/media/share/pedro/2020-Covid/data/'
    csv_content = utils.read_txt('/media/share/pedro/2020-Covid/data/test.txt')

    for c in csv_content:
        _y_test.append(class_map['COVID-19'])
        _x_test_paths.append(dataset_path + c.replace('\n', ''))

    _y_test = np.asarray(_y_test)
    _x_test = utils.load_images(_x_test_paths, img_shape) / 255.

    return _x_test, _y_test
예제 #4
0
def get_txt_content(ans_txt_path_list: list, pending_txt_dir: str) -> list:
    txt_content = []
    for a_txt_path in ans_txt_path_list:
        txt_name = os.path.basename(a_txt_path)
        p_txt_path = os.path.join(pending_txt_dir, txt_name)
        a_txt_content = utils.read_txt(a_txt_path)
        if os.path.exists(p_txt_path):
            p_txt_content = utils.read_txt(p_txt_path)
            txt_content.append((a_txt_content, p_txt_content))
        else:
            txt_content.append((a_txt_content, []))
            st.warning("{}不存在,请检查".format(a_txt_path))
    return txt_content
예제 #5
0
def load_and_cache_examples(args, tokenizer, logger, mode="train"):
    """SemEval2010Task8 does'not have dev set"""
    assert mode in ["train", "test", "dev"]

    if not os.path.exists(args.data_cache_dir):
        os.mkdir(args.data_cache_dir)

    cached_examples_file = os.path.join(
        args.data_cache_dir,
        "cached_{}_{}_{}_{}".format(args.dataset, mode, args.entity_position_encoding, str(args.max_seq_length)),
    )
    if os.path.exists(cached_examples_file):
        logger.info("Loading features from cached file %s", cached_examples_file)
        examples = torch.load(cached_examples_file)
    else:
        logger.info("Creating features for %s %s set" % (args.dataset, mode))
        if args.dataset == 'kbp37':
            _, train_sentences, train_relations = read_txt(os.path.join(KBP37RawPath, "train.txt"))
            _, dev_sentences, dev_relations = read_txt(os.path.join(KBP37RawPath, "dev.txt"))
            _, test_sentences, test_relations = read_txt(os.path.join(KBP37RawPath, "test.txt"))
            if not args.kbp37_split_dev:
                train_sentences.extend(dev_sentences)
                train_relations.extend(dev_relations)
        else:
            _, train_sentences, train_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "train.txt"))
            _, test_sentences, test_relations = read_txt(os.path.join(SemEval2010Task8RawPath, "test.txt"))
        relation2id_path = KBP37Relation2IdPath if args.dataset == "kbp37" else SemEval2010Relation2IdPath
        if os.path.exists(relation2id_path):
            with open(relation2id_path, 'r', encoding='utf8') as f:
                relation2id = json.load(f)
        else:
            relation2id, _ = static_relations(train_relations)
            with open(relation2id_path, 'w', encoding='utf8') as f:
                json.dump(relation2id, f)
        if mode == 'train':
            sentences, relations = train_sentences, train_relations
        elif mode == 'test':
            sentences, relations = test_sentences, test_relations
        else:
            if args.dataset == 'kbp37':
                sentences, relations = dev_sentences, dev_relations
            else:
                raise ValueError("SemEval2010Task8 does'not have dev set!")
        examples = create_examples_for_xlnet(sentences, relations, tokenizer, relation2id,
                                             True if args.entity_position_encoding=="entity_tag" else False,
                                             True if args.entity_position_encoding == "token_type_ids" else False,
                                             args.max_seq_length)
        torch.save(examples, cached_examples_file)
    return examples
예제 #6
0
def _create_dataset(input_dir, filenames, output_path):
  count = 0
  writer = tf.python_io.TFRecordWriter(output_path+'.cache')
  random.shuffle(filenames)
  for i, filename in enumerate(filenames):
    wave_path = input_dir + filename[0]
    txt_path = input_dir + filename[1]
    stem = os.path.splitext(os.path.split(filename[0])[-1])[0]
    wave = utils.read_wave(wave_path)
    text = utils.read_txt(txt_path)
    if len(wave) >= len(text):
      data = tf.train.Example(features=tf.train.Features(feature={
        'uid': tf.train.Feature(bytes_list=tf.train.BytesList(value=[stem.encode('utf-8')])),
        'audio/data': tf.train.Feature(float_list=tf.train.FloatList(value=wave.reshape([-1]).tolist())),
        'audio/shape': tf.train.Feature(int64_list=tf.train.Int64List(value=wave.shape)),
        'text': tf.train.Feature(int64_list=tf.train.Int64List(value=text)),
      }))
      writer.write(data.SerializeToString())
    else:
      glog.error("length of label(%d) is greater than feature(%d) at %s." % (len(text), len(wave), stem))

    count = i + 1
    if count % 50 == 0:
      glog.info('processed %d/%d files.' % (count, len(filenames)))
  if count % 1000 != 0:
    glog.info('processed %d/%d files.' % (count, len(filenames)))
  if os.path.exists(output_path):
    os.remove(output_path)
  if os.path.exists(output_path+'.cache'):
    os.renames(output_path+'.cache', output_path)
예제 #7
0
def run(args):
    doc = read_txt(args.path_to_doc)
    doc_tokens = [
        process_text(entry,
                     lower=not args.cased,
                     remove_stopwords=args.remove_stopwords,
                     remove_punctuation=args.remove_punctuation)
        for entry in doc
    ]

    all_tokens = []
    for entry_tokens in doc_tokens:
        all_tokens += entry_tokens

    rare_tokens, selected_tokens = get_rare_tokens(all_tokens,
                                                   args.min_freq,
                                                   args.max_tokens,
                                                   return_non_rare=True)
    if args.remove_rare:
        doc_tokens = [
            filter_tokens(entry_tokens, set(rare_tokens))
            for entry_tokens in doc_tokens
        ]

    gu = GloVeUtility(args.path_to_glove)

    vectorizer = CountVectorizer(ngram_range=(args.ngram_lower,
                                              args.ngram_upper),
                                 vocabulary=selected_tokens)
    count_vector = vectorizer.fit_transform(
        [" ".join(entry_tokens) for entry_tokens in doc_tokens])

    csr_mat = count_vector.T * count_vector
    csr_mat.setdiag(0)

    cooccur_ar = csr_mat.toarray()

    mittens_model = Mittens(n=gu.d, max_iter=args.iter)
    embeddings = mittens_model.fit(cooccur_ar,
                                   vocab=selected_tokens,
                                   initial_embedding_dict=gu.vector_dict)

    filename = args.path_to_glove.split(os.path.sep)[-1]
    os.makedirs(args.output, exist_ok=True)

    embeddings_dict = dict(zip(selected_tokens, embeddings))
    progress_bar.std_print("\nTrained on {} tokens.".format(
        len(embeddings_dict)))

    if args.save_new_only:
        savepath = os.path.join(args.output, "new_" + filename)
        embeddings_list = [
            " ".join([key] + [str(val) for val in embeddings_dict[key]])
            for key in embeddings_dict
        ]
        write_txt(savepath, embeddings_list)
    else:
        savepath = os.path.join(args.output, filename)
        gu.add_replace_vectors(embeddings_dict)
        gu.save_vectors(savepath)
예제 #8
0
 def __init__(self, root_dir, txt_COVID, txt_NonCOVID, transform=None):
     """
     Args:
         txt_path (string): Path to the txt file with annotations.
         root_dir (string): Directory with all the images.
         transform (callable, optional): Optional transform to be applied
             on a sample.
     File structure:
     - root_dir
         - CT_COVID
             - img1.png
             - img2.png
             - ......
         - CT_NonCOVID
             - img1.png
             - img2.png
             - ......
     """
     self.root_dir = root_dir
     self.txt_path = [txt_COVID, txt_NonCOVID]
     self.classes = ['CT_COVID', 'CT_NonCOVID']
     self.num_cls = len(self.classes)
     self.img_list = []
     for c in range(self.num_cls):
         cls_list = [[
             os.path.join(self.root_dir, self.classes[c], item), c
         ] for item in read_txt(self.txt_path[c])]
         self.img_list += cls_list
     self.transform = transform
     print('samples = ', len(self.img_list))
예제 #9
0
    def __init__(self, logging_groups=['Default', 'Cortex']):
        Cortex.__init__(self, 'SOUL', logging_groups)

        # The following is parsing the defines.txt to get the app specific data
        defines = utils.parse_defines(
            utils.read_txt("../database/cortex/soul.txt"))
        self.delay = eval(defines['delay'])
예제 #10
0
def get_index(filename):
    if os.path.exists(filename):
        movie_id, total = map(int, utils.read_txt(filename).split('\n')[0].split(','))
    else:
        movie_id, total = 129406, 0
    print(movie_id, total)
    return [movie_id, total]
예제 #11
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--feats",
                        default=None,
                        required=True,
                        help="name of the list of hdf5 files")
    parser.add_argument("--stats",
                        default=None,
                        required=True,
                        help="filename of hdf5 format")

    args = parser.parse_args()

    # read list and define scaler
    filenames = read_txt(args.feats)
    scaler = StandardScaler()
    print("number of training utterances =", len(filenames))

    # process over all of data
    for filename in filenames:
        feat = read_hdf5(filename, "/feat_org")
        scaler.partial_fit(feat[:, 1:])

    # add uv term
    mean = np.zeros((feat.shape[1]))
    scale = np.ones((feat.shape[1]))
    mean[1:] = scaler.mean_
    scale[1:] = scaler.scale_

    # write to hdf5
    write_hdf5(args.stats, "/mean", mean)
    write_hdf5(args.stats, "/scale", scale)
예제 #12
0
def Nonlinear_Test(models):
    print("Load the validation data...")
    start_time = time.time()
    val_imgs, val_idxs = load_val_data(data_dir)
    print("{:.4f} seconds".format(time.time() - start_time))

    del val_imgs

    print("Extract the image features...")
    val_features = np.load('./val_bow.npy')

    print('Test the classifiers...')
    accuracy = 0
    for class_name in category:
        target_idxs = np.array([
            read_txt(os.path.join(data_dir, '{}_val.txt'.format(class_name)))
        ])
        target_labels = get_labels(val_idxs, target_idxs)

        val_accuracy = models[class_name].score(val_features, target_labels)
        print('{} Classifier validation accuracy:  {:.4f}'.format(
            class_name, val_accuracy))
        accuracy += val_accuracy

    del val_features, target_idxs, target_labels

    print('Average validation accuracy: {:.4f}'.format(accuracy /
                                                       len(category)))
예제 #13
0
def Nonlinear_Trainer():
    print("Load the training data...")
    start_time = time.time()
    train_imgs, train_idxs = load_train_data(data_dir)
    del train_imgs
    print("{:.4f} seconds".format(time.time() - start_time))

    print("Extract the image features...")
    train_features = np.load('./train_bow.npy')

    print('Train the classifiers...')
    accuracy = 0
    models = {}

    for class_name in category:
        target_idxs = np.array([
            read_txt(os.path.join(data_dir, '{}_train.txt'.format(class_name)))
        ])
        target_labels = get_labels(train_idxs, target_idxs)

        models[class_name] = nonlinear_classifier(train_features,
                                                  target_labels)
        train_accuracy = models[class_name].score(train_features,
                                                  target_labels)
        print('{} zClassifier train accuracy:  {:.4f}'.format(
            class_name, train_accuracy))
        accuracy += train_accuracy

    print('Average train accuracy: {:.4f}'.format(accuracy / len(category)))
    del train_features, target_labels, target_idxs

    return models
예제 #14
0
    def __init__(self, config, mode):
        """
        Args:
            txt_path (string): Path to the txt file with annotations.
            root_dir (string): Directory with all the images.

        File structure:
        - root_dir
            - CT_COVID
                - img1.png
                - img2.png
                - ......
            - CT_NonCOVID
                - img1.png
                - img2.png
                - ......
        """
        self.config = config
        self.root = self.config.dataset.input_data
        if mode == 'train':

            self.txt_path = [train_txt_COVID, train_txt_NonCOVID]
        elif mode == 'val':
            self.txt_path = [val_txt_COVID, val_txt_NonCOVID]
        elif mode == 'test':
            self.txt_path = [test_txt_COVID, test_txt_NonCOVID]
        self.class_dict = {'CT_COVID': 0, 'CT_NonCOVID': 1}
        self.num_cls = len(self.class_dict)
        self.img_list = []
        for c in range(self.num_cls):
            cls_list = [[os.path.join(self.root, self.class_dict[c], item), c]
                        for item in read_txt(self.txt_path[c])]
            self.img_list += cls_list

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        train_transformer = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomResizedCrop((224), scale=(0.8, 1.2)),
            transforms.RandomRotation(15),
            transforms.RandomHorizontalFlip(p=0.1),
            transforms.RandomVerticalFlip(p=0.1),
            transforms.ColorJitter(brightness=0.2,
                                   contrast=0.2,
                                   saturation=0.2,
                                   hue=0.2),
            transforms.ToTensor(), normalize
        ])

        val_transformer = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(), normalize
        ])
        if mode == 'train':
            self.transform = train_transformer
        else:
            self.transform = val_transformer
        print('samples = ', len(self.img_list))
예제 #15
0
def load_documents():
    txt_list = []
    for document in sys.argv[2:]:
        txt = read_txt(document)
        txt_list.append(txt)

    dic = {'filename': sys.argv[2:], 'raw_text': txt_list}
    return pd.DataFrame.from_dict(dic)
def copy_covid_dataset():
    output_path = Paths.DATASET_BASE_PATH + 'data/augmented-orig/'
    dataset_path = Paths.DATASET_BASE_PATH + 'data/train/'

    for c in utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/train_COVIDx.txt'):
        c = c.split(' ')
        if 'COVID-19' in c[-1]:
            img = cv2.imread(dataset_path + c[-2], cv2.IMREAD_COLOR)
            cv2.imwrite(output_path + c[-2], img)  # write png image
예제 #17
0
def get_index(filename):
    if os.path.exists(filename):
        movie_id, total = map(
            int,
            utils.read_txt(filename).split('\n')[0].split(','))
    else:
        movie_id, total = 129406, 0
    print(movie_id, total)
    return [movie_id, total]
def check_if_all_files_exist(data_type):
    csv_content = utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/{}_COVIDx.txt'.format(data_type))
    imgs_path = Paths.DATASET_BASE_PATH + 'data/{}/'.format(data_type)

    for c in csv_content:
        img_path = c.split(' ')[-2]
        if not os.path.exists(imgs_path + img_path):
            print('The following image was not found [{}].'.format(img_path))
            shutil.move(imgs_path.replace('/test/', '/train/') + img_path, imgs_path + img_path)
예제 #19
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--feats",
                        default=None,
                        required=True,
                        type=str,
                        help="name of the list of hdf5 files")
    parser.add_argument("--stats",
                        default=None,
                        required=True,
                        type=str,
                        help="filename of hdf5 format")
    parser.add_argument("--feature_type",
                        default="world",
                        choices=["world", "melspc", "mcep"],
                        type=str,
                        help="feature type")
    parser.add_argument("--verbose",
                        default=1,
                        type=int,
                        help="log message level")

    args = parser.parse_args()

    # set log level
    if args.verbose == 1:
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(
            level=logging.WARN,
            format=
            '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warn("logging is disabled.")

    # show argmument
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # read file list
    file_list = read_txt(args.feats)
    logging.info("number of utterances = %d" % len(file_list))

    # calculate statistics
    calc_stats(file_list, args)
예제 #20
0
    def __init__(self, screen, logging_groups=['Default', 'Cortex']):
        Cortex.__init__(self, 'Collector', logging_groups)

        # Current collection platform
        self.platform = GroupMe_Web(screen, logging_groups)

        # The following is parsing the defines.txt to get the app specific data
        defines = utils.parse_defines(
            utils.read_txt("../database/cortex/collector.txt"))
        self.alias_dict = eval(defines['member_alias'])
예제 #21
0
    def __init__(self,
                 mode,
                 root_dir,
                 txt_COVID,
                 txt_NonCOVID,
                 transform=None):
        """
        Args:
            txt_path (string): Path to the txt file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        File structure:
        - root_dir
            - CT_COVID
                - img1.png
                - img2.png
                - ......
            - CT_NonCOVID
                - img1.png
                - img2.png
                - ......
        """
        self.root_dir = root_dir
        self.txt_path = [txt_COVID, txt_NonCOVID]
        self.classes = ['CT_COVID', 'CT_NonCOVID']
        self.num_cls = len(self.classes)
        self.img_list = []
        for c in range(self.num_cls):
            cls_list = [[
                os.path.join(self.root_dir, self.classes[c], item), c
            ] for item in read_txt(self.txt_path[c])]
            self.img_list += cls_list

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        train_transformer = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomResizedCrop((224), scale=(0.5, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(), normalize
        ])

        val_transformer = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(), normalize
        ])
        if mode == 'train':
            self.transform = train_transformer
        else:
            self.transform = val_transformer
        print('samples = ', len(self.img_list))
예제 #22
0
def main():
    # parser arguments
    args = _get_arguments()
    # set log level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(level=logging.WARN,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warn("logging is disabled.")

    # show argmument
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))

    # check directory existence
    if not os.path.exists(os.path.dirname(args.writedir)):
        os.makedirs(os.path.dirname(args.writedir))

    # get file list
    if os.path.isdir(args.feats):
        feat_list = sorted(find_files(args.feats, "*.%s" % args.feature_format))
    elif os.path.isfile(args.feats):
        feat_list = read_txt(args.feats)
    else:
        logging.error("--feats should be directory or list.")
        sys.exit(1)
    feat_ids = [os.path.basename(f).replace(".%s" % args.feature_format, "") for f in feat_list]
    logging.info("number of utterances = %d" % len(feat_ids))

    # divie list
    feat_ids = np.array_split(feat_ids, args.n_jobs)
    feat_ids = [f_ids.tolist() for f_ids in feat_ids]

    # multi processing
    processes = []
    # for f in file_lists:
    for f in feat_ids:
        p = mp.Process(target=noise_shaping, args=(f, args,))
        p.start()
        processes.append(p)

    # wait for all process
    for p in processes:
        p.join()
예제 #23
0
    def import_call(self, e):
        if setting_fftool.has_query:
            utils.showinfo("有任务正在执行,请稍后")
            return

        tup = tuple([])
        ft = self.file_types
        ft_tup = self.file_types_tup
        if e.widget == self.import_btn:
            tup = filedialog.askopenfilenames(
                filetypes=ft,
                title='导入文件',
                initialdir=setting_fftool.last_folder)

        elif e.widget == self.import_list_btn:
            if os.path.exists(setting_fftool.list_file):
                arr = utils.read_txt(setting_fftool.list_file)
                new_arr = []
                for f in arr:
                    if os.path.exists(f):
                        new_arr.append(f)
                if not len(new_arr):
                    utils.showinfo('txt中的地址都不正确' + setting_fftool.list_file)
                    return
                tup = tuple(new_arr)

        elif e.widget == self.import_dir_btn:
            folder = filedialog.askdirectory(
                title='选择目录', initialdir=setting_fftool.last_folder)
            if folder:
                folder = utils.pathlib_path(folder)
                setting_fftool.last_folder = folder
                arr = []
                new_arr = []
                # 获得目录下所有文件
                utils.list_dir(folder, arr)
                # 过滤出指定格式的文件
                for f in arr:
                    suffix = str(Path(f).suffix)
                    for f_type in ft_tup:
                        if suffix == f_type:
                            new_arr.append(f)
                            break
                tup = tuple(new_arr)

        if len(tup):
            tup = utils.pathlib_path_tup(tup, True)
            self.tree.set_list(list(tup))
            # self.start.set_state(True)
            # self.clear_query()

            setting_fftool.last_folder = utils.pathlib_parent(tup[0])
예제 #24
0
    def __getitem__(self, idx):
        if self.mode == 'train':
            filenames = self.train_filenames[idx]
        else:
            filenames = self.test_filenames[idx]
        wave_path = self.cfg.dataset + filenames[0]
        txt_path = self.cfg.dataset + filenames[1]
        try:
            text_tmp = utils.read_txt(txt_path)  # list
            wave_tmp = utils.read_wave(wave_path)  # numpy
        except OSError:
            print(txt_path)
            print(wave_path)
            return self.__getitem__(0)
        wave_tmp = torch.from_numpy(wave_tmp)
        wave = torch.zeros([40, self.max_wave
                            ])  # 512 may be too short, if error,fix it
        length_wave = wave_tmp.shape[1]
        # print(length_wave)
        wave[:, :length_wave] = wave_tmp
        # print(txt_path)

        while 27 in text_tmp:
            text_tmp.remove(27)

        length_text = len(text_tmp)
        text_tmp = torch.tensor(text_tmp)
        text = torch.zeros([self.max_text
                            ])  # 256 may be too short, fix it, if error
        text[:length_text] = text_tmp
        name = filenames[0].split('/')[-1]

        if length_text >= length_wave:
            sample = {
                'name': name,
                'wave': torch.zeros([40, self.max_wave], dtype=torch.float),
                'text': torch.zeros([self.max_text], dtype=torch.float),
                'length_wave': self.max_wave,
                'length_text': self.max_text
            }
        else:
            sample = {
                'name': name,
                'wave': wave,
                'text': text,
                'length_wave': length_wave,
                'length_text': length_text
            }
        return sample
def prepare_adjoint(config):
    # mkdir outputbase
    outputbase = config["outputdir"]
    clean_outputdir(outputbase)

    eventlist_file = config["eventlist"]
    eventlist = read_txt(eventlist_file)
    # split the eventlist and dump into separate files
    nevents_per_job = config["nevents_per_job"]
    cmtlist_per_job = split_job(eventlist, nevents_per_job)
    print("-"*10 + "\nJob list:\n%s" % cmtlist_per_job)

    print("="*20 + "\nPreparing jobs...")
    for job_id, cmtlist in cmtlist_per_job.iteritems():
        prepare_one_job(job_id, cmtlist, config)
예제 #26
0
def find_end_curly_bracket(file_path, start_line, end_line):
    content = utils.read_txt(file_path)
    current_line = end_line - 1
    start_line = start_line - 1
    tag = False
    while (current_line >= start_line):
        if len(content[current_line]) > 0 and content[current_line][0] == '}':
            tag = True
            break
        current_line = current_line - 1
    if tag:
        return current_line + 1
    raise Exception(
        "ERROR: Cannot find the last curly bracket within [%d, %d]" %
        (start_line, end_line))
예제 #27
0
def load_url_by_file(_fileNum) -> List[Tuple[str, str]]:
    """
    File로 부터 신문사들 URL가져옮

    :param _fileNum: 뉴스언론사 index
    :return: zip(naver url, origin url)
    """
    urldatas = read_txt(DATA_DIR + "{}.txt".format(_fileNum))  # naver @@@ origin
    # (naver, origin)
    urlList = list(zip(
        [url.split('@@@')[NAVER_IDX] for url in urldatas],
        [url.split('@@@')[ORIGIN_IDX] for url in urldatas]
    ))
    log('s', "{}.txt 파일에서 {}개 불러옴... Ex) {} ".format(_fileNum, len(urlList), urlList[0][ORIGIN_IDX]))
    return urlList
예제 #28
0
def main():
    # parser arguments
    args = _get_arguments()
    # set log level
    if args.verbose == 1:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    elif args.verbose > 1:
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
    else:
        logging.basicConfig(level=logging.WARN,
                            format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s',
                            datefmt='%m/%d/%Y %I:%M:%S')
        logging.warn("logging is disabled.")

    # show argmument
    for key, value in vars(args).items():
        logging.info("%s = %s" % (key, str(value)))
    # read list
    if os.path.isdir(args.waveforms):
       file_list = sorted(find_files(args.waveforms, "*.wav"))
    else:
       file_list = read_txt(args.waveforms)
    logging.info("number of utterances = %d" % len(file_list))
    wav_set = 'wav_%s_%s' % (args.feature_format, args.wavtype)
    # create file folders
    filepath_create(file_list, wav_set)
    # divide list
    file_lists = np.array_split(file_list, args.n_jobs)
    file_lists = [f_list.tolist() for f_list in file_lists]
    # multi processing
    processes = []
    # for f in file_lists:
    for f in file_lists:
        p = mp.Process(target=noise_shaping, args=(f, wav_set, args,))
        p.start()
        processes.append(p)

    # wait for all process
    for p in processes:
        p.join()
예제 #29
0
def main():
    if (len(sys.argv)) != 3:
        print('Usage: python3 crawl_cnki.py start end')
        return
    else:
        start, end = int(sys.argv[1]), int(sys.argv[2])

    start_time = time.time()
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

    # 将excel文件中的期刊名转存为txt文件,并读取出范围为start到end的期刊名
    ori_src, new_src = './待爬取数据.xlsx', './journals.txt'
    utils.excel2txt(ori_src, new_src)
    journals = utils.read_txt(new_src, start, end)

    # 创建用于保存输出结果的目录
    output_dir = './publish_numbers'
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    start_year, end_year = 2012, 2020
    cnt, total = 0, end - start + 1
    succeed = failed = skipped = 0

    for i in range(end - start + 1):
        cnt += 1
        output_file = output_dir + '/' + journals[i] + '.xlsx'
        if not Path(output_file).is_file():
            if start_crawl(journals[i], start_year, end_year, output_file):
                succeed += 1
            else:
                failed += 1
            print(
                'Progress: {}/{}, succeed: {}, failed: {}, skipped: {}, used time: {}'
                .format(cnt, total, succeed, failed, skipped,
                        time.time() - start_time))
        else:
            skipped += 1

    print(
        'Finished crawl. Total succeed: {}, total failed: {}, total skipped: {}, total used time: {}'
        .format(succeed, failed, skipped,
                time.time() - start_time))
예제 #30
0
def prepare_ami_mdm(ami_mdm_location, audio_path, text_path, lists_path,
                    processes):
    for f in ['dev', 'test', 'train']:
        dst_list = os.path.join(lists_path, f"ami-mdm-{f}.lst")
        dst_text = os.path.join(text_path, f"ami-mdm-{f}.txt")
        if not os.path.exists(dst_list):
            with Pool(processes) as p:
                to_list = partial(ami_mdm_to_list, audio_path,
                                  ami_mdm_location)
                rows = read_txt(os.path.join(ami_mdm_location, f))
                samples = list(tqdm(
                    p.imap(to_list, rows),
                    total=len(rows),
                ))
            with open(dst_list, "w") as list_f:
                list_f.writelines(samples)

            with open(dst_list, "r") as list_f, open(dst_text, "w") as text_f:
                for line in list_f:
                    text_f.write(" ".join(line.strip().split(" ")[3:]) + "\n")

        else:
            print(f"{dst_list} exists, doing verify")
            new_list = []
            with open(dst_list, "r") as list_f:
                for line in list_f:
                    filename = line.split(" ")[1]
                    text = " ".join(line.strip().split(" ")[3:])
                    params = " ".join(line.strip().split(" ")[:3])
                    text = remove_punct(text)
                    line = f"{params} {text}\n"
                    if not os.path.exists(filename) or len(
                            text) < 2 or not alpha.match(text):
                        print(
                            f"{filename} does not exists or text is empty, text: {text}"
                        )
                    else:
                        new_list.append(line)
            with open(dst_list, "w") as list_f:
                list_f.writelines(new_list)

    print("Prepared AMI MDM8", flush=True)
예제 #31
0
파일: train.py 프로젝트: DarioArenas/NLP
def read_category(category):
    filename_list = []
    txt_list = []
    txt_len_list = []

    file_list = os.listdir(category)
    for file in file_list:
        filename = os.path.join(category, file)
        txt = read_txt(filename)
        filename_list.append(filename.replace('/', '_'))
        txt_list.append(txt)
        txt_len_list.append(len(txt))

    dic = {
        'category': [category] * len(filename_list),
        'filename': filename_list,
        'raw_text': txt_list,
        'raw_text_lenght': txt_len_list
    }
    return pd.DataFrame.from_dict(dic)
def copy_missing_files(data_type):
    output_path = Paths.DATASET_BASE_PATH + 'data/{}/'.format(data_type)
    train_dataset_path = [
        Paths.DATASET_BASE_PATH + 'covid-chestxray-dataset/images/',
        Paths.DATASET_BASE_PATH + 'rsna-pneumonia-detection-challenge/stage_2_{}_images/'.format(data_type)
    ]

    csv_content = utils.read_txt(Paths.DATASET_BASE_PATH + 'COVID-Net/{}_COVIDx.txt'.format(data_type))

    _x_train_paths = []
    for c in csv_content:
        full_path = None
        img_path = c.split(' ')[-2]
        if not img_path.endswith('.dcm'):
            if os.path.exists(train_dataset_path[0] + img_path):
                full_path = train_dataset_path[0] + img_path
            elif os.path.exists(train_dataset_path[1] + img_path):
                full_path = train_dataset_path[1] + img_path
            if full_path is not None:
                img = cv2.imread(full_path)
                cv2.imwrite(output_path + img_path, img)  # write png image
def prepare_proc_obsd(config):
    # mkdir outputbase
    outputbase = config["outputdir"]
    clean_outputdir(outputbase)

    # copy param files
    taglist = config["taglist"]
    copy_param_files(taglist, config["paramdir"], outputbase)

    # copy path files
    eventlist_file = config["eventlist"]
    eventlist = read_txt(eventlist_file)
    if len(eventlist) == 0:
        raise ValueError("No events found in file:" % eventlist_file)
    copy_path_files(eventlist, taglist, config["pathdir"], outputbase)

    # split the eventlist and dump into separate files
    nevents_per_job = config["nevents_per_job"]
    cmtlist_per_job = split_job(eventlist, nevents_per_job)
    dump_cmtlist(cmtlist_per_job, outputbase)

    # prepare job scripts
    prepare_job_scripts(cmtlist_per_job, config)
예제 #34
0
파일: show.py 프로젝트: joseaccruz/xyz2b
FILE_VIDEO = "/home/jcruz/Desktop/BIG_DATA/fish_videos/ana_faustino/Tu15_A__Cond1_side_2.avi"

START_T = 7 * 60 * 1000

(X_SCALE, X_TRANS) = (21.88392008, 271.97335871)
(Y_SCALE, Y_TRANS) = (-21.9054242, 407.59388039)


def on_mouse(event, x, y, flags, k):
    if event == 1:
        print "MOUSE:", x, y, flags

#
#
#
data = utils.read_txt(FILE_DATA)
cap = cv2.VideoCapture(FILE_VIDEO)

(x, y) = (10.5128689995, -7.9942489268)


cv2.namedWindow("x")
cv2.setMouseCallback("x", on_mouse)

i = 0

fpre = None

while True:
    tf = int(cap.get(cv2.cv.CV_CAP_PROP_POS_MSEC))
    td = int(data[i][0] * 1000.0)
예제 #35
0
files = utils.list_excel(DATA_DIR)

for name_xls in files:
    print name_xls

    # get the names of the result files
    name_txt = name_xls.replace(".xlsx", ".txt")
    name_sub = name_xls.replace(".xlsx", ".sub")
    name_rep = name_xls.replace(".xlsx", ".report")

    # convert from Excel to text
    if FORCE_XLS_TO_TXT or (not os.path.isfile(name_txt)):
        utils.excel_to_txt(name_xls, name_txt)

    # get the data from the text file
    data = np.array(utils.read_txt(name_txt))

    # round the data to clean um some noise
    data = np.round(np.array(utils.read_txt(name_txt)), 2)

    # trim the initial and final NaN values
    data = compute.trim(data, 1)
    data = compute.trim(data, 2)

    # interpolate some NaN values
    compute.interpolate(data, 1)
    compute.interpolate(data, 2)

    # smooth the position data
    data_smooth = data
    data_smooth[:, 1] = compute.smooth(data[:, 1], win_size=25)
예제 #36
0
파일: xyz2b.py 프로젝트: joseaccruz/xyz2b
for (key, files) in fpairs.items():
    if len(files) != 2:
        print "Missing file ('top' or 'side') in pair for assay '%s'" % key
        continue

    print "*****\n%s\n*****" % key

    name_sub = "%s/%s.sub" % (DATA_DIR, key)
    name_rep = "%s/%s.report" % (DATA_DIR, key)
    name_fig1 = "%s/%s_fig1.%s" % (DATA_DIR, key, FIG_FORMAT)
    name_fig2 = "%s/%s_fig2.%s" % (DATA_DIR, key, FIG_FORMAT)

    (trim_start, trim_end) = (None, None)
    for ftxt in files:
        # round the data to clean um some noise
        data = np.round(np.array(utils.read_txt(ftxt)), 2)

        # trim the initial and final NaN values
        (start, end) = compute.trim3(data)

        trim_start = start if (trim_start < start or trim_start is None) else trim_start
        trim_end   = end if (trim_end > end or trim_end is None) else trim_end

        # interpolate some NaN values
        compute.interpolate3(data, start, end)

        if "_side" in ftxt:
            data_s = data
        elif "_top" in ftxt:
            data_t = data