Exemplo n.º 1
0
def main() -> None:
    """
    Entrypoint into the command-line interface.
    """
    parser = argparse.ArgumentParser(
        description="Lists files in a directory."
    )

    parser.add_argument(
        "path",
        type=str,
        help="The path to show files in."
    )

    # Parse and process commands

    args = parser.parse_args()

    if args.path:

        path = args.path
        files = list_files(path)
        for f in files:
            print(
                "d" if f.isdir else "f",
                f" {f.human_readable_bytes:<12}",
                f.path
            )
def read_games(tournament_dir):
    game_files = list_files(tournament_dir, '.json')
    games = []
    for game_file in game_files:
        with open(game_file, 'r') as fin:
            games.append(json.load(fin))
    return games
def read_data(data_path, max_games=None):
    game_files = list(sorted(list_files(data_path, '.json')))
    augmentations = list(Augmentation.iter_augmentations())

    if max_games is not None:
        game_files = list(game_files)[-max_games:]
        print('Using game files from %s to %s' % (game_files[0], game_files[-1]))

    x = []
    y_policy = []
    y_reward = []

    for game_path in tqdm(game_files):
        with open(game_path, 'r') as fin:
            game_data = json.load(fin)

        winner, starter, actions, policies = AlphaConnectSerializer.deserialize(game_data)

        state = State.empty()
        states = [state]
        for action in actions:
            state = state.take_action(action)
            states.append(state)
        states, final_state = states[:-1], states[-1]

        n_samples = min(len(states), len(augmentations))
        game_samples = sample(list(range(len(states))), n_samples)
        for augmentation, i in zip(augmentations, game_samples):
            augmentend_action_order = sorted(Action.iter_actions(), key=lambda a: a.augment(augmentation).to_int())

            x.append(states[i].to_numpy(augmentation))
            y_policy.append([policies[i].get(action, 0.0) for action in augmentend_action_order])
            y_reward.append(winner_value(final_state.winner, states[i]))

    return np.array(x), np.array(y_policy), np.array(y_reward)
Exemplo n.º 4
0
    def pageRank(self):
        sourceDirectory =settings.PAGERANK_RESOURCE_DIRECTORY
        destDirectory = PAGERANK_DESTINATION_DIRECTORY
        docs = []
        id2index = {}
        # print('start read files')
        # read files
        for file in map(lambda x: os.path.join(sourceDirectory,x),list_files(sourceDirectory, '*.json')):
            with open(file, 'r') as readFile:
                doc = json.load(readFile)
            id2index[doc['id']] = len(docs)
            self.progress_bar.next()
            docs.append(doc)
        # print('start calc page rank')
        # create links matrix
        n = len(docs)
        p = []
        for doc in docs:
            pp = [0] * n
            for linkID in filter(lambda x: x in id2index.keys() , (set(doc['cited_in']) |set(doc['refrences'])) ):
                pp[id2index[linkID]] = 1
            p.append(pp)

        # calculate page rank
        pr = self.pageRankMathCalculation(p,PAGERANK_ALFA,PAGERANK_ERROR)

        # print('start save files')
        # save docs
        os.makedirs(destDirectory, exist_ok=True)
        for doc,pagerank in zip(docs,pr):
            doc['pageRank'] = pagerank
            file_name = '{}.json'.format(doc['id'])
            with open(os.path.join(destDirectory , file_name), 'w') as outfile:
                json.dump(doc, outfile)
Exemplo n.º 5
0
def load_subspace_pairs(subspace_pairs):
    """Load the pairs used for gender subspace definition.

    Returns:
        Dict[str, List[Tuple[str, str]]]: Dictionary of gender word pairs. The
            value is a list of (male, female) words; the key is the name of
            that list.
    """
    filepairs = []
    for direction_file in list_files(DIRECTIONS_PATH):
        with open(direction_file) as fd:
            filepairs.extend(
                (os.path.basename(direction_file), tuple(line.strip().split()))
                for line in fd)
    if subspace_pairs == 'pair':
        keyfunc = (lambda filepair: '-'.join(filepair[1]))
    elif subspace_pairs == 'group':
        keyfunc = (lambda filepair: filepair[0])
    elif subspace_pairs == 'all':
        keyfunc = (lambda filepair: 'all')
    else:
        raise ValueError(
            'unrecognized subspace pair parameter: {}'.format(subspace_pairs))
    return {
        key: [filepair[1] for filepair in group]
        for key, group in groupby(filepairs, key=keyfunc)
    }
Exemplo n.º 6
0
 def __init__(self, path, x_sufix, page_size):
     self.path = path
     self.x_sufix = x_sufix
     self.v_sufix = None
     self.array_x_files = util.list_files(os.path.join(path, x_sufix),
                                          ext='png')
     self.pos = 0
     self.page_size = page_size
Exemplo n.º 7
0
def build_all_fasttext_models(model_type='skipgram'):
    if model_type not in ['skipgram', 'cbow']:
        raise ValueError('model_type must be "skipgram" or "cbow" but got "' +
                         str(model_type) + '"')
    for corpus_file in list_files(CORPORA_PATH):
        if not corpus_file.endswith('-swapped'):
            create_pronoun_swapped_corpus(corpus_file, 'swap-pairs/pronouns')
    for corpus_file in list_files(CORPORA_PATH):
        model_stub = os.path.join(
            MODELS_PATH,
            os.path.basename(corpus_file) + '.' + model_type,
        )
        if not os.path.exists(model_stub + '.bin'):
            subprocess.run(args=[
                'fasttext', 'skipgram', '-input', corpus_file, '-output',
                model_stub
            ])
Exemplo n.º 8
0
def main():
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print "usage: indexer </path/to/files/for/indexing/> <indexer> [cutoff]"
        exit()
    use_cutoff_freq = False
    if len(sys.argv) == 4 and sys.argv[3].lower() == "cutoff":
        use_cutoff_freq = True
    idxr = getattr(sys.modules[__name__], sys.argv[2])()
    for inp_file in list_files(sys.argv[1]):
        idxr.index_file(inp_file, use_cutoff_freq)
Exemplo n.º 9
0
    def bulk_add_documents_in_directory(self, folder, index_name, document_type, pattern='*.json'):
        """
        add documents in directory that their name matches given pattern

        :param folder: directory address
        :param index_name: name of index (e.g. articles)
        :param document_type: type of documents (e.g. paper)
        :param pattern: file name pattern (wildcard). default: *.json
        :return:
        """
        return self.bulk_add_documents_file_list(
            map(lambda name: self.base_folder+name, list_files(folder, pattern=pattern)),
            index_name, document_type)
Exemplo n.º 10
0
    def clusterDocs(self):
        api = TermVectorAPI(ELASTIC_URL)
        #print('start read files')
        for file in map(lambda x: os.path.join(CLUSTER_SOURCE_DIRECTORY,x),list_files(CLUSTER_SOURCE_DIRECTORY, '*.json')):
            with open(file, 'r') as readFile:
                doc = json.load(readFile)
            self.docsJson[doc['id']]= doc
            self.docVector[doc['id']] = Vector(api.get_term_vector(INDEX_NAME, DOCUMENT_TYPE, doc['id']))
        #print('read all files successfully')
        #print('start init centroid')
        self.initCentroid(CLUSTER_NUM)
        #print('end init centroid')

        while True:
            self.oldDocCluster = self.docCluster.copy()
            self.docCluster = {}
            for docID in self.docsJson.keys():
                self.docCluster[docID] = self.nearestCentroid(docID)
            self.updateCentroid()
            self.progress_bar.next()
            #print('one step clustring')
            if (self.terminateCondition()):
                self.progress_bar.finish()
                break

        #print('converge clustring')
        print('K = ',CLUSTER_NUM,' J = ',self.J())
        candids = self.findCandidateText(CLUSTER_CANDIDATE_TEXT_LEN)
        #print('calc candid')
        c = [[] for x in range(len(self.centroidList))]
        for d in self.docCluster.keys():
            c[self.docCluster[d]].append(d)
        #print('start save result')
        os.makedirs(CLUSTER_DESTINATION_DIRECTORY, exist_ok=True)
        os.makedirs(CLUSTER_CANDIDATE_TEXT_DIRECTORY, exist_ok=True)
        for i in range(len(self.centroidList)):
            res = {}
            res['id'] = i
            res['name'] = candids[i]
            res['pages'] = c[i]
            fileName = i.__str__()+'.json'
            print('Cluster {}: {}\tnumber of docs: {}', i, ' '.join(candids[i]), len(c[i]))
            #print(res)
            with open(os.path.join(CLUSTER_CANDIDATE_TEXT_DIRECTORY, fileName), 'w') as outfile:
                json.dump(res, outfile)
        for id in self.docsJson.keys():
            self.docsJson[id]['cluster'] = self.docCluster[id]
            file_name = '{}.json'.format(id)
            with open(os.path.join(CLUSTER_DESTINATION_DIRECTORY , file_name), 'w') as outfile:
                json.dump(self.docsJson[id], outfile)
Exemplo n.º 11
0
 def __init__(self, dir_path):
     self._cache = {}
     _clases = {}
     # Open and load text file including the whole training data
     for subfolder in list_dir(dir_path):
         class_dir = os.path.join(dir_path, subfolder)
         class_idx = len(_clases)
         _clases[class_dir] = []
         for filename in list_files(class_dir, '.JPEG'): 
             _clases[class_dir].append({
                 'path': os.path.join(class_dir, filename),
                 'class_idx': class_idx,
                 'name': os.path.splitext(filename)[0],
                 'rotation': 0
             })
     self.class_list = list(_clases.items())
Exemplo n.º 12
0
 def __init__(self, dir_path):
     self._cache = {}
     _clases = {}
     for subfolder in list_dir(dir_path):
         for character in list_dir(os.path.join(dir_path, subfolder)):
             class_dir = os.path.join(dir_path, subfolder, character)
             class_idx = len(_clases)
             _clases[class_dir] = []
             for filename in list_files(class_dir, '.png'):
                 _clases[class_dir].append({
                     'path': os.path.join(class_dir, filename),
                     'class_idx': class_idx,
                     'name': character + '_' + os.path.splitext(filename)[0],
                     'rotation': 0
                 })
     self.class_list = list(_clases.items())
Exemplo n.º 13
0
def process_request(directory, request):
    if request == "get /":
        return util.list_files(directory)
    elif request.startswith("get /"):
        filename = request.replace("get /", "")
        return util.read_file(directory, filename)
    elif request.startswith("post /"):
        # obtain all request terms
        terms = request.split()
        # obtain filename
        filename = terms[1].replace("/", "")
        # obtain content
        content = ""
        content_terms = terms
        content_terms.pop(0)
        content_terms.pop(0)
        for t in content_terms:
            content = content + " " + t
        return util.overwrite_file(directory, filename, content)
    else:
        return "sorry, '%s' command does not exist" % request
Exemplo n.º 14
0
def list_players(model_dir):
    players = [
        (RandomPlayer, {}),
        (GreedyPlayer, {}),
        (MiniMaxPlayer, {
            'depth': 1
        }),
        (MiniMaxPlayer, {
            'depth': 2
        }),
        (MiniMaxPlayer, {
            'depth': 3
        }),
        (MonteCarloPlayer, {
            'budget': 400
        }),
        (MonteCarloPlayer, {
            'budget': 800
        }),
        (MonteCarloPlayer, {
            'budget': 1600
        }),
        (MonteCarloPlayer, {
            'budget': 3200
        }),
        (MonteCarloPlayer, {
            'budget': 6400
        }),
    ]
    model_files = list(sorted(list_files(model_dir, '.h5')))
    for model_file in model_files:
        if model_file.endswith('0.h5'):
            players += [
                (AlphaConnectPlayer, {
                    'model_path': model_file,
                    'search_budget': 1600
                }),
            ]
    return players
Exemplo n.º 15
0
def train(model):
    _train_epochs = train_conf['train_epochs']
    _train_data = train_conf['train_data']
    _batch_size = train_conf['batch_size']
    for n in range(_train_epochs):
        tf.logging.info('=' * 30 + ' START EPOCH {} '.format(n + 1) +
                        '=' * 30 + '\n')
        train_data_list = list_files(_train_data)  # dir to file list
        for f in train_data_list:
            t0 = time.time()
            tf.logging.info('<EPOCH {}>: Start training {}'.format(n + 1, f))
            model.train(
                input_fn=lambda: input_fn(f, ModeKeys.TRAIN, _batch_size),
                hooks=None,
                steps=None,
                max_steps=None,
                saving_listeners=None)

            tf.logging.info(
                '<EPOCH {}>: Finish training {}, take {} mins'.format(
                    n + 1, f, elapse_time(t0)))
            print('-' * 80)
Exemplo n.º 16
0
def author_cluster_admin():
    timer = Timer()
    timer.start()
    authors = list()
    for file in list_files(AUTHOR_CLUSTER_SOURCE_DIRECTORY, '*.json'):
        with open(os.path.join(AUTHOR_CLUSTER_SOURCE_DIRECTORY, file), 'r') as fp:
            author_data = json.load(fp)
            authors.append(Author(author_data))

    from clustering.authors_cluster import Dendogram
    clusters = Dendogram(authors)
    clusters.cluster()

    min_similarity = 0.375
    cluster_list = list(map(
        lambda cluster: list(map(lambda x: x.name, cluster)),
        map(
            lambda x: list(x.authors),
            clusters.get_clusters(min_similarity)
        )
    ))

    cluster_dict = dict()
    for cluster in cluster_list:
        for author in cluster:
            cluster_dict[author] = cluster

    with open(AUTHOR_CLUSTER_FILE, 'w') as fp:
        json.dump(cluster_dict, fp)

    timer.end()

    return render_template('indexing_result.html',
        duration=timer.get_time_taken_pretty(),
        elastic_response=json.dumps(cluster_list, indent=True),
        success=True,
        numdocs=len(cluster_list)
    )
Exemplo n.º 17
0
  def gen_train_input(self, inputs, decode_fn):
     #--------------------- train
    logging.info('train_input: %s'%FLAGS.train_input)
    print 'train_input: %s'%FLAGS.train_input
    trainset = util.list_files(FLAGS.train_input)
    logging.info('trainset:{} {}'.format(len(trainset), trainset[:2]))
    print 'trainset:{} {}'.format(len(trainset), trainset[:2])
    
    
    query, query_str, text, text_str = inputs(
      trainset, 
      decode_fn=decode_fn,
      batch_size=FLAGS.batch_size,
      num_epochs=FLAGS.num_epochs, 
      #seed=seed,
      num_threads=FLAGS.num_threads,
      batch_join=FLAGS.batch_join,
      shuffle_files=FLAGS.shuffle_files,
      fix_sequence=FLAGS.fix_sequence,
      num_prefetch_batches=FLAGS.num_prefetch_batches,
      min_after_dequeue=FLAGS.min_after_dequeue,
      name=self.input_train_name)

    return (query, query_str, text, text_str), trainset
Exemplo n.º 18
0
def extract_meta(dirname, subdir, args):
    source = args.get('source')
    datasets = args.get('datasets')

    # Make sure dirname don't end in '/', otherwise our poor basename is not going to work well
    dirname = strip_dirname(dirname)

    # Extract metadata from csv and create a record

    # First check if okay by looking into processed.txt
    processedFile = dirname + '/processed.txt'
    processed = None
    if util.is_non_zero_file(processedFile):
        processed = util.read_properties(processedFile, log)
        if not processed:
            if not args.get('includeAll'):
                return False  # NOT ACCEPTABLE
    else:
        if not args.get('includeAll'):
            return False  # NOT ACCEPTABLE

    # Process log
    times = None
    processLog = dirname + '/process.log'
    if os.path.isfile(processLog):
        times = timings.computeTimings(processLog)

    # Take dirname and extract the final string as the id
    id = os.path.basename(dirname)
    if subdir is None:
        subdir = id

    meta1 = {
        'fullId': source + '.' + id,
        'id': id,
        'source': source,
        'datasets': datasets,
        'path': subdir
    }
    if times is not None:
        total = timings.getTotal(times)
        meta1['totalProcessingSecs'] = total.get('secs')
        meta1['totalProcessingTime'] = total.get('time')
    # Extract create time from id matching: 2016-07-01_04-29-28
    date_match = DATE_RE.search(id)
    if date_match:
        createdAt = date_match.group(0)
        # Reformat to be in ISO8601 format
        meta1['createdAt'] = createdAt[0:10] + 'T' + createdAt[
            11:13] + ':' + createdAt[14:16] + ':' + createdAt[17:19]

    # Look for dirname/id.txt and extract fields into our meta
    # if there is txt file, read it and append to metadata record
    metafile = dirname + '/' + id + '.txt'
    if util.is_non_zero_file(metafile):
        # Read in txt as dictionary
        meta = util.read_properties(metafile, log)
    else:
        meta = {}
    # go through files and find last time as updatedAt time for scan

    # Take our basic info and merge it into meta (overwriting what maybe there)
    if processed:
        meta.update(processed)
    meta.update(meta1)

    # Check what files we have
    meta['files'] = util.list_files(dirname)
    lastModified = util.lastModified(meta['files'])
    if lastModified:
        meta['updatedAt'] = util.millisToIso(
            lastModified.get('modifiedAtMillis'))

    # Check what stage we are in
    # NOTE: This requires meta to be filled in with id and files!!!
    if args.get('stages'):
        check_stages(args.get('stages'), meta, times)

    # Check if we have a ply file and how big it is
    filebase = id + '_vh_clean_2' if args.get('checkCleaned') else id
    plyfile = dirname + '/' + filebase + '.ply'
    plyfileSize = util.filesize(plyfile)
    meta['hasCleaned'] = args.get('checkCleaned') and plyfileSize > 0
    if plyfileSize > 0:
        meta['fileType'] = 'ply'
        meta['fileSize'] = plyfileSize

    # Check if we have a png file
    pngfile = dirname + '/' + filebase + '.png'
    pngfileSize = util.filesize(pngfile)
    meta['hasScreenshot'] = pngfileSize > 0

    # Check if we have a thumbnail file
    pngfile = dirname + '/' + filebase + '_thumb.png'
    pngfileSize = util.filesize(pngfile)
    meta['hasThumbnail'] = pngfileSize > 0

    if source == 'nyuv2' and not meta.get('sceneType'):
        idParts = meta.get('id').split('_')
        meta['sceneType'] = '_'.join(idParts[0:len(idParts - 1)])

    if meta.get('sceneLabel'):
        # Derive sceneName from sceneLabel
        sceneLabel = meta.get('sceneLabel')
        match = SCENELABEL_RE.match(sceneLabel)
        meta['sceneName'] = match.group(1) if match else sceneLabel

    return meta
Exemplo n.º 19
0
def test_list_files_in_order(directory_hierarchy):
    files = list(list_files(directory_hierarchy))
    assert list(sorted(files)) == files
Exemplo n.º 20
0
def main() -> None:
    path = "."
    files = list_files(path)
    for f in files:
        print("d" if f.isdir else "f", f" {f.human_readable_bytes:<12}",
              f.path)
Exemplo n.º 21
0
def test_Given_validPath_When_listFiles_Then_listAllFiles(input, output):
    assert set(list_files(input)) == set(output)
Exemplo n.º 22
0
def new_model_path(model_dir):
    model_files = list(list_files(model_dir, '.h5'))
    model_iteration = len(model_files)
    model_path = os.path.abspath(
        os.path.join(model_dir, '%6.6d.h5' % model_iteration))
    return model_iteration, model_path
Exemplo n.º 23
0
def train_and_eval(model):
    _train_epochs = train_conf['train_epochs']
    _train_data = train_conf['train_data']
    _eval_data = train_conf['eval_data']
    _test_data = train_conf['test_data']
    _batch_size = train_conf['batch_size']
    _epochs_per_eval = train_conf['epochs_per_eval']

    for n in range(_train_epochs):
        tf.logging.info('=' * 30 + ' START EPOCH {} '.format(n + 1) +
                        '=' * 30 + '\n')
        train_data_list = list_files(_train_data)  # dir to file list
        for f in train_data_list:
            t0 = time.time()
            tf.logging.info('<EPOCH {}>: Start training {}'.format(n + 1, f))
            model.train(
                input_fn=lambda: input_fn(f, ModeKeys.TRAIN, _batch_size),
                hooks=None,
                steps=None,
                max_steps=None,
                saving_listeners=None)

            tf.logging.info(
                '<EPOCH {}>: Finish training {}, take {} mins'.format(
                    n + 1, f, elapse_time(t0)))
            print('-' * 80)

            tf.logging.info('<EPOCH {}>: Start evaluating {}'.format(
                n + 1, _eval_data))
            t0 = time.time()

            results = model.evaluate(
                input_fn=lambda: input_fn(f, ModeKeys.EVAL, _batch_size),
                steps=None,  # Number of steps for which to evaluate model.
                hooks=None,
                checkpoint_path=None,  # latest checkpoint in model_dir is used.
                name=None)

            tf.logging.info(
                '<EPOCH {}>: Finish evaluation {}, take {} mins'.format(
                    n + 1, _eval_data, elapse_time(t0)))
            print('-' * 80)
            # Display evaluation metrics
            for key in sorted(results):
                print('{}: {}'.format(key, results[key]))

        # every epochs_per_eval test the model (use larger test dataset)
        if (n + 1) % _epochs_per_eval == 0:
            tf.logging.info('<EPOCH {}>: Start testing {}'.format(
                n + 1, _test_data))
            results = model.evaluate(
                input_fn=lambda: input_fn(f, ModeKeys.EVAL, _batch_size),
                steps=None,  # Number of steps for which to evaluate model.
                hooks=None,
                checkpoint_path=
                None,  # If None, the latest checkpoint in model_dir is used.
                name=None)

            tf.logging.info(
                '<EPOCH {}>: Finish testing {}, take {} mins'.format(
                    n + 1, _test_data, elapse_time(t0)))
            print('-' * 80)
            # Display evaluation metrics
            for key in sorted(results):
                print('{}: {}'.format(key, results[key]))
Exemplo n.º 24
0
def is_first_model(model_dir):
    model_files = list(list_files(model_dir, '.h5'))
    return len(model_files) == 0
Exemplo n.º 25
0
# Load the OpenCV’s Caffe-based deep learning face detector model
print("[EXEC] Loading face detector model....")
detector = cv2.dnn.readNetFromCaffe(
    "face_detection_model/deploy.prototxt",
    "face_detection_model/res10_300x300_ssd_iter_140000.caffemodel")


# Load the embbeder model to extract a 128-D facial embedding vector
# It contains the OpenCV deep learning Torch embedding model.
print("[EXEC] Loading face recognizer model....")
embedder = cv2.dnn.readNetFromTorch("openface_nn4.small2.v1.t7")

print("[EXEC] Reading Image Paths.....")
# Discrete each image path into a list
imagePaths = list(list_files(rootPath="my_dataset"))
print(imagePaths)

knownEmbeddings = []
knownNames = []

total = 0

# Iterate over every single image
for (i, imagePath) in enumerate(imagePaths):
    print("Processing image {} of {}".format(i + 1, len(imagePaths)))
    # Extract name of the image
    name = imagePath.split(os.path.sep)[-2]
    image = cv2.imread(imagePath)
    image = cv2.resize(image, dsize=(750, 600))
    # Height and Width
Exemplo n.º 26
0
def main():
	md_files = list_files("www", is_markdown_file, recur=True)
	for md_file in md_files:
		html_file = md_file[:-2] + "html"
		md_to_html(md_file, html_file)
Exemplo n.º 27
0
def test_list_files():
    ret = '\n'
    ret += './=' + str(util.list_files('./dir2')) + '\n'
    ret += './=' + str(util.list_files('./dir2', pattern=r'\d.+')) + '\n'
    ret += './=' + str(util.list_files('./dir2', pattern=r'lang_.*')) + '\n'
    return ret
Exemplo n.º 28
0
 def _get_files(self, folder):
     if os.path.isdir(folder):
         return sorted(list_files(folder, valid_exts=self.IMG_EXTS))
     else:
         raise (RuntimeError('No folder named "{}" found.'.format(folder)))
Exemplo n.º 29
0
def inputs(files, decode_fn, batch_size=64,
           num_epochs = None, num_threads=12, 
           shuffle_files=True, batch_join=True, shuffle_batch=True, 
           min_after_dequeue=None, seed=None, enqueue_many=False,
           fix_random=False, no_random=False, fix_sequence=False,
           allow_smaller_final_batch=False, 
           num_prefetch_batches=None, 
           dynamic_pad=False,
           bucket_boundaries=None,
           length_index=None,
           length_fn=None,
           name='input'):
  """Reads input data num_epochs times.
  for sparse input here will do:
  1. read serialized_example
  2. shuffle serialized_examples
  3. decdoe batch_serialized_examples
  notice read_sparse.inputs and also be used for dense inputs,but if you 
  only need to decode part from serialized_example, then read.inputs will 
  be better, less to put to suffle
  #--------decode example, can refer to libsvm-decode.py
  # def decode(batch_serialized_examples):
  #   features = tf.parse_example(
  #       batch_serialized_examples,
  #       features={
  #           'label' : tf.FixedLenFeature([], tf.int64),
  #           'index' : tf.VarLenFeature(tf.int64),
  #           'value' : tf.VarLenFeature(tf.float32),
  #       })

  #   label = features['label']
  #   index = features['index']
  #   value = features['value']

  #   return label, index, value 

  #string_input_reducer will shuffle files
  #shuffle will read file by file and shuffle withn file(in shuffle queue) 
  #shuffle_batch_join will read multiple files and shuffle in shuffle queue(from many files)

  To get fixed sequence 
  shuffle=False  so by this way the sequence is as your data input unchange
  or
  shuffle=True
  seed=1024 #set
  batch_join=False  by this way you have fixed random, so get same result
  NOTICE, shuffle=True,seed=1024,batch_join=True will not get same result
  shuffle=False,seed=1024,batch_join=True also, so batch_join seems seed only control inqueue random, can not get fixed result

  for no random -> fixed result set shuffle=False wihch will force batch_join=False then use batch
  for fixed random ->  shuffle=True, seed set or  fix_random=True
  read-records.py show above ok, but train-evaluate.py show not, only shuffle=False can get fixed result.. @FIXME strange
  for train-evaluate.py it looks you can set shuffle in string_input_producer True, but then must use batch,
  batch_join and shuffle_batch join all not fixed even with seed set, may be due to trainset two inputs read ?
  for read-records.py batch_join will be fixed, shuffle_batch_join not 

  defualt parmas will give max random...

  Args:
  decode: user defined decode 
  min_after_dequeue: set to >2w for production train, suggesed will be 0.4 * num_instances, but also NOTICE do not exceed mem
  #--default parmas will make most randomness
  shuffle_files: wehter shuffle file 
  shuffle_batch: batch or shuffle_batch
  batch_join: wether to use multiple reader or use one reader mutlitple thread
  fix_random: if True make at most random which can fix random result
  allow_smaller_final_batch: set True usefull if you want verify on small dataset
  """
  if isinstance(files, str):
    files = util.list_files(files)

  assert len(files) > 0

  if not min_after_dequeue : 
    min_after_dequeue = 100000
  if not num_epochs: 
    num_epochs = None

  if fix_random:
    if seed is None:
      seed = 1024
    shuffle_files = True  
    batch_join = False  #check can be True ?

    #to get fix_random 
    #shuffle_batch = True  and num_threads = 1 ok
    #shuffle_batch = False and num_threads >= 1 ok
    #from models/iamge-text-sim/read_records shuffle_batch = True will be quick, even single thread
    #and strange num_threas = 1 will be quicker then 12
    
    shuffle_batch = True
    num_threads = 1

    #shuffle_batch = False

  if fix_sequence:
    no_random = True 
    allow_smaller_final_batch = True
    num_threads = 1

  if no_random:
    shuffle_files = False
    batch_join = False
    shuffle_batch = False 

  if dynamic_pad:
    #use tf.batch
    shuffle_batch = False

  #shuffle=True
  #batch_join = True #setting to False can get fixed result
  #seed = 1024
  
  with tf.name_scope(name):
    filename_queue = tf.train.string_input_producer(
      files, 
      num_epochs=num_epochs,
      shuffle=shuffle_files,
      seed=seed)
    
    # min_after_dequeue defines how big a buffer we will randomly sample
    #   from -- bigger means better shuffling but slower start up and more
    #   memory used.
    # capacity must be larger than min_after_dequeue and the amount larger
    #   determines the maximum we will prefetch.  Recommendation:
    #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
    #@TODO cifa10 always use num_prefetch_batches = 3, 3 * batch_size, check which is better
    if not num_prefetch_batches: 
      num_prefetch_batches = num_threads + 3
    capacity = min_after_dequeue + num_prefetch_batches * batch_size
    #@TODO diff between tf.batch_join and tf.batch, batch_join below means shuffle_batch_join.. TODO
    if batch_join:
      batch_list = [_read(filename_queue) for _ in xrange(num_threads)]
      #print batch_list
      if shuffle_batch:
        batch_serialized_examples = tf.train.shuffle_batch_join(
            batch_list, 
            batch_size=batch_size, 
            capacity=capacity,
            min_after_dequeue=min_after_dequeue,
            seed=seed,
            enqueue_many=enqueue_many,
            allow_smaller_final_batch=allow_smaller_final_batch,
            name='shuffle_batch_join_queue')
      else:
        batch_serialized_examples = tf.train.batch_join(
          batch_list, 
          batch_size=batch_size, 
          capacity=capacity,
          enqueue_many=enqueue_many,
          allow_smaller_final_batch=allow_smaller_final_batch,
          dynamic_pad=dynamic_pad,
          name='batch_join_queue')
    else:
      serialized_example = list(_read(filename_queue))
      #@FIXME... for bug now can not be more random if want fix random see D:\mine\tensorflow-exp\models\image-text-sim\train-evaluate-fixrandom.py
      if shuffle_batch:	      
        batch_serialized_examples = tf.train.shuffle_batch(	
            serialized_example, 
            batch_size=batch_size, 
            num_threads=num_threads,
            capacity=capacity,
            min_after_dequeue=min_after_dequeue,
            seed=seed,
            enqueue_many=enqueue_many,
            allow_smaller_final_batch=allow_smaller_final_batch,
            name='shuffle_batch_queue')		    
      else:	    
        batch_serialized_examples = tf.train.batch(
            serialized_example, 
            batch_size=batch_size, 
            #@TODO to make really fxied result use num_threads=1, may be shuffle_batch will be fix random?
            num_threads=num_threads,
            capacity=capacity,
            enqueue_many=enqueue_many,
            allow_smaller_final_batch=allow_smaller_final_batch,
            dynamic_pad=dynamic_pad,
            name='batch_queue')

    #print name
    #print decode_fn
    #print decode_fn(batch_serialized_examples)
    return decode_fn(batch_serialized_examples) if decode_fn is not None else batch_serialized_examples
Exemplo n.º 30
0
def latest_model_path(model_dir):
    model_files = list(sorted(list_files(model_dir, '.h5')))
    model_iteration = len(model_files)
    model_path = os.path.abspath(os.path.join(model_dir, model_files[-1]))
    return model_iteration - 1, model_path
Exemplo n.º 31
0
def main():
    md_files = list_files("www", is_markdown_file, recur=True)
    for md_file in md_files:
        html_file = md_file[:-2] + "html"
        md_to_html(md_file, html_file)
Exemplo n.º 32
0
Arquivo: main.py Projeto: yen3/snap2
def list_posts():
    posts = [Post.constructFromPath(pf) for pf in list_files(conf.POST_PATH)]
    posts = posts[::-1]
    return json.dumps([p.postStringMap() for p in posts])