Пример #1
0
def main():
    """
    Iterates through the list of cyano datasets and colocalizes them with the specified environmentl variables.
    Colocalized datasets are stored in the "COLOCALIZED_DIR" as csv files.
    """
    def saveColocalizedCSV(df):
        df.to_csv(f"{COLOCALIZED_DIR}{os.path.basename(cyanoFile)}", index=False) 

    cyanoFiles = cyano_csv_files(DATA_DIR)
    api = pycmap.API(token=API_KEY)
    makedir(COLOCALIZED_DIR)
    envs = environmental_datasets()        
    envs = add_env_temporal_coverage(api, envs)

    for cyanoFile in cyanoFiles:
        df = pd.read_csv(cyanoFile)
        df = add_env_columns(df, envs)
        dfs = [df.loc[i].to_frame().T for i in range(len(df))]
        colocalizedDF  = pd.DataFrame({})
        with concurrent.futures.ThreadPoolExecutor() as executor:
            futureObjs = executor.map(match, dfs, [api] * len(dfs), [envs] * len(dfs), [cyanoFile] * len(dfs), [len(dfs)] * len(dfs))
            for fo in futureObjs:
                if len(colocalizedDF) < 1:
                    colocalizedDF = fo
                else:
                    colocalizedDF = pd.concat([colocalizedDF, fo], ignore_index=True)  
        saveColocalizedCSV(colocalizedDF)
Пример #2
0
 def generate_summary(self, title):
     makedir(RESULTS_FOLDER)
     self._compare_action_against_performance(title)
     if HUMAN_DATA_COMPARE:
         self._compare_score_against_human_data(title)
         self._compare_action_against_human_data(title, 1)
     self._plot_learning_curve(title)
Пример #3
0
def main():
    """
    Iterates through the list of colocalized cyano datasets and compile them into a single csv file.
    The compiled file is stored in the "COMPILED_DIR" as a csv file.
    """
    print("""

            ##########################################################
            #                                                        #
            #                                                        #
            #         Compiling Colocalized Cyano Datasets           #
            #                                                        #
            #                                                        #
            ##########################################################

            
        """)
    cyanoFiles = glob.glob(f"{COLOCALIZED_DIR}*.csv")
    makedir(COMPILED_DIR)
    dfCompiled = pd.DataFrame({})
    for cyanoFile in cyanoFiles:
        print(f"Compiling {cyanoFile}")
        data = unify(cyanoFile)
        if len(dfCompiled) < 1:
            dfCompiled = data
        else:
            dfCompiled = pd.concat([dfCompiled, data], ignore_index=True)
    dfCompiled.to_csv(f"{COMPILED_DIR}compiled.csv", index=False)
Пример #4
0
 def new_simulation(self):
     makedir(RESULTS_FOLDER
             )  # if created, the function will catch the exception
     self.current_df = pd.DataFrame(columns=COLUMNS)
     self.current_detail_df = pd.DataFrame(columns=DETAIL_COLUMNS)
     datetime_str = '{:%d-%H-%M-%S}'.format(datetime.datetime.now())
     self.file_name = (lambda x: RESULTS_FOLDER + x + ' ' + datetime_str)
Пример #5
0
def downloadBiduPicture(keyword, startpage, endpage):
    # 创建文件夹
    common.makedir(str(keyword))
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36'
    }
    # 控制页数的循环
    while (int(startpage) < int(endpage)):
        url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ie=utf-8&oe=utf-8&word=' + str(
            keyword) + '&pn=' + str(startpage) + '&rn=60'
        text = requests.get(url=url, headers=headers).text
        # print(text)
        pictureList = getBaiduUrlList(text)
        for count in range(0, len(pictureList)):
            try:
                name = str(keyword) + '_' + str(startpage) + '_' + str(count)
                type = '.jpg'
                # 多线程下载
                mythread = threading.Thread(target=common.downloadPicture,
                                            args=(pictureList[count], name,
                                                  type))
                mythread.start()
                # 控制线程数目为64
                if (threading.activeCount() >= 64):
                    mythread.join()
                print('thread name:' + str(mythread.name))
                print(str(threading.activeCount()) + 'actived thread')
            except:
                print(print('当前(第' + str(count) + ')图片下载超时,正在下载下一张'))
        # 控制循环
        startpage += 1
Пример #6
0
    def run_fuse(self):
        """
        Run fusion.
        """

        assert os.path.exists(self.options.depth_dir)
        common.makedir(self.options.out_dir)

        files = self.read_directory(self.options.depth_dir)
        timer = common.WallTimer()
        Rs = self.get_views()

        for filepath in files:

            # As rendering might be slower, we wait for rendering to finish.
            # This allows to run rendering and fusing in parallel (more or less).

            depths = common.read_hdf5(filepath)

            timer.reset()
            tsdf = self.fusion(depths, Rs)
            tsdf = tsdf[0]

            vertices, triangles = libmcubes.marching_cubes(-tsdf, 0)
            vertices /= self.options.resolution
            vertices -= 0.5

            off_file = os.path.join(self.options.out_dir, ntpath.basename(filepath)[:-3])
            exporter.export_off(vertices, triangles, off_file)
            print('[Data] wrote %s (%f seconds)' % (off_file, timer.elapsed()))
Пример #7
0
 def cross_mode_summary(self,
                        mode_lst=[mode for mode, _ in MODE_MAP.items()],
                        subject_lst=None,
                        subject_info=None):
     makedir(RESULTS_FOLDER)
     if subject_lst is not None:
         MODE_MAP[MODE_IDENTIFIER] = [None, None, 'black']
     self._plot_p_mb(mode_lst, subject_lst, subject_info)
    def get_in_files(self):
        if self.options.in_dir is not None:
            assert os.path.exists(self.options.in_dir)
            common.makedir(self.options.out_dir)
            files = self.read_directory(self.options.in_dir)
        else:
            files = [self.options.in_file]

        return files
Пример #9
0
def main(args):
  corpus_name = os.path.basename(args.corpus)
  if args.model:
    model_dir = args.model
  else:
    model_dir = os.path.join('.', corpus_name+'.model')

  makedir(model_dir)

  langs_path = os.path.join(model_dir, 'lang_index')
  domains_path = os.path.join(model_dir, 'domain_index')
  index_path = os.path.join(model_dir, 'paths')

  # display paths
  logging.info("corpus path: {0}".format(args.corpus))
  logging.info("model path: {0}".format(model_dir))
  logging.info("writing langs to: {0}".format(langs_path))
  logging.info("writing domains to: {0}".format(domains_path))
  logging.info("writing index to: {0}".format(index_path))

  indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
                          langs = args.lang, domains = args.domain)

  # Compute mappings between files, languages and domains
  lang_dist = indexer.dist_lang
  lang_index = indexer.lang_index
  lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
  logging.info("langs({0}): {1}".format(len(lang_dist), lang_info))

  domain_dist = indexer.dist_domain
  domain_index = indexer.domain_index
  domain_info = ' '.join(("{0}({1})".format(k, domain_dist[v]) for k,v in domain_index.items()))
  logging.info("domains({0}): {1}".format(len(domain_dist), domain_info))

  logging.info("identified {0} files".format(len(indexer.items)))

  # output the language index
  with open(langs_path,'w') as f:
    writer = csv.writer(f)
    writer.writerows((l, lang_dist[lang_index[l]]) 
        for l in sorted(lang_index.keys(), key=lang_index.get))

  # output the domain index
  with open(domains_path,'w') as f:
    writer = csv.writer(f)
    writer.writerows((d, domain_dist[domain_index[d]]) 
        for d in sorted(domain_index.keys(), key=domain_index.get))

  # output items found
  with open(index_path,'w') as f:
    writer = csv.writer(f)
    writer.writerows( (d,l,p) for (d,l,n,p) in indexer.items )
Пример #10
0
    def run(self):
        """
        Run the tool, i.e. scale all found OFF files.
        """

        assert os.path.exists(self.options.in_dir)
        common.makedir(self.options.out_dir)
        common.makedir('1_s_t')
        files = self.read_directory(self.options.in_dir)

        for filepath in files:
            mesh = common.Mesh.from_off(filepath)

            # Get extents of model.
            min, max = mesh.extents()
            total_min = np.min(np.array(min))
            total_max = np.max(np.array(max))

            # Set the center (although this should usually be the origin already).
            centers = ((min[0] + max[0]) / 2, (min[1] + max[1]) / 2,
                       (min[2] + max[2]) / 2)
            # Scales all dimensions equally.
            sizes = (total_max - total_min, total_max - total_min,
                     total_max - total_min)
            translation = (-centers[0], -centers[1], -centers[2])
            scales = (1 / (sizes[0] + 2 * self.options.padding * sizes[0]),
                      1 / (sizes[1] + 2 * self.options.padding * sizes[1]),
                      1 / (sizes[2] + 2 * self.options.padding * sizes[2]))

            mesh.translate(translation)
            mesh.scale(scales)

            print scales, translation

            # print('[Data] %s extents before %f - %f, %f - %f, %f - %f' % (os.path.basename(filepath), min[0], max[0], min[1], max[1], min[2], max[2]))
            # min, max = mesh.extents()
            # print('[Data] %s extents after %f - %f, %f - %f, %f - %f' % (os.path.basename(filepath), min[0], max[0], min[1], max[1], min[2], max[2]))

            # May also switch axes if necessary.
            mesh.switch_axes(0, 2)

            mesh.to_off(
                os.path.join(self.options.out_dir, os.path.basename(filepath)))

            scipy.io.savemat(
                os.path.join('1_s_t', os.path.basename(filepath)).replace(
                    '.off', '.mat'), {
                        'translation': translation,
                        'scales': scales,
                        'sizes': sizes
                    })
    def run(self):
        """
        Run simplification.
        """

        common.makedir(self.options.out_dir)
        files = self.get_in_files()

        for filepath in files:
            os.system(
                'LC_NUMERIC=C meshlabserver -i %s -o %s -s %s' %
                (filepath,
                 os.path.join(self.options.out_dir, ntpath.basename(filepath)),
                 self.simplification_script))
Пример #12
0
def main():
    """
    Iterates through the list of datasets containing measurements of cyanobacteria.
    The measurements are retrieved and stored in individual csv files on local disk.
    """
    api = pycmap.API(token=API_KEY)
    makedir(DATA_DIR)
    cyanos = cyano_datasets()
    for dataset in cyanos:
        print("\n********************************")
        print("Downloading ", dataset, " ...")
        print("********************************\n")
        data = retrieve(api, dataset, DEPTH1, DEPTH2)
        data.to_csv(f"{DATA_DIR}{dataset[0]}.csv", index=False)
Пример #13
0
    def run(self):
        """
        Run simplification.
        """

        assert os.path.exists(self.options.in_dir)
        common.makedir(self.options.out_dir)
        files = self.read_directory(self.options.in_dir)

        for filepath in files:
            os.system(
                'meshlabserver -i %s -o %s -s %s' %
                (filepath,
                 os.path.join(self.options.out_dir, ntpath.basename(filepath)),
                 self.simplification_script))
Пример #14
0
    def get_in_files(self):
        if self.options.in_dir is not None:
            assert os.path.exists(self.options.in_dir)
            common.makedir(self.options.out_dir)
            files = self.read_directory(self.options.in_dir)
        else:
            files = [self.options.in_file]

        if not self.options.overwrite:
            def file_filter(filepath):
                outpath = self.get_outpath(filepath)
                return not os.path.exists(outpath)
            files = list(filter(file_filter, files))

        return files
Пример #15
0
 def _compare_action_against_human_data(self,
                                        title,
                                        num_comp=PCA_COMPONENTS):
     if not ACTION_COMPARE:
         return
     SAMPLE_ACTION_SEQUENCES = 50
     NUMBER_OF_SAMPLE_SUBJECTS = 10 if not HEAD_AND_TAIL_SUBJECTS else 9
     NUMBER_OF_TAIL_SUBJECTS = None if not HEAD_AND_TAIL_SUBJECTS else 9
     makedir(RESULTS_FOLDER + 'Action_Summary/')
     file_name = lambda x: self.file_name('Action_Summary/' + x)
     sample_df = self.human_data_df.copy()
     # self._aggregated_analysis(sample_df, lambda episode: self._get_entropy_series(episode), file_name, title, num_comp)
     sample_df = pd.DataFrame(columns=[
         'trial_' + str(trial_num)
         for trial_num in range(self.trial_separation)
     ])
     feature_seq = []
     sample_detail_data = self.current_detail  # random.sample(self.current_detail, NUMBER_OF_SAMPLE_SUBJECTS)
     for subject_index, detail_df in enumerate(sample_detail_data):
         for index, episode in enumerate(
                 range(len(self.current_data[0]))[-SAMPLE_ACTION_SEQUENCES:]
         ):  # extract last 10 episode action sequences
             action_sequence = list(
                 map(int, (detail_df['action']
                           )[episode * self.trial_separation:(episode + 1) *
                             self.trial_separation].tolist()))
             sample_df.loc[SAMPLE_ACTION_SEQUENCES * subject_index +
                           index] = action_sequence
             if HEAD_AND_TAIL_SUBJECTS:
                 feature_seq.append(
                     self.human_data_df['Performance'].loc[subject_index])
             else:
                 feature_seq.append(subject_index)
     feature_series_func = lambda dummy_var: feature_seq
     self._aggregated_analysis(
         sample_df,
         feature_series_func,
         file_name,
         title,
         num_comp,
         head_subjects=NUMBER_OF_SAMPLE_SUBJECTS,
         tail_subjects=NUMBER_OF_TAIL_SUBJECTS,
         num_sequences=SAMPLE_ACTION_SEQUENCES,
         feature_label='Subject ID' if not HEAD_AND_TAIL_SUBJECTS else
         'Negative Log Likelihood Performance',
         in_all_episodes=False,
         in_selected_episodes=False,
         simple_analysis=True)
    def run(self):
        """
        Run rotation.
        """

        assert os.path.exists(self.options.in_dir)
        common.makedir(self.options.out_dir)
        files = self.read_directory(self.options.in_dir)

        for filepath in files:
            #added LC_NUMERIC=C
            os.system('LC_NUMERIC=C meshlabserver -i %s -o %s -s %s' % (
                filepath,
                os.path.join(self.options.out_dir, ntpath.basename(filepath)),
                self.rotation_script
            ))
Пример #17
0
    def run(self):
        """
        Run the tool, i.e. scale all found OFF files.
        """
        common.makedir(self.options.out_dir)
        if self.options.t_dir is not None:
            common.makedir(self.options.t_dir)

        files = self.get_in_files()

        if self.options.n_proc == 0:
            for filepath in files:
                self.run_file(filepath)
        else:
            with Pool(self.options.n_proc) as p:
                p.map(self.run_file, files)
Пример #18
0
    def run(self):
        """
        Run simplification.
        """

        assert os.path.exists(self.options.in_dir)
        common.makedir(self.options.out_dir)
        files = self.read_directory(self.options.in_dir)
        print files

        for filepath in files[0:1]:
            print filepath
            command = '/Applications/meshlab.app/Contents/MacOS/meshlabserver -i %s -o %s -s %s' % (
                filepath,
                os.path.join(self.options.out_dir, ntpath.basename(filepath)),
                self.simplification_script)
            print command
            os.system(command)
Пример #19
0
def main(args):
    if args.temp:
        buckets_dir = args.temp
    else:
        buckets_dir = os.path.join(args.model, 'buckets')
    makedir(buckets_dir)

    bucketlist_path = os.path.join(args.model, 'bucketlist')
    index_path = os.path.join(args.model, 'paths')

    # display paths
    logger.info("index path: %s", index_path)
    logger.info("bucketlist path: %s", bucketlist_path)
    logger.info("buckets path: %s", buckets_dir)

    with open(index_path) as f:
        reader = csv.reader(f)
        items = list(reader)

    # Tokenize
    logger.info("will tokenize %d files" % len(items))
    if args.scanner:
        from scanner import Scanner
        tokenizer = Scanner.from_file(args.scanner)
        logger.info("using provided scanner: ", args.scanner)
    elif args.prager:
        tokenizer = PragerTokenizer(args.order, use_words=args.words)
        logger.info(
            "using Prager tokenization: order[{0}] use_words[{1}]".format(
                args.order, args.words))
    else:
        tokenizer = NGramTokenizer(args.min_order, args.max_order)
        logger.info("using n-gram tokenizer: order {0}-{1}".format(
            args.min_order, args.max_order))
    b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets,
                         args.jobs, args.chunksize, args.sample_count,
                         args.sample_size)

    # output the paths to the buckets
    with open(bucketlist_path, 'w') as f:
        for d in b_dirs:
            f.write(d + '\n')
Пример #20
0
    def run_render(self):
        """
        Run rendering.
        """

        assert os.path.exists(self.options.in_dir)
        common.makedir(self.options.depth_dir)

        files = self.read_directory(self.options.in_dir)
        timer = common.WallTimer()
        Rs = self.get_views()

        for filepath in files:
            timer.reset()
            mesh = common.Mesh.from_off(filepath)
            depths = self.render(mesh, Rs)

            depth_file = os.path.join(self.options.depth_dir, os.path.basename(filepath) + '.h5')
            common.write_hdf5(depth_file, np.array(depths))
            print('[Data] wrote %s (%f seconds)' % (depth_file, timer.elapsed()))
Пример #21
0
    def _compare_score_against_human_data(self, title):
        if not SOCRE_COMPARE:
            return
        makedir(RESULTS_FOLDER + 'Score_Summary/')
        file_name = lambda x: self.file_name('Score_Summary/' + x)
        summary_df = self.human_data_df.copy()
        # create a target for CCA
        target_df = pd.DataFrame()
        target_df['score'] = [df['score'].mean() for df in self.current_data]
        cca = CCA(n_components=1)
        cca.fit(summary_df, target_df)

        # combine them for PCA
        for column_id in ANALYSIS_EXTRA_COLUMNS:
            summary_df[column_id] = [
                df[column_id].mean() for df in self.current_data
            ]
        pca = PCA(n_components=PCA_COMPONENTS)
        pca.fit(summary_df)
        with open(file_name('Score Statistics Summary ' + title), 'x') as f:
            self._write_pca_summary(pca, f)
            f.write('\nCCA:\n    X weights:\n')
            f.write('        ' + ' '.join(map(str, cca.x_weights_)))
            f.write('\n    Y weights\n')
            f.write('        ' + ' '.join(map(str, cca.y_weights_)))

        # generate historical CCA
        cca_trace_df = pd.DataFrame(columns=HUMAN_DATA_COLUMN)
        for index in range(self.current_data[0].shape[0])[3:]:
            target_df = pd.DataFrame()
            target_df['score'] = [
                df['score'].loc[:index].mean() for df in self.current_data
            ]
            cca.fit(self.human_data_df, target_df)
            cca_trace_df.loc[index] = [abs(x[0]) for x in cca.x_weights_]
        cca_trace_df.plot(figsize=FIG_SIZE,
                          grid=True,
                          title='CCA progression summary ' + title)
        save_plt_figure(file_name('CCA progression summary ' + title))
Пример #22
0
def downloadGooglePicture(keyword, startpage, endpage):
    common.makedir(str(keyword))
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"
    }
    for startpage in range(int(startpage), int(endpage)):
        url = 'https://chartsapi.gdgdocs.org/search?tbm=isch&q=' + str(
            keyword) + '&ijn=' + str(startpage)
        # print(url)
        text = requests.get(url=url, headers=headers).content
        #解码 将二进制数据转换为str
        text = text.decode()
        # print(text)
        # json解析错误,所以用正则匹配出所有图片地址
        pattern = re.compile(r',"ou":"(.*?)","ow"', re.S)
        pictureList = re.findall(pattern, text)
        # print(pictureList)
        for count in range(0, len(pictureList)):
            # 消除url编码问题:将=编码为\\u003d
            pictureList[count] = common.urlCode(pictureList[count])
            # print(pictureList[count])
            name = str(keyword) + '_' + str(startpage) + "_" + str(count)
            type = '.jpg'
            try:
                # 启动线程
                mythread = threading.Thread(target=common.downloadPicture,
                                            args=(pictureList[count], name,
                                                  type))
                mythread.start()
                # 控制线程数目
                if (threading.activeCount() >= 64):
                    mythread.join()
                # 打印线程名 和线程数目
                print('thread name:' + str(mythread.name))
                print(str(threading.activeCount()) + 'actived thread')
            except:
                print('线程创建失败')
        startpage += 1
Пример #23
0
    def run(self):
        """
        Run the tool.
        """
        common.makedir(self.options.out_dir)
        files = self.get_in_files()

        if self.options.mode == 'render':
            method = self.run_render
        elif self.options.mode == 'fuse':
            method = self.run_fuse
        elif self.options.mode == 'sample':
            method = self.run_sample
        else:
            print('Invalid model, choose render or fuse.')
            exit()

        if self.options.n_proc == 0:
            for filepath in files:
                method(filepath)
        else:
            with Pool(self.options.n_proc) as p:
                p.map(method, files)
Пример #24
0
def main(args):
  if args.temp:
    buckets_dir = args.temp
  else:
    buckets_dir = os.path.join(args.model, 'buckets')
  makedir(buckets_dir)

  bucketlist_path = os.path.join(args.model, 'bucketlist')
  index_path = os.path.join(args.model, 'paths')

  # display paths
  logger.info("index path: %s", index_path)
  logger.info("bucketlist path: %s", bucketlist_path)
  logger.info("buckets path: %s", buckets_dir)

  with open(index_path) as f:
    reader = csv.reader(f)
    items = list(reader)

  # Tokenize
  logger.info("will tokenize %d files" % len(items))
  if args.scanner:
    from scanner import Scanner
    tokenizer = Scanner.from_file(args.scanner)
    logger.info("using provided scanner: ", args.scanner)
  elif args.prager:
    tokenizer = PragerTokenizer(args.order, use_words=args.words)
    logger.info("using Prager tokenization: order[{0}] use_words[{1}]".format(args.order, args.words))
  else:
    tokenizer = NGramTokenizer(args.min_order,args.max_order)
    logger.info("using n-gram tokenizer: order {0}-{1}".format(args.min_order, args.max_order))
  b_dirs = build_index(items, tokenizer, buckets_dir, args.buckets, args.jobs, args.chunksize, args.sample_count, args.sample_size)

  # output the paths to the buckets
  with open(bucketlist_path,'w') as f:
    for d in b_dirs:
      f.write(d+'\n')
Пример #25
0
def translate(source_sentences, phrase_table_fnames, weights, translation_type,
              permutations_per_sentence, derivations_count, output_dir):
    sentence_fsts_dir = os.path.join(output_dir, 'sentence_fsts')
    common.makedir(sentence_fsts_dir)
    if 'monotone' == translation_type:
        encode_sentences_to_fsts_monotone(source_sentences, sentence_fsts_dir)
    else:
        encode_sentences_to_fsts_lattice(source_sentences, permutations_per_sentence, weights, sentence_fsts_dir)
    phrase_table_fsts_dir = os.path.join(output_dir, 'phrase_table_fsts')
    common.makedir(phrase_table_fsts_dir)
    encode_phrase_tables_to_fsts(source_sentences, phrase_table_fnames, weights, phrase_table_fsts_dir)
    translation_fsts_dir = os.path.join(output_dir, 'translation_fsts')
    common.makedir(translation_fsts_dir)
    make_translation_fsts(sentence_fsts_dir, phrase_table_fsts_dir, translation_type,
                          derivations_count, translation_fsts_dir)
    translations_dir = os.path.join(output_dir, 'translations')
    common.makedir(translations_dir)
    get_best_translations(translation_fsts_dir, translation_type, translations_dir)
Пример #26
0
      help="use DOMAIN - can be specified multiple times (uses all domains found if not specified)")
  parser.add_argument("-l","--lang", metavar="LANG", action='append',
      help="use LANG - can be specified multiple times (uses all langs found if not specified)")
  parser.add_argument("--min_domain", type=int, default=MIN_DOMAIN,
      help="minimum number of domains a language must be present in" )
  parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")

  args = parser.parse_args()

  corpus_name = os.path.basename(args.corpus)
  if args.model:
    model_dir = args.model
  else:
    model_dir = os.path.join('.', corpus_name+'.model')

  makedir(model_dir)

  langs_path = os.path.join(model_dir, 'lang_index')
  domains_path = os.path.join(model_dir, 'domain_index')
  index_path = os.path.join(model_dir, 'paths')

  # display paths
  if not SILENT:
    print "corpus path:", args.corpus
    print "model path:", model_dir
    print "writing langs to:", langs_path
    print "writing domains to:", domains_path
    print "writing index to:", index_path

  indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
                          langs = args.lang, domains = args.domain)
Пример #27
0
    group.add_argument(
        "--sample_count",
        type=int,
        help="number of samples for sampling-based tokenization",
        default=None)

    args = parser.parse_args()

    if args.sample_count and args.line:
        parser.error("sampling in line mode is not implemented")

    if args.temp:
        tmp_dir = args.temp
    else:
        tmp_dir = os.path.join(args.model, 'buckets')
    makedir(tmp_dir)

    # We generate a new directory at each invocation, otherwise we run the
    # risk of conflicting with a previous run without warning.
    buckets_dir = tempfile.mkdtemp(suffix='tokenize', dir=tmp_dir)

    bucketlist_path = args.output if args.output else os.path.join(
        args.model, 'bucketlist')
    index_path = os.path.join(args.model, 'paths')

    # display paths
    print "index path:", index_path
    print "bucketlist path:", bucketlist_path
    print "buckets path:", buckets_dir

    if args.line:
Пример #28
0
    tr_type = sys.argv[6]
    perms_fname = None
    if 'monotone' == tr_type:
        out_dir = sys.argv[7]
    elif len(sys.argv) < 9 or 'lattice' != tr_type:
        print error
        sys.exit()
    else:
        perms_fname = sys.argv[7]
        out_dir = sys.argv[8]
    return sys.argv[1], sys.argv[2], int(sys.argv[3]),\
        sys.argv[4], int(sys.argv[5]), tr_type, perms_fname, out_dir


if __name__ == '__main__':
    if len(sys.argv) < 8:
        print error
        sys.exit()

    source_sentences_fname, phrase_tables_dir, sentences_count, weights_fname, derivations_count,\
        translation_type, permutations_fname, output_dir = parse_arguments()

    source_sentences, phrase_table_fnames, weights, permutations_per_sentence =\
        ir.read_input(source_sentences_fname, phrase_tables_dir, sentences_count, weights_fname, permutations_fname)

    # Create output folder
    common.makedir(output_dir)

    tp.translate(source_sentences, phrase_table_fnames, weights, translation_type,
                 permutations_per_sentence, derivations_count, output_dir)
Пример #29
0
  parser.add_argument("-s", "--scanner", metavar='SCANNER', help="use SCANNER for tokenizing")
  parser.add_argument("--buckets", type=int, metavar='N', help="distribute features into N buckets", default=NUM_BUCKETS)
  parser.add_argument("--max_order", type=int, help="highest n-gram order to use")
  parser.add_argument("--word", action='store_true', default=False, help="use 'word' tokenization (currently str.split)")
  parser.add_argument("--chunksize", type=int, help="max chunk size (number of files to tokenize at a time - smaller should reduce memory use)", default=CHUNKSIZE)
  parser.add_argument("-t", "--temp", metavar='TEMP_DIR', help="store buckets in TEMP_DIR instead of in MODEL_DIR/buckets")
  parser.add_argument("model", metavar='MODEL_DIR', help="read index and produce output in MODEL_DIR")
  
  args = parser.parse_args()
  

  if args.temp:
    buckets_dir = args.temp
  else:
    buckets_dir = os.path.join(args.model, 'buckets')
  makedir(buckets_dir)

  bucketlist_path = os.path.join(args.model, 'bucketlist')
  index_path = os.path.join(args.model, 'paths')

  # display paths
  print "index path:", index_path
  print "bucketlist path:", bucketlist_path
  print "buckets path:", buckets_dir

  with open(index_path) as f:
    reader = csv.reader(f)
    items = list(reader)

  if sum(map(bool,(args.scanner, args.max_order, args.word))) > 1:
    parser.error('can only specify one of --word, --scanner and --max_order')
Пример #30
0
  group = parser.add_argument_group('sampling')
  group.add_argument("--sample_size", type=int, help="size of sample for sampling-based tokenization", default=140)
  group.add_argument("--sample_count", type=int, help="number of samples for sampling-based tokenization", default=None)
  
  args = parser.parse_args()

  if args.sample_count and args.line:
    parser.error("sampling in line mode is not implemented")
  

  if args.temp:
    tmp_dir = args.temp
  else:
    tmp_dir = os.path.join(args.model, 'buckets')
  makedir(tmp_dir)

  # We generate a new directory at each invocation, otherwise we run the 
  # risk of conflicting with a previous run without warning.
  buckets_dir = tempfile.mkdtemp(suffix='tokenize',dir=tmp_dir)

  bucketlist_path = args.output if args.output else os.path.join(args.model, 'bucketlist')
  index_path = os.path.join(args.model, 'paths')

  # display paths
  print "index path:", index_path
  print "bucketlist path:", bucketlist_path
  print "buckets path:", buckets_dir

  if args.line:
  	print "treating each LINE as a document"
Пример #31
0
  # Try to determine the set of features to consider
  if args.features:
    # Use a pre-determined feature list
    feat_path = args.features
  elif os.path.exists(m_path('DFfeats')):
    # Use LDfeats
    feat_path = m_path('DFfeats')
  else:
    raise ValueError("no suitable feature list")

  # Where temp files go
  if args.temp:
    buckets_dir = args.temp
  else:
    buckets_dir = m_path('buckets')
  makedir(buckets_dir)

  all_langs = set()
  pairs = []
  for p in args.pairs:
    try:
      lang1, lang2 = p.split(',')
    except ValueError:
      # Did not unpack to two values
      parser.error("{0} is not a lang-pair".format(p))
    all_langs.add(lang1)
    all_langs.add(lang2)
    pairs.append((lang1, lang2))

  if args.output:
    makedir(args.output)
Пример #32
0
    max_order = 4
    min_domain = 1
    model = None
    no_domain_ig = False
    proportion = 1.0
    sample_count = None
    sample_size = 140
    temp = None
    word = False


if __name__ == "__main__":
    data_path = "../data"
    corpus_name = os.path.basename(data_path)
    model_dir = os.path.join('.', corpus_name + '.model')
    makedir(model_dir)
    #语料库index初始化
    #输入:数据路径,最小分类,训练比例,语言,分类数,line
    print "开始进行索引语料库-index......"
    indexer = CorpusIndexer(data_path,
                            min_domain=1,
                            proportion=1.0,
                            langs=None,
                            domains=None,
                            line_level=False)

    # 计算文件,语言和域之间的映射
    lang_dist = indexer.dist_lang
    lang_index = indexer.lang_index
    lang_info = ' '.join(
        ("{0}({1})".format(k, lang_dist[v]) for k, v in lang_index.items()))
Пример #33
0
    group.add_argument(
        "--sample_count",
        type=int,
        help="number of samples for sampling-based tokenization",
        default=None)

    args = parser.parse_args()

    if args.sample_count and args.line:
        parser.error("sampling in line mode is not implemented")

    if args.temp:
        buckets_dir = args.temp
    else:
        buckets_dir = os.path.join(args.model, 'buckets')
    makedir(buckets_dir)

    bucketlist_path = args.output if args.output else os.path.join(
        args.model, 'bucketlist')
    index_path = os.path.join(args.model, 'paths')

    # display paths
    print "index path:", index_path
    print "bucketlist path:", bucketlist_path
    print "buckets path:", buckets_dir

    if args.line:
        print "treating each LINE as a document"

    with open(index_path) as f:
        reader = csv.reader(f)
Пример #34
0
    )

    parser.add_argument("corpus", help="read corpus from CORPUS_DIR", metavar="CORPUS_DIR")

    args = parser.parse_args()

    if args.df_feats and args.ld_feats:
        parser.error("--df_feats and --ld_feats are mutually exclusive")

    corpus_name = os.path.basename(args.corpus)
    if args.model:
        model_dir = args.model
    else:
        model_dir = os.path.join(".", corpus_name + ".model")

    makedir(model_dir)

    langs_path = os.path.join(model_dir, "lang_index")
    domains_path = os.path.join(model_dir, "domain_index")
    index_path = os.path.join(model_dir, "paths")

    # display paths
    print "corpus path:", args.corpus
    print "model path:", model_dir

    indexer = CorpusIndexer(
        args.corpus, min_domain=args.min_domain, proportion=args.proportion, langs=args.lang, domains=args.domain
    )

    # Compute mappings between files, languages and domains
    lang_dist = indexer.dist_lang
Пример #35
0
        type=bool,
        default=True,
        help='If printing logging information for scales of meshes.')
    return parser


if __name__ == '__main__':

    parser = get_parser()
    options = parser.parse_args()

    scale_tools = Scale(options)
    fusion_tools = Fusion(options)

    assert os.path.exists(options.in_dir)
    common.makedir(options.scale_dir)
    common.makedir(options.depth_dir)
    common.makedir(options.out_dir)

    files_unfiltered = scale_tools.read_directory(options.in_dir)
    files = [file for file in files_unfiltered if '.off' in file]
    print('= Found %s OFFs in %s' % (len(files), options.in_dir))
    print(files)
    timer = common.Timer()
    Rs = fusion_tools.get_views()

    for idx, filepath in enumerate(files):
        print('=== Processing %d/%d OFFs...' % (idx + 1, len(files)))

        off_file_out = os.path.join(options.out_dir,
                                    ntpath.basename(filepath)).replace(
Пример #36
0
  args = parser.parse_args()

  if args.sample_count and args.line:
    parser.error("sampling in line mode is not implemented")

  if args.df_feats and args.ld_feats:
    parser.error("--df_feats and --ld_feats are mutually exclusive")

  corpus_name = os.path.basename(args.corpus)
  if args.model:
    model_dir = args.model
  else:
    model_dir = os.path.join('.', corpus_name+'.model')

  makedir(model_dir)

  # display paths
  print "corpus path:", args.corpus
  print "model path:", model_dir

  indexer = CorpusIndexer(args.corpus, min_domain=args.min_domain, proportion=args.proportion,
                          langs = args.lang, domains = args.domain, line_level=args.line)

  # Compute mappings between files, languages and domains
  lang_dist = indexer.dist_lang
  lang_index = indexer.lang_index
  lang_info = ' '.join(("{0}({1})".format(k, lang_dist[v]) for k,v in lang_index.items()))
  print "langs({0}): {1}".format(len(lang_dist), lang_info)

  domain_dist = indexer.dist_domain
Пример #37
0
  # Try to determine the set of features to consider
  if args.features:
    # Use a pre-determined feature list
    feat_path = args.features
  elif os.path.exists(m_path('DFfeats')):
    # Use LDfeats
    feat_path = m_path('DFfeats')
  else:
    raise ValueError("no suitable feature list")

  # Where temp files go
  if args.temp:
    buckets_dir = args.temp
  else:
    buckets_dir = m_path('buckets')
  makedir(buckets_dir)

  all_langs = set()
  pairs = []
  for p in args.pairs:
    try:
      lang1, lang2 = p.split(',')
    except ValueError:
      # Did not unpack to two values
      parser.error("{0} is not a lang-pair".format(p))
    all_langs.add(lang1)
    all_langs.add(lang2)
    pairs.append((lang1, lang2))

  if args.output:
    makedir(args.output)
Пример #38
0
    def run_fuse(self):
        """
        Run fusion.
        """

        assert os.path.exists(self.options.depth_dir)
        common.makedir(self.options.out_dir)
        common.makedir('4_tsdf')

        files = self.read_directory(self.options.depth_dir)
        timer = common.Timer()
        Rs = self.get_views()

        for filepath in files:

            # As rendering might be slower, we wait for rendering to finish.
            # This allows to run rendering and fusing in parallel (more or less).
            depths = common.read_hdf5(filepath)

            timer.reset()
            tsdf = self.fusion(depths, Rs)
            tsdf = tsdf[0]

            vertices, triangles = libmcubes.marching_cubes(-tsdf, 0)
            # vertices, triangles, _, _ = measure.marching_cubes_lewiner(-tsdf, 0)
            print tsdf.shape
            np.save(os.path.join('4_tsdf', ntpath.basename(filepath)[:-3]).replace('.off', ''), -tsdf)
            vertices /= self.options.resolution
            vertices -= 0.5

            off_file = os.path.join(self.options.out_dir, ntpath.basename(filepath)[:-3])
            libmcubes.export_off(vertices, triangles, off_file)
            print('[Data] wrote %s (%f seconds)' % (off_file, timer.elapsed()))

            mesh = common.Mesh.from_off(off_file)
            s_t = scipy.io.loadmat(off_file.replace('2_watertight', '1_s_t').replace('.off', '.mat'))
            # scales_ori = (1./s_t['scales'][0][0], 1./s_t['scales'][0][1], 1./s_t['scales'][0][2])
            # translation_ori = (-s_t['translation'][0][0], -s_t['translation'][0][1], -s_t['translation'][0][2])

            sizes_ori = (s_t['sizes'][0][0], s_t['sizes'][0][1], s_t['sizes'][0][2])

            # print scales, translation

            min, max = mesh.extents()
            total_min = np.min(np.array(min))
            total_max = np.max(np.array(max))

            # Set the center (although this should usually be the origin already).
            centers = (
                (min[0] + max[0]) / 2,
                (min[1] + max[1]) / 2,
                (min[2] + max[2]) / 2
            )
            # Scales all dimensions equally.
            sizes = (
                total_max - total_min,
                total_max - total_min,
                total_max - total_min
            )
            translation = (
                -centers[0],
                -centers[1],
                -centers[2]
            )

            mesh.translate(translation)
            mesh.scale((sizes_ori[0]/sizes[0], sizes_ori[1]/sizes[1], sizes_ori[2]/sizes[2]))
            mesh.to_off(off_file)