def _on_run_sampler(self, e):
        '''Handles the run sampler button click event 

        '''
        try:
            
            self.confidence_val = Decimal(self.confidence.GetValue()) / Decimal('100')
            self.precision_val = float(self.precision.GetValue()) / 100.0 
            
            if not os.path.exists(self.dir_path) or not os.path.exists(self.output_dir_path):
                dlg = wx.MessageDialog(self, "Please enter a valid input/output directory", "Error", wx.ICON_ERROR)
                dlg.ShowModal()
                return 
            
            file_list = find_files_in_folder(self.dir_path)
            self.SetStatusText('%d files found in %s.' % (len(file_list), self.dir_path) )
            
            sampled_files = random_sampler(file_list, self.confidence_val, self.precision_val, self.SEED)
            self.SetStatusText('%d files are sampled out of %d files.' % (len(sampled_files), len(file_list)))
            
            copy_files_with_dir_tree(self.dir_path, sampled_files, self.output_dir_path)
            self.SetStatusText('%d randomly sampled files (from %d files) are copied to the output folder.' % (len(sampled_files), len(file_list)))

            
            # shows the tree list control 
            self.process_files_tree.on_changed_output_dir(self.output_folder_control.GetValue())
            self.process_files_tree.Show(True)
            self.GetSizer().Layout()
            self.Refresh()

        
        
        except Exception as anyException:
            dlg = wx.MessageDialog(self, str(anyException), "Error", wx.ICON_ERROR)
            dlg.ShowModal()
示例#2
0
def main():
	
	
	timestamp = datetime.datetime.now()
	arg_parser = argparse.ArgumentParser('Random sample test function:')
	arg_parser.add_argument("-d", dest="input_folder", type=str,
						    help="the root directory for all the mails",
						    required=True)
	arg_parser.add_argument("-c", dest="confidence", type=float,
						    help="The confidence interval eg. 0.95 for 95%",
						    required=True)
	arg_parser.add_argument("-p", dest="precision", type=float,
						    help="The precision for the interval eg. 0.02 precision for\
						    .95 confidence gives 95%  +/-2% error",
						    required=True)
	arg_parser.add_argument("-o", dest="output_dir", type=str, help="Output directory of samples",
						    default="/home/abhiramj/code/temp/samples",
						    required=False)
	args = arg_parser.parse_args()
	
	if  not os.path.isdir(args.output_dir):
		logger.debug("Making output directory" + args.output_dir)
		os.makedirs(args.output_dir)
	
	file_handle = logging.FileHandler(os.path.join(args.output_dir,'random_sampler_test_function--' +str(timestamp)+'.log'))
	file_handle.setLevel(logging.INFO)
	file_handle.setFormatter(formatter)
	logger.addHandler(file_handle)
	
	logger.info("Args are: ")
	logger.info("input_folder: "+ args.input_folder)
	logger.info("confidence: "+ str(args.confidence))
	logger.info("precision: "+ str(args.precision))
	logger.info("output_dir: "+ args.output_dir)
	

	if not os.path.exists(args.input_folder) :
		logger.error("Exiting with error: Input folder cannot be found")
		raise Exception, "Input folder cannot be found"
	if args.confidence <= 0 or args.confidence > 1:
		logger.error("Exiting with error: Confidence is not valid, enter as a probability between 0 and 1")
		raise Exception, "Confidence is not valid, enter as a probability between 0 and 1"
	if args.precision <= 0 or args.precision > 1:
		logger.error( "Exiting with error: Precision is not valid, enter as a probability between 0 and 1")
		raise Exception, "Precision is not valid, enter as a probability between 0 and 1"


	file_list = find_files_in_folder(args.input_folder)
	message_random_sample = random_sampler(file_list,args.confidence,args.precision,SEEDCONSTANT=0.5)
	
	
	file_destination_dir = args.output_dir +"--"+ str(timestamp)
	copy_random_files(file_destination_dir,message_random_sample)
	return message_random_sample
 def __initialize(self):
     for sex in ("atp", "wta"):
         dirname = os.path.join(cfg_dir.stat_players_dir(sex),
                                "decided_set")
         for filename in fu.find_files_in_folder(dirname,
                                                 filemask="*",
                                                 recursive=False):
             subname = os.path.basename(filename).replace(".txt",
                                                          "")  # 'decided'
             dct = dict_tools.load(
                 filename,
                 createfun=lambda: defaultdict(rl.SizedValue),
                 keyfun=int,
                 valuefun=rl.SizedValue.create_from_text,
             )
             for plrid, val in dct.items():
                 self.val_from_sexnameplr[(sex, subname, plrid)] = val
示例#4
0
def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")
def _last_file_date(sex: str, surface="all"):
    def filename_mask():
        return "{}_{}_elo_*.json".format(sex, str(surface))

    dates = []
    for filename in fu.find_files_in_folder(cfg_dir.ratings_dir(),
                                            filemask=filename_mask(),
                                            recursive=False):
        if len(filename) >= 15:
            end_idx = -len(".json")
            beg_idx = end_idx - len("yyyy.mm.dd")
            yyyy_mm_dd = filename[beg_idx:end_idx]
            year = int(yyyy_mm_dd[0:4])
            month = int(yyyy_mm_dd[5:7])
            day = int(yyyy_mm_dd[8:10])
            dates.append(datetime.date(year, month, day))
    if dates:
        return max(dates)
 def do_load(self, dialog):
     
     self.file_list = find_files_in_folder(self.dir_path)
     self.Refresh()
     dialog.Close()
    def __init__(self, parent):
        '''
        Constructor
        '''
        # Some value needs to initialized for this to run without exception
        self.confidence_val = DEFAULT_CONFIDENCE_LEVEL / Decimal('100')
        self.precision_val = DEFAULT_CONFIDENCE_INTERVAL
                
        # Calls the parent class's method 
        super(RandomSampler, self).__init__(parent) 
        
        # stack to store files and tags
        self.file_tag_dict = {}
        '''
        This is the tag format
        
        self.file_tag_dict(filename) = [('Reviewed','True'),('Accept', 'False'),('A1','False'),('A2','True')]
        '''
        
        # initialize the default list of tags and the current tag
        self.DEFAULT_TAGS_NUMBER = 2
        self.REVIEWED_TAG_INDEX = 0
        self.ACCEPT_TAG_INDEX = 1
        self.current_file_selected = None
        self.default_tag = ('Default' , 'True')
        self.current_tag_list = self.make_default_tag_list()
        self._tag_list.ClearAll()
        self._tag_list.InsertColumn(0,'Tag')
        self._tag_list.InsertColumn(1,'Status')
        self._tag_list.InsertStringItem(self.REVIEWED_TAG_INDEX, 'Reviewed')
        self._tag_list.SetStringItem(self.REVIEWED_TAG_INDEX, 1, 'False')
        self._tag_list.InsertStringItem(self.ACCEPT_TAG_INDEX, 'Accept')
        self._tag_list.SetStringItem(self.ACCEPT_TAG_INDEX, 1, 'False')
        
        # Separator for splitting tags
        self.TAG_NAME_SEPARATOR = " , "
        self.TAG_PREFIX = 'tag :'
        
        # Maximum depth of folders expanded for display
        self.MAX_FOLDER_DEPTH = 2
        
        self._st_num_samples.Hide()
        self.dir_path = tempfile.gettempdir() # a cross-platform way of getting the path to the temp directory
        self.output_dir_path = tempfile.gettempdir()
        self.from_copy_files_dir = self.dir_path 
        self.to_copy_files_dir = self.output_dir_path

        # for the I/O tab 
        self._tc_data_dir.SetValue(self.dir_path)
        self._tc_output_dir.SetValue(self.output_dir_path)
        self._tc_out_data_dir.SetValue(self.dir_path)
        self._tc_out_output_dir.SetValue(self.output_dir_path)
        self.file_list = find_files_in_folder(self.dir_path)
        self._st_num_data_dir_files.SetLabel('%d files found' % len(self.file_list))
        self._st_out_num_data_dir_files.SetLabel('%d files found' % len(self.file_list))

        
        # Defaults for random sample calcluation
        self.SEED = 2013

        self._set_confidence_level_and_interval()
        self.confidence_val = Decimal(self._cbx_confidence_levels.GetValue()) / Decimal('100')
        self.get_precision_as_float()
        self.Bind(wx.EVT_COMMAND_FIND_REPLACE_ALL, self._on_load_tag_list)
        self._panel_samples.Show(False) # make the tree list control invisible
        
        # Icon defaults
        self.icon_size = (16,16)
        self.image_list = wx.ImageList(self.icon_size[0], self.icon_size[1])
        self._tc_results.SetImageList(self.image_list)
        self.folder_icon     = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_FOLDER,      wx.ART_OTHER, self.icon_size))
        self.folder_open_icon = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_FILE_OPEN,   wx.ART_OTHER, self.icon_size))
        self.file_icon     = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_NORMAL_FILE, wx.ART_OTHER, self.icon_size))  

        self._current_page = 0
        self.nb_config_sampler.ChangeSelection(self._current_page)

        self.Center()
        self.Show(True)
示例#8
0
    
    
    if search_algorithm == 'LDA':
    
        None
        # Process the query 
        
#        responsive_docs, non_responsive_docs = process_query(query, dictionary, lda, index, doc_paths, limit)
#        nrd = np.array(non_responsive_docs)
#        nrd_paths = [os.path.join(dir_path, nrd[idx,2]) for idx, dir_path in enumerate(nrd[:,1])] # looks like i'm not getting full file paths
        
    elif search_algorithm == 'Lucene':
#        None 
        responsive_docs = lucene_search(lucene_index_file, limit, query)
        non_responsive_docs = []
        for file_name in find_files_in_folder(DATA_PATH):
            if os.path.dirname(file_name) is not lucene_index_file:       # skipping index directory
                if file_name not in responsive_docs:
                    non_responsive_docs.append(file_name)
        nrd_paths=non_responsive_docs 
    
    print 'Number of responsive documents:', len(responsive_docs)
    print 'Number of non responsive documents:', len(non_responsive_docs) 
    
    print 'The responsive files are: '
    for f in responsive_docs:
        print f 
    
    
#    ## Enter confidence intervals to get samples  
#