def saveMissingFiles(self): """ Saves a list of all missing files in the output directory """ file_dir = os.path.join(self.exp["exp_dir"], "output") ensureDirExists(file_dir) file_path = os.path.join(file_dir, "missing_files.csv") with open( file_path, "w", ) as f: f.write( "file_guid,match_guid,query,match_title,match_year,in_papers\n" ) for mfile in MISSING_FILES: for index, item in enumerate(mfile): try: f.write("\"" + six.text_type(item).replace("\"", "") + "\"") except: f.write(u"<unicode error>") if index < len(mfile) - 1: f.write(",") else: f.write("\n")
def setupExperimentDir(self): """ Ensures dir exists, etc. """ self.exp["exp_dir"] = os.path.normpath( os.path.join(cp.Corpus.paths.experiments, self.exp["name"])) + os.sep ensureDirExists(self.exp["exp_dir"])
def initializeIndexer(self): """ Initializes the Java VM, creates directories if needed """ print("Initializing VM...") lucene.initVM(maxheap="768m") baseFullIndexDir = cp.Corpus.paths.fileLuceneIndex + os.sep ensureDirExists(baseFullIndexDir)
def saveCachedJson(self, path, data): """ Save anything as JSON """ ensureDirExists(os.path.dirname(path)) lines = json.dumps(bow, indent=3) try: f = codecs.open(path, "w", "utf-8") f.write(lines) f.close() except: print("Error saving JSON", path, "Exception in saveCachedJson():", sys.exc_info()[:2])
def __init__(self, numitems, results_file=True, dump_straight_to_disk=True, dump_filename="results.csv", message_text=None, start_counting_now=False, dot_every_xitems=None): ProgressIndicator.__init__(self, False, numitems=numitems, dot_every_xitems=dot_every_xitems) if message_text: self.message_text = message_text ## self.mrr=defaultdict(lambda:{}) # dict for Mean Reciprocal Rank scores ## self.dcg=defaultdict(lambda:{}) # Discounted Cumulative Gain ## self.ndcg=defaultdict(lambda:{}) # Normalized DCG ## self.precision=defaultdict(lambda:{}) # Exclusively precision score. 1 if right, 0 otherwise self.output_filename = dump_filename # stores every kind of score !TODO self.scores = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0))) self.num_data_points = defaultdict(lambda: defaultdict(lambda: 0)) # old list to store results self.overall_results = [] # new results in Pandas dataframe ## self.all_results=DataFrame() self.text_results = [] self.total_citations = 0 self.numchunks = 0 self.report_file = None self.dump_file = None if results_file: ensureDirExists(cp.Corpus.paths.output) self.report_file = codecs.open( os.path.join(cp.Corpus.paths.output, "report.txt"), "w") ## self.citations_extra_info=["a94-1008-cit14","a94-1008-cit4", "a00-1014-cit27"] self.citations_extra_info = [] self.full_citation_id = "" self.run_parameters = defaultdict(lambda: None) self.csv_columns = "query_id file_guid citation_id doc_position query_method doc_method precision_score rank mrr_score ndcg_score az csc_ctype match_guid first_result".split( ) self.dump_straight_to_disk = dump_straight_to_disk if dump_straight_to_disk: self.startResultsDump()
def loadListOrListAllFiles(self, inputdir, file_mask): """ Either loads the existing file list or lists the contents of the input directory. """ all_input_files_fn=os.path.join(cp.Corpus.paths.fileDB,"all_input_files.txt") ALL_INPUT_FILES=loadFileList(all_input_files_fn) if not ALL_INPUT_FILES: print("Listing all files...") ALL_INPUT_FILES=self.listAllFiles(inputdir,file_mask) ensureDirExists(cp.Corpus.paths.fileDB) saveFileList(ALL_INPUT_FILES,all_input_files_fn) return ALL_INPUT_FILES
def __init__(self, result_storer, cache_dir, res_ids=None, max_results=sys.maxsize): """ Creates cache directory if it doesn't exist """ super(self.__class__, self).__init__(result_storer, res_ids=res_ids, max_results=max_results) self.cache_dir = cache_dir self.own_dir = os.path.join(cache_dir, self.result_storer.table_name) ensureDirExists(cache_dir) ensureDirExists(self.own_dir)
def __init__(self, table_name, cache_dir, res_ids=None, max_results=sys.maxsize): """ Creates cache directory if it doesn't exist """ if res_ids: self.res_ids = res_ids else: self.res_ids = [] self.cache_dir = cache_dir self.table_name = table_name self.own_dir = os.path.join(cache_dir, table_name) ensureDirExists(cache_dir) ensureDirExists(self.own_dir) self.getResultList()
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity = FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res = IndexWriter(index, writerConfig) res.deleteAll() return res
def createDefaultDirs(self): """ Creates all necessary dirs """ for path in self.paths: ensureDirExists(self.paths[path])