示例#1
0
    def saveMissingFiles(self):
        """
            Saves a list of all missing files in the output directory
        """
        file_dir = os.path.join(self.exp["exp_dir"], "output")
        ensureDirExists(file_dir)
        file_path = os.path.join(file_dir, "missing_files.csv")

        with open(
                file_path,
                "w",
        ) as f:
            f.write(
                "file_guid,match_guid,query,match_title,match_year,in_papers\n"
            )
            for mfile in MISSING_FILES:
                for index, item in enumerate(mfile):
                    try:
                        f.write("\"" + six.text_type(item).replace("\"", "") +
                                "\"")
                    except:
                        f.write(u"<unicode error>")

                    if index < len(mfile) - 1:
                        f.write(",")
                    else:
                        f.write("\n")
示例#2
0
 def setupExperimentDir(self):
     """
         Ensures dir exists, etc.
     """
     self.exp["exp_dir"] = os.path.normpath(
         os.path.join(cp.Corpus.paths.experiments,
                      self.exp["name"])) + os.sep
     ensureDirExists(self.exp["exp_dir"])
示例#3
0
    def initializeIndexer(self):
        """
            Initializes the Java VM, creates directories if needed
        """
        print("Initializing VM...")
        lucene.initVM(maxheap="768m")

        baseFullIndexDir = cp.Corpus.paths.fileLuceneIndex + os.sep
        ensureDirExists(baseFullIndexDir)
示例#4
0
    def saveCachedJson(self, path, data):
        """
			Save anything as JSON
		"""
        ensureDirExists(os.path.dirname(path))
        lines = json.dumps(bow, indent=3)
        try:
            f = codecs.open(path, "w", "utf-8")
            f.write(lines)
            f.close()
        except:
            print("Error saving JSON", path, "Exception in saveCachedJson():",
                  sys.exc_info()[:2])
示例#5
0
    def __init__(self,
                 numitems,
                 results_file=True,
                 dump_straight_to_disk=True,
                 dump_filename="results.csv",
                 message_text=None,
                 start_counting_now=False,
                 dot_every_xitems=None):
        ProgressIndicator.__init__(self,
                                   False,
                                   numitems=numitems,
                                   dot_every_xitems=dot_every_xitems)
        if message_text:
            self.message_text = message_text
        ##        self.mrr=defaultdict(lambda:{}) # dict for Mean Reciprocal Rank scores
        ##        self.dcg=defaultdict(lambda:{}) # Discounted Cumulative Gain
        ##        self.ndcg=defaultdict(lambda:{}) # Normalized DCG
        ##        self.precision=defaultdict(lambda:{}) # Exclusively precision score. 1 if right, 0 otherwise

        self.output_filename = dump_filename

        # stores every kind of score !TODO
        self.scores = defaultdict(
            lambda: defaultdict(lambda: defaultdict(lambda: 0)))
        self.num_data_points = defaultdict(lambda: defaultdict(lambda: 0))

        # old list to store results
        self.overall_results = []

        # new results in Pandas dataframe
        ##        self.all_results=DataFrame()

        self.text_results = []
        self.total_citations = 0
        self.numchunks = 0
        self.report_file = None
        self.dump_file = None
        if results_file:
            ensureDirExists(cp.Corpus.paths.output)
            self.report_file = codecs.open(
                os.path.join(cp.Corpus.paths.output, "report.txt"), "w")

        ##        self.citations_extra_info=["a94-1008-cit14","a94-1008-cit4", "a00-1014-cit27"]
        self.citations_extra_info = []
        self.full_citation_id = ""
        self.run_parameters = defaultdict(lambda: None)
        self.csv_columns = "query_id file_guid citation_id doc_position query_method doc_method precision_score rank mrr_score ndcg_score az csc_ctype match_guid first_result".split(
        )
        self.dump_straight_to_disk = dump_straight_to_disk
        if dump_straight_to_disk:
            self.startResultsDump()
示例#6
0
    def loadListOrListAllFiles(self, inputdir, file_mask):
        """
            Either loads the existing file list or lists the contents of the
            input directory.
        """
        all_input_files_fn=os.path.join(cp.Corpus.paths.fileDB,"all_input_files.txt")
        ALL_INPUT_FILES=loadFileList(all_input_files_fn)
        if not ALL_INPUT_FILES:
            print("Listing all files...")
            ALL_INPUT_FILES=self.listAllFiles(inputdir,file_mask)
            ensureDirExists(cp.Corpus.paths.fileDB)
            saveFileList(ALL_INPUT_FILES,all_input_files_fn)

        return ALL_INPUT_FILES
示例#7
0
 def __init__(self,
              result_storer,
              cache_dir,
              res_ids=None,
              max_results=sys.maxsize):
     """
         Creates cache directory if it doesn't exist
     """
     super(self.__class__, self).__init__(result_storer,
                                          res_ids=res_ids,
                                          max_results=max_results)
     self.cache_dir = cache_dir
     self.own_dir = os.path.join(cache_dir, self.result_storer.table_name)
     ensureDirExists(cache_dir)
     ensureDirExists(self.own_dir)
示例#8
0
    def __init__(self,
                 table_name,
                 cache_dir,
                 res_ids=None,
                 max_results=sys.maxsize):
        """
            Creates cache directory if it doesn't exist
        """
        if res_ids:
            self.res_ids = res_ids
        else:
            self.res_ids = []

        self.cache_dir = cache_dir
        self.table_name = table_name
        self.own_dir = os.path.join(cache_dir, table_name)
        ensureDirExists(cache_dir)
        ensureDirExists(self.own_dir)
        self.getResultList()
示例#9
0
    def createIndexWriter(self, actual_dir, max_field_length=20000000):
        """
            Returns an IndexWriter object created for the actual_dir specified
        """
        ensureDirExists(actual_dir)
        index = SimpleFSDirectory(File(actual_dir))
        analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)

        writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT,
                                         analyzer)
        similarity = FieldAgnosticSimilarity()

        writerConfig.setSimilarity(similarity)
        writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        ##    res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length))
        res = IndexWriter(index, writerConfig)
        res.deleteAll()
        return res
示例#10
0
    def createDefaultDirs(self):
        """
			Creates all necessary dirs
		"""
        for path in self.paths:
            ensureDirExists(self.paths[path])