예제 #1
0
    def canonical(self):
        """compute new, deleted"""
        canonical_start = Filename.get(self.start, FileType.CANONICAL)
        canonical_end = Filename.get(self.end, FileType.CANONICAL)
        canonical_new = Filename.get(self.end, FileType.CANONICAL, FileAdjective.NEW)
        self.values['new_canonical'] = comm(canonical_end, canonical_start, canonical_new)

        canonical_deleted = Filename.get(self.end, FileType.CANONICAL, FileAdjective.DELETED)
        self.values['deleted_canonical'] = comm(canonical_start, canonical_end, canonical_deleted)

        self.values['canonical'] = lines_in_file(canonical_end)
예제 #2
0
    def fulltext(self):
        """Compute the new and deleted bibcodes for each type of error from
        todays list of bibcodes compared with yesterdays list. Results stored
        in variables that are then used in report.py."""
        for e in conf['FULLTEXT_ERRORS'].keys():

            err_msg = "_" + ("_".join(e.split())).replace('-', '_')

            ft_start = Filename.get(self.start, FileType.FULLTEXT, adjective=None, msg=err_msg + "_")
            ft_end = Filename.get(self.end, FileType.FULLTEXT, adjective=None, msg=err_msg + "_")
            ft_new = Filename.get(self.end, FileType.FULLTEXT, adjective=FileAdjective.NEW, msg=err_msg + "_")
            self.values['new_ft' + err_msg] = comm(ft_end, ft_start, ft_new)

            ft_deleted = Filename.get(self.end, FileType.FULLTEXT, FileAdjective.DELETED, msg=err_msg + "_")
            self.values['deleted_ft' + err_msg] = comm(ft_start, ft_end, ft_deleted)

            self.values['ft' + err_msg] = lines_in_file(ft_end)
예제 #3
0
 def canonical(self):
     """create local copy of canonical bibcodes"""
     c = conf['CANONICAL_FILE']
     air = Filename.get(self.date, FileType.CANONICAL)
     logger.info(
         'making local copy of canonical bibcodes file, from %s to %s', c,
         air)
     shutil.copy(c, air)
     sort(air)
예제 #4
0
    def solr_bibcodes_finish(self, jobid):
        """get results from earlier submitted job"""
        url = conf.get('SOLR_URL', 'http://localhost:9983/solr/collection1/')
        status = 'batch?command=status&wt=json&jobid='
        get_results = 'batch?command=get-results&wt=json&jobid='
        # now we wait for solr to process batch query
        finished = False
        startTime = datetime.now()
        while not finished:
            rStatus = requests.get(url + status + jobid)
            if rStatus.status_code != 200:
                logger.error('batch status check failed, status: %s, text: %s',
                             rStatus.status_code, rStatus.text)
                return False
            j = rStatus.json()
            if j['job-status'] == 'finished':
                finished = True
            else:
                sleep(10)
            if (datetime.now() - startTime).total_seconds() > 3600 * 2:
                logger.error(
                    'solr batch process taking too long, seconds: %s;',
                    (datetime.now() - startTime).total_seconds())
                return False

        logger.info(
            'solr bacth completed in %s seconds, now fetching bibcodes',
            (datetime.now() - startTime).total_seconds())
        rResults = requests.get(url + get_results + jobid)
        if rResults.status_code != 200:
            logger.error(
                'failed to obtain bibcodes from solr batch query, status: %s, text: %s,',
                rResults.status_code, rResults.text)
            return False

        # finally save bibcodes to file
        bibs = rResults.text  # all 12 million bibcodes are in this one text field
        # convert to json-ish text to simple string, response includes newlines between bibcodes
        bibs = re.sub(r'{"bibcode":"|,|"}', '', bibs)
        filename = Filename.get(self.date, FileType.SOLR)
        with open(filename, 'w') as f:
            f.write(bibs)
        sort(filename)

        return True
예제 #5
0
    def solr(self):
        """compute missing, deleted, new, extra"""
        solr_end = Filename.get(self.end, FileType.SOLR)
        canonical_end = Filename.get(self.end, FileType.CANONICAL)
        solr_missing = Filename.get(self.end, FileType.SOLR, FileAdjective.MISSING)
        self.values['missing_solr'] = comm(canonical_end, solr_end, solr_missing)

        solr_start = Filename.get(self.start, FileType.SOLR)
        solr_new = Filename.get(self.end, FileType.SOLR, FileAdjective.NEW)
        self.values['new_solr'] = comm(solr_end, solr_start, solr_new)

        solr_deleted = Filename.get(self.end, FileType.SOLR, FileAdjective.DELETED)
        self.values['deleted_solr'] = comm(solr_start, solr_end, solr_deleted)

        solr_extra = Filename.get(self.end, FileType.SOLR, FileAdjective.EXTRA)
        self.values['extra_solr'] = comm(solr_end, canonical_end, solr_extra)

        self.values['solr'] = lines_in_file(solr_end)
예제 #6
0
    def fulltext(self):
        """Get errors from todays fulltext logs and generate a list for each
        type of error of corresponding bibcodes and source directories. These
        lists are written to files that are further processed in compute.py"""

        # types of errors with corresponding file names
        errors = conf['FULLTEXT_ERRORS']

        # get todays date
        now = datetime.strftime(datetime.now(), "%Y-%m-%d")

        # loop through types of errors messages
        for err_msg in errors.keys():

            bibs = []
            dirs = []

            # location of bibcode and directory in message field
            """example log:
            {"asctime": "2019-08-26T11:38:34.201Z", "msecs": 201.6739845275879,
            "levelname": "ERROR", "process": 13411, "threadName": "MainThread",
            "filename": "checker.py", "lineno": 238, "message": "Bibcode '2019arXiv190105463B'
            is linked to a non-existent file '/some/directory/filename.xml'",
            "timestamp": "2019-08-26T11:38:34.201Z", "hostname": "adsvm05"}"""
            loc_bib = 1
            loc_dir = 3

            if (err_msg == "No such file or directory"):
                loc_bib = 3
                loc_dir = 11
            elif (err_msg == "format not currently supported for extraction"):
                loc_bib = 7
                loc_dir = 23

            # loop through files
            for name in glob.glob(errors[err_msg]):

                command = "awk -F\: '/" + err_msg + "/ && /" + now + "/ && /ERROR/ {print $0}' " + name
                args = shlex.split(command)

                x = Popen(args, stdout=PIPE, stderr=STDOUT)

                # get bibcodes/directories from todays errors
                resp = x.communicate()[0].split("\n")

                for r in resp:
                    if r:
                        r = r.split("'")
                        bibs.append(r[loc_bib])
                        dirs.append(r[loc_dir])

            # create filename based on error message and date
            fname = Filename.get(
                self.date,
                FileType.FULLTEXT,
                adjective=None,
                msg="_" + ("_".join(err_msg.split())).replace('-', '_') + "_")

            # write bibcodes and directories for each error type to file
            with open(fname, 'w') as f:
                writer = csv.writer(f, delimiter='\t')
                writer.writerows(zip(bibs, dirs))

            sort(fname)
예제 #7
0
파일: run-bpr.py 프로젝트: Dellen/poi
import poi
from utils import Filename
from utils import setup_log
from utils import save_model 

if __name__ == "__main__":
    mdname = "bpr"
    fn = Filename("foursquare")
    setup_log(fn.log(mdname))
    train_cks = poi.load_checkins(open(fn.train))
    test_cks = poi.load_checkins(open(fn.test))

    eva = poi.Evaluation(test_cks, full=False)
    def hook(model):
        eva.assess(model)
        save_model(model, "./model/model_%s_%i.pkl" % (mdname, model.current))
        
    mf = poi.BPR(train_cks, 
                learn_rate = 0.1, 
                reg_user=0.08, 
                reg_item=0.08, 
                ) 
    mf.train(after=hook)

예제 #8
0
import poi
from utils import Filename
from utils import setup_log
from utils import save_model

if __name__ == "__main__":
    mdname = "bpr"
    fn = Filename("foursquare")
    setup_log(fn.log(mdname))
    train_cks = poi.load_checkins(open(fn.train))
    test_cks = poi.load_checkins(open(fn.test))

    eva = poi.Evaluation(test_cks, full=False)

    def hook(model):
        eva.assess(model)
        save_model(model, "./model/model_%s_%i.pkl" % (mdname, model.current))

    mf = poi.BPR(
        train_cks,
        learn_rate=0.1,
        reg_user=0.08,
        reg_item=0.08,
    )
    mf.train(after=hook)