示例#1
0
 def testWebSiteInfoProvider(self):
     localConfig = readConfig( '../config_local.yaml' )
     testConfig = readConfig( '../config_test.yaml' )
     wsInfo = WebSiteInfoProvider( user_agent = localConfig['user_agent'],
                                   max_wait_time_secs=10,
                                   default_crawl_delay = 3 )
     '''
     informacion que necesito: 
     1. última vez que lo visité 
     2. el can fetch, debe quedar expuesto
     
     '''
     wsInfo.canFetchOrWait( "https://es.wikipedia.org/wiki/Wikipedia:Portada" )
示例#2
0
 def testSentenceProcessor(self):
     localConfig = readConfig('../config_local.yaml')
     testConfig = readConfig('../config_test.yaml')
     db = getServer(localConfig)[localConfig["db"]["db"]]
     sp = SentenceProcessorCouch(db)
     sp.addSentence('https://es.wikipedia.org', 'This is a test',
                    ['this', 'is', 'a', 'test'])
     sp.addSentence('https://es.wikipedia.org', 'This is another test',
                    ['this', 'is', 'another', 'test'])
     self.assertTrue(
         sp.isSentence('https://es.wikipedia.org', 'This is a test'))
     self.assertTrue(
         sp.isSentence('https://es.wikipedia.org', 'This is another test'))
示例#3
0
 def readConfig(self):
     for name, value in common.readConfig(Hud.config_name):
         if name == 'background':
             self.backgroundColor = common.hex_to_rgba(value)
         elif name == 'foreground':
             self.foregroundColor = common.hex_to_rgb(value)
         elif name == 'highlight':
             self.highlightColor = common.hex_to_rgb(value)
         else :
             raise Exception('whoa')
示例#4
0
parser = argparse.ArgumentParser(
    description=
    'Consume inbound mails, generating opens, clicks, OOBs and FBLs. Config file {} must be present in current directory.'
    .format(configFileName()))
parser.add_argument(
    'directory',
    type=str,
    help='directory to ingest .msg files, process and delete them',
)
parser.add_argument(
    '-f',
    action='store_true',
    help='Keep looking for new files forever (like tail -f does)')
args = parser.parse_args()

cfg = readConfig(configFileName())
logger = createLogger(cfg['Sink'].get('Logfile',
                                      baseProgName() + '.log'),
                      cfg['Sink'].getint('Logfile_backupCount', 10))

if args.directory:
    if args.f:
        # Process the inbound directory forever
        while True:
            fnameList = glob.glob(os.path.join(args.directory, '*.msg'))
            if fnameList:
                consumeFiles(logger, fnameList, cfg)
            time.sleep(5)
            cfg = readConfig(
                configFileName())  # get config again, in case it's changed
    else:
示例#5
0
 def setUp(self):
     self.localConfig = readConfig('../config_local.yaml')
     self.testConfig = readConfig('../config_test.yaml')
示例#6
0
    year, week, day = t.isocalendar()
    odd_week_offset = (week % nweeks) * 7
    i = day - 1 + odd_week_offset
    return d[i], i


# -----------------------------------------------------------------------------------------
# Main code
# -----------------------------------------------------------------------------------------
# This script should be run once per hour from crontab

if __name__ == "__main__":
    logger = createLogger(baseProgName() + '.log', 10)
    try:
        t = datetime.utcnow()
        cfg = readConfig('consume-mail.ini')
        weekly_cycle_bounce_rate = cfg.get('Weekly_Cycle_Bounce_Rate',
                                           '3').split(',')
        today_bounce_rate, x = nWeeklyCycle(weekly_cycle_bounce_rate, t)
        logger.info(
            'Today is day {} (zero based) in the {}-day cycle. Bounce rate will be {}%'
            .format(x, len(weekly_cycle_bounce_rate), today_bounce_rate))
        filename = '/etc/pmta/config'
        logger.info('Changing line of file {}'.format(filename))
        with open(filename, "r+") as f:
            pmta_cfg = f.readlines()
            pmta_cfg_bounce_param = 'dummy-smtp-blacklist-bounce-percent'
            for i, s in enumerate(pmta_cfg):
                if pmta_cfg_bounce_param in s:
                    pmta_cfg[
                        i] = '{} {}\t# Updated by script {} on {} UTC\n'.format(
示例#7
0
 def setUp(self):
     self.localConfig = readConfig('../config_local.yaml')
     self.testConfig = readConfig('../config_test.yaml')
     self.db = getServer(self.localConfig)[self.localConfig["db"]["db"]]
     self.up = UrlProcessorCouch(self.db)
示例#8
0
    data['sentences'] = []
    for sentenceId in wordData['sentences']:
        sentenceData = db[sentenceId]
        data['sentences'].append({
            'sentence': sentenceData['sentence'],
            'source': sentenceData['source'],
            'date': sentenceData['date']
        })
    file.write(json.dumps(data) + "\n")


if __name__ == '__main__':
    print("corpus extractor v.1.0")
    logging.info("corpus extractor v.1.0")
    (localConfigFile, configFile, loggingFile) = parseArguments(sys.argv[1:])
    setupLogger(loggingFile)
    localConfig = readConfig(localConfigFile)
    server = getServer(localConfig)
    db = getDatabaseConnection(server, localConfig['db']['db'])
    if 'sentence_threshold' in localConfig:
        sentenceThreshold = localConfig['sentence_threshold']
    else:
        sentenceThreshold = getMinimumSentenceThreshold(
            db, 'all_words/sentences_length', threshold=95)
    logging.info(
        f"Processing word entries that have at least {sentenceThreshold} sentences, the discarded words would be put into {DISCARDED_WORDS_TXT}"
    )
    processEntries(db, localConfig['corpus_result_dir'], sentenceThreshold)
    print("Finished")
    logging.info("finished")
示例#9
0
 def readConfig(self):
     for name, command in common.readConfig(Aliases.file_name):
         self.items.append((name, command))
示例#10
0
            urls_to_visit.append(child.attrs['href'])
    return urls_to_visit


def set_urls_as_not_visited(db: couchdb.Database, not_visited_view):
    for url in db.iterview(not_visited_view, 100):
        urlDoc = db[url.id]
        urlDoc['visited'] = False
        db.save(urlDoc)


if __name__ == '__main__':
    print("webcrawler v.1.0")
    (local_config_file, config_file,
     logging_file) = parseArguments(sys.argv[1:])
    local_config = readConfig(local_config_file)
    setupLogger(logging_file)
    server = getServer(local_config)
    db = server[local_config["db"]["db"]]
    sp = SentenceProcessorCouch(db)
    up = UrlProcessorCouch(db)
    exclusions = ExclusionRules(config_file['exclusion_rules'])
    webSiteInfoProvider = WebSiteInfoProvider(local_config['user_agent'],
                                              max_wait_time_secs=10,
                                              default_crawl_delay=5)
    engine = Engine(local_config['working_hours'], local_config['max_jobs'],
                    local_config['user_agent'],
                    UrlsProviderReal(db, "urls/not_visited"), sp, up,
                    webSiteInfoProvider, exclusions)
    engine.start()
    print("finished")