def _main(arguments=sys.argv): # if version is specified ignore any other arguments if '--version' in arguments or '-v' in arguments: return make_exit(message="hepdata-converter version: %s" % version.__version__) parser = argparse.ArgumentParser(description="CLI tools for converting between HEP data formats", add_help=True, formatter_class=argparse.RawTextHelpFormatter, epilog=generate_help_epilogue()) parser.add_argument("--input-format", '-i', action='store', default='yaml', help='format of the input file/s (default: yaml) [choose one option from Parsers section below]') parser.add_argument("--output-format", '-o', action='store', default='yaml', help='format of the output file/s (default: yaml) [choose one option from Writers section below]') parser.add_argument("--version", '-v', action='store_const', const=True, default=False, help='Show hepdata-converter version') parser.add_argument("--hepdata-doi", '-d', action='store', default='', help='Pass HEPData DOI, e.g. "10.17182/hepdata.74247.v1"') parser.add_argument("input") parser.add_argument("output") if arguments == sys.argv: arguments = sys.argv[1:] program_args = vars(parser.parse_known_args(arguments)[0]) input_format = program_args['input_format'] output_format = program_args['output_format'] Parser.get_concrete_class(input_format).register_cli_options(parser) Writer.get_concrete_class(output_format).register_cli_options(parser) # reparse arguments, now with added options from concrete parsers / writers program_args = vars(parser.parse_args(arguments)) try: convert(program_args['input'], program_args['output'], program_args) return make_exit() except ValueError as e: return make_exit(message="Options error: %s" % str(e), code=1)
def test_parse_index_row_returns_dict(self): p = Parser() elem = make_tree(SAMPLE_ROW) result = p.parse_index_row(elem) self.assertEqual(type(result), dict)
def _main(arguments=sys.argv): # if version is specified ignore any other arguments if '--version' in arguments or '-v' in arguments: return make_exit(message="hepdata-converter version: %s" % version.__version__) parser = argparse.ArgumentParser( description="CLI tools for converting between HEP data formats", add_help=True, formatter_class=argparse.RawTextHelpFormatter, epilog=generate_help_epilogue()) parser.add_argument( "--input-format", '-i', action='store', default='yaml', help= 'format of the input file/s (default: yaml) [choose one option from Parsers section below]' ) parser.add_argument( "--output-format", '-o', action='store', default='yaml', help= 'format of the output file/s (default: yaml) [choose one option from Writers section below]' ) parser.add_argument("--version", '-v', action='store_const', const=True, default=False, help='Show hepdata-converter version') parser.add_argument( "--hepdata-doi", '-d', action='store', default='', help='Pass HEPData DOI, e.g. "10.17182/hepdata.74247.v1"') parser.add_argument("input") parser.add_argument("output") if arguments == sys.argv: arguments = sys.argv[1:] program_args = vars(parser.parse_known_args(arguments)[0]) input_format = program_args['input_format'] output_format = program_args['output_format'] Parser.get_concrete_class(input_format).register_cli_options(parser) Writer.get_concrete_class(output_format).register_cli_options(parser) # reparse arguments, now with added options from concrete parsers / writers program_args = vars(parser.parse_args(arguments)) try: convert(program_args['input'], program_args['output'], program_args) return make_exit() except ValueError as e: return make_exit(message="Options error: %s" % str(e), code=1)
def __init__(self): self.r_helper = redis_helper() self.g = Graph(host="127.0.0.1", http_port=7474, user="******", password="******") self.num_limit = 20 self.rela2Entity = { "rel_clauses": "Clauses", "rel_scope": "Scope", "rel_type": "Insur_type" } self.parse = Parser() self.conf = configparser.ConfigParser() self.conf.read(cur_dir + '/../config/kg_bot.conf') self.form_reg = re.compile(self.conf.get("askProperty", "form_reg")) self.term_reg = re.compile(self.conf.get("askProperty", "term_reg")) self.price_reg = re.compile(self.conf.get("askProperty", "price_reg")) self.sale_reg = re.compile(self.conf.get("askProperty", "sale_reg")) self.crowd_reg = re.compile(self.conf.get("askProperty", "crowd_reg")) self.url_reg = re.compile(self.conf.get("askProperty", "url_reg")) self.amount_reg = re.compile(self.conf.get("askProperty", "amount_reg")) self.info_reg = re.compile(self.conf.get("askProperty", "info_reg")) self.clause_reg = re.compile(self.conf.get("askEntity", "clause_reg")) self.scope_reg = re.compile(self.conf.get("askEntity", "scope_reg")) self.type_reg = re.compile(self.conf.get("askEntity", "type_reg")) self.entity_reg = re.compile(self.conf.get("askEntity", "entity_reg")) self.rela_reg = re.compile(self.conf.get("askRela", "rela_reg"))
def parser(): global a_date parsed_tasks_links = [] Parser.get_gdoc_config('1VGObmBB7RvgBtBUGW7lXVPvm6_m96BJpjFIH_qkZGBM') while True: a_date = datetime.date.today() set_file_logger() parsed_tasks = [] for batch in Parser.parse_all(): parsed_tasks.extend(batch) new_tasks = [task for task in parsed_tasks if task['link'] not in parsed_tasks_links and not db_handler.check_task_link(task['link'])] logger.debug(f"New tasks {[task['link'] for task in new_tasks]}") for task in new_tasks: task = format_task(task) print(f"{', '.join([task['title'], task['price'], task['currency'], task['price_format']])}") logger.debug(f"Sending task {task['link']} to db") db_handler.add_task(task) tasks_sender(new_tasks) parsed_tasks_links = [task['link'] for task in parsed_tasks] time.sleep(5)
def test_parse_index_table_returns_all_rows(self): row = '<tr class="tCenter hl-tr"></tr>' html = '<table id="tor-tbl">' + row * 10 + '</table>' p = Parser() rows = p.parse_index_table(html) self.assertEqual(len(rows), 10)
def parse(): # parse gathered data and save as csv logger.info("parse") storage = Persistor() parser = Parser() raw_data = storage.read_raw_data() parsed_files = parser.parse_object(raw_data) storage.save_csv(parsed_files)
def test_pid(self): p = Parser() res = p.parse(self.f, None, False) for pid, trace in res.iteritems(): for timestamp, data in trace.iteritems(): self.assertTrue(pid == data['pid']) self.assertTrue(timestamp == data['timestamp']) for item in ('path', 'timestamp', 'type'): self.assertTrue(item in data)
def test_main(self): p = Parser() res = p.parse(self.f, None, False) self.assertTrue(len(res) == 6) for e in res: self.assertTrue(isinstance(e, dict)) self.assertTrue(isinstance(e['path'], basestring)) self.assertTrue(isinstance(e['version'], basestring))
def run(self): parser = Parser() while True: sourceStr = raw_input("Enter an infix expression: ") if sourceStr == "": break try: parser.parse(sourceStr) print parser.parseStatus() except Exception, e: print "Error:" print e
def run(self): parser = Parser() while True: sourceStr = input( "Enter an arithmetic expression or just enter to quit: ") if sourceStr == "": break try: parser.parse(sourceStr) print(parser.parseStatus()) print(parser.tree) except Exception as e: print("Error:") print(e)
def identifyCorpus(corpus, x=-1): """ update corpus with mwedictionaries (type, count, tokens), train, predict and evaluate corpus """ print(XPParams.use_extern_labels) if XPParams.use_extern_labels: Parser.parse(corpus, "") # -> prediction scores = Evaluation.evaluate(corpus) # -> evaluate else: corpus.update() clf = EmbeddingOracle.train(corpus, x) # -> training Parser.parse(corpus, clf) # -> prediction scores = Evaluation.evaluate(corpus) # -> evaluate return scores
def run(self): parser = Parser() while True: sourceStr = input("Enter an infix expression: ") if sourceStr == "": break try: tree = parser.parse(sourceStr) print("Prefix:", tree.prefix()) print("Infix:", tree.infix()) print("Postfix:", tree.postfix()) print("Value:", tree.value()) except Exception as e: print("Error:") print(e)
def test_get_parser(self): p = Parser() fun = p.get_parser() self.assertTrue(fun == None) for ext, klass in PARSERS_CONFIG.iteritems(): p = Parser() p.ext = ext fun = p.get_parser() self.assertTrue(fun != None) fun_class = load_by_name(klass) self.assertTrue(isinstance(fun, fun_class))
def main(): parser = Parser(DATASET) src_prep = SrcPreprocessing(parser.src_parser()) src_prep.preprocess() with open(DATASET.root / 'preprocessed_src.pickle', 'wb') as file: pickle.dump(src_prep.src_files, file, protocol=pickle.HIGHEST_PROTOCOL) report_prep = ReportPreprocessing(parser.report_parser()) report_prep.preprocess() with open(DATASET.root / 'preprocessed_reports.pickle', 'wb') as file: pickle.dump(report_prep.bug_reports, file, protocol=pickle.HIGHEST_PROTOCOL)
def test_main(self): p = Parser() res = p.parse(self.f, None, False) self.assertTrue(len(res) == 9) for grp, data in res.iteritems(): self.assertTrue(isinstance(data, list)) for symbol in data: self.assertTrue(isinstance(symbol, dict)) self.assertTrue(isinstance(symbol['name'], basestring)) self.assertTrue( isinstance(symbol['offset'], basestring) or \ isinstance(symbol['offset'], int))
class Interpreter(object): def __init__(self, w_m, profile): self.parser = Parser() self.profile = profile self.w_m = w_m self.k_b = KnowledgeBase() def interpret(self, stimulus): self.w_m.push_input() parsed_data = self.parser.parse(stimulus) classes = [] for wrd in parsed_data: matched = self.k_b.match(wrd) if matched: classes.append(matched[0]) for cls in classes: word_class = cls.get_word_class() if word_class == 'noun': self.w_m.percept.nouns.append(cls) elif word_class == 'verb': self.w_m.percept.verbs.append(cls) elif word_class == 'adjective': self.w_m.percept.adjectivs.append(cls) elif word_class == 'adverb': self.w_m.percept.adverbs.append(cls)
def __init__(self, url, raw_html, step, lang="en"): self.status = True self.url = url self.step = step self.lang = lang # title of the article self.title = None #text self.article = u"" self.cleaned_text = u"" # meta self.meta_description = u"" self.meta_lang = u"" self.meta_favicon = u"" self.meta_keywords = u"" #link and domain self.canonical_link = u"" self.domain = u"" # cleaned text self.top_node = None self.tags = set() self.final_url = url self.raw_html = raw_html # the lxml Document object self.parser = Parser() self.raw_doc = u"" self.publish_date = None self.additional_data = {} self.links = [] self.outlinks = [] self.inlinks = [] self.start_date = datetime.datetime.today()
def convert(input, output=None, options={}): """Converts arbitrary supported data format to the new HEPData YAML format, and writes the output files to the output_dir directory :param input: input, depending on the choosen input datatype it may be, filepath, filetype object, directory, etc :param output: output directory to which converted YAML files will be written :param input_format: format of the input data, must be a string containing name of the input parser class :type input_format: str :param options: additional options used for conversion (depends on the choosen input format) :type options: dict :raise ValueError: raised if no input_format is specified """ if 'input_format' not in options and 'output_format' not in options: raise ValueError("no input_format and output_format specified!") input_format = options.get('input_format', 'yaml') output_format = options.get('output_format', 'yaml') parser = Parser.get_concrete_class(input_format)(**options) writer = Writer.get_concrete_class(output_format)(**options) if not output and not writer.single_file_output: raise ValueError("this output_format requires specifying 'output' argument") # if no output was specified create proxy output to which writer can insert data _output = output if not _output: _output = StringIO.StringIO() writer.write(parser.parse(input), _output) # if no output was specified return output if not output: return _output.getvalue()
def parse(self, path): parser = Parser.get(path) self.search_method = parser.search_method # todo: fix row = parser.next() while row: self.rows.append(row) row = parser.next()
def __init__(self, log, conf, name): Parser.__init__(self, log) self.name = name self.conf = conf self.sources = {} self.matches = 0 if 'sources' not in self.conf: raise Exception('Invalid configuration for '+self.name+' match') if 'workers' not in self.conf: raise Exception('Field workers not specified in '+self.name+' match') if type(self.conf['workers']) != int: self.log.error('Invalid value for field workers in '+self.name+' match, setting to 1') self.conf['workers'] = 1 if self.conf['workers'] < 1: self.log.error('Invalid value for field workers in '+self.name+' match, setting to 1') self.conf['workers'] = 1 for s in self.conf['sources'].keys(): self.add_source(s, self.conf['sources'][s])
class Crawler(object): def __init__(self): self.pres = Parser() self.proc = Processor() def start(self): """ 启动爬虫方法 :param urls: 启动URL :return: 抓取的URL数量 """ self.getHy() self.getStockLHRank(2) #获取数据东方财富买入最多的股票。1小时 def getStockLHRank(self, timePeriod): root_urls = 'https://simqry2.eastmoney.com/qry_tzzh_v2?type=spo_rank_tiger&plat=2&ver=web20&rankType=30001&timePeriod=' + str( timePeriod) + '&recIdx=1&recCnt=50' html = download(root_urls, 'utf8') print html data = self.pres.StockLHRank(html) self.proc.StockLHRank(data) #print data #print data[0] #https://simqry2.eastmoney.com/qry_tzzh_v2?type=spo_rank_tiger&plat=2&ver=web20&rankType=30001&timePeriod=3&recIdx=1&recCnt=50 #获取行业资金流入情况 def getHy(self): for p in range(1, 3): root_urls = 'http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cmd=C._BKHY&type=ct&st=BalFlowMain&sr=-1&p=' + str( p ) + '&token=894050c76af8597a853f5b408b759f5d&sty=DCFFITABK&rt=51115543' html = download(root_urls, 'utf8') ''' html =re.findall(r'[(](.*?)[)]', html) jsonObj = json.loads(html[0]) print jsonObj[0].split(',')[14] ''' data = self.pres.easey(html) self.proc.hyDb(data)
def worker(address, host): parser = Parser.new() parser.global_set_address(address) parser.global_set_host(host) parser.global_set_timestamp( ) if config["reporting"]["timestamp"] == "true" else "" parser.global_pair("host", host["name"]) # Identify all modules modules = [ os.path.splitext(f)[0] for f in os.listdir(MODULES_PATH) if os.path.splitext(f)[1] == ".py" and f != "__init__.py" ] # Worker function to run module module_threads = [] def module_worker(module): mod = import_module("modules." + module) mod.run(parser) for module in modules: # Check if module is activated in host settings if module in host["modules"]: # Use host-specific settings setting_enabled = host["modules"][module][0] setting_tick = host["modules"][module][1] else: # Use global settings try: setting_enabled = config["global"]["modules"][module][0] setting_tick = config["global"]["modules"][module][1] except: continue # TODO , add warning? # Check if module is active in global and in host settings if setting_enabled == "false" or ((tick % setting_tick) != 0): continue # Start thread thread = Thread(target=module_worker, args=(module, )) thread.start() module_threads.append(thread) # Wait for all module threads to complete for thread in module_threads: thread.join() results.append(parser.get())
def test_parse(self): for f in self.files: p = Parser() res = p.parse(f, None, False) self.assertTrue( isinstance(res, dict) or \ isinstance(res, list) or \ res is None) res = p.parse(f, None, True) self.assertTrue(isinstance(res, tuple)) self.assertTrue(len(res) == 2) res = p.parse(f, None, False) def_parser = PARSERS_CONFIG.get(p.ext, None) cond = def_parser != None and res != {} or\ def_parser == None and res == {} self.assertTrue(cond)
def __init__(self): super().__init__(parent=None, title='ChoseFile', size=(640, 535), style=wx.SYSTEM_MENU | wx.CAPTION | wx.CLOSE_BOX | wx.CLIP_CHILDREN) # ui self.init_ui() # config self.setting = Setting() # default self.init_default(self.setting) # logger self.logger = Logger(self.console_text) # parser self.parser = Parser(self.logger) # processor self.processor = Processor(self.logger, self.setting)
def run(): # 添加项目目录到环境变量 sys.path.append(settings.BASE_DIR) # 连接redis,实例化spider, parse, checker redis_cli = redis.Redis(**settings.REDIS_PARAM) spider = Spider() parser = Parser(spider.queue, redis_cli) checher = Checker(redis_cli) # 创建并启动线程 thread_list = [ Thread(target=spider.run), Thread(target=parser.run), Thread(target=checher.run) ] for thread in thread_list: thread.start()
def generate_help_epilogue(): margin = ' ' r = 'Parsers:\n' r += '[use them as --input-format parameter]\n' r += '\n' for cls in Parser.get_all_subclasses(): r += cls.get_help(margin) r += '\nWriters:\n' r += '[use them as --output-format parameter]\n' r += '\n' for cls in Writer.get_all_subclasses(): r += cls.get_help(margin) return r
def parse_doc_section(section, dom): parser = Parser(dom) unhandled_count = 0 handled_count = 0 for para in section["paragraphs"]: para['text'] = _para_text_content(para) if not para['text']: continue success = parser(para) if not success and para['text']: unhandled_count += 1 print('unhandled para {}:'.format(para['index']), para, '\n', file=sys.stderr) elif success: handled_count += 1 print('handled paras: {}'.format(handled_count), file=sys.stderr) print('unhandled paras: {}'.format(unhandled_count), file=sys.stderr)
def parse_doc_section(section, dom): def prep_para(para): para['text'] = _para_text_content(para) def next_para(): paras = section['paragraphs'] next_index = para['index'] - paras[0]['index'] + 1 if next_index >= len(paras): return None next_p = prep_para(copy.deepcopy(paras[next_index])) if matchers.empty(next_p): next_p = next_p['next']() return next_p para['next'] = next_para return para parser = Parser(dom) unhandled_count = 0 handled_count = 0 for para in section["paragraphs"]: prep_para(para) if not para['text']: continue success = parser(para) if not success and para['text']: unhandled_count += 1 print('unhandled para {}:'.format(para['index']), para, '\n', file=sys.stderr) elif success: handled_count += 1 print('handled paras: {}'.format(handled_count), file=sys.stderr) print('unhandled paras: {}'.format(unhandled_count), file=sys.stderr)
def convert(input, output=None, options={}): """Converts a supported ``input_format`` (*oldhepdata*, *yaml*) to a supported ``output_format`` (*csv*, *root*, *yaml*, *yoda*). :param input: location of input file for *oldhepdata* format or input directory for *yaml* format :param output: location of output directory to which converted files will be written :param options: additional options such as ``input_format`` and ``output_format`` used for conversion :type input: str :type output: str :type options: dict :raise ValueError: raised if no ``input_format`` or ``output_format`` is specified """ if 'input_format' not in options and 'output_format' not in options: raise ValueError("no input_format and output_format specified!") input_format = options.get('input_format', 'yaml') output_format = options.get('output_format', 'yaml') parser = Parser.get_concrete_class(input_format)(**options) writer = Writer.get_concrete_class(output_format)(**options) if not output and not writer.single_file_output: raise ValueError( "this output_format requires specifying 'output' argument") # if no output was specified create proxy output to which writer can insert data _output = output if not _output: _output = StringIO.StringIO() writer.write(parser.parse(input), _output) # if no output was specified return output if not output: return _output.getvalue()
def __init__(self, filename): Parser.__init__(self, filename, ".date.csv")
def test_trace(self): p = Parser() res = p.parse(self.f, None, False) self.assertTrue(res['trace'] is not None) self.assertTrue(len(res['trace']) == 268)
log.info('Application logger initialized') # Cache registration with Flask app cache.init_app(app) log.info('Cache initialized') # Database registration with Flask app db.app = app db.init_app(app) log.info('DB initialized') # Parser initialization parser = Parser() counter = 0 log.info('Parser initialized') # DB structure initialization and parser first run with app.app_context(): db.create_all() # running parser on first start of app if Rate.query.all().__len__() == 0: parser.get_rates() # Scheduled job def run_schedule(): global counter
def __init__(self, filename): Parser.__init__(self, filename, ".withdrawal.csv")
def __init__(self, w_m, profile): self.parser = Parser() self.profile = profile self.w_m = w_m self.k_b = KnowledgeBase()
'''main.py main method.''' from parsers import Parser if __name__ == '__main__': parser = Parser('fallbackdfa.in', 'Lab2.in') dfa = parser.parse_dfa() test_outputs = [] test_inputs = parser.parse_test_inputs() for test_input_name, test_input_data in test_inputs: print('--', test_input_name) test_output = dfa.run(test_input_data) test_outputs.append((test_input_name.replace('in', 'out'), test_output)) with open('Lab2-2.out', 'w') as f: for test_output_name, test_output_data in test_outputs: # print('---', test_output_name) # print(test_output_data) formatted_test_output = ['%s,%s' % (state.lexical_category_name, lexeme) for state, lexeme in test_output_data] if formatted_test_output: formatted_test_output = '%s\n%s' % (test_output_name, ' '.join(formatted_test_output)) else: formatted_test_output = test_output_name + "\nError: Input lexemes don't match the language!" f.write(formatted_test_output) f.write('\n')
def run( self ): # it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing try: print "[+] Parsing nmap xml file: " + self.filename starttime = time.time() try: parser = Parser(self.filename) except: print '\t[-] Giving up on import due to previous errors.' print "\t[-] Unexpected error:", sys.exc_info()[0] self.done.emit() return self.db.dbsemaphore.acquire( ) # ensure that while this thread is running, no one else can write to the DB s = parser.get_session() # nmap session info if s: nmap_session(self.filename, s.start_time, s.finish_time, s.nmap_version, s.scan_args, s.total_hosts, s.up_hosts, s.down_hosts) hostCount = len(parser.all_hosts()) if hostCount == 0: # to fix a division by zero if we ran nmap on one host hostCount = 1 progress = 100.0 / hostCount totalprogress = 0 self.tick.emit(int(totalprogress)) for h in parser.all_hosts( ): # create all the hosts that need to be created db_host = nmap_host.query.filter_by(ip=h.ip).first() if not db_host: # if host doesn't exist in DB, create it first hid = nmap_host('', '', h.ip, h.ipv4, h.ipv6, h.macaddr, h.status, h.hostname, h.vendor, h.uptime, h.lastboot, h.distance, h.state, h.count) note(hid, '') session.commit() for h in parser.all_hosts( ): # create all OS, service and port objects that need to be created db_host = nmap_host.query.filter_by( ip=h.ip).first() # fetch the host os_nodes = h.get_OS() # parse and store all the OS nodes for os in os_nodes: db_os = nmap_os.query.filter_by( host_id=db_host.id).filter_by(name=os.name).filter_by( family=os.family).filter_by( generation=os.generation).filter_by( os_type=os.os_type).filter_by( vendor=os.vendor).first() if not db_os: nmap_os(os.name, os.family, os.generation, os.os_type, os.vendor, os.accuracy, db_host) for p in h.all_ports(): # parse the ports s = p.get_service() if not ( s is None ): # check if service already exists to avoid adding duplicates db_service = nmap_service.query.filter_by( name=s.name).filter_by( product=s.product).filter_by( version=s.version).filter_by( extrainfo=s.extrainfo).filter_by( fingerprint=s.fingerprint).first() if not db_service: db_service = nmap_service(s.name, s.product, s.version, s.extrainfo, s.fingerprint) else: # else, there is no service info to parse db_service = None # fetch the port db_port = nmap_port.query.filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() if not db_port: db_port = nmap_port(p.portId, p.protocol, p.state, db_host, db_service) session.commit() totalprogress += progress self.tick.emit(int(totalprogress)) for h in parser.all_hosts( ): # create all script objects that need to be created db_host = nmap_host.query.filter_by(ip=h.ip).first() for p in h.all_ports(): for scr in p.get_scripts(): db_port = nmap_port.query.filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() db_script = nmap_script.query.filter_by( script_id=scr.scriptId).filter_by( port_id=db_port.id).first() if not db_script: # if this script object doesn't exist, create it nmap_script(scr.scriptId, scr.output, db_port, db_host) for hs in h.get_hostscripts(): db_script = nmap_script.query.filter_by( script_id=hs.scriptId).filter_by( host_id=db_host.id).first() if not db_script: nmap_script(hs.scriptId, hs.output, None, db_host) session.commit() for h in parser.all_hosts(): # update everything db_host = nmap_host.query.filter_by(ip=h.ip).first( ) # get host from DB (if any with the same IP address) if db_host.ipv4 == '' and not h.ipv4 == '': db_host.ipv4 = h.ipv4 if db_host.ipv6 == '' and not h.ipv6 == '': db_host.ipv6 = h.ipv6 if db_host.macaddr == '' and not h.macaddr == '': db_host.macaddr = h.macaddr if not h.status == '': db_host.status = h.status if db_host.hostname == '' and not h.hostname == '': db_host.hostname = h.hostname if db_host.vendor == '' and not h.vendor == '': db_host.vendor = h.vendor if db_host.uptime == '' and not h.uptime == '': db_host.uptime = h.uptime if db_host.lastboot == '' and not h.lastboot == '': db_host.lastboot = h.lastboot if db_host.distance == '' and not h.distance == '': db_host.distance = h.distance if db_host.state == '' and not h.state == '': db_host.state = h.state if db_host.count == '' and not h.count == '': db_host.count = h.count tmp_name = '' tmp_accuracy = '0' # TODO: check if better to convert to int for comparison os_nodes = h.get_OS() for os in os_nodes: db_os = nmap_os.query.filter_by( host_id=db_host.id).filter_by(name=os.name).filter_by( family=os.family).filter_by( generation=os.generation).filter_by( os_type=os.os_type).filter_by( vendor=os.vendor).first() db_os.os_accuracy = os.accuracy # update the accuracy if not os.name == '': # get the most accurate OS match/accuracy to store it in the host table for easier access if os.accuracy > tmp_accuracy: tmp_name = os.name tmp_accuracy = os.accuracy if os_nodes: # if there was operating system info to parse if not tmp_name == '' and not tmp_accuracy == '0': # update the current host with the most accurate OS match db_host.os_match = tmp_name db_host.os_accuracy = tmp_accuracy for p in h.all_ports(): s = p.get_service() if not (s is None): # fetch the service for this port db_service = nmap_service.query.filter_by( name=s.name).filter_by( product=s.product).filter_by( version=s.version).filter_by( extrainfo=s.extrainfo).filter_by( fingerprint=s.fingerprint).first() else: db_service = None # fetch the port db_port = nmap_port.query.filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() db_port.state = p.state if not ( db_service is None ): # if there is some new service information, update it db_port.service_id = db_service.id for scr in p.get_scripts( ): # store the script results (note that existing script outputs are also kept) db_script = nmap_script.query.filter_by( script_id=scr.scriptId).filter_by( port_id=db_port.id).first() if not scr.output == '': db_script.output = scr.output totalprogress += progress self.tick.emit(int(totalprogress)) session.commit() self.db.dbsemaphore.release() # we are done with the DB print '\t[+] Finished in ' + str(time.time() - starttime) + ' seconds.' self.done.emit() self.schedule.emit( parser, self.output == '' ) # call the scheduler (if there is no terminal output it means we imported nmap) except: print '\t[-] Something went wrong when parsing the nmap file..' print "\t[-] Unexpected error:", sys.exc_info()[0] self.done.emit()
def __init__(self, filename): Parser.__init__(self, filename, ".deposit.csv")
def __init__(self, filename): Parser.__init__(self, filename, ".balance.csv")
def run( self ): # it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing try: self.importProgressWidget.show() session = self.db.session() self.tsLog("Parsing nmap xml file: " + self.filename) starttime = time() try: parser = Parser(self.filename) except: self.tsLog('Giving up on import due to previous errors.') self.tsLog("Unexpected error: {0}".format(sys.exc_info()[0])) self.done.emit() return self.db.dbsemaphore.acquire( ) # ensure that while this thread is running, no one else can write to the DB s = parser.get_session() # nmap session info if s: n = nmap_session(self.filename, s.start_time, s.finish_time, s.nmap_version, s.scan_args, s.total_hosts, s.up_hosts, s.down_hosts) session.add(n) hostCount = len(parser.all_hosts()) if hostCount == 0: # to fix a division by zero if we ran nmap on one host hostCount = 1 totalprogress = 0 self.importProgressWidget.setProgress(int(totalprogress)) self.importProgressWidget.show() createProgress = 0 createOsNodesProgress = 0 createPortsProgress = 0 for h in parser.all_hosts( ): # create all the hosts that need to be created db_host = session.query(nmap_host).filter_by(ip=h.ip).first() if not db_host: # if host doesn't exist in DB, create it first hid = nmap_host(os_match='', os_accuracy='', ip=h.ip, ipv4=h.ipv4, ipv6=h.ipv6, macaddr=h.macaddr, status=h.status, hostname=h.hostname, vendor=h.vendor, uptime=h.uptime, lastboot=h.lastboot, distance=h.distance, state=h.state, count=h.count) self.tsLog("Adding db_host") session.add(hid) t_note = note(h.ip, 'Added by nmap') session.add(t_note) else: self.tsLog("Found db_host already in db") createProgress = createProgress + ((100.0 / hostCount) / 5) totalprogress = totalprogress + createProgress self.importProgressWidget.setProgress(int(totalprogress)) self.importProgressWidget.show() session.commit() for h in parser.all_hosts( ): # create all OS, service and port objects that need to be created self.tsLog("Processing h {ip}".format(ip=h.ip)) db_host = session.query(nmap_host).filter_by(ip=h.ip).first() if db_host: self.tsLog( "Found db_host during os/ports/service processing") else: self.log( "Did not find db_host during os/ports/service processing" ) os_nodes = h.get_OS() # parse and store all the OS nodes self.tsLog(" 'os_nodes' to process: {os_nodes}".format( os_nodes=str(len(os_nodes)))) for os in os_nodes: self.tsLog( " Processing os obj {os}".format(os=str(os.name))) db_os = session.query(nmap_os).filter_by( host_id=db_host.id).filter_by(name=os.name).filter_by( family=os.family).filter_by( generation=os.generation).filter_by( os_type=os.os_type).filter_by( vendor=os.vendor).first() if not db_os: t_nmap_os = nmap_os(os.name, os.family, os.generation, os.os_type, os.vendor, os.accuracy, db_host.id) session.add(t_nmap_os) createOsNodesProgress = createOsNodesProgress + ( (100.0 / hostCount) / 5) totalprogress = totalprogress + createOsNodesProgress self.importProgressWidget.setProgress(int(totalprogress)) self.importProgressWidget.show() session.commit() all_ports = h.all_ports() self.tsLog(" 'ports' to process: {all_ports}".format( all_ports=str(len(all_ports)))) for p in all_ports: # parse the ports self.tsLog(" Processing port obj {port}".format( port=str(p.portId))) s = p.get_service() if not ( s is None ): # check if service already exists to avoid adding duplicates #print(" Found service {service} for port {port}".format(service=str(s.name),port=str(p.portId))) #db_service = session.query(nmap_service).filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first() db_service = session.query(nmap_service).filter_by( name=s.name).first() if not db_service: #print("Did not find service *********** name={0} prod={1} ver={2} extra={3} fing={4}".format(s.name, s.product, s.version, s.extrainfo, s.fingerprint)) db_service = nmap_service(s.name, s.product, s.version, s.extrainfo, s.fingerprint) session.add(db_service) # else: #print("FOUND service *************** name={0}".format(db_service.name)) else: # else, there is no service info to parse db_service = None # fetch the port db_port = session.query(nmap_port).filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() if not db_port: #print("Did not find port *********** portid={0} proto={1}".format(p.portId, p.protocol)) if db_service: db_port = nmap_port(p.portId, p.protocol, p.state, db_host.id, db_service.id) else: db_port = nmap_port(p.portId, p.protocol, p.state, db_host.id, '') session.add(db_port) #else: #print('FOUND port *************** portid={0}'.format(db_port.port_id)) createPortsProgress = createPortsProgress + ( (100.0 / hostCount) / 5) totalprogress = totalprogress + createPortsProgress self.importProgressWidget.setProgress(totalprogress) self.importProgressWidget.show() session.commit() #totalprogress += progress #self.tick.emit(int(totalprogress)) for h in parser.all_hosts( ): # create all script objects that need to be created db_host = session.query(nmap_host).filter_by(ip=h.ip).first() for p in h.all_ports(): for scr in p.get_scripts(): self.tsLog( " Processing script obj {scr}".format( scr=str(scr))) db_port = session.query(nmap_port).filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() db_script = session.query(nmap_script).filter_by( script_id=scr.scriptId).filter_by( port_id=db_port.id).first() cveResults = scr.get_cves() for cveEntry in cveResults: t_cve = cve(name=cveEntry.name, url=cveEntry.url, source=cveEntry.source, severity=cveEntry.severity, product=cveEntry.product, version=cveEntry.version, hostId=db_host.id) session.add(t_cve) if not db_script: # if this script object doesn't exist, create it t_nmap_script = nmap_script( scr.scriptId, scr.output, db_port.id, db_host.id) self.tsLog( " Adding nmap_script obj {script}". format(script=scr.scriptId)) session.add(t_nmap_script) for hs in h.get_hostscripts(): db_script = session.query(nmap_script).filter_by( script_id=hs.scriptId).filter_by( host_id=db_host.id).first() if not db_script: t_nmap_script = nmap_script(hs.scriptId, hs.output, None, db_host.id) session.add(t_nmap_script) session.commit() for h in parser.all_hosts(): # update everything db_host = session.query(nmap_host).filter_by(ip=h.ip).first() if db_host.ipv4 == '' and not h.ipv4 == '': db_host.ipv4 = h.ipv4 if db_host.ipv6 == '' and not h.ipv6 == '': db_host.ipv6 = h.ipv6 if db_host.macaddr == '' and not h.macaddr == '': db_host.macaddr = h.macaddr if not h.status == '': db_host.status = h.status if db_host.hostname == '' and not h.hostname == '': db_host.hostname = h.hostname if db_host.vendor == '' and not h.vendor == '': db_host.vendor = h.vendor if db_host.uptime == '' and not h.uptime == '': db_host.uptime = h.uptime if db_host.lastboot == '' and not h.lastboot == '': db_host.lastboot = h.lastboot if db_host.distance == '' and not h.distance == '': db_host.distance = h.distance if db_host.state == '' and not h.state == '': db_host.state = h.state if db_host.count == '' and not h.count == '': db_host.count = h.count session.add(db_host) tmp_name = '' tmp_accuracy = '0' # TODO: check if better to convert to int for comparison os_nodes = h.get_OS() for os in os_nodes: db_os = session.query(nmap_os).filter_by( host_id=db_host.id).filter_by(name=os.name).filter_by( family=os.family).filter_by( generation=os.generation).filter_by( os_type=os.os_type).filter_by( vendor=os.vendor).first() db_os.os_accuracy = os.accuracy # update the accuracy if not os.name == '': # get the most accurate OS match/accuracy to store it in the host table for easier access if os.accuracy > tmp_accuracy: tmp_name = os.name tmp_accuracy = os.accuracy if os_nodes: # if there was operating system info to parse if not tmp_name == '' and not tmp_accuracy == '0': # update the current host with the most accurate OS match db_host.os_match = tmp_name db_host.os_accuracy = tmp_accuracy session.add(db_host) for p in h.all_ports(): s = p.get_service() if not (s is None): #db_service = session.query(nmap_service).filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first() db_service = session.query(nmap_service).filter_by( name=s.name).first() else: db_service = None # fetch the port db_port = session.query(nmap_port).filter_by( host_id=db_host.id).filter_by( port_id=p.portId).filter_by( protocol=p.protocol).first() if db_port: #print("************************ Found {0}".format(db_port)) if db_port.state != p.state: db_port.state = p.state session.add(db_port) if not ( db_service is None ) and db_port.service_id != db_service.id: # if there is some new service information, update it db_port.service_id = db_service.id session.add(db_port) for scr in p.get_scripts( ): # store the script results (note that existing script outputs are also kept) db_script = session.query(nmap_script).filter_by( script_id=scr.scriptId).filter_by( port_id=db_port.id).first() if not scr.output == '' and scr.output is not None: db_script.output = scr.output session.add(db_script) totalprogress = 100 self.importProgressWidget.setProgress(int(totalprogress)) self.importProgressWidget.show() session.commit() self.db.dbsemaphore.release() # we are done with the DB self.tsLog('Finished in ' + str(time() - starttime) + ' seconds.') self.done.emit() self.importProgressWidget.hide() self.schedule.emit( parser, self.output == '' ) # call the scheduler (if there is no terminal output it means we imported nmap) except Exception as e: self.tsLog('Something went wrong when parsing the nmap file..') self.tsLog("Unexpected error: {0}".format(sys.exc_info()[0])) self.tsLog(e) raise self.done.emit()
def identifyCorpus(corpus): corpus.update() clf = EmbeddingOracle.train(corpus) Parser.parse(corpus, clf) scores = Evaluation.evaluate(corpus) return scores
def test_summary(self): p = Parser() res = p.parse(self.f, None, False) self.assertTrue(res['summary'] is not None) self.assertTrue(len(res['summary']) == 46)
def __init__(self, filename): Parser.__init__(self, filename, ".check.csv")
def test_index_timestamp_returns_timestamp(self): p = Parser() html = '<tr><td></td><td><u>123456</u></td></tr>' ts = p.index_timestamp(make_tree(html)) self.assertEqual(ts, 123456)
def run(self): # it is necessary to get the qprocess because we need to send it back to the scheduler when we're done importing try: print "[+] Parsing nmap xml file: " + self.filename starttime = time.time() try: parser = Parser(self.filename) except: print '\t[-] Giving up on import due to previous errors.' print "\t[-] Unexpected error:", sys.exc_info()[0] self.done.emit() return self.db.dbsemaphore.acquire() # ensure that while this thread is running, no one else can write to the DB s = parser.get_session() # nmap session info if s: nmap_session(self.filename, s.start_time, s.finish_time, s.nmap_version, s.scan_args, s.total_hosts, s.up_hosts, s.down_hosts) hostCount = len(parser.all_hosts()) if hostCount==0: # to fix a division by zero if we ran nmap on one host hostCount=1 progress = 100.0 / hostCount totalprogress = 0 self.tick.emit(int(totalprogress)) for h in parser.all_hosts(): # create all the hosts that need to be created db_host = nmap_host.query.filter_by(ip=h.ip).first() if not db_host: # if host doesn't exist in DB, create it first hid = nmap_host('', '', h.ip, h.ipv4, h.ipv6, h.macaddr, h.status, h.hostname, h.vendor, h.uptime, h.lastboot, h.distance, h.state, h.count) note(hid, '') session.commit() for h in parser.all_hosts(): # create all OS, service and port objects that need to be created db_host = nmap_host.query.filter_by(ip=h.ip).first() # fetch the host os_nodes = h.get_OS() # parse and store all the OS nodes for os in os_nodes: db_os = nmap_os.query.filter_by(host_id=db_host.id).filter_by(name=os.name).filter_by(family=os.family).filter_by(generation=os.generation).filter_by(os_type=os.os_type).filter_by(vendor=os.vendor).first() if not db_os: nmap_os(os.name, os.family, os.generation, os.os_type, os.vendor, os.accuracy, db_host) for p in h.all_ports(): # parse the ports s = p.get_service() if not (s is None): # check if service already exists to avoid adding duplicates db_service = nmap_service.query.filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first() if not db_service: db_service = nmap_service(s.name, s.product, s.version, s.extrainfo, s.fingerprint) else: # else, there is no service info to parse db_service = None # fetch the port db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first() if not db_port: db_port = nmap_port(p.portId, p.protocol, p.state, db_host, db_service) session.commit() totalprogress += progress self.tick.emit(int(totalprogress)) for h in parser.all_hosts(): # create all script objects that need to be created db_host = nmap_host.query.filter_by(ip=h.ip).first() for p in h.all_ports(): for scr in p.get_scripts(): db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first() db_script = nmap_script.query.filter_by(script_id=scr.scriptId).filter_by(port_id=db_port.id).first() if not db_script: # if this script object doesn't exist, create it nmap_script(scr.scriptId, scr.output, db_port, db_host) for hs in h.get_hostscripts(): db_script = nmap_script.query.filter_by(script_id=hs.scriptId).filter_by(host_id=db_host.id).first() if not db_script: nmap_script(hs.scriptId, hs.output, None, db_host) session.commit() for h in parser.all_hosts(): # update everything db_host = nmap_host.query.filter_by(ip=h.ip).first() # get host from DB (if any with the same IP address) if db_host.ipv4 == '' and not h.ipv4 == '': db_host.ipv4 = h.ipv4 if db_host.ipv6 == '' and not h.ipv6 == '': db_host.ipv6 = h.ipv6 if db_host.macaddr == '' and not h.macaddr == '': db_host.macaddr = h.macaddr if not h.status == '': db_host.status = h.status if db_host.hostname == '' and not h.hostname == '': db_host.hostname = h.hostname if db_host.vendor == '' and not h.vendor == '': db_host.vendor = h.vendor if db_host.uptime == '' and not h.uptime == '': db_host.uptime = h.uptime if db_host.lastboot == '' and not h.lastboot == '': db_host.lastboot = h.lastboot if db_host.distance == '' and not h.distance == '': db_host.distance = h.distance if db_host.state == '' and not h.state == '': db_host.state = h.state if db_host.count == '' and not h.count == '': db_host.count = h.count tmp_name = '' tmp_accuracy = '0' # TODO: check if better to convert to int for comparison os_nodes = h.get_OS() for os in os_nodes: db_os = nmap_os.query.filter_by(host_id=db_host.id).filter_by(name=os.name).filter_by(family=os.family).filter_by(generation=os.generation).filter_by(os_type=os.os_type).filter_by(vendor=os.vendor).first() db_os.os_accuracy = os.accuracy # update the accuracy if not os.name == '': # get the most accurate OS match/accuracy to store it in the host table for easier access if os.accuracy > tmp_accuracy: tmp_name = os.name tmp_accuracy = os.accuracy if os_nodes: # if there was operating system info to parse if not tmp_name == '' and not tmp_accuracy == '0': # update the current host with the most accurate OS match db_host.os_match = tmp_name db_host.os_accuracy = tmp_accuracy for p in h.all_ports(): s = p.get_service() if not (s is None): # fetch the service for this port db_service = nmap_service.query.filter_by(name=s.name).filter_by(product=s.product).filter_by(version=s.version).filter_by(extrainfo=s.extrainfo).filter_by(fingerprint=s.fingerprint).first() else: db_service = None # fetch the port db_port = nmap_port.query.filter_by(host_id=db_host.id).filter_by(port_id=p.portId).filter_by(protocol=p.protocol).first() db_port.state = p.state if not (db_service is None): # if there is some new service information, update it db_port.service_id = db_service.id for scr in p.get_scripts(): # store the script results (note that existing script outputs are also kept) db_script = nmap_script.query.filter_by(script_id=scr.scriptId).filter_by(port_id=db_port.id).first() if not scr.output == '': db_script.output = scr.output totalprogress += progress self.tick.emit(int(totalprogress)) session.commit() self.db.dbsemaphore.release() # we are done with the DB print '\t[+] Finished in '+ str(time.time()-starttime) + ' seconds.' self.done.emit() self.schedule.emit(parser, self.output == '') # call the scheduler (if there is no terminal output it means we imported nmap) except: print '\t[-] Something went wrong when parsing the nmap file..' print "\t[-] Unexpected error:", sys.exc_info()[0] self.done.emit()
def test_index_nbytes_returns_nbytes(self): p = Parser() html = '<td class="tor-size"><u>123456</u></td>' nbytes = p.index_nbytes(make_tree(html)) self.assertEqual(nbytes, 123456)
def train(model, sess): with sess: summary_writers = model.init_summaries(sess) loss_dict = model.fit(sess, summary_writers) return loss_dict def test(model, sess): with sess: loss_dict = model.run_eval(sess, 'test') return loss_dict if __name__ == '__main__': args = Parser().get_parser().parse_args() config = Config(args) model, sess = init_model(config) if config.load == True: print("\033[92m=>\033[0m Testing Model") test_loss, test_metrics = train(model, sess) output = "=> Test Loss : {}".format(test_loss) else: print("\033[92m=>\033[0m Training Model") loss_dict = train(model, sess) test_metrics = loss_dict['test_metrics'] output = "=> Best Train Loss : {}, Test Loss : {}".format( loss_dict["train_loss"], loss_dict["test_loss"]) # output += "\n=> Test : Coverage = {}, Average Precision = {}, Micro Precision = {}, Micro Recall = {}, Micro F Score = {}".format(metrics['coverage'], metrics['average_precision'], metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'])
def test_main(self): p = Parser() res = p.parse(self.f, None, False) self.assertTrue(len(res) == 9)
def __init__(self, filename): Parser.__init__(self, filename, ".transaction.csv")
class Article(object): '''Article''' def __init__(self, url, raw_html, step, lang="en"): self.status = True self.url = url self.step = step self.lang = lang # title of the article self.title = None #text self.article = u"" self.cleaned_text = u"" # meta self.meta_description = u"" self.meta_lang = u"" self.meta_favicon = u"" self.meta_keywords = u"" #link and domain self.canonical_link = u"" self.domain = u"" # cleaned text self.top_node = None self.tags = set() self.final_url = url self.raw_html = raw_html # the lxml Document object self.parser = Parser() self.raw_doc = u"" self.publish_date = None self.additional_data = {} self.links = [] self.outlinks = [] self.inlinks = [] self.start_date = datetime.datetime.today() def get(self): try: self.doc = self.parser.fromstring(self.raw_html) #init extractor method extractor = StandardContentExtractor(self,"en") # init the document cleaner cleaner = StandardDocumentCleaner(self) # init the output formatter formatter = StandardOutputFormatter(self, stopwords_class="en") #doc #self.doc = doc self.raw_doc = deepcopy(self.raw_html) self.title = extractor.get_title() #self.title = self.title #meta self.meta_lang = extractor.get_meta_lang() #self.meta_favicon = extractor.get_favicon() self.meta_description = extractor.get_meta_description() self.meta_description = self.meta_description.decode("utf-8") self.meta_keywords = extractor.get_meta_keywords() #domain and url self.canonical_link = extractor.get_canonical_link() self.domain = extractor.get_domain() #~ #~ #tag self.tags = extractor.extract_tags() #~ #text self.doc = cleaner.clean() self.top_node = extractor.calculate_best_node() if self.top_node is not None: # post cleanup self.top_node = extractor.post_cleanup(self.top_node) # clean_text #self.cleaned_text = formatter.get_formatted_text() #self.content = self.content.decode("utf-8") self.links = extractor.get_links() self.outlinks = [{"url":url, "step": self.step+1} for url in extractor.get_outlinks()] try: self.content = formatter.get_formatted_text() except Exception as e: try: self.content = bs(self.raw_html).getText() self.content = nltk.clean_html(self.content) except Exception as e: print e self.content = re.sub(r'<.*?>', '', self.raw_html) #self.inlinks, self.inlinks_err = extractor.get_outlinks(self.links) # TODO # self.article.publish_date = self.extractor.get_pub_date(doc) # self.article.additional_data = self.extractor.more(doc) return True except Exception as e: self.status = { "url": self.url, "scope": "article extraction", "msg": e.args, "status": False, "code": -2 } return False def repr(self): self.status ={ "url": self.canonical_link, "domain": self.domain, "title": self.title.encode("utf-8"), "content": self.content, "description": self.meta_description.encode("utf-8"), "outlinks": self.outlinks, "crawl_date": self.start_date, "raw_html": self.raw_html, } return def is_relevant(self, query): self.content = {"title":unicode(self.title), "content": unicode(self.content)} if query.match(self.content) is False: self.status = {"url":self.url, "code": -1, "msg": "Not Relevant","status": False, "title": self.title, "content": self.content} return False else: self.repr() return True