Exemplo n.º 1
0
	def __init__(self):
		self.__conf = Configure()
		self.__associations = Associations()
		self.__sites = Sites()
		resultList = []

		xReader = XMLReader()
		xParser = XMLParser()
		confTree = xReader.getTree('xml/conf.xml')
		if confTree == None:
			exit()
		searchParams = xParser.getSearchParams(confTree)
		searchSites = xParser.getSearchSites(confTree)
		pagesToSearch = xParser.getPagesToSearch(confTree)
		self.masterInspectionPath = xParser.getMIXML(confTree)

		self.__conf.setParams(searchSites, searchParams, pagesToSearch)

		keywordTree = xReader.getTree('xml/keywords.xml')
		fKeywordTree = xReader.getTree('xml/f_keywords.xml')
		if keywordTree == None or fKeywordTree == None:
			exit()
		keywords = xParser.getKeywords(keywordTree)
		fKeywords = xParser.getKeywords(fKeywordTree)
		avoids = xParser.getAvoids(keywordTree)
		fAvoids = xParser.getAvoids(fKeywordTree)

		self.__associations.setParams(keywords, avoids, fKeywords, fAvoids)

		sitesTree = xReader.getTree('xml/sites.xml')
		if sitesTree == None:
			exit()
		goodSites, badSites = xParser.getSites(sitesTree)

		self.__sites.setParams(goodSites, badSites)
Exemplo n.º 2
0
 def getInspectionsStr(self, xml):
     xReader = XMLReader()
     xParser = XMLParser()
     tree = xReader.getTree(xml)
     XMLInspections = []
     if tree == None:
         print(ind, xml, "Failed to read.")
         return
     link, score, url, fil, ID = xParser.getInspectionDataWithId(tree)
     for i in range(len(link)):
         XMLInspections.append(Inspection(link[i], score[i], url[i], fil[i], ID[i]))
     return XMLInspections
Exemplo n.º 3
0
 def getInspections(self, xmls):
     xReader = XMLReader()
     xParser = XMLParser()
     XMLInspections = []
     for ind, xml in enumerate(xmls):
         tree = xReader.getTree(xml)
         if tree == None:
             print(ind, xml, "Failed to read.")
             continue
         link, score, url, fil = xParser.getInspectionData(tree)
         for i in range(len(link)):
             XMLInspections.append(Inspection(link[i], score[i], url[i], fil[i]))
     return XMLInspections
Exemplo n.º 4
0
	def __init__(self, vote, voteId, skip = False):
		if skip:
			return
		self.vote = vote
		self.voteId = voteId

		xReader = XMLReader()
		xParser = XMLParser()
		confTree = xReader.getTree('xml/conf.xml')
		if confTree == None:
			print('Abort. Failed to read xml/conf.xml')
			exit()
		self.masterInspectionPath = xParser.getMIXML(confTree)
Exemplo n.º 5
0
	def updateSitesXMl(self):
		xReader = XMLReader()
		xParser = XMLParser()
		xWriter = XMLWriter()
		tree = xReader.getTree('xml/sites.xml')
		gdSites, bdSites = xParser.getSites(tree)
		data = None
		for obj in self.XMLInspections:
			if obj.ID == self.voteId:
				data = obj
				break
		if self.vote == "up":
			gdSites.append(obj.url)
		else:
			bdSites.append(obj.url)
		xWriter.writeSitesXML(gdSites,bdSites,'xml/sites.xml')
Exemplo n.º 6
0
	def storeWords(self):
		self.wl = WordList()
		xReader = XMLReader()
		xParser = XMLParser()

		if xReader.checkIfExistsQuiet('xml/words.xml'):
			tree = xReader.getTree('xml/words.xml')
			wordAvg, avgRatio = xParser.getGeneralFromWords(tree)
			self.wl = xParser.getWords(tree)

		usf = 0
		usl = 0
		if self.vote == "up":
			usf = 1
		else:
			usl = 1

		for ind, obj in enumerate(self.XMLInspections):
			if obj.ID != self.voteId:
				continue

			pl = PageLoader(obj.fil)
			if not pl.isReadable():
				print('Abort. File not readable:', obj.fil)
				exit()
			pl.read()

			patt = "^[a-zA-Z0-9]*$"
			pl.linkWords = self.removeListElesNotPatterned(patt, pl.linkWords)
			pl.titleWords = self.removeListElesNotPatterned(patt, pl.titleWords)
			pl.headerWords = self.removeListElesNotPatterned(patt, pl.headerWords)
			pl.specialWords = self.removeListElesNotPatterned(patt, pl.specialWords)
			pl.normalWords = self.removeListElesNotPatterned(patt, pl.normalWords)

			for word in pl.linkWords:
				self.wl.append(word, usf, usl)
			for word in pl.titleWords:
				self.wl.append(word, usf, usl)
			for word in pl.headerWords:
				self.wl.append(word, usf, usl)
			for word in pl.specialWords:
				self.wl.append(word, usf, usl)
			for word in pl.normalWords:
				self.wl.append(word, usf, usl)
			return
Exemplo n.º 7
0
class Parser:
    def __init__(self):
        self.__parser = XMLParser(global_define.XML_NODE_NAME, global_define.XML_TAG_NAME_LIST)
        pass

    def parse(self, file_path):
        xml_dicts = None
        try:
            xml_dicts = self.__parser.parse(file_path)
        except Exception,e:
            logger.error("%s file parse failed. [%s]" %(file_path, e))
        return xml_dicts
Exemplo n.º 8
0
def time_process(data_file):
    curr_time = dt.datetime.now()
    # run loop
    fobj = XMLParser(data_file, curr_time)
    lim = fobj.find_oldest_time()
    while curr_time > lim:
        curr_time -= TIME_INCR
        print('running time analysis for ' + str(curr_time.date()))
        fobj.update_time(curr_time)
        d = fobj.parse_to_dict()
        if d:
            net = NetworkParser(d)
            print("Analyzing File " + data_file + ' at time ' +
                  str(curr_time.date()))
            na = NetworkAnalysis(net.G, os.path.basename(data_file),
                                 output_path, curr_time.date())

            basic = na.d3dump(public_out_path, str(curr_time.date()))

            public_data_output = public_data + na.fileName + "/"
            if generate_data:  # write out decentralized results
                na.write_permanent_data_json(public_data_output,
                                             str(curr_time.date()), basic)

    print("Completed Analyzing: " + data_file)
Exemplo n.º 9
0
def get_parser(filename):
    parsers = []
    parsers.append(PlaintextParser(filename))
    try:
        parsers.append(LineParser(filename))
    except ValueError:
        pass
    parsers.append(XMLParser(filename))
    parsers.append(CtmParser(filename))

    for parser in parsers:
        if parser.wants_this_file():
            return parser

    return None
Exemplo n.º 10
0
    def main(arg):
        try:
            if len(arg) != 1:
                print("Must be only one parameter - path to xml-file.")
                return 1

            xml_file = arg[0]
            if not os.path.isfile(xml_file):
                print("Invalid file - {}.".format(xml_file))
                return 1

            with open(xml_file, 'r') as f:
                buffer = f.read()

            Main.print_xml_tree(
                XMLParser(buffer).get_iterator(), Main.FOUR_SPACES)

        except Exception as exp:
            print("An error occurred: " + str(exp))
            return -1
Exemplo n.º 11
0
    def process_file(data_file):
        curr_time = get_time()
        # Parse Into Network
        d = XMLParser(data_file, get_time()).parse_to_dict()
        net = NetworkParser(d)
        # Graph Analysis
        output("Analyzing File " + data_file)
        na = NetworkAnalysis(net.G, os.path.basename(data_file), output_path)
        na.outputBasicStats()
        na.outputNodesAndEdges()
        # na.nodeRemoval()

        basic = na.d3dump(public_out_path, str(curr_time))

        # Run Decentralized Search
        if decentralized_search_settings["run_decentralized_search"]:
            hiearchyG = net.G.copy()
            category_hierarchy = CategoryBasedHierarchicalModel(
                hiearchyG,
                similarity_matrix_type=category_hierarchical_model_settings[
                    "similarity_matrix_type"],
                max_branching_factor_root=category_hierarchical_model_settings[
                    "max_branching_factor_root"])
            category_hierarchy.build_hierarchical_model()
            decentralized_search_model = HierarchicalDecentralizedSearch(
                hiearchyG,
                category_hierarchy.hierarchy,
                na,
                detailed_print=decentralized_search_settings["detailed_print"],
                hierarchy_nodes_only=decentralized_search_settings[
                    "hierarchy_nodes_only"],
                apply_weighted_score=decentralized_search_settings[
                    "apply_weighted_score"],
            )
            n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search(
                1000, decentralized_search_settings["widen_search"],
                decentralized_search_settings["plots"])
            basic.update({
                "decentralized_num_paths_found":
                n_found,
                "decentralized_num_paths_missing":
                n_missing,
                "decentralized_average_decentralized_path_length":
                av_path_len,
                "decentralized_average_num_unique_nodes":
                av_unique_nodes,
                "hierarchy_num_nodes":
                (len(category_hierarchy.hierarchy.nodes()) -
                 len(category_hierarchy.ranked_categories)),
                "hierarchy_num_cat_nodes":
                len(category_hierarchy.ranked_categories),
                "hierarchy_num_levels":
                category_hierarchy.num_hierarchy_levels
            })
            basic["hierarchy_ratio_cat_nodes"] = basic[
                "hierarchy_num_cat_nodes"] / basic["hierarchy_num_nodes"]

            path_lengths_deciles_dict = {}
            for i in range(len(path_lengths_deciles)):
                path_lengths_deciles_dict["path_length_" + str(
                    (i + 1) * 10) + "_percentile"] = path_lengths_deciles[i]
            basic.update(path_lengths_deciles_dict)

            random_search_model = RandomSearch(net.G, na)
            n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search(
                1000, decentralized_search_settings["widen_search"],
                decentralized_search_settings["plots"])
            basic.update({
                "random_num_paths_found": n_found,
                "random_num_paths_missing": n_missing,
                "random_average_decentralized_path_length": av_path_len,
                "random_average_num_unique_nodes": av_unique_nodes
            })

        if generate_data:
            na.write_permanent_data_json(
                public_data, basic)  # write out decentralized results

        # na.generateDrawing()

        output("Completed Analyzing: " + data_file)
Exemplo n.º 12
0
    def time_process(data_file):
        curr_time = dt.datetime.now()
        # run loop
        fobj = XMLParser(data_file, curr_time)
        lim = fobj.find_oldest_time()
        while curr_time > lim:
            curr_time -= TIME_INCR
            print('running time analysis for ' + str(curr_time))
            fobj.update_time(curr_time)
            d = fobj.parse_to_dict()
            if d:
                net = NetworkParser(d)
                output("Analyzing File " + data_file + ' at time ' +
                       str(curr_time))
                na = NetworkAnalysis(net.G, os.path.basename(data_file),
                                     output_path, curr_time)

                basic = na.d3dump(public_out_path, str(curr_time))

                # Run Decentralized Search
                try:
                    if decentralized_search_settings[
                            "run_decentralized_search"]:
                        hiearchyG = net.G.copy()
                        category_hierarchy = CategoryBasedHierarchicalModel(
                            hiearchyG,
                            similarity_matrix_type=
                            category_hierarchical_model_settings[
                                "similarity_matrix_type"],
                            max_branching_factor_root=
                            category_hierarchical_model_settings[
                                "max_branching_factor_root"])
                        category_hierarchy.build_hierarchical_model()
                        decentralized_search_model = HierarchicalDecentralizedSearch(
                            hiearchyG,
                            category_hierarchy.hierarchy,
                            na,
                            detailed_print=decentralized_search_settings[
                                "detailed_print"],
                            hierarchy_nodes_only=decentralized_search_settings[
                                "hierarchy_nodes_only"],
                            apply_weighted_score=decentralized_search_settings[
                                "apply_weighted_score"],
                        )
                        n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search(
                            1000,
                            decentralized_search_settings["widen_search"],
                            decentralized_search_settings["plots"])
                        basic.update({
                            "decentralized_num_paths_found":
                            n_found,
                            "decentralized_num_paths_missing":
                            n_missing,
                            "decentralized_average_decentralized_path_length":
                            av_path_len,
                            "decentralized_average_num_unique_nodes":
                            av_unique_nodes,
                            "hierarchy_num_nodes":
                            (len(category_hierarchy.hierarchy.nodes()) -
                             len(category_hierarchy.ranked_categories)),
                            "hierarchy_num_levels":
                            category_hierarchy.num_hierarchy_levels
                        })

                        path_lengths_deciles_dict = {}
                        for i in range(len(path_lengths_deciles)):
                            path_lengths_deciles_dict["path_length_" + str((i + 1) * 10) + "_percentile"] = \
                                path_lengths_deciles[i]
                        basic.update(path_lengths_deciles_dict)

                        random_search_model = RandomSearch(net.G, na)
                        n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search(
                            1000,
                            decentralized_search_settings["widen_search"],
                            decentralized_search_settings["plots"])
                        basic.update({
                            "random_num_paths_found":
                            n_found,
                            "random_num_paths_missing":
                            n_missing,
                            "random_average_decentralized_path_length":
                            av_path_len,
                            "random_average_num_unique_nodes":
                            av_unique_nodes
                        })
                except:
                    pass

                if generate_data:  # write out decentralized results
                    na.write_permanent_data_json(public_data, basic,
                                                 str(curr_time.date()))

        output("Completed Analyzing: " + data_file)
Exemplo n.º 13
0
from xml_parser import XML, XMLParser
import openpyxl

root_url = "/var/www/Ozone/Ozone/static/xml/"
filenames = [
    "mbt",
    "kbt"
]

format = ".xml"


for filename in filenames:
    parser = XMLParser(root_url + filename + format)

    offers = parser.ozon_offers_list()
    titles = parser.get_all_keys()
    categories = parser.get_all_categories()

    titles.sort(key = lambda x: str(offers[0:100]).count(x), reverse=True)

    wb = openpyxl.Workbook()
    ws = wb.create_sheet("offers")

    for key in titles:
        ws.cell(1, titles.index(key)+2, key)

    row = 2
    for offer in offers:
        ws.cell(row, 1, str(categories[int(offer["categoryId"])]))
        col = 2
Exemplo n.º 14
0
 def __init__(self):
     self.__parser = XMLParser(global_define.XML_NODE_NAME, global_define.XML_TAG_NAME_LIST)
     pass