예제 #1
0
def time_process(data_file):
    curr_time = dt.datetime.now()
    # run loop
    fobj = XMLParser(data_file, curr_time)
    lim = fobj.find_oldest_time()
    while curr_time > lim:
        curr_time -= TIME_INCR
        print('running time analysis for ' + str(curr_time.date()))
        fobj.update_time(curr_time)
        d = fobj.parse_to_dict()
        if d:
            net = NetworkParser(d)
            print("Analyzing File " + data_file + ' at time ' +
                  str(curr_time.date()))
            na = NetworkAnalysis(net.G, os.path.basename(data_file),
                                 output_path, curr_time.date())

            basic = na.d3dump(public_out_path, str(curr_time.date()))

            public_data_output = public_data + na.fileName + "/"
            if generate_data:  # write out decentralized results
                na.write_permanent_data_json(public_data_output,
                                             str(curr_time.date()), basic)

    print("Completed Analyzing: " + data_file)
예제 #2
0
def get_parser(filename):
    parsers = []
    parsers.append(PlaintextParser(filename))
    try:
        parsers.append(LineParser(filename))
    except ValueError:
        pass
    parsers.append(XMLParser(filename))
    parsers.append(CtmParser(filename))

    for parser in parsers:
        if parser.wants_this_file():
            return parser

    return None
예제 #3
0
    def main(arg):
        try:
            if len(arg) != 1:
                print("Must be only one parameter - path to xml-file.")
                return 1

            xml_file = arg[0]
            if not os.path.isfile(xml_file):
                print("Invalid file - {}.".format(xml_file))
                return 1

            with open(xml_file, 'r') as f:
                buffer = f.read()

            Main.print_xml_tree(
                XMLParser(buffer).get_iterator(), Main.FOUR_SPACES)

        except Exception as exp:
            print("An error occurred: " + str(exp))
            return -1
예제 #4
0
    def process_file(data_file):
        curr_time = get_time()
        # Parse Into Network
        d = XMLParser(data_file, get_time()).parse_to_dict()
        net = NetworkParser(d)
        # Graph Analysis
        output("Analyzing File " + data_file)
        na = NetworkAnalysis(net.G, os.path.basename(data_file), output_path)
        na.outputBasicStats()
        na.outputNodesAndEdges()
        # na.nodeRemoval()

        basic = na.d3dump(public_out_path, str(curr_time))

        # Run Decentralized Search
        if decentralized_search_settings["run_decentralized_search"]:
            hiearchyG = net.G.copy()
            category_hierarchy = CategoryBasedHierarchicalModel(
                hiearchyG,
                similarity_matrix_type=category_hierarchical_model_settings[
                    "similarity_matrix_type"],
                max_branching_factor_root=category_hierarchical_model_settings[
                    "max_branching_factor_root"])
            category_hierarchy.build_hierarchical_model()
            decentralized_search_model = HierarchicalDecentralizedSearch(
                hiearchyG,
                category_hierarchy.hierarchy,
                na,
                detailed_print=decentralized_search_settings["detailed_print"],
                hierarchy_nodes_only=decentralized_search_settings[
                    "hierarchy_nodes_only"],
                apply_weighted_score=decentralized_search_settings[
                    "apply_weighted_score"],
            )
            n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search(
                1000, decentralized_search_settings["widen_search"],
                decentralized_search_settings["plots"])
            basic.update({
                "decentralized_num_paths_found":
                n_found,
                "decentralized_num_paths_missing":
                n_missing,
                "decentralized_average_decentralized_path_length":
                av_path_len,
                "decentralized_average_num_unique_nodes":
                av_unique_nodes,
                "hierarchy_num_nodes":
                (len(category_hierarchy.hierarchy.nodes()) -
                 len(category_hierarchy.ranked_categories)),
                "hierarchy_num_cat_nodes":
                len(category_hierarchy.ranked_categories),
                "hierarchy_num_levels":
                category_hierarchy.num_hierarchy_levels
            })
            basic["hierarchy_ratio_cat_nodes"] = basic[
                "hierarchy_num_cat_nodes"] / basic["hierarchy_num_nodes"]

            path_lengths_deciles_dict = {}
            for i in range(len(path_lengths_deciles)):
                path_lengths_deciles_dict["path_length_" + str(
                    (i + 1) * 10) + "_percentile"] = path_lengths_deciles[i]
            basic.update(path_lengths_deciles_dict)

            random_search_model = RandomSearch(net.G, na)
            n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search(
                1000, decentralized_search_settings["widen_search"],
                decentralized_search_settings["plots"])
            basic.update({
                "random_num_paths_found": n_found,
                "random_num_paths_missing": n_missing,
                "random_average_decentralized_path_length": av_path_len,
                "random_average_num_unique_nodes": av_unique_nodes
            })

        if generate_data:
            na.write_permanent_data_json(
                public_data, basic)  # write out decentralized results

        # na.generateDrawing()

        output("Completed Analyzing: " + data_file)
예제 #5
0
    def time_process(data_file):
        curr_time = dt.datetime.now()
        # run loop
        fobj = XMLParser(data_file, curr_time)
        lim = fobj.find_oldest_time()
        while curr_time > lim:
            curr_time -= TIME_INCR
            print('running time analysis for ' + str(curr_time))
            fobj.update_time(curr_time)
            d = fobj.parse_to_dict()
            if d:
                net = NetworkParser(d)
                output("Analyzing File " + data_file + ' at time ' +
                       str(curr_time))
                na = NetworkAnalysis(net.G, os.path.basename(data_file),
                                     output_path, curr_time)

                basic = na.d3dump(public_out_path, str(curr_time))

                # Run Decentralized Search
                try:
                    if decentralized_search_settings[
                            "run_decentralized_search"]:
                        hiearchyG = net.G.copy()
                        category_hierarchy = CategoryBasedHierarchicalModel(
                            hiearchyG,
                            similarity_matrix_type=
                            category_hierarchical_model_settings[
                                "similarity_matrix_type"],
                            max_branching_factor_root=
                            category_hierarchical_model_settings[
                                "max_branching_factor_root"])
                        category_hierarchy.build_hierarchical_model()
                        decentralized_search_model = HierarchicalDecentralizedSearch(
                            hiearchyG,
                            category_hierarchy.hierarchy,
                            na,
                            detailed_print=decentralized_search_settings[
                                "detailed_print"],
                            hierarchy_nodes_only=decentralized_search_settings[
                                "hierarchy_nodes_only"],
                            apply_weighted_score=decentralized_search_settings[
                                "apply_weighted_score"],
                        )
                        n_found, n_missing, av_path_len, av_unique_nodes, path_lengths_deciles = decentralized_search_model.run_decentralized_search(
                            1000,
                            decentralized_search_settings["widen_search"],
                            decentralized_search_settings["plots"])
                        basic.update({
                            "decentralized_num_paths_found":
                            n_found,
                            "decentralized_num_paths_missing":
                            n_missing,
                            "decentralized_average_decentralized_path_length":
                            av_path_len,
                            "decentralized_average_num_unique_nodes":
                            av_unique_nodes,
                            "hierarchy_num_nodes":
                            (len(category_hierarchy.hierarchy.nodes()) -
                             len(category_hierarchy.ranked_categories)),
                            "hierarchy_num_levels":
                            category_hierarchy.num_hierarchy_levels
                        })

                        path_lengths_deciles_dict = {}
                        for i in range(len(path_lengths_deciles)):
                            path_lengths_deciles_dict["path_length_" + str((i + 1) * 10) + "_percentile"] = \
                                path_lengths_deciles[i]
                        basic.update(path_lengths_deciles_dict)

                        random_search_model = RandomSearch(net.G, na)
                        n_found, n_missing, av_path_len, av_unique_nodes = random_search_model.run_search(
                            1000,
                            decentralized_search_settings["widen_search"],
                            decentralized_search_settings["plots"])
                        basic.update({
                            "random_num_paths_found":
                            n_found,
                            "random_num_paths_missing":
                            n_missing,
                            "random_average_decentralized_path_length":
                            av_path_len,
                            "random_average_num_unique_nodes":
                            av_unique_nodes
                        })
                except:
                    pass

                if generate_data:  # write out decentralized results
                    na.write_permanent_data_json(public_data, basic,
                                                 str(curr_time.date()))

        output("Completed Analyzing: " + data_file)
예제 #6
0
from xml_parser import XML, XMLParser
import openpyxl

root_url = "/var/www/Ozone/Ozone/static/xml/"
filenames = [
    "mbt",
    "kbt"
]

format = ".xml"


for filename in filenames:
    parser = XMLParser(root_url + filename + format)

    offers = parser.ozon_offers_list()
    titles = parser.get_all_keys()
    categories = parser.get_all_categories()

    titles.sort(key = lambda x: str(offers[0:100]).count(x), reverse=True)

    wb = openpyxl.Workbook()
    ws = wb.create_sheet("offers")

    for key in titles:
        ws.cell(1, titles.index(key)+2, key)

    row = 2
    for offer in offers:
        ws.cell(row, 1, str(categories[int(offer["categoryId"])]))
        col = 2