示例#1
0
	def parse_folder(path):
		"""
		Parses all .bib files in given folder.
		Returns a tuple (parsed_iten, search_index) containing all items found
		"""
		if not os.path.isdir(path):
			raise Exception("Path to folder expected")

		parsed_items = []
		files = utils.files_in_folder(path, "*.bib")
		executor = concurrent.futures.ProcessPoolExecutor(max_workers=multiprocessing.cpu_count())
		futures = [
			executor.submit(
				BibParser()._parse_file,
				os.path.join(path, filename)
			)
			for filename in files
		]
		for future in futures:
			parsed_items += future.result()
		executor.shutdown()

		parsed_items = list(sorted(
			parsed_items,
			key=BibItem.key_to_key_func(const.DEFAULT_ORDER_BY)
		))
		item_index = search_index.Index(parsed_items)
		fin_ctx = FinalizingContext(item_index)
		for item in parsed_items:
			item.finalize_item_set(fin_ctx)
		item_index.update(parsed_items)
		return (parsed_items, item_index)
示例#2
0
def generate_output_template():
    output_images = files_in_folder("./output/")
    test_images = files_in_folder("./dataset/test/raw/")
    true_masks_images = files_in_folder("./dataset/test/segmentation_mask/")
    template_predictions = list()
    for i in range(len(output_images)):
        template_predictions.append(PredictionResult(test_images[i], output_images[i], true_masks_images[i]))
    loader = FileSystemLoader("./templates")
    env = Environment(loader=loader)
    template = env.get_template('index.html')
    template_output = template.render(predictions=template_predictions)
    reports_path = "./reports"
    if not os.path.exists(reports_path):
        os.makedirs(reports_path)
    text_file = open(f"{reports_path}/index.html", "w")
    text_file.write(template_output)
    text_file.close()
	def parse_folder(self, path: str) -> [BibItem]:
		"""
		Parses all .bib files in given folder.
		Returns list containing all items found
		"""
		if not os.path.isdir(path):
			raise Exception("Path to folder expected")

		parsed_items = []
		files = utils.files_in_folder(path, "*.bib")
		for filename in files:
			parsed_items += self.parse_file(os.path.join(path, filename))
		return parsed_items
示例#4
0
    def parse_folder(self, path):
        """
		Parses all .bib files in given folder.
		Returns list containing all items found
		"""
        if not os.path.isdir(path):
            raise Exception("Path to folder expected")

        parsed_items = []
        files = utils.files_in_folder(path, "*.bib")
        for filename in files:
            parsed_items += self.parse_file(os.path.join(path, filename))
        return parsed_items
示例#5
0
def main():
	if not os.path.isdir(config.www.elibrary_dir):
		print("root folder '{elibrary_dir}' is inaccessible".format(
			elibrary_dir=config.www.elibrary_dir
		))
		sys.exit(1)

	#filename in database is relative, but begins from /
	file_modifier = lambda file_, root=config.www.elibrary_dir: os.path.join(root, file_[1:])
	filenames = set(map(file_modifier, item_index["filename"].keys()))
	
	files = utils.files_in_folder(config.www.elibrary_dir, "*.pdf", excludes=EXCLUDED_FOLDERS)
	
	files_filter = lambda file_: file_ not in filenames
	files = list(filter(files_filter, files))
	
	for file in files:
		print(file)
示例#6
0
def main():
    if not os.path.isdir(config.www.elibrary_dir):
        print("root folder '{elibrary_dir}' is inaccessible".format(
            elibrary_dir=config.www.elibrary_dir))
        sys.exit(1)

    #filename in database is relative, but begins from /
    file_modifier = lambda file_, root=config.www.elibrary_dir: os.path.join(
        root, file_[1:])
    filenames = set(map(file_modifier, item_index["filename"].keys()))

    files = utils.files_in_folder(config.www.elibrary_dir,
                                  "*.pdf",
                                  excludes=EXCLUDED_FOLDERS)

    files_filter = lambda file_: file_ not in filenames
    files = list(filter(files_filter, files))

    for file in files:
        print(file)
示例#7
0
def update_graphs(qrels, run1, run2, metric, top):
    global qrels_dict

    df_a = trec_eval(metric, "qrels/" + qrels, run1)
    df_b = trec_eval(metric, "qrels/" + qrels, run2)
    df = pd.concat([df_a, df_b], axis=1)
    df = df.sort_values(run1, ascending=False)

    # ranking
    ranking_1 = read_run("runs/" + run1)
    ranking_2 = read_run("runs/" + run2)

    df_dict_a = {}
    for topic in ranking_1.keys():
        df_dict_a[topic] = pd.DataFrame(ranking_1[topic], columns=["docid"])
        df_dict_a[topic][run1] = df_dict_a[topic]["docid"].apply(
            lambda x: find_relevance(x, qrels_dict[topic]))

    df_dict_b = {}
    for topic in ranking_2.keys():
        df_dict_b[topic] = pd.DataFrame(ranking_2[topic], columns=["docid"])
        df_dict_b[topic][run2] = df_dict_b[topic]["docid"].apply(
            lambda x: find_relevance(x, qrels_dict[topic]))

    # merge dataframes
    df_dict = {}
    df_dict[top] = df_dict_a[top].merge(df_dict_b[top],
                                        left_index=True,
                                        right_index=True)

    # set n to 5 if @5 or 10 if @10:
    total = 1
    n = int("".join(filter(str.isdigit, metric)))
    block_size = total / n

    # ndcg plot
    fig_a = df.plot.bar(barmode="group")
    fig_a.update_layout(
        plot_bgcolor="white",
        yaxis_title=pretty_metric[metric],
        xaxis_title="Topic",
        title=(f"{pretty_metric[metric]}: {df_a[run1].mean():.4f}"
               f"({df_a[run1].median():.4f}) & {df_b[run2].mean():.4f}"
               f" ({df_b[run2].median():.4f})"),
        title_x=0.5,
    )

    # ranking annotations docids (should be put in separate function)
    y1 = df_dict[top][run1].values
    y2 = df_dict[top][run2].values

    xcoord = df_dict[top].iloc[0:n].index  # [0,1,2]

    annotations1 = [
        dict(
            x=8,  # xi-0.2,
            y=xi - (total / 5) + (5 / 30),
            text=df_dict[top]["docid_x"].iloc[xi],
            xanchor="auto",
            yanchor="bottom",
            showarrow=False,
            font={
                "size": block_size * 75,
                "color": "grey"
            },
        ) for xi, yi in zip(xcoord, y1)
    ]

    annotations2 = [
        dict(
            x=8,
            y=xi + (total / 5) + (5 / 30),
            text=mark_new_text(
                df_dict[top]["docid_y"].iloc[xi],
                df_dict[top]["docid_x"].iloc[0:n],
            ),
            xanchor="auto",
            yanchor="bottom",
            showarrow=False,
            font={
                "size":
                block_size * 75,
                "color":
                mark_new(df_dict[top]["docid_y"].iloc[xi],
                         df_dict[top]["docid_x"].iloc[0:n]),
            },
        ) for xi, yi in zip(xcoord, y2)
    ]

    annotations = annotations1 + annotations2

    # ranking plot
    fig_b = df_dict[top].iloc[0:n].plot.barh(x=[run1, run2], barmode="group")
    fig_b.update_layout(
        plot_bgcolor="white",
        yaxis={
            "tickmode": "array",
            "tickvals": np.arange(n),
            "ticktext": np.arange(1, n + 1),
            "autorange": "reversed",
        },
        title=
        f"{pretty_metric[metric]}: {df_a[run1].loc[top]} & {df_b[run2].loc[top]}",
        title_x=0.5,
        yaxis_title="Rank",
        xaxis={
            "range": [0, 16],
            "fixedrange": True
        },
        xaxis_title="Relevance",
        annotations=annotations,
    )

    df_all = pd.DataFrame()
    for file in files_in_folder("runs"):
        try:
            df_tmp = trec_eval(metric, "qrels/" + qrels, file)
            df_all = pd.concat([df_all, df_tmp], axis=1)
        except Exception as e:
            logging.info(f"{file} is not a TREC run. --> {e}")
            continue

    medians = df_all.median().sort_values()
    medians_desc = df_all.median().sort_values(ascending=False)

    traces = []
    for run_name, run_data in df_all[medians.index].iteritems():
        color = "rgb(0, 0, 0)"
        if run_name == run1:
            color = "rgb(99, 110, 250)"
        if run_name == run2:
            color = "rgb(239, 85, 59)"
        traces.append(
            go.Box(
                y=run_data,
                name=run_name,
                boxpoints="all",
                jitter=0.5,
                whiskerwidth=0.2,
                marker=dict(size=2, color=color),
                line=dict(width=1),
            ))

    fig_box = go.Figure(data=traces)
    fig_box.update_layout(
        plot_bgcolor="white",
        title="Overview",
        title_x=0.5,
        yaxis_title=pretty_metric[metric],
        showlegend=False,
    )

    headerColor = "grey"

    fig_table = go.Figure(data=[
        go.Table(
            header=dict(
                values=[
                    "<b>run</b>",
                    f"<b>percentage new in top {n} (relative to base)</b>",
                    f"<b>{pretty_metric[metric]} (mean : median)</b>",
                    "<b>p-value (H0: Equal average with base)</b>",
                ],
                line_color="darkslategray",
                fill_color=headerColor,
                align=["left", "center"],
                font=dict(color="white", size=12),
            ),
            cells=dict(
                values=[
                    df_all[medians_desc.index].columns,
                    [
                        "{:.2f}%".format(
                            new_percentage(ranking_1, read_run("runs/" +
                                                               file), n))
                        for file in df_all[medians_desc.index].columns
                    ],
                    [(f"{round(np.mean(df_all[run]), 4):.4f} : "
                      f"{round(np.median(df_all[run]), 4):.4f}")
                     for run in df_all[medians_desc.index].columns],
                    [
                        f"{ttest_rel(df_a, df_all[col])[1][0]:.4f}"
                        if col != run1 else "X"
                        for col in df_all[medians_desc.index].columns
                    ],
                ],
                line_color="darkslategray",
                fill_color=[[
                    mark_current_runs(run1, run2, run)
                    for run in df_all[medians_desc.index].columns
                ]],
                align=["left", "center"],
                font=dict(color="darkslategray", size=11),
            ),
        )
    ])
    fig_table.update_layout(
        plot_bgcolor="white",
        title="Table",
        title_x=0.5,
        yaxis_title=pretty_metric[metric],
    )

    return fig_a, fig_b, fig_box, fig_table
示例#8
0
     style={
         "display": "flex",
         "justify-content": "center"
     },
 ),
 html.Div(
     [
         html.Div(
             [
                 html.P("base"),
                 dcc.Dropdown(
                     id="dropdown-run-1",
                     options=[{
                         "label": file,
                         "value": file
                     } for file in files_in_folder("runs")],
                     value=files_in_folder("runs")[0],
                     style={"width": 180},
                 ),
             ],
             style=style_block,
         ),
         html.Div(
             [
                 html.P("alternative"),
                 dcc.Dropdown(
                     id="dropdown-run-2",
                     options=[{
                         "label": file,
                         "value": file
                     } for file in files_in_folder("runs")],
def main(
	max_count=("c", 100, "Maximum count of filenames to display"),
	root=("r", "", "E-library root")
):
	if (len(root) == 0) or (not os.path.isdir(root)):
		print("Root folder is inaccessible")
		sys.exit(1)

	root = os.path.abspath(root)

	#filename in database is relative, but begins from /
	file_modifier = lambda file_, root=root: os.path.join(root, file_[1:])
	filenames = set(map(file_modifier, item_index["filename"].keys()))
	
	files = utils.files_in_folder(root, "*.pdf", excludes=EXCLUDED_FOLDERS)
	
	files_filter = lambda file_: file_ not in filenames
	files = list(filter(files_filter, files))
	
	print("Going to process {0} items".format(len(items)))
	print("Going to process {0} files".format(len(files)))
	output_count = 0
	output_dict = dict()
	for file_ in files:
		relpath = "/" + os.path.relpath(file_, root)
			
		metadata = utils.extract_metadata_from_file(file_)	
		item_search = utils.create_search_from_metadata(metadata)
		
		found_items = list(filter(item_search, items))
		found_count = len(found_items)
		if found_count == 0:
			print("Nothing found for file '{relpath}'".format(
				relpath=relpath,
			))
		elif found_count == 1:
			item = found_items[0]
			if item in output_dict:
				output_dict[item].add(relpath)
			else:
				output_dict[item] = set([relpath])
		else:
			source_getter = lambda item: item.source()
			print("Found multiple items for '{relpath}':\n\t{sources}".format(
				sources=list(map(source_getter, found_items)),
				relpath=relpath
			))
		
		output_count += 1
		if output_count >= max_count:
			print("Reached maxcount. Exiting")
			break

	sort_key = lambda pair: pair[0].source()		
	for item, paths in sorted(output_dict.items(), key=sort_key):
		print("Filename for {id} ({source}):".format(
			id=item.id(),
			source=item.source(),
		))
		print("filename = {{{relpath}}}".format(
			relpath=" {0} ".format(config.parser.list_sep).join(sorted(paths))
		))

	sort_key = lambda item: item.source()
	if len(items) < max_count:
		for item in sorted(items, key=sort_key):
			print("Item isn't digitized: {id} ({source})".format(
				id=item.id(),
				source=item.source()))
示例#10
0
import utils

utils.files_in_folder('.', 'files.csv')