def parse_folder(path): """ Parses all .bib files in given folder. Returns a tuple (parsed_iten, search_index) containing all items found """ if not os.path.isdir(path): raise Exception("Path to folder expected") parsed_items = [] files = utils.files_in_folder(path, "*.bib") executor = concurrent.futures.ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) futures = [ executor.submit( BibParser()._parse_file, os.path.join(path, filename) ) for filename in files ] for future in futures: parsed_items += future.result() executor.shutdown() parsed_items = list(sorted( parsed_items, key=BibItem.key_to_key_func(const.DEFAULT_ORDER_BY) )) item_index = search_index.Index(parsed_items) fin_ctx = FinalizingContext(item_index) for item in parsed_items: item.finalize_item_set(fin_ctx) item_index.update(parsed_items) return (parsed_items, item_index)
def generate_output_template(): output_images = files_in_folder("./output/") test_images = files_in_folder("./dataset/test/raw/") true_masks_images = files_in_folder("./dataset/test/segmentation_mask/") template_predictions = list() for i in range(len(output_images)): template_predictions.append(PredictionResult(test_images[i], output_images[i], true_masks_images[i])) loader = FileSystemLoader("./templates") env = Environment(loader=loader) template = env.get_template('index.html') template_output = template.render(predictions=template_predictions) reports_path = "./reports" if not os.path.exists(reports_path): os.makedirs(reports_path) text_file = open(f"{reports_path}/index.html", "w") text_file.write(template_output) text_file.close()
def parse_folder(self, path: str) -> [BibItem]: """ Parses all .bib files in given folder. Returns list containing all items found """ if not os.path.isdir(path): raise Exception("Path to folder expected") parsed_items = [] files = utils.files_in_folder(path, "*.bib") for filename in files: parsed_items += self.parse_file(os.path.join(path, filename)) return parsed_items
def parse_folder(self, path): """ Parses all .bib files in given folder. Returns list containing all items found """ if not os.path.isdir(path): raise Exception("Path to folder expected") parsed_items = [] files = utils.files_in_folder(path, "*.bib") for filename in files: parsed_items += self.parse_file(os.path.join(path, filename)) return parsed_items
def main(): if not os.path.isdir(config.www.elibrary_dir): print("root folder '{elibrary_dir}' is inaccessible".format( elibrary_dir=config.www.elibrary_dir )) sys.exit(1) #filename in database is relative, but begins from / file_modifier = lambda file_, root=config.www.elibrary_dir: os.path.join(root, file_[1:]) filenames = set(map(file_modifier, item_index["filename"].keys())) files = utils.files_in_folder(config.www.elibrary_dir, "*.pdf", excludes=EXCLUDED_FOLDERS) files_filter = lambda file_: file_ not in filenames files = list(filter(files_filter, files)) for file in files: print(file)
def main(): if not os.path.isdir(config.www.elibrary_dir): print("root folder '{elibrary_dir}' is inaccessible".format( elibrary_dir=config.www.elibrary_dir)) sys.exit(1) #filename in database is relative, but begins from / file_modifier = lambda file_, root=config.www.elibrary_dir: os.path.join( root, file_[1:]) filenames = set(map(file_modifier, item_index["filename"].keys())) files = utils.files_in_folder(config.www.elibrary_dir, "*.pdf", excludes=EXCLUDED_FOLDERS) files_filter = lambda file_: file_ not in filenames files = list(filter(files_filter, files)) for file in files: print(file)
def update_graphs(qrels, run1, run2, metric, top): global qrels_dict df_a = trec_eval(metric, "qrels/" + qrels, run1) df_b = trec_eval(metric, "qrels/" + qrels, run2) df = pd.concat([df_a, df_b], axis=1) df = df.sort_values(run1, ascending=False) # ranking ranking_1 = read_run("runs/" + run1) ranking_2 = read_run("runs/" + run2) df_dict_a = {} for topic in ranking_1.keys(): df_dict_a[topic] = pd.DataFrame(ranking_1[topic], columns=["docid"]) df_dict_a[topic][run1] = df_dict_a[topic]["docid"].apply( lambda x: find_relevance(x, qrels_dict[topic])) df_dict_b = {} for topic in ranking_2.keys(): df_dict_b[topic] = pd.DataFrame(ranking_2[topic], columns=["docid"]) df_dict_b[topic][run2] = df_dict_b[topic]["docid"].apply( lambda x: find_relevance(x, qrels_dict[topic])) # merge dataframes df_dict = {} df_dict[top] = df_dict_a[top].merge(df_dict_b[top], left_index=True, right_index=True) # set n to 5 if @5 or 10 if @10: total = 1 n = int("".join(filter(str.isdigit, metric))) block_size = total / n # ndcg plot fig_a = df.plot.bar(barmode="group") fig_a.update_layout( plot_bgcolor="white", yaxis_title=pretty_metric[metric], xaxis_title="Topic", title=(f"{pretty_metric[metric]}: {df_a[run1].mean():.4f}" f"({df_a[run1].median():.4f}) & {df_b[run2].mean():.4f}" f" ({df_b[run2].median():.4f})"), title_x=0.5, ) # ranking annotations docids (should be put in separate function) y1 = df_dict[top][run1].values y2 = df_dict[top][run2].values xcoord = df_dict[top].iloc[0:n].index # [0,1,2] annotations1 = [ dict( x=8, # xi-0.2, y=xi - (total / 5) + (5 / 30), text=df_dict[top]["docid_x"].iloc[xi], xanchor="auto", yanchor="bottom", showarrow=False, font={ "size": block_size * 75, "color": "grey" }, ) for xi, yi in zip(xcoord, y1) ] annotations2 = [ dict( x=8, y=xi + (total / 5) + (5 / 30), text=mark_new_text( df_dict[top]["docid_y"].iloc[xi], df_dict[top]["docid_x"].iloc[0:n], ), xanchor="auto", yanchor="bottom", showarrow=False, font={ "size": block_size * 75, "color": mark_new(df_dict[top]["docid_y"].iloc[xi], df_dict[top]["docid_x"].iloc[0:n]), }, ) for xi, yi in zip(xcoord, y2) ] annotations = annotations1 + annotations2 # ranking plot fig_b = df_dict[top].iloc[0:n].plot.barh(x=[run1, run2], barmode="group") fig_b.update_layout( plot_bgcolor="white", yaxis={ "tickmode": "array", "tickvals": np.arange(n), "ticktext": np.arange(1, n + 1), "autorange": "reversed", }, title= f"{pretty_metric[metric]}: {df_a[run1].loc[top]} & {df_b[run2].loc[top]}", title_x=0.5, yaxis_title="Rank", xaxis={ "range": [0, 16], "fixedrange": True }, xaxis_title="Relevance", annotations=annotations, ) df_all = pd.DataFrame() for file in files_in_folder("runs"): try: df_tmp = trec_eval(metric, "qrels/" + qrels, file) df_all = pd.concat([df_all, df_tmp], axis=1) except Exception as e: logging.info(f"{file} is not a TREC run. --> {e}") continue medians = df_all.median().sort_values() medians_desc = df_all.median().sort_values(ascending=False) traces = [] for run_name, run_data in df_all[medians.index].iteritems(): color = "rgb(0, 0, 0)" if run_name == run1: color = "rgb(99, 110, 250)" if run_name == run2: color = "rgb(239, 85, 59)" traces.append( go.Box( y=run_data, name=run_name, boxpoints="all", jitter=0.5, whiskerwidth=0.2, marker=dict(size=2, color=color), line=dict(width=1), )) fig_box = go.Figure(data=traces) fig_box.update_layout( plot_bgcolor="white", title="Overview", title_x=0.5, yaxis_title=pretty_metric[metric], showlegend=False, ) headerColor = "grey" fig_table = go.Figure(data=[ go.Table( header=dict( values=[ "<b>run</b>", f"<b>percentage new in top {n} (relative to base)</b>", f"<b>{pretty_metric[metric]} (mean : median)</b>", "<b>p-value (H0: Equal average with base)</b>", ], line_color="darkslategray", fill_color=headerColor, align=["left", "center"], font=dict(color="white", size=12), ), cells=dict( values=[ df_all[medians_desc.index].columns, [ "{:.2f}%".format( new_percentage(ranking_1, read_run("runs/" + file), n)) for file in df_all[medians_desc.index].columns ], [(f"{round(np.mean(df_all[run]), 4):.4f} : " f"{round(np.median(df_all[run]), 4):.4f}") for run in df_all[medians_desc.index].columns], [ f"{ttest_rel(df_a, df_all[col])[1][0]:.4f}" if col != run1 else "X" for col in df_all[medians_desc.index].columns ], ], line_color="darkslategray", fill_color=[[ mark_current_runs(run1, run2, run) for run in df_all[medians_desc.index].columns ]], align=["left", "center"], font=dict(color="darkslategray", size=11), ), ) ]) fig_table.update_layout( plot_bgcolor="white", title="Table", title_x=0.5, yaxis_title=pretty_metric[metric], ) return fig_a, fig_b, fig_box, fig_table
style={ "display": "flex", "justify-content": "center" }, ), html.Div( [ html.Div( [ html.P("base"), dcc.Dropdown( id="dropdown-run-1", options=[{ "label": file, "value": file } for file in files_in_folder("runs")], value=files_in_folder("runs")[0], style={"width": 180}, ), ], style=style_block, ), html.Div( [ html.P("alternative"), dcc.Dropdown( id="dropdown-run-2", options=[{ "label": file, "value": file } for file in files_in_folder("runs")],
def main( max_count=("c", 100, "Maximum count of filenames to display"), root=("r", "", "E-library root") ): if (len(root) == 0) or (not os.path.isdir(root)): print("Root folder is inaccessible") sys.exit(1) root = os.path.abspath(root) #filename in database is relative, but begins from / file_modifier = lambda file_, root=root: os.path.join(root, file_[1:]) filenames = set(map(file_modifier, item_index["filename"].keys())) files = utils.files_in_folder(root, "*.pdf", excludes=EXCLUDED_FOLDERS) files_filter = lambda file_: file_ not in filenames files = list(filter(files_filter, files)) print("Going to process {0} items".format(len(items))) print("Going to process {0} files".format(len(files))) output_count = 0 output_dict = dict() for file_ in files: relpath = "/" + os.path.relpath(file_, root) metadata = utils.extract_metadata_from_file(file_) item_search = utils.create_search_from_metadata(metadata) found_items = list(filter(item_search, items)) found_count = len(found_items) if found_count == 0: print("Nothing found for file '{relpath}'".format( relpath=relpath, )) elif found_count == 1: item = found_items[0] if item in output_dict: output_dict[item].add(relpath) else: output_dict[item] = set([relpath]) else: source_getter = lambda item: item.source() print("Found multiple items for '{relpath}':\n\t{sources}".format( sources=list(map(source_getter, found_items)), relpath=relpath )) output_count += 1 if output_count >= max_count: print("Reached maxcount. Exiting") break sort_key = lambda pair: pair[0].source() for item, paths in sorted(output_dict.items(), key=sort_key): print("Filename for {id} ({source}):".format( id=item.id(), source=item.source(), )) print("filename = {{{relpath}}}".format( relpath=" {0} ".format(config.parser.list_sep).join(sorted(paths)) )) sort_key = lambda item: item.source() if len(items) < max_count: for item in sorted(items, key=sort_key): print("Item isn't digitized: {id} ({source})".format( id=item.id(), source=item.source()))
import utils utils.files_in_folder('.', 'files.csv')