def print_sum_of_each_sheet(workbook_folder, file_name_contains=None, normalized = False): workbooks_paths = [] if file_name_contains == None: workbooks_paths.extend(FileBrowser.returnFilePathsFromDirectory(workbook_folder)) else: for e in file_name_contains: workbooks_paths.extend(FileBrowser.returnFilesWithEnding(workbook_folder, str(e + ".xls"))) pre_sheets = get_pre_sheets_and_path(workbooks_paths) ''' for path, pre_sheet in pre_sheets: pre_sheet_rows = pre_sheet.nrows sum = 0 for r in range(1, pre_sheet_rows): pre_row = pre_sheet.row_values(r) sum += float(pre_row[3]) print path + " = "+sum ''' for path, pre_sheet in pre_sheets: if normalized: score_max, score_min = find_min_max(pre_sheet) pre_sheet_rows = pre_sheet.nrows sum = 0 for r in range(1, pre_sheet_rows): pre_row = pre_sheet.row_values(r) if normalized: score = normalize(float(pre_row[3]), score_max, score_min) else: score = float(pre_row[3]) sum += score print path + "\t" + str(sum)
def generate(path_location, path_doc): pathDictionary = path_location + "/dictionary.dict" pathCorpus = path_location + "/corpus.mm" pathDoc = path_doc pathBinding = path_location + "/corpus-docs.binding" FileBrowser.create_folder_if_not_exists(path_location) from gensim import corpora # pathDictionary = '/Volumes/My Passport/gensim-wiki/dictionary.dict' # pathCorpus = '/Volumes/My Passport/gensim-wiki/corpus.mm' from time import time tStart = time() # Generate a list of files listFiles = FileBrowser.returnFilePathsFromDirectory(pathDoc) iterText = ParseWikiText(listFiles) dictionary = corpora.Dictionary(text.lower().split() for text in iterText) # remove stop words and words that appear only once stoplist = set(nltk.corpus.stopwords.words("english")) stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() dictionary.save(pathDictionary) corpus = GenerateWikiCorpus(listFiles, dictionary) corpora.MmCorpus.serialize(pathCorpus, corpus) # Save index to file import pickle pickle.dump(listIDs, open(pathBinding, 'w')) # for i in range(0,len(IDs)): # print "{0}\t{1}".format(i,IDs[i]) tEnd = time() print "Running time: %f" % (tEnd - tStart)
def generate_similar_document_spreadsheets(topic_models, save_location): for model in topic_models: print model file_name = re.match(".*\/(.*)", model).group(1) save_path = save_location + "/" + file_name + ".xls" index_path = pathIndex + "/" + file_name + ".index" if not FileBrowser.file_exists(save_path) and FileBrowser.file_exists(index_path): current_sim = mergeFromEachTopicModel(model) if current_sim != None: doc_ids = groupDocs(current_sim) # printRangeOfDocs(doc_ids) gen_spreadsheet(doc_ids, save_location, file_name)
def generate(path_location, path_doc): pathDictionary = path_location + "/dictionary.dict" pathCorpus = path_location + "/corpus.mm" pathDoc = path_doc pathBinding = path_location + "/corpus-docs.binding" FileBrowser.create_folder_if_not_exists(path_location) from gensim import corpora # pathDictionary = '/Volumes/My Passport/gensim-wiki/dictionary.dict' # pathCorpus = '/Volumes/My Passport/gensim-wiki/corpus.mm' from time import time tStart = time() # Generate a list of files listFiles = FileBrowser.returnFilePathsFromDirectory(pathDoc) iterText = ParseWikiText(listFiles) dictionary = corpora.Dictionary(text.lower().split() for text in iterText) # remove stop words and words that appear only once stoplist = set(nltk.corpus.stopwords.words("english")) stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() dictionary.save(pathDictionary) corpus = GenerateWikiCorpus(listFiles, dictionary) corpora.MmCorpus.serialize(pathCorpus, corpus) # Save index to file import pickle pickle.dump(listIDs, open(pathBinding, 'w')); # for i in range(0,len(IDs)): # print "{0}\t{1}".format(i,IDs[i]) tEnd = time() print "Running time: %f" % (tEnd - tStart)
def __init__(self): self.error_dialog = QtWidgets.QErrorMessage() self.verticalLayoutWidget = QtWidgets.QWidget(Dialog) self.verticalLayout = QtWidgets.QVBoxLayout(self.verticalLayoutWidget) self.pushButton = QtWidgets.QPushButton(self.verticalLayoutWidget) self.textBrowser = QtWidgets.QTextBrowser(self.verticalLayoutWidget) self.pushButton_2 = QtWidgets.QPushButton(self.verticalLayoutWidget) self.fb = FileBrowser.MyFileBrowser()
def list_all_from_spreadsheet(workbook_folder, file_name_contains=None): merged_spreadsheets = [] workbooks_paths = [] if file_name_contains == None: workbooks_paths.extend(FileBrowser.returnFilePathsFromDirectory(workbook_folder)) else: for e in file_name_contains: workbooks_paths.extend(FileBrowser.returnFilesWithEnding(workbook_folder, str(e + ".xls"))) pre_sheets = get_pre_sheets(workbooks_paths) for pre_sheet in pre_sheets: pre_sheet_rows = pre_sheet.nrows for r in range(1, pre_sheet_rows): pre_row = pre_sheet.row_values(r) row = [pre_row[0],int(pre_row[1]),pre_row[2],float(pre_row[3])] merged_spreadsheets.append(row) return merged_spreadsheets
def generateFromEachTopicModel(model): model_type = re.match(".*\.(.*)",model).group(1) corpus_type = re.match(".*model-(tfidf|bow)",model).group(1) file_name = re.match(".*\/(.*)",model).group(1) pathCurrentIndex = pathIndex+"/"+file_name+".index" index = similarities.Similarity.load(pathCurrentIndex) save_location = pathFolder+"/spreadsheets-simple/"+file_name FileBrowser.create_folder_if_not_exists(save_location) print file_name if model_type == "lsi": lsi = models.LsiModel.load(model) generateResults(lsi,index,corpus_type,articles,save_location) elif model_type == "lda": lda = models.LdaModel.load(model) generateResults(lda,index,corpus_type,articles,save_location) elif model_type == "rp": rp = models.RpModel.load(model) generateResults(rp,index,corpus_type,articles,save_location) elif model_type == "hdp": hdp = models.HdpModel.load(model) generateResults(hdp,index,corpus_type,articles,save_location)
def OpenFileBrowser(self): index = 0 for b in self.buttons: print(b.isFlat()) if b.isFlat(): index = self.buttons.index(b) print(index) fileBrowser = FileBrowser.FileDialog() path = fileBrowser.openFileNameDialog() self.sound[index] = path break
def LoadCurrentScreen(self, setScreen, state=0): if (self.currentScreen is not None): print("New Screen") self.currentScreen.KillAll() self.currentScreen = None if (setScreen == "Print"): self.currentScreen = Printing.PrintScreen( self.screen, self.printingScreenLoader, self.beeCon, state) else: if setScreen == "PrinterInfo": self.currentScreen = PrinterInfo.PrinterInfoScreen( self.screen, self.printerInfoScreenLoader, self.beeCon) elif setScreen == "Jog": self.currentScreen = Jog.JogScreen(self.screen, self.jogLoader, self.beeCon) elif setScreen == "Calibration": self.currentScreen = Calibration.CalibrationScreen( self.screen, self.calibrationLoader, self.beeCon) elif setScreen == "FilamentChange": self.currentScreen = FilamentChange.FilamentChangeScreen( self.screen, self.filamentChangeLoader, self.beeCon) elif setScreen == "Settings": self.currentScreen = Settings.SettingsScreen( self.screen, self.settingsLoader, self.beeCon) elif setScreen == "FileBrowser": self.currentScreen = FileBrowser.FileBrowserScreen( self.screen, self.fileBrowserLoader, self.beeCon) elif setScreen == "About": self.currentScreen = About.AboutScreen(self.screen, self.aboutLoader, self.beeCon) self.currentScreenName = self.currentScreen.GetCurrentScreenName() return
def saveRenderedImageAs(self): FileBrowser.saveImageAs(self.view2D)
import pickle binding = pickle.load(open(pathBinding,'r')) corpus = corpora.MmCorpus(pathCorpus) dictionary = corpora.Dictionary.load(pathDictionary) tfidf = models.TfidfModel.load(pathTFIDF) corpus_tfidf = tfidf[corpus] tot_sims = [] topic_models = [] topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lda")) topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lsi")) topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"rp")) topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"hdp")) result_location = pathFolder + "/relevant-docs" save_location = pathFolder + "/relevant-docs" #spreadsheets_paths = [str(result_location+"/relevant-docs-test (Responses).xlsx"),str(result_location+"/relevant-docs-normalized (Responses).xlsx")] #articles = groupArticlesFromSpreadsheets(articles, get_pre_sheets(spreadsheets_paths)) generate_similar_document_spreadsheets(topic_models, save_location)
import xlwt import FileBrowser import sys import re if sys.argv.__len__() == 2: qrel_models_folder = sys.argv[1] else: print "qrel_models_folder" quit() folders = FileBrowser.returnAllDirs(qrel_models_folder) workbook = xlwt.Workbook() sheet = workbook.add_sheet("Results") row = 0 sheet.write(row,0,"Model type") sheet.write(row,1,"Corpus type") sheet.write(row,2,"Topics") sheet.write(row,3,"MAP") sheet.write(row,4,"NDCG") for folder in folders: ''' if ending hdp get bow/tfidf and topics "-" else: get bow/tfidf and topics '''
"Iron fertilization", "Carbon cycle", "Carbon sink", "Oceanography", "Ocean chemistry", "Global warming", "Ocean acidification", "Oxygen minimum zone", "Marine ecosystem", "Aquatic ecosystem", "Biomass (ecology)", "Blue carbon" ] topic_models = [] topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"model-tfidf-500.lsi")) #topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lda")) #topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lsi")) #topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"rp")) #topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"hdp")) #for model in topic_models: # generateFromEachTopicModel(model) result_location = pathFolder + "/relevant-docs" save_location = pathFolder + "/relevant-docs-extended" spreadsheets_paths = [str(result_location+"/relevant-docs-test (Responses).xlsx"),str(result_location+"/relevant-docs-normalized (Responses).xlsx")] articles = groupArticlesFromSpreadsheets(articles, get_pre_sheets(spreadsheets_paths))
import xlwt import FileBrowser import sys import re if sys.argv.__len__() == 2: qrel_models_folder = sys.argv[1] else: print "qrel_models_folder" quit() folders = FileBrowser.returnAllDirs(qrel_models_folder) workbook = xlwt.Workbook() sheet = workbook.add_sheet("Results") row = 0 sheet.write(row, 0, "Model type") sheet.write(row, 1, "Corpus type") sheet.write(row, 2, "Topics") sheet.write(row, 3, "MAP") sheet.write(row, 4, "NDCG") for folder in folders: ''' if ending hdp get bow/tfidf and topics "-" else: get bow/tfidf and topics '''
def getSimModels(sim_model_folder_path): return FileBrowser.returnFilePathsFromDirectory(sim_model_folder_path)
import sys path = r'C:\MyGit\FileBrowser' # add current tool path to the syspath so that it can be importted if not path in sys.path: sys.path.append(path) # import module import FileBrowser reload(FileBrowser) # create the file browser fb = FileBrowser.MyQtApp(True) fb.show()
def open_browser(self): browser = FileBrowser() filepath = browser.openfile() self.filepath_entry.insert(0, filepath)
def generateResults(model_type, index, corpus_type, articles, save_location): for article in articles: query_id = WikiAPI.get_pageid_from_article_title("en",article) query_id = str(query_id) #print query_id title_query = WikiAPI.get_title_from_pageid("en",query_id) article_paths = returnArticlePaths(pathDoc) spreadsheet_location = save_location+"/"+query_id+'-'+str(title_query)+".xls" if not FileBrowser.file_exists(spreadsheet_location): print str(article) + " - generating sim docs" location = getLocation(query_id) if location == None: print("\tArticle " + str(title_query) + " not found") continue article_text, index_query = getText(location, query_id) text_query = dictionary.doc2bow(article_text.lower().split()) if corpus_type == "tfidf": text_query = tfidf[text_query] model_query = model_type[text_query] sims = index[model_query] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) #print(sims) import xlwt #print "title\tid\turl\tsimilarity" #print "\n\n" #print "Similarity to query "+index_query+"\n" #print article_text excel_workbook = xlwt.Workbook(encoding='utf8') excel_sheet = excel_workbook.add_sheet("test") #excel. #excel.add_sheet("test") #limit = 101 limit = 1000 pos_diff = 0 i = 0 while i != limit: id_article = binding[sims[i][0]][0] #if id_article == query_id: # limit+=1 # pos_diff+=1 #else: similarity_rate = sims[i][1] #solr_query = s.query('id:"'+id_article+'"') title = WikiAPI.get_title_from_pageid("en",id_article) #title = str(solr_query.results[0]['title'][0]) #location = str(solr_query.results[0]['location'][0]) url = WikiAPI.get_fullurl_from_pageid("en",id_article) #print "{0} -> {1} -> {2}".format(sims[i], id_article, title) #print "{0}\t{1}\t{2}\t{3}".format(title, id_article, url, similarity_rate) #print WikiAPI.get_summary_from_pageid("en",id_article)+'\n\n' #text = getText(location,id_article)[0] excel_sheet.write(i-pos_diff, 0, title) excel_sheet.write(i-pos_diff, 1, id_article) excel_sheet.write(i-pos_diff, 2, url) excel_sheet.write(i-pos_diff, 3, float(similarity_rate)) i+=1 excel_workbook.save(save_location+"/"+query_id+'-'+str(title_query)+".xls") else: print str(article) + " exists"
qrel_file.close() result_file.close() if sys.argv.__len__() == 5: rel_docs_ss_path = sys.argv[1] sim_models_folders_path = sys.argv[2] qrel_dest_folder_path = sys.argv[3] trec_eval_loc = sys.argv[4] else: print "rel_docs_ss_path sim_models_folders_path qrel_folder_path trec_eval_loc" quit() rel_docs = getRelevantDocs(rel_docs_ss_path) for sim_model_folder_path in FileBrowser.returnAllDirs(sim_models_folders_path): print sim_model_folder_path file_name = re.match(".*\/(.*)",sim_model_folder_path).group(1) qrel_folder_path = qrel_dest_folder_path + "/" + file_name FileBrowser.create_folder_if_not_exists(qrel_folder_path) qrel_file_path = qrel_folder_path +"/qrel.txt" result_file_path = qrel_folder_path +"/qrel-res.txt" output_file_path = qrel_folder_path +"/output.txt" gen_qrel(sim_model_folder_path, rel_docs, qrel_file_path, result_file_path) system_command = trec_eval_loc +" -q -m all_trec "+ qrel_file_path+" "+result_file_path +" > " +output_file_path print system_command os.system(system_command) #"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel.txt"
qrel_file.close() result_file.close() if sys.argv.__len__() == 5: rel_docs_ss_path = sys.argv[1] sim_models_folders_path = sys.argv[2] qrel_dest_folder_path = sys.argv[3] trec_eval_loc = sys.argv[4] else: print "rel_docs_ss_path sim_models_folders_path qrel_folder_path trec_eval_loc" quit() rel_docs = getRelevantDocs(rel_docs_ss_path) for sim_model_folder_path in FileBrowser.returnAllDirs( sim_models_folders_path): print sim_model_folder_path file_name = re.match(".*\/(.*)", sim_model_folder_path).group(1) qrel_folder_path = qrel_dest_folder_path + "/" + file_name FileBrowser.create_folder_if_not_exists(qrel_folder_path) qrel_file_path = qrel_folder_path + "/qrel.txt" result_file_path = qrel_folder_path + "/qrel-res.txt" output_file_path = qrel_folder_path + "/output.txt" gen_qrel(sim_model_folder_path, rel_docs, qrel_file_path, result_file_path) system_command = trec_eval_loc + " -q -m all_trec " + qrel_file_path + " " + result_file_path + " > " + output_file_path print system_command os.system(system_command) #"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel.txt" #"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel-res.txt"
import FileBrowser import SolrAPI import xlAPI import sys if sys.argv.__len__() == 4: pathXml = sys.argv[1] pathDoc = sys.argv[2] pathMRNER = sys.argv[3] pathTextDocsFolder = sys.argv[3]+"/docs" else: print "pathXml pathDoc pathMRNER" quit() FileBrowser.create_folder_if_not_exists(pathTextDocsFolder) s = SolrAPI.SolrAPI('enwiki-20160113_no-text') sheet = xlAPI.getSpreadsheetFile(pathXml) col_ID = 1 for i in range(0,sheet.nrows): article_id = int(sheet.cell(i,col_ID).value) title = s.get_title_from_id(article_id).replace("/","_") location = s.get_location_from_id(article_id) text = SimilarityMeasureAPI.getText(pathDoc,location,article_id) file_name = "/"+str(article_id)+" - "+str(title)+".txt" print str(title) file = open(str(pathTextDocsFolder+file_name),"w") file.write(text) file.close()