def print_sum_of_each_sheet(workbook_folder, file_name_contains=None, normalized = False):


    workbooks_paths = []
    if file_name_contains == None:
        workbooks_paths.extend(FileBrowser.returnFilePathsFromDirectory(workbook_folder))
    else:
        for e in file_name_contains:
            workbooks_paths.extend(FileBrowser.returnFilesWithEnding(workbook_folder, str(e + ".xls")))

    pre_sheets = get_pre_sheets_and_path(workbooks_paths)
    '''
    for path, pre_sheet in pre_sheets:
        pre_sheet_rows = pre_sheet.nrows
        sum = 0
        for r in range(1, pre_sheet_rows):
            pre_row = pre_sheet.row_values(r)
            sum += float(pre_row[3])
        print path + " = "+sum
    '''
    for path, pre_sheet in pre_sheets:
        if normalized:
            score_max, score_min = find_min_max(pre_sheet)
        pre_sheet_rows = pre_sheet.nrows
        sum = 0
        for r in range(1, pre_sheet_rows):
            pre_row = pre_sheet.row_values(r)
            if normalized:
                score = normalize(float(pre_row[3]), score_max, score_min)
            else:
                score = float(pre_row[3])
            sum += score
        print path + "\t" + str(sum)
def generate(path_location, path_doc):

    pathDictionary = path_location + "/dictionary.dict"
    pathCorpus = path_location + "/corpus.mm"
    pathDoc = path_doc
    pathBinding = path_location + "/corpus-docs.binding"

    FileBrowser.create_folder_if_not_exists(path_location)

    from gensim import corpora

    # pathDictionary = '/Volumes/My Passport/gensim-wiki/dictionary.dict'
    # pathCorpus = '/Volumes/My Passport/gensim-wiki/corpus.mm'

    from time import time

    tStart = time()

    # Generate a list of files
    listFiles = FileBrowser.returnFilePathsFromDirectory(pathDoc)

    iterText = ParseWikiText(listFiles)
    dictionary = corpora.Dictionary(text.lower().split() for text in iterText)

    # remove stop words and words that appear only once
    stoplist = set(nltk.corpus.stopwords.words("english"))
    stop_ids = [
        dictionary.token2id[stopword] for stopword in stoplist
        if stopword in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq == 1
    ]
    dictionary.filter_tokens(
        stop_ids +
        once_ids)  # remove stop words and words that appear only once
    dictionary.compactify()
    dictionary.save(pathDictionary)

    corpus = GenerateWikiCorpus(listFiles, dictionary)
    corpora.MmCorpus.serialize(pathCorpus, corpus)

    # Save index to file

    import pickle
    pickle.dump(listIDs, open(pathBinding, 'w'))

    # for i in range(0,len(IDs)):
    #    print "{0}\t{1}".format(i,IDs[i])

    tEnd = time()

    print "Running time: %f" % (tEnd - tStart)
def generate_similar_document_spreadsheets(topic_models, save_location):
    for model in topic_models:
        print model
        file_name = re.match(".*\/(.*)", model).group(1)
        save_path = save_location + "/" + file_name + ".xls"
        index_path = pathIndex + "/" + file_name + ".index"
        if not FileBrowser.file_exists(save_path) and FileBrowser.file_exists(index_path):
            current_sim = mergeFromEachTopicModel(model)
            if current_sim != None:
                doc_ids = groupDocs(current_sim)
            # printRangeOfDocs(doc_ids)
            gen_spreadsheet(doc_ids, save_location, file_name)
def generate(path_location, path_doc):

    pathDictionary = path_location + "/dictionary.dict"
    pathCorpus = path_location + "/corpus.mm"
    pathDoc = path_doc
    pathBinding = path_location + "/corpus-docs.binding"

    FileBrowser.create_folder_if_not_exists(path_location)

    from gensim import corpora

    # pathDictionary = '/Volumes/My Passport/gensim-wiki/dictionary.dict'
    # pathCorpus = '/Volumes/My Passport/gensim-wiki/corpus.mm'


    from time import time

    tStart = time()

    # Generate a list of files
    listFiles = FileBrowser.returnFilePathsFromDirectory(pathDoc)

    iterText = ParseWikiText(listFiles)
    dictionary = corpora.Dictionary(text.lower().split() for text in iterText)


    # remove stop words and words that appear only once
    stoplist = set(nltk.corpus.stopwords.words("english"))
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
    dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
    dictionary.compactify()
    dictionary.save(pathDictionary)

    corpus = GenerateWikiCorpus(listFiles, dictionary)
    corpora.MmCorpus.serialize(pathCorpus, corpus)

    # Save index to file

    import pickle
    pickle.dump(listIDs, open(pathBinding, 'w'));

    # for i in range(0,len(IDs)):
    #    print "{0}\t{1}".format(i,IDs[i])

    tEnd = time()

    print "Running time: %f" % (tEnd - tStart)
示例#5
0
 def __init__(self):
     self.error_dialog = QtWidgets.QErrorMessage()
     self.verticalLayoutWidget = QtWidgets.QWidget(Dialog)
     self.verticalLayout = QtWidgets.QVBoxLayout(self.verticalLayoutWidget)
     self.pushButton = QtWidgets.QPushButton(self.verticalLayoutWidget)
     self.textBrowser = QtWidgets.QTextBrowser(self.verticalLayoutWidget)
     self.pushButton_2 = QtWidgets.QPushButton(self.verticalLayoutWidget)
     self.fb = FileBrowser.MyFileBrowser()
def list_all_from_spreadsheet(workbook_folder, file_name_contains=None):
    merged_spreadsheets = []

    workbooks_paths = []
    if file_name_contains == None:
        workbooks_paths.extend(FileBrowser.returnFilePathsFromDirectory(workbook_folder))
    else:
        for e in file_name_contains:
            workbooks_paths.extend(FileBrowser.returnFilesWithEnding(workbook_folder, str(e + ".xls")))

    pre_sheets = get_pre_sheets(workbooks_paths)

    for pre_sheet in pre_sheets:
        pre_sheet_rows = pre_sheet.nrows
        for r in range(1, pre_sheet_rows):
            pre_row = pre_sheet.row_values(r)
            row = [pre_row[0],int(pre_row[1]),pre_row[2],float(pre_row[3])]
            merged_spreadsheets.append(row)

    return merged_spreadsheets
def generateFromEachTopicModel(model):
    model_type = re.match(".*\.(.*)",model).group(1)
    corpus_type = re.match(".*model-(tfidf|bow)",model).group(1)
    file_name = re.match(".*\/(.*)",model).group(1)
    pathCurrentIndex = pathIndex+"/"+file_name+".index"
    index = similarities.Similarity.load(pathCurrentIndex)
    save_location = pathFolder+"/spreadsheets-simple/"+file_name
    FileBrowser.create_folder_if_not_exists(save_location)
    print file_name
    if model_type == "lsi":
        lsi = models.LsiModel.load(model)
        generateResults(lsi,index,corpus_type,articles,save_location)
    elif model_type == "lda":
        lda = models.LdaModel.load(model)
        generateResults(lda,index,corpus_type,articles,save_location)
    elif model_type == "rp":
        rp = models.RpModel.load(model)
        generateResults(rp,index,corpus_type,articles,save_location)
    elif model_type == "hdp":
        hdp = models.HdpModel.load(model)
        generateResults(hdp,index,corpus_type,articles,save_location)
示例#8
0
    def OpenFileBrowser(self):
        index = 0
        for b in self.buttons:
            print(b.isFlat())
            if b.isFlat():
                index = self.buttons.index(b)
                print(index)
                fileBrowser = FileBrowser.FileDialog()
                path = fileBrowser.openFileNameDialog()

                self.sound[index] = path
                break
示例#9
0
    def LoadCurrentScreen(self, setScreen, state=0):

        if (self.currentScreen is not None):
            print("New Screen")
            self.currentScreen.KillAll()
            self.currentScreen = None

        if (setScreen == "Print"):
            self.currentScreen = Printing.PrintScreen(
                self.screen, self.printingScreenLoader, self.beeCon, state)
        else:
            if setScreen == "PrinterInfo":
                self.currentScreen = PrinterInfo.PrinterInfoScreen(
                    self.screen, self.printerInfoScreenLoader, self.beeCon)
            elif setScreen == "Jog":
                self.currentScreen = Jog.JogScreen(self.screen, self.jogLoader,
                                                   self.beeCon)
            elif setScreen == "Calibration":
                self.currentScreen = Calibration.CalibrationScreen(
                    self.screen, self.calibrationLoader, self.beeCon)
            elif setScreen == "FilamentChange":
                self.currentScreen = FilamentChange.FilamentChangeScreen(
                    self.screen, self.filamentChangeLoader, self.beeCon)
            elif setScreen == "Settings":
                self.currentScreen = Settings.SettingsScreen(
                    self.screen, self.settingsLoader, self.beeCon)
            elif setScreen == "FileBrowser":
                self.currentScreen = FileBrowser.FileBrowserScreen(
                    self.screen, self.fileBrowserLoader, self.beeCon)
            elif setScreen == "About":
                self.currentScreen = About.AboutScreen(self.screen,
                                                       self.aboutLoader,
                                                       self.beeCon)

            self.currentScreenName = self.currentScreen.GetCurrentScreenName()

        return
示例#10
0
    def saveRenderedImageAs(self):

        FileBrowser.saveImageAs(self.view2D)

import pickle
binding = pickle.load(open(pathBinding,'r'))

corpus = corpora.MmCorpus(pathCorpus)
dictionary = corpora.Dictionary.load(pathDictionary)
tfidf = models.TfidfModel.load(pathTFIDF)

corpus_tfidf = tfidf[corpus]


tot_sims = []

topic_models = []
topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lda"))
topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lsi"))
topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"rp"))
topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"hdp"))



result_location = pathFolder + "/relevant-docs"
save_location = pathFolder + "/relevant-docs"
#spreadsheets_paths = [str(result_location+"/relevant-docs-test (Responses).xlsx"),str(result_location+"/relevant-docs-normalized (Responses).xlsx")]
#articles = groupArticlesFromSpreadsheets(articles, get_pre_sheets(spreadsheets_paths))


generate_similar_document_spreadsheets(topic_models, save_location)

import xlwt
import FileBrowser
import sys
import re


if sys.argv.__len__() == 2:
    qrel_models_folder = sys.argv[1]
else:
    print "qrel_models_folder"
    quit()

folders = FileBrowser.returnAllDirs(qrel_models_folder)

workbook = xlwt.Workbook()
sheet = workbook.add_sheet("Results")
row = 0
sheet.write(row,0,"Model type")
sheet.write(row,1,"Corpus type")
sheet.write(row,2,"Topics")
sheet.write(row,3,"MAP")
sheet.write(row,4,"NDCG")

for folder in folders:
    '''
    if ending hdp
        get bow/tfidf and topics "-"
    else:
        get bow/tfidf and topics

    '''
    "Iron fertilization",
    "Carbon cycle",
    "Carbon sink",
    "Oceanography",
    "Ocean chemistry",
    "Global warming",
    "Ocean acidification",
    "Oxygen minimum zone",
    "Marine ecosystem",
    "Aquatic ecosystem",
    "Biomass (ecology)",
    "Blue carbon"
]

topic_models = []
topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"model-tfidf-500.lsi"))

#topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lda"))
#topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"lsi"))
#topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"rp"))
#topic_models.extend(FileBrowser.returnFilesWithEnding(pathModel,"hdp"))

#for model in topic_models:
#    generateFromEachTopicModel(model)


result_location = pathFolder + "/relevant-docs"
save_location = pathFolder + "/relevant-docs-extended"
spreadsheets_paths = [str(result_location+"/relevant-docs-test (Responses).xlsx"),str(result_location+"/relevant-docs-normalized (Responses).xlsx")]
articles = groupArticlesFromSpreadsheets(articles, get_pre_sheets(spreadsheets_paths))
import xlwt
import FileBrowser
import sys
import re

if sys.argv.__len__() == 2:
    qrel_models_folder = sys.argv[1]
else:
    print "qrel_models_folder"
    quit()

folders = FileBrowser.returnAllDirs(qrel_models_folder)

workbook = xlwt.Workbook()
sheet = workbook.add_sheet("Results")
row = 0
sheet.write(row, 0, "Model type")
sheet.write(row, 1, "Corpus type")
sheet.write(row, 2, "Topics")
sheet.write(row, 3, "MAP")
sheet.write(row, 4, "NDCG")

for folder in folders:
    '''
    if ending hdp
        get bow/tfidf and topics "-"
    else:
        get bow/tfidf and topics

    '''
def getSimModels(sim_model_folder_path):
    return FileBrowser.returnFilePathsFromDirectory(sim_model_folder_path)
import sys
path = r'C:\MyGit\FileBrowser'
# add current tool path to the syspath so that it can be importted
if not path in sys.path:
    sys.path.append(path)

# import module
import FileBrowser
reload(FileBrowser)

# create the file browser
fb = FileBrowser.MyQtApp(True)
fb.show()
示例#17
0
	def open_browser(self):
		browser = FileBrowser()
		filepath = browser.openfile()
		self.filepath_entry.insert(0, filepath)
def generateResults(model_type, index, corpus_type, articles, save_location):
    for article in articles:
        query_id = WikiAPI.get_pageid_from_article_title("en",article)
        query_id = str(query_id)
        #print query_id
        title_query = WikiAPI.get_title_from_pageid("en",query_id)
        article_paths = returnArticlePaths(pathDoc)
        spreadsheet_location = save_location+"/"+query_id+'-'+str(title_query)+".xls"
        if not FileBrowser.file_exists(spreadsheet_location):
            print str(article) + " - generating sim docs"
            location = getLocation(query_id)
            if location == None:
                print("\tArticle " + str(title_query) + " not found")
                continue
            article_text, index_query = getText(location, query_id)

            text_query = dictionary.doc2bow(article_text.lower().split())
            if corpus_type == "tfidf":
                text_query = tfidf[text_query]


            model_query = model_type[text_query]


            sims = index[model_query] # perform a similarity query against the corpus
            sims = sorted(enumerate(sims), key=lambda item: -item[1])
            #print(sims)

            import xlwt

            #print "title\tid\turl\tsimilarity"
            #print "\n\n"
            #print "Similarity to query "+index_query+"\n"
            #print article_text
            excel_workbook = xlwt.Workbook(encoding='utf8')
            excel_sheet = excel_workbook.add_sheet("test")
            #excel.
            #excel.add_sheet("test")
            #limit = 101
            limit = 1000
            pos_diff = 0
            i = 0

            while i != limit:
                id_article = binding[sims[i][0]][0]
                #if id_article == query_id:
                #    limit+=1
                #    pos_diff+=1
                #else:
                similarity_rate = sims[i][1]
                #solr_query = s.query('id:"'+id_article+'"')

                title = WikiAPI.get_title_from_pageid("en",id_article)
                #title = str(solr_query.results[0]['title'][0])
                #location = str(solr_query.results[0]['location'][0])
                url = WikiAPI.get_fullurl_from_pageid("en",id_article)
                #print "{0} -> {1} -> {2}".format(sims[i], id_article, title)
                #print "{0}\t{1}\t{2}\t{3}".format(title, id_article, url, similarity_rate)
                #print WikiAPI.get_summary_from_pageid("en",id_article)+'\n\n'
                #text = getText(location,id_article)[0]

                excel_sheet.write(i-pos_diff, 0, title)
                excel_sheet.write(i-pos_diff, 1, id_article)
                excel_sheet.write(i-pos_diff, 2, url)
                excel_sheet.write(i-pos_diff, 3, float(similarity_rate))
                i+=1
            excel_workbook.save(save_location+"/"+query_id+'-'+str(title_query)+".xls")
        else:
            print str(article) + " exists"
    qrel_file.close()
    result_file.close()


if sys.argv.__len__() == 5:
    rel_docs_ss_path = sys.argv[1]
    sim_models_folders_path = sys.argv[2]
    qrel_dest_folder_path = sys.argv[3]
    trec_eval_loc = sys.argv[4]
else:
    print "rel_docs_ss_path sim_models_folders_path qrel_folder_path trec_eval_loc"
    quit()

rel_docs = getRelevantDocs(rel_docs_ss_path)

for sim_model_folder_path in FileBrowser.returnAllDirs(sim_models_folders_path):
    print sim_model_folder_path

    file_name = re.match(".*\/(.*)",sim_model_folder_path).group(1)
    qrel_folder_path = qrel_dest_folder_path + "/" + file_name
    FileBrowser.create_folder_if_not_exists(qrel_folder_path)
    qrel_file_path = qrel_folder_path +"/qrel.txt"
    result_file_path = qrel_folder_path +"/qrel-res.txt"
    output_file_path = qrel_folder_path +"/output.txt"
    gen_qrel(sim_model_folder_path, rel_docs, qrel_file_path, result_file_path)
    system_command = trec_eval_loc +" -q -m all_trec "+ qrel_file_path+" "+result_file_path +" > " +output_file_path
    print system_command
    os.system(system_command)


#"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel.txt"
    qrel_file.close()
    result_file.close()


if sys.argv.__len__() == 5:
    rel_docs_ss_path = sys.argv[1]
    sim_models_folders_path = sys.argv[2]
    qrel_dest_folder_path = sys.argv[3]
    trec_eval_loc = sys.argv[4]
else:
    print "rel_docs_ss_path sim_models_folders_path qrel_folder_path trec_eval_loc"
    quit()

rel_docs = getRelevantDocs(rel_docs_ss_path)

for sim_model_folder_path in FileBrowser.returnAllDirs(
        sim_models_folders_path):
    print sim_model_folder_path

    file_name = re.match(".*\/(.*)", sim_model_folder_path).group(1)
    qrel_folder_path = qrel_dest_folder_path + "/" + file_name
    FileBrowser.create_folder_if_not_exists(qrel_folder_path)
    qrel_file_path = qrel_folder_path + "/qrel.txt"
    result_file_path = qrel_folder_path + "/qrel-res.txt"
    output_file_path = qrel_folder_path + "/output.txt"
    gen_qrel(sim_model_folder_path, rel_docs, qrel_file_path, result_file_path)
    system_command = trec_eval_loc + " -q -m all_trec " + qrel_file_path + " " + result_file_path + " > " + output_file_path
    print system_command
    os.system(system_command)

#"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel.txt"
#"/Users/Mateusz/Desktop/spreadsheets-en-improved/ternary/qrel-res.txt"
def getSimModels(sim_model_folder_path):
    return FileBrowser.returnFilePathsFromDirectory(sim_model_folder_path)
import FileBrowser
import SolrAPI
import xlAPI
import sys

if sys.argv.__len__() == 4:
    pathXml = sys.argv[1]
    pathDoc = sys.argv[2]
    pathMRNER = sys.argv[3]
    pathTextDocsFolder = sys.argv[3]+"/docs"
else:
    print "pathXml pathDoc pathMRNER"
    quit()


FileBrowser.create_folder_if_not_exists(pathTextDocsFolder)
s = SolrAPI.SolrAPI('enwiki-20160113_no-text')

sheet = xlAPI.getSpreadsheetFile(pathXml)
col_ID = 1

for i in range(0,sheet.nrows):
    article_id = int(sheet.cell(i,col_ID).value)
    title = s.get_title_from_id(article_id).replace("/","_")
    location = s.get_location_from_id(article_id)
    text = SimilarityMeasureAPI.getText(pathDoc,location,article_id)
    file_name = "/"+str(article_id)+" - "+str(title)+".txt"
    print str(title)
    file = open(str(pathTextDocsFolder+file_name),"w")
    file.write(text)
    file.close()