def processAllRecords(bibData): for k,v in bibData.items(): # 1. create folders, copy files functions.processBibRecord(memexPath, v) # 2. OCR the file language = identifyLanguage(v, "eng") ocrPublication(memexPath, v["rCite"], language)
def processAllRecords( bibData ): # Takes the bibData as an argument and loops through the dictionary for k, v in bibData.items(): # 1. create folders, copy files functions.processBibRecord(memexPath, v) # 2. OCR the file language = identifyLanguage(v, "eng") # Identifies the language as eng ocrPublication(memexPath, v["rCite"], language) # OCRs the publication
def processAllRecords( bibData): #defines a functions to process all your records for k, v in bibData.items( ): #loops through key-value-pairs in your bibData-dictionary # 1. create folders, copy files functions.processBibRecord(memexPath, v) # 2. OCR the file language = identifyLanguage(v, "eng") ocrPublication( memexPath, v["rCite"], language ) #assigns the parameters to your previously defined function
def processAllRecords(bibData): keys = list(bibData.keys()) random.shuffle(keys) for key in keys: bibRecord = bibData[key] functions.processBibRecord(pathToMemex, bibRecord) language = identifyLanguage(bibRecord, "eng") ocrPublication(pathToMemex, bibRecord["rCite"], language)
def processAllRecords(bibData): #define function, 1 parameter keys = list(bibData.keys()) # define variable bib keys random.shuffle(keys) #allows multiprocessing; every time the function is executed it starts with a different pdf for key in keys: #loop through every individual bib key bibRecord = bibData[key] #store inormation in new variable # 1. create folders, copy files functions.processBibRecord(memexPath, bibRecord) #pre-defined function in functions.py # 2. OCR the file language = identifyLanguage(bibRecord, "eng") #use pre-defined function to determine pdf's language ocrPublication(memexPath, bibRecord["rCite"], language) #use pre-defined function to extract text from images
def processAllRecords(bibData): keys = list(bibData.keys()) random.shuffle(keys) for key in keys: bibRecord = bibData[key] # 1. create folders, copy files functions.processBibRecord(memexPath, bibRecord) # 2. OCR the file language = identifyLanguage(bibRecord, "eng") ocrPublication(memexPath, bibRecord["rCite"], language)
def processAllRecords(bibData): ## now function to process all the pdfs keys = list(bibData.keys( )) ## in a list and random to do more than one process at a time random.shuffle(keys) for key in keys: ## looping through keys; applying the function from above to all of the pdfs; bibRecord = bibData[key] functions.processBibRecord(memexPath, bibRecord) language = identifyLanguage(bibRecord, "eng") ## checking language every time ocrPublication(memexPath, bibRecord["rCite"], language)
def processAllRecords( bibData): #defines a functions to process all your records keys = list(bibData.keys()) #extracts the keys of your dictionary random.shuffle(keys) #shuffles the keys for key in keys: #randomly loops through the keys bibRecord = bibData[key] #chooses a random record to process # 1. create folders, copy files functions.processBibRecord(memexPath, bibRecord) # 2. OCR the file language = identifyLanguage(bibRecord, "eng") ocrPublication(memexPath, bibRecord["rCite"], language)
def processAllRecords(bibData): keys = list( bibData.keys()) # Grabs the keys from the dictionary into the list random.shuffle(keys) # Shuffles the order of the keys in the list for key in keys: # Processes records based on this list (since the list will be different # every time, whenever the script is run it'll start processing another script). bibRecord = bibData[key] # 1. Create folders, copy files functions.processBibRecord(memexPath, bibRecord) # 2. OCR the file language = identifyLanguage(bibRecord, "eng") ocrPublication(memexPath, bibRecord["rCite"], language)
def processAllRecords(bibDataFile): bibData = functions.loadBib(bibDataFile) keys = list(bibData.keys()) random.shuffle(keys) for key in keys: bibRecord = bibData[key] functions.processBibRecord(settings["path_to_memex"], bibRecord) language = functions.identifyLanguage(bibRecord["rCite"], "eng") ocrPublication(bibRecord["rCite"], language) functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") functions.memexStatusUpdates(settings["path_to_memex"], ".bib") functions.memexStatusUpdates(settings["path_to_memex"], ".png") functions.memexStatusUpdates(settings["path_to_memex"], ".json")
def processAllRecords(bibDataFile): # load the bib file as dictionary using the function from previous step bibData = functions.loadBib(bibDataFile) # save the keys of the dictionary bibData as a list keys = list(bibData.keys()) random.shuffle(keys) print print(str(keys)) # in a loop, process each key from the list keys (i.e. each record by citation key) for key in keys: bibRecord = bibData[key] # run the function from the previous step that creates a path with pdf and bib files, if not already there functions.processBibRecord(settings["path_to_memex"], bibRecord) language = functions.identifyLanguage(bibRecord, "eng") # run the function that saves ocr-ed text as json files and created .png images for each page ocrPublication(bibRecord["rCite"], language)
def processAllRecords(bibData): # save the keys from the dictionary bibData to the list keys keys = list(bibData.keys()) # pick random element from the shuffled list keys random.shuffle(keys) # loop through each key from the list keys for key in keys: # save the bibData record to bibRecord bibRecord = bibData[key] # 1. create folders, copy files # call the function processBibRecord with the memexPath and the bibRecord as input values functions.processBibRecord(memexPath, bibRecord) # 2. OCR the file # call the function identifyLanguage with the bibRecord and the fallBackLanguage as input values and save the return value to language language = identifyLanguage(bibRecord, "eng") ocrPublication(memexPath, bibRecord["rCite"], language)
def processAllRecords(bibDataFile): #defines a functions for all the records bibData = functions.loadBib( bibDataFile) #loops through key-value-pairs in the bibData-dictionary keys = list(bibData.keys()) #keys from the list random.shuffle(keys) #randomizes the OCRing for key in keys: #loops through the keys bibRecord = bibData[key] #adds a key to the bibData functions.processBibRecord(settings["path_to_memex"], bibRecord) #assigns a new parameter language = functions.identifyLanguage( bibRecord["rCite"], "eng") #identifies a language, assigns the "eng" ocrPublication(bibRecord["rCite"], language, int( settings["page_limit"])) #sets a page limit, if there is such functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") #creates a pdf functions.memexStatusUpdates(settings["path_to_memex"], ".bib") #creates a bib functions.memexStatusUpdates(settings["path_to_memex"], ".png") #creates a png functions.memexStatusUpdates(settings["path_to_memex"], ".json") #creates a jsonfile
def processAllRecords(bibData): for k, v in bibData.items(): functions.processBibRecord(memexPath, v)