def featureParagraphNumericRatings(sample): #input(sample) Ex1FeatureSet = Extractor.extractAll(sample, IIEx1features) #input(Ex1FeatureSet) rating = [number for data, number in paragraphClassifier(Ex1FeatureSet)] #input(rating) return {"Food": rating[0], "Service": rating[1], "Venue" : rating[2], "OverallP" : rating[3]}
def SearchCategorymember(self,categoryname): try: self.subcategoryresultlist.DeleteAllItems(); self.pageresultlist.DeleteAllItems(); wikiextractor = Extractor.wikiextractor() query = configuration.api_url_zh +'&list=categorymembers&cmtitle=Category:%s&cmsort=timestamp&' \ 'cmdir=desc&cmlimit=max' % categoryname json_content = wikiextractor.getjson(query) members = json_content['query']['categorymembers'] for member in members: #TODO:如果没有Category属性,无法判断是否为子类(?) pageid = str(member['pageid']) if 'Category:' in member['title']: subcategory = member['title'].lstrip('Category:') index = self.subcategoryresultlist.InsertStringItem(sys.maxint, subcategory) self.subcategoryresultlist.SetStringItem(index, 0, subcategory) else: page = member['title'] #TODO:待完善 # 说明不是有效的page if ':' in page: continue index = self.pageresultlist.InsertStringItem(sys.maxint, pageid) self.pageresultlist.SetStringItem(index, 0, pageid) self.pageresultlist.SetStringItem(index, 1, page) except Exception,e: self.statusbar.SetStatusText(e.message,0)
def predictTagless(classifier, featureExtractors, taglessTestSet): test_set = Extractor.extractAll(taglessTestSet, featureExtractors) predictions = [{"data": e[0], "features" : e[1][0], "predicted_label" : e[1][1]} for e in zip(taglessTestSet, classifier(test_set))] #print("PREDICTIONS:") #for i in range(len(predictions)): # print("Prediction #", i+1, ":", predictions[i]) # print("") return [e["predicted_label"] for e in predictions]
def predictTagged(classifier, featureExtractors, taggedTestSet): test_set = Extractor.extractAllTagged(taggedTestSet, featureExtractors) pLabels, acc = classifier(test_set, True) predictions = [{"data": e[0], "features" : e[1][0], "predicted_label" : e[1][1]} for e in zip(taggedTestSet, pLabels)] print("PREDICTIONS:") for i in range(len(predictions)): print("Prediction #", i+1, ":", "Predicted Label:", predictions[i]["predicted_label"], "Actual Label:", predictions[i]["data"][1]) #print("") return [e["predicted_label"] for e in predictions], acc
def OnGeoExtract(self,evt): try: wikiextractor = Extractor.wikiextractor() data_dict={} members=self.GetGeoList() wikiextractor.get_data_dict_from_pageid(members, data_dict,'f') filewriter=FileWriter.filewriter() filewriter.SaveToSQLite(data_dict) #filewriter.SaveToExcel(data_dict) self.statusbar.SetStatusText("保存成功,请检查excel文件",0) except Exception,e: self.statusbar.SetStatusText(e.message,0)
def OnExtract(self,evt): try: categoryname = self.categoryname.GetValue() wikiextractor = Extractor.wikiextractor() data_dict = {} if self.extractsubcategoryck.Get3StateValue() == wx.CHK_CHECKED: wikiextractor.parse_members(categoryname, data_dict,'t') else: wikiextractor.parse_members(categoryname, data_dict,'f') filewriter=FileWriter.filewriter() #filewriter.SaveToSQLite(data_dict) filewriter.SaveToExcel(data_dict) self.statusbar.SetStatusText(u"保存成功,请检查excel文件",0) except Exception,e: self.statusbar.SetStatusText(e.message,0)
def SearchbyPrex(self,prex): try: self.subcategoryresultlist.DeleteAllItems(); wikiextractor = Extractor.wikiextractor() query = configuration.api_url_zh + '&list=allcategories&acprefix=%s'% prex json_content = wikiextractor.getjson(query) members = json_content['query']['allcategories'] for member in members: #TODO:如果没有Category属性,无法判断是否为子类(?) category = member['*'] index = self.subcategoryresultlist.InsertStringItem(sys.maxint, category) self.subcategoryresultlist.SetStringItem(index, 0, category) except Exception,e: self.statusbar.SetStatusText(e.message,0)
def runPreExtractedNfoldCrossValidation(classifier, featureSets, n): print("Running",n,"fold validation...") classifiers = [] print("Dividing into folds") folds = Extractor.getPreExtractedNfolds(featureSets, n) for i in range(len(folds)): training = [] print("Validating with fold", i) for j in range(len(folds)): if not j == i: training.extend(folds[j]) classifiers.append(classifier(training, folds[i])) classifiers.sort(key=lambda x: x[1], reverse = True) for i in range(n): print("Accuracy for classifier", i+1, ":", classifiers[i][1]) pass return classifiers
def test_exported_functions(self): """ Testing exported functions extarction using a test conf file. """ conf_file = "test_assets/extractor_confs/exported_functions_conf.yaml" in_folder = "test_assets/executables" out_folder = "test_assets/extracted_features/exported_functions" extractor = Extractor.new(conf_file, in_folder, out_folder) extractor.extract_batch() features_dict = extractor.features with open( "test_assets/expected_features_dicts/exported_functions.json", "rb") as f1: expected_feature_dict = json.load(f1) with open(out_folder + "/json/0/071df5b74f08fb5a4ce13a6cd2e7f485.json", "rb") as f2: extracted_feature_dict = json.load(f2) self.assertEqual(extracted_feature_dict, expected_feature_dict, "exported functions don't match")
def runNfoldCrossValidation(classifier, trainingSamples, featureExtractors, n, save=False): print("Running", n, "fold validation...") classifiers = [] print("Dividing into folds") folds = Extractor.getNfolds(trainingSamples, featureExtractors, n, save) for i in range(len(folds)): training = [] print("Validating with fold", i) for j in range(len(folds)): if not j == i: training.extend(folds[j]) classifiers.append(classifier(training, folds[i])) classifiers.sort(key=lambda x: x[1], reverse=True) for i in range(n): print("Accuracy for classifier", i + 1, ":", classifiers[i][1]) pass return classifiers
def genetic_alg(next_gen, num_generations=200, max_size_gen=1000, size_final_gen=100, mutations_per_solution_max=50, name=""): print('start solution:' + str(next_gen[0].get_score())) for times in xrange(num_generations): next_gen = next_generation(next_gen, max_size_gen, size_final_gen, mutations_per_solution_max) print('iter :' + str(times) + ', name: ' + name) print('best solution:' + str(next_gen[0].get_score())) print('worst solution:' + str(next_gen[size_final_gen - 1].get_score())) print('start solution:' + str(next_gen[0].get_score())) file_write = open(name + '.obj', 'w') pickle.dump(next_gen[0], file_write) ex = Extractor.Extractor(next_gen[0].cars, name) ex.write() return next_gen[0]
def main(): #---------Extraccion de Informacion--------- extractor = Extractor() OS_Data = extractor.get_os_information() Server_Data = extractor.get_server_data() Processor_Data = extractor.get_processor_information() Processes_data = extractor.get_processes_information() Users_Data = extractor.get_users_information() #------------------------------------------- #------Envio de Informacion a la API-------- dictonary_set = { 'OS': OS_Data, 'Proccesor': Processor_Data, 'Server': Server_Data, 'Users': Users_Data, 'Processes': Processes_data } post_api(config.URL_API, dictonary_set)
def start_server(): """ This function starts the server (single-process server) """ host = 'localhost' # Host Name port = 5000 # Port Number s = jpysocket.jpysocket() # Create Socket s.bind((host, port)) # Bind Port And Host s.listen(5) # listening print("Server started ... ") while True: connection, address = s.accept() # Accept the Connection msg_recv = connection.recv(SIMPLE_PACKET_SIZE) # Receive msg msg_recv = jpysocket.jpydecode(msg_recv) # Decrypt msg if msg_recv == "SendFile": print("\nRequest for extracting features") msg_recv = connection.recv(SIMPLE_PACKET_SIZE) size = int( (int(jpysocket.jpydecode(msg_recv))) / FILE_PACKET_SIZE) + 1 # Only .wav file are accepted! f = open("SongTemp.wav", 'wb') # temporary file, opened in binary print("Download of the file ...") while size > 0: packet = connection.recv(FILE_PACKET_SIZE) f.write(packet) size -= 1 f.close() print("Extracting features ...") result = Extractor.extract_feature("SongTemp.wav") print("Features extracted: " + result) # The /r/n is necessary for the client side, that needs to receive a complete line connection.send(bytes(result + "\r\n", 'UTF-8')) os.remove("SongTemp.wav") # remove the temporary file connection.close() s.close()
def test_binary_image(self): """ Testing the binary image extraction using a test conf file. """ from PIL import Image, ImageChops """ # Funtion that compares the differences of the two images . @param1 image, @param2 image (extracted & expected images) @return an image (difference between pixels) if they are equal then it returns a black image """ def assertImage(pic_1, pic_2): diff = ImageChops.difference(pic_1, pic_2) theDifferenceImage = diff.convert('RGB') theDifferenceImage.paste(pic_2, mask=diff) return theDifferenceImage conf_file = "test_assets/extractor_confs/binary_image_conf.yaml" in_folder = "test_assets/executables" out_folder = "test_assets/extracted_features/binary_image" extractor = Extractor.new(conf_file, in_folder, out_folder) extractor.extract_batch() extracted_image_features = extractor.features extracted_image = Image.open( "test_assets/expected_features_images/binary_image.png") expected_image = Image.open( out_folder + "/image/binary_image/0/071df5b74f08fb5a4ce13a6cd2e7f485.png") difference = assertImage(extracted_image, expected_image) """ #getbbox(): verifying if all pixels are black it return 'None' if they are # if not then the pixels where they are changed """ self.assertTrue(not difference.getbbox(), "Binary images don't match")
def mergeAuthors(finalAuthorsNames): import Extractor as ex #--------------------CREAZIONE TABELLA UTENTI PANEL ATTUALI------------------------------------------------ noDuplicates = list(dict.fromkeys(finalAuthorsNames)) shared = ex.created_panels(noDuplicates, finalAuthorsNames) (authors_loves, authors_views) = ex.lovesAndViews(noDuplicates, shared) stars_list = ex.star_count(noDuplicates) avatars = ex.has_avatar(noDuplicates) bios = ex.has_bio(noDuplicates) nFollowers = ex.followers(noDuplicates) shared = list(map(int, shared)) authors_loves = list(map(int, authors_loves)) authors_views = list(map(int, authors_views)) stars_list = list(map(int, stars_list)) nFollowers = list(map(int, nFollowers)) authors_ranking = calculateAuthorRanking(stars_list, authors_loves, authors_views, nFollowers, shared) authorsTable = createAuthorsTable(noDuplicates, stars_list, avatars, bios, nFollowers, authors_loves, authors_views, authors_ranking, finalAuthorsNames) authorsTable = authorsTable.rename( columns={ 'Authors': 'panel_author', 'Stars': 'author_stars', 'Has Avatar': 'has_avatar_author', 'Has Bio': 'has_bio_author', 'Followers': 'author_followers', 'Tot loves': 'tot_loves_author', 'Tot views': 'tot_views_author' }) return authorsTable
from Extractor import * from Wrapper import * from Sender import * if __name__ == '__main__': extractor = Extractor() wrapper = Wrapper() sender = Sender() rawData = extractor.get_site() dataModel = wrapper.packData() print(rawData)
import Extractor as ex import question_generator as gen # To speed up script, start servers: ##bash runStanfordParserServer.sh ##bash runSSTServer.sh #Dish sample #direct_path = "/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/Dish_Sample.txt" #Apple Brief direct_path = '/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/Test_Text.txt' with open(direct_path, 'r') as file: brief = file.read() test = ex.Extractor(brief) qGen = gen.QuestionGenerator() test.fix_pronouns(silence=1) sentences = test.get_sentences() print(sentences) for sentence in sentences: flashcard = qGen.generate_question(sentence) if flashcard: #print(type(flashcard), type(flashcard[0])) print("Question: {}\n\nAnswer: {}'\n-------------".format( flashcard[0]['Q'], flashcard[0]['A']))
from Extractor import * from Threads import * from Curator import * from gcamp_extractor import * arguments = { 'root': '/Users/stevenban/Documents/Data/20190917/binned', 'numz': 20, 'frames': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'offset': 0, 't': 999, 'gaussian': (25, 4, 3, 1), 'quantile': 0.99, 'reg_peak_dist': 40, 'anisotropy': (6, 1, 1), 'blob_merge_dist_thresh': 7, 'mip_movie': True, 'marker_movie': True, 'infill': True, 'save_threads': True, 'save_timeseries': True, 'suppress_output': False, 'regen': False, } e = Extractor(**arguments) e.calc_blob_threads() e.quantify() c = Curator(e)
def process(url): e = Extractor.Extractor() return e.get_out_data(url)
# pylint: disable=no-member import time import cv2 import numpy as np import Extractor W = 1920 // 2 H = 1080 // 2 F = 1 extrac = Extractor.Extractor(F, H, W) class Process(): def process_frame(self, img): self.img = cv2.resize(img, (W, H)) #matches es un array-2D con los puntos normalizados y filtrados matches = extrac.extract(self.img) print("%d matches" % (len(matches))) for pt1, pt2 in matches: #Se desnormalizan las coordenadas de pt1 y pt2, provenientes del filtrado, para poder mostrarse u1, v1 = extrac.denormalize(pt1) u2, v2 = extrac.denormalize(pt2) #Dibuja un circulo verde por cada keypoint
''' @author: Yang @time: 17-11-14 下午2:57 ''' ''' extractor text lines from a text file acceleration dataset: training data: 2000 | test data: 100 underfitting dataset: training data: 500 | test data: 1000 ''' import Extractor extractor = Extractor.Extractor(filename='text.txt') extractor.load_data(trainingNum=2000, testNum=100) extractor.save_data(target='dataset/text/acceleration/') extractor.load_data(trainingNum=500, testNum=1000) extractor.save_data(target='dataset/text/underfitting/') extractor.charset() import os ''' imageGen.py generate image for OCR ''' os.system('python imageGen.py')
def run(self): while not self.quit: DatabaseManager.DeleteData() Extractor.ExtractData() self.delay(300000)
def mainloop(rut): return Extractor.http_request(rut, key) counter = 0 finished = 0 print("Rut Checker") start = int(input("Range Start:")) end = int(input("Range End:")) print("Cracking captcha key... " + '\r', end="") updateKey() start_time = time.time() sys.stdout.flush() print("Starting... " + '\r', end="") for rut in range(start, end): if Uploader.checkrut(rut) == 0: print("Doing missing rut " + str(rut)) data = Extractor.http_request(rut, key) if data == 0: print("Rut " + str(rut) + " timed out.") updatePercentage(rut) else: if Process.validate(data) == 1: key = Keymaker.generate() time.sleep(5) print("Rut " + str(rut) + " broken key.") else: Uploader.updata(rut, Process.process(data)) updatePercentage(rut)
# First we Compress 3 files # Note that the desired file should be added first # So here we want the final file looks like "aks.jpg" import Compressor sources = [] sources.append("aks.jpg") sources.append("aks2.jpg") sources.append("music.mp3") Compressor.compress(sources) # This generate file "comped.jpg" Compressor.compress(sources, "fileName") # This generate file "fileName.jpg" # Then we Extract Thems import Extractor Extractor.extractor(source="filename.jpg") # This extract files in current folder Extractor.extractor(source="filename.jpg", destination="./dest/") # This extract files in 'dest' folder
def runSingleFold(classifier, taggedSamples, featureExtractors, trainWeight=2, testWeight=1): print("Compiling training and test sets") test_set, training_set = Extractor.getTestandTraining(taggedSamples, featureExtractors, 2,1,save=False) print("Running Classifier") return classifier(training_set, test_set)
def mainloop(rut): return Extractor.http_request(rut,key) counter=0 finished=0 print("Rut Checker") start=int(input("Range Start:")) end=int(input("Range End:")) print ("Cracking captcha key... "+'\r', end="") updateKey() start_time=time.time() sys.stdout.flush() print ("Starting... "+'\r', end="") for rut in range(start, end): if Uploader.checkrut(rut)==0: print("Doing missing rut "+str(rut)) data = Extractor.http_request(rut,key) if data==0: print("Rut "+str(rut)+" timed out.") updatePercentage(rut) else: if Process.validate(data) == 1: key = Keymaker.generate() time.sleep(5) print("Rut "+str(rut)+" broken key.") else: Uploader.updata(rut,Process.process(data)) updatePercentage(rut)
def mainloop(rut): return Extractor.http_request(rut,key)
parser = argparse.ArgumentParser() parser.add_argument('-s', '--secondary', help='PDF Files have some secondary publications', action='store_true') args = parser.parse_args() print(args) if args.secondary: print("PDF FIles are not all mains or are secondaries") pdfs_are_main_pubs = False else: print("PDF Files are mains") pdfs_are_main_pubs = True init_start = time.time() is_ready_to_run = extractor.check_resources() if is_ready_to_run: "remove spaces from names of PDF's since spaces causes pdffigures2 to skip pdf" os.chdir(input_dir) for file in glob.glob("*.pdf"): extractor.remove_space(file) print("GROBID extracting text, metadata and references") try: extractor.data_extractor(input_dir, output_dir) except Exception as e: print(e) sys.exit("GROBID encountered an error") print("PDFFigures2.0 extracting figures and figure captions")
def mainloop(rut): return Extractor.http_request(rut, key)
os.chdir(os.getcwd() + "/_Neural Networks_") pickle.dump((network, batches), open("network.p", "wb")) if test: correct = 0 total = 0 percent = 0 os.chdir('..') answer_array = [] answers = open("mnist-test-labels.txt", "r") index = 0 for line in answers: answer_array.append(int(line.strip())) os.chdir(os.getcwd() + '/test_images') for filename in os.listdir(os.getcwd()): image = Extractor.getImage(filename) matrix = Extractor.ImageToMatrix(image) data = np.asarray(matrix).flatten() data = scale_data(data) #print(answer_array[total]) if run_test(network, data) == answer_array[total]: correct += 1 total += 1 if total % ((10000) / 100) == 0: percent += 1 print(str(percent) + "%") #network.show(2,2) if total == stop_at: break print(str(correct) + "/" + str(total))
solution = Solution.Solution(cars, rule_out_rides, bonus, steps) if genetic: solutions = solutions + [solution] print('genetic algorithm:') solution = Solution.genetic_alg([solution], num_generations=1000, max_size_gen=500, size_final_gen=50, mutations_per_solution_max=50, name=file) score_new = solution.get_score() suma_total = suma_total + score_new print('solution score ' + str(score_new)) if score_new > score: print('NUEVA MEJORA DE PUNTUACION EN EL FICHERO: ' + file) best_scores[index_file]=score_new ex = Extractor.Extractor(solution.cars, file) ex.write() veces_mejorado = veces_mejorado + 1 index_file = index_file + 1 print('score: ' + str(suma_total/1000000.0) + ' M') if best_global < suma_total: print('Has mejorado el algoritmo!') print('llevas mejoradas veces: ' + str(veces_mejorado) + '/' + str(index_total)) print('mejora: ' + str(sum(best_scores)-sum(old_best_scores))) suma_total=0 index_total=index_total+1
class DataManager(): def __init__(self): self.extractor = Extractor() self.rootDirectory = '' self.movieDb = None def setRootDirectory(self, rootDir): ''' Utility to set the root directory used for extracting Args: Full path to root directory ''' self.rootDirectory = rootDir def runExtraction(self): ''' Runs the extractor on a file system located at the specified root directory to store the data. Args: None. ''' self.movieDb = None if self.rootDirectory == '': return -1 self.movieDb = self.extractor.extract(self.rootDirectory) def loadDatabase(self, dataFile = ''): ''' Loads a file containing movie information pulled out by the extractor and stores it into the DataManager. File must be of type .csv Args: dataFile - string. Full path to dataFile ''' self.movieDb = None if os.path.isfile(dataFile) and dataFile.split('.') == 'csv': if self.movieDb: self.movieDb.purgeDb() with open(dataFile, 'r') as csvFile: reader = csv.reader(csvFile) g = None for row in reader: # Genre located if row.find('[') > 0: if g == None: g = Genre() else: # save off current genre and start a new one self.movieDb.addGenre(g) g = Genre() else: if g: movie = Movie(row[0], row[2], row[1], row[3]) g.addMovie(movie) def exportData(self, outputFile): ''' Write extracted movie data to .csv file Args: outputFile - full path to desired .csv file ''' if self.movieDb and outputFile and outputFile.split('.')[1] == "csv": with open(outputFile, 'w', newline='') as csvFile: writer = csv.writer(csvFile) # Write genreName to file for genreName, genreObject in self.movieDb.genres.items(): writer.writerow('[' + genreName + ']') # Parse out and write movies data contained in genre for movieTitle, movieObject in genreObject.movies.items(): writer.writerow([movieTitle, movieObject.fileType, movieObject.size, movieObject.length])
def download_show(self, url): page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') ep_range = self.ep_range links = [] for link in soup.findAll('a', {'class': 'sonra'}): if link['href'] not in links: links.append(link['href']) if self.exclude is not None: excluded = [ i for e in self.exclude for i in links if re.search(e, i) ] links = [item for item in links if item not in excluded] season = "season-" + self.season if self.update == True: links = links[0:1] if len(ep_range) == 1: ep_range = '{0}-{0}'.format(ep_range) if ep_range == 'l5' or ep_range == 'L5': # L5 (Last five) links = links[:5] ep_range = 'All' season = 'season-All' if self.newest: links = links[0:1] ep_range = 'All' season = 'season-All' if season != "season-All" and ep_range != "All": episodes = [ "episode-{0}".format(n) for n in range(int(ep_range[0]), int(ep_range[1]) + 1) ] if season == 'season-1': matching = [ s for s in links if 'season' not in s or season in s ] else: matching = [s for s in links if season in s] matching = [ s for s in matching for i in episodes if i == re.search(r'episode-[0-9]+', s).group(0) ] elif season != "season-All": if season == 'season-1': matching = [ s for s in links if 'season' not in s or season in s ] else: matching = [s for s in links if season in s] elif ep_range != 'All': episodes = [ "episode-{0}".format(n) for n in range(int(ep_range[0]), int(ep_range[1]) + 1) ] matching = [ s for s in links for i in episodes if re.search("{0}-".format(i), s) ] else: matching = links if len(matching) < 1: matching.reverse() if (self.threads != None and self.threads != 0): if (len(matching) == 1): for item in matching: source_url, backup_url = self.find_download_link(item) hidden_url = self.find_hidden_url(item) if self.resolution == '480' or len(source_url[0]) > 2: download_url = source_url[0][1] else: try: download_url = source_url[1][1] except Exception: download_url = source_url[0][1] show_info = self.info_extractor(item) output = self.check_output(show_info[0]) Extractor(logger=self.logger, download_url=download_url, backup_url=backup_url, hidden_url=hidden_url, output=output, header=self.header, user_agent=self.user_agent, show_info=show_info, settings=self.settings, quiet=self.quiet) else: count = 0 while (True): processes_count = 0 processes = [] processes_url = [] processes_extra = [] if (int(self.threads) > len(matching)): self.threads = 3 procs = ProcessParallel(print('', end='\n\n')) for x in range(int(self.threads)): try: item = matching[count] _, extra = self.is_valid(item) processes.append(self.download_single) processes_url.append(item) processes_extra.append(extra) count += 1 except Exception as e: if self.logger == 'True': print('Error: {0}'.format(e)) pass for x in processes: procs.append_process( x, url=processes_url[processes_count], extra=processes_extra[processes_count]) processes_count += 1 if ('' in processes_extra): self.threads = None self.download_show(url) break procs.fork_processes() procs.start_all() procs.join_all() processes_url.clear() processes_extra.clear() processes.clear() self.threads = self.original_thread if (count >= len(matching)): break else: for item in matching: source_url, backup_url = self.find_download_link(item) hidden_url = self.find_hidden_url(item) if self.resolution == '480' or len(source_url[0]) > 2: download_url = source_url[0][1] else: try: download_url = source_url[1][1] except Exception: download_url = source_url[0][1] show_info = self.info_extractor(item) output = self.check_output(show_info[0]) Extractor(logger=self.logger, download_url=download_url, backup_url=backup_url, hidden_url=hidden_url, output=output, header=self.header, user_agent=self.user_agent, show_info=show_info, settings=self.settings, quiet=self.quiet) if (self.original_thread != None and self.original_thread != 0): self.threads = self.original_thread
def main(argv): inputfile = '' outputfile = '' url = 'http://www.crummy.com/software/BeautifulSoup/' findthis = "www.crummy.com" do_config = False get_config = False do_get_url = False try: opts, args = getopt.getopt(argv,"hrcgi:o:u:k:",["ifile=","ofile=","ufile=","kfile="]) except getopt.GetoptError: print 'url_extractor.py -h' print 'url_extractor.py -i <inputfile> -o <outputfile>' print 'url_extractor.py -u <base_url> -k <keyword>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'url_extractor.py -h' print 'url_extractor.py -i <inputfile> -o <outputfile>' print 'url_extractor.py -u <base_url> -k <keyword>' sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg elif opt == '-c': do_config = True elif opt == '-g': get_config = True elif opt == '-r': do_get_url = True elif opt in ("-u", "--ufile"): url = arg elif opt in ("-k", "--kfile"): findthis = arg extract = Extractor(url) if do_config: #Config post = {"base_url": url, "keyword": findthis, "updated": datetime.datetime.utcnow()} extract.add_config(post) if get_config: criteria = {} config = extract.get_config(criteria) print config print config["_id"] print config["base_url"] if do_get_url: #Reading html_object = extract.get_html_from_url() #html_object = getUrl(url) html_doc = html_object.read() list_of_hrefs = extract.find_keyword_in_href(findthis, html_doc) if list_of_hrefs: for href in list_of_hrefs: #print href extract.add_url_to_collected(href) collected_urls = extract.get_collected_urls() for url in collected_urls: print url print 'Input file is "', inputfile print 'Output file is "', outputfile print 'Base url is "', url print 'Keyword is "', findthis
def SearchbyGeo(self, lat, lon,primay): try: self.geopageresultlist.DeleteAllItems(); sourcelist = self.GetSourceList() wikiextractor = Extractor.wikiextractor() query ="" queries = {} pagedatalist=[] for source in sourcelist: lastcount =-1 geopagelist = [] while(len(geopagelist)<100): while True: if primay: query = source + '&list=geosearch&gscoord=%s|%s&gsradius=10000&gsglobe=earth&gsnamespace=0&gslimit=500&gsprop=dim&gsprimary=primary' %(lat,lon) else: query = source + '&list=geosearch&gscoord=%s|%s&gsradius=10000&gsglobe=earth&gsnamespace=0&gslimit=500&gsprop=dim&gsprimary=all' %(lat,lon) json_content = wikiextractor.getjson(query) queries = json_content['query'] nowcount =len(geopagelist) if('geosearch' in queries.keys() and len(queries['geosearch'])>0): break; lat+=0.003; lon+=0.003; if(nowcount==lastcount): break; lastcount=len(geopagelist) pages = queries['geosearch'] for page in pages: try: #TODO:如果没有Category属性,无法判断是否为子类(?) lat = page['lat'] lon = page['lon'] pageid = page['pageid'] strpageid=str(pageid).decode('utf-8') title = page['title'].decode('utf8') strlat = str(page['lat']).decode('utf-8') strlon = str(page['lat']).decode('utf-8') strdim=str(page['dim']).decode('utf-8') if(pageid in self.geopagelist): continue else: geopagelist.append(pageid) #测试写入查询结果信息 pagedata = {} pagedata[u'文章ID']= pageid pagedata[u'标题']= title pagedata[u'经度']= lon pagedata[u'纬度']= lat pagedata[u'大小']= strdim pagedatalist.append(pagedata); index = self.geopageresultlist.InsertStringItem(sys.maxint, strpageid) self.geopageresultlist.SetStringItem(index, 0, strpageid) self.geopageresultlist.SetStringItem(index, 1, title) self.geopageresultlist.SetStringItem(index, 2, strlat) self.geopageresultlist.SetStringItem(index, 3, strlon) self.geopageresultlist.SetStringItem(index, 4, strdim) except Exception,e: continue #最后存储查询结果 self.resultdict[u'查询结果']=pagedatalist
parser.add_argument("-l", "--log-file", help="Logging file", default="MR-extractor.log") return parser if __name__ == '__main__': arg_parser = create_arg_parser() args = arg_parser.parse_args() # Getting args from the parser conf_file = args.conf_file in_folder = args.input_dir out_folder = args.output_dir log_file = args.log_file # Making extraction log.basicConfig( filename=log_file, format='[%(levelname)s %(asctime)s] %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=log.DEBUG, ) log.info("Starting extraction") extractor = Extractor.new(conf_file, in_folder, out_folder) extractor.extract_batch() log.info("Extraction ended successfully")
def run_Extractor_RPD(self): self.output_file_name = 'ExtractorRPD' r = Extractor.RemovePageDuplicates(self.inputFilePath, self.output_file_path()) #print(self.output_file_name) r.process()
def run_Extractor_MLP(self): self.update_file_names('ExtractorMLP') m = Extractor.MergeLetterPages(self.input_file_path(), self.output_file_path()) #print(self.input_file_name, self.output_file_name) m.process()
def __init__(self): self.extractor = Extractor() self.rootDirectory = '' self.movieDb = None
filemode='a') def buildSequence(frameList): sequence = [] for image in frameList: features = model.extract(image) sequence.append(features) return np.array(sequence) proc_csv = pd.read_pickle('dataset_with_file_list.pkl') logging.info("Starting routine for features extraction with InceptionV3") model = Extractor() proc_csv.set_index('palavra', inplace=True) folder = '/home/fabiana/Desktop/projeto-final-src/Classifier/InceptionV3_Features' for n in ['5', '10', '15']: print('Number of keyframes: ' + n) for palavra, frameList in tqdm(proc_csv[f'files_list_{n}'].iteritems(), total=len(proc_csv)): if (len(frameList) < int(n)): # print(f"Word {palavra} got less than {n} key frames") logging.warning(f"Word {palavra} got less than {n} key frames") seq = buildSequence(frameList) np.save(f'{folder}/{n}/{palavra}', seq)