def runSitemapCrawler(self): startTime = time.time() headers = { 'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Mobile Safari/537.36" } xmlQueue = set() xmlQueue.add(self.sitemapURL) htmlQueue = set() log('sitemap', 'Crawling XML Sitemap for ' + self.siteName) while (len(xmlQueue) != 0): nextParse = requests.get(xmlQueue.pop(), headers=headers) newXMLLinks = self.findNewLinksXML(nextParse) for link in newXMLLinks: if '.xml' in link: if 'archive' not in link: xmlQueue.add(link) else: htmlQueue.add(link) FileIO.deleteFileContents(self.crawledFile) FileIO.setToFile(htmlQueue, self.crawledFile) log( 'time', 'Finished crawling XML sitemap for ' + self.siteName + ' in ' + str(time.time() - startTime) + ' seconds')
def runParser(self): FileIO.deleteFileContents(self.indexFile) if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) self.linksList = list(self.links) if not self.links: log('error', 'Crawled file is empty') return self threadPool = [] for i in range(0, self.MAX_THREADS): newThread = Thread(name='parser_' + str(i), target=self.parserWorker) threadPool.append(newThread) for i in range(0, self.MAX_THREADS): threadPool[i].start() for i in range(0, self.MAX_THREADS): threadPool[i].join() self.saveLinkGraphs()
def __init__(self, domain, threads, mode='DEV'): self.domain = domain self.mode = mode self.MAX_THREADS = threads self.buildQueue = [] self.readSemaphore = True self.invertedIndexSemaphore = True self.hubAuthFile = 'domains/' + self.domain + '/' + self.domain + "_HubAuth.json" self.hubAuthScores = FileIO.readJsonFile(self.hubAuthFile) self.pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json" self.pageRanks = FileIO.readJsonFile(self.pageRankFile)
def saveConfigFile(self): ''' method when pressing Menu item for loading config file ''' savefile = QFileDialog.getSaveFileName(self, 'Select where the config file should be saved',self.lastdir) if not savefile: return file_obj = FileIO(savefile) file_obj.writeFile(self, ParameterWrap) self.lastdir = self.dirs.getParentDir(str(savefile))
def saveConfigFile(self): ''' method when pressing Menu item for loading config file ''' savefile = QFileDialog.getSaveFileName( self, 'Select where the config file should be saved', self.lastdir) if not savefile: return file_obj = FileIO(savefile) file_obj.writeFile(self, ParameterWrap) self.lastdir = self.dirs.getParentDir(str(savefile))
def setup(): """ Read book info from file, if file exists. """ global counter data = fileIO.readAsString(BOOKS_FILE_NAME) if len(data) > 0: make_book_list(data) counter = fileIO.readAsPosInt(COUNTER_FILE_NAME) if counter == -1: counter = len(book_list)
def loadConfigFile(self): ''' method for loading config-file (from Menu item) ''' loadfile = QFileDialog.getOpenFileName(self, 'Select where the config file is located',self.lastdir) if not loadfile: return file_obj = FileIO(loadfile) file_obj.loadFile(self,ParameterWrap) self.dirs.inputdir = self.inputdirectory.text() self.dirs.initSinDirectory() self.lastdir = self.dirs.getParentDir(str(loadfile))
def __init__(self, siteName, baseURL): self.siteName = siteName self.baseURL = baseURL FileIO.createSiteFileSetup(self.siteName, self.baseURL) self.queueFile = 'domains/' + siteName + '/' + siteName + '_queue.txt' self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt' self.queue = set() self.crawled = set() self.numCrawled = 0 self.outlinkGraph = Graph() self.inlinkGraph = Graph() self.inlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_inlinks.json' self.outlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_outlinks.json' self.sitemapURL = self.findXMLSitemap()
def loadConfigFile(self): ''' method for loading config-file (from Menu item) ''' loadfile = QFileDialog.getOpenFileName( self, 'Select where the config file is located', self.lastdir) if not loadfile: return file_obj = FileIO(loadfile) file_obj.loadFile(self, ParameterWrap) self.dirs.inputdir = self.inputdirectory.text() self.dirs.initSinDirectory() self.lastdir = self.dirs.getParentDir(str(loadfile))
def build(self): filePath = 'domains/' + self.domain + '/' + self.domain + "_index.txt" pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json" rawData = FileIO.readJsonFile(filePath) count = 0 for entry in rawData.keys(): count += 1 doc = rawData[entry] if doc['title'] == None: doc['title'] = 'No Title' self.addDocumentToCollection( url=entry, title=doc['title'], body=doc['body'], description=doc['description'], pageRank=self.pageRanks[entry], hub=self.hubAuthScores[doc['title']][0], authority=self.hubAuthScores[doc['title']][1]) self.buildInvertedIndex(doc['body'], entry) if self.mode == 'DEV' and count >= 5: break
def search(self): path = 'search_index' for file in os.listdir(path): if file.endswith(".json") or file.endswith(".txt"): indices = FileIO(path + '/' + file).read_file()["indices"] for object in indices: print object["word_index"] if self.check_words(object["word_index"]) > 0: self.results.append(object)
def record(self): """Start or end recording """ if self.camera.is_recording: self.camera.stop_recording() self.rec_button.setText('&Rec') self.rec_act.setText('&Record') self.write_text("save : {}".format(self.video_filename)) else: self.video_filename = FileIO.get_filename(self.filename_rule, self.video_suffix, self.parent_dir) self.camera.start_recording(self.video_filename, self.video_codec) self.rec_button.setText('Stop rec') self.rec_act.setText('Stop record')
def __init__(self, siteName, baseURL, threads): self.siteName = siteName self.baseURL = baseURL self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt' self.indexFile = FileIO.createSiteIndexFile(self.siteName) self.links = set() self.linksList = None self.readSemaphore = True self.writeSemaphore = True self.MAX_THREADS = threads self.inlinkGraph = Graph() self.outlinkGraph = Graph() self.inlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_inlinks.json' self.outlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_outlinks.json'
def parserWorker(self): buffer = [] while (len(self.linksList) > 0): while (not self.readSemaphore): pass self.readSemaphore = False start = len(self.linksList) - DataParser.MAX_BUFFER_LEN - 1 if len( self.linksList) > DataParser.MAX_BUFFER_LEN else 0 end = len(self.linksList) toParse = self.linksList[start:end] del self.linksList[start:end] self.readSemaphore = True for link in toParse: obj = extractData(link) self.addNewLinksToGraphs(obj['link'], obj['newLinks']) buffer.append('link: ' + link + '\n') title = obj['title'] if obj['title'] != None else self.siteName buffer.append('title: ' + title + '\n') buffer.append('description: ' + obj['description'] + '\n', ) beforeCleanupBody = obj['body'].replace('\n', ' ') afterCleanupBody = ' '.join(beforeCleanupBody.split()) buffer.append('body: ' + afterCleanupBody + '\n\n') while (not self.writeSemaphore): pass self.writeSemaphore = False FileIO.writeToFile(self.indexFile, "".join(buffer)) self.writeSemaphore = True buffer[:] = []
def shutdown(): """Save all data to a file - one for books, one for the current counter value, for persistent storage""" output_data = make_output_data() # Create data directory fileIO.mkdir(DATA_DIR) # write data to file fileIO.overwrite(BOOKS_FILE_NAME, output_data) # write counter to data fileIO.overwrite(COUNTER_FILE_NAME, counter)
def run(self): for url_object in self.url_list: extract_object = extraction.Extraction(url_object) url_object['word_index'] = extract_object.get_text() # try: # # open('search_index/' + data.index_file, 'w') # file = FileIO('search_index/' + data.index_file) # print file.create_file(url_object) # except: # file = FileIO('search_index/' + data.index_file) # print file.create_file(url_object) # file = FileIO('search_index/' + data.get_index_file()) # print file.create_file(url_object) # f = open(os.path.dirname(__file__) + '/../data.yml') if os.path.isfile('../search_index/' + data.index_file): file = FileIO('../search_index/' + data.index_file) file.update_file(url_object) else: file = FileIO('../search_index/' + data.index_file) file.create_file(url_object)
def save_frame(self): """Save the frame on the window as an image. """ if self.filename_rule == "Manual": self.save_frame_manual() if not self.filename: return None prm = re.sub(r"\.(.*)", ".csv", str(self.filename)) else: self.filename = FileIO.get_filename(self.filename_rule, self.image_suffix, self.parent_dir) prm = str(self.filename).replace(self.image_suffix, "csv") if not self.dst.exists(): self.dst.mkdir(parents=True) im = Image.fromarray(self.camera.frame) im.save(self.filename) # make a parameter file with open(prm, "w") as f: for name, key in self.current_params.items(): f.write("{},{}\n".format(name, self.current_params[name]["value"])) self.write_text("{:<10}: {}".format("save image", self.filename)) self.write_text("{:<10}: {}".format("save param", prm))
def runParser(self): if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) if not self.links: log('error', 'Crawled file is empty') return self data = FileIO.readJsonFile(self.indexFile) for link in self.links: if link not in data: obj = extractData(link) data[link] = { 'docId': DataParser.docId, 'title': obj['title'], 'body': obj['body'] } DataParser.docId += 1 FileIO.deleteFileContents(self.indexFile) FileIO.writeJsonFile(data, self.indexFile)
def saveLinkGraphs(self): FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)
def getUrls(file): fileio = FileIO() return fileio.fileRead(file)
def runSpider(self, iterations): startTime = time.time() for i in range(0, iterations): self.queue = FileIO.fileToSet(self.queueFile) self.crawled = FileIO.fileToSet(self.crawledFile) newLinks = set() newCrawledLinks = set() while (len(self.queue) != 0): nextLink = self.queue.pop() res = self.crawlPage(nextLink) newCrawledLinks.add(nextLink) newLinks = newLinks.union(res) FileIO.deleteFileContents(self.queueFile) FileIO.setToFile(newLinks, self.queueFile) FileIO.setToFile(newCrawledLinks, self.crawledFile) FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile) log( 'time', "Crawler for " + self.siteName + " execution Finished. Runtime: " + str(time.time() - startTime) + "seconds. Total links crawled: " + str(self.numCrawled))
def multiple_search(self, file): path = 'search_index' indices = FileIO(path + '/' + file).read_file()["indices"] for object in indices: if self.check_words(object["word_index"]) > 0: self.results.append(object)
def __init__(self, *args, **kwargs): FileIO.__init__(self, *args, **kwargs) self.data_list = [] self.stemmer = PorterStemmer() # correct syntax? self.score_map = self.ranges =
from os import path intention = sys.argv[1] midi_file_name = sys.argv[2] name = None if intention == "-hide": message_file_name = sys.argv[3] if path.isfile(midi_file_name): if path.isfile(message_file_name): parts = midi_file_name.split(".") extension = parts[-1] if extension != "mid": print "The file must have a .mid extension" else: fileIO = FileIO() message = fileIO.get_text_from(message_file_name) hider = Hider() hider.hide(midi_file_name, message) print "The output file name will be: " "secret_in_" + midi_file_name else: print "You must put the message file in the same directory as midistegano.py" else: print "You must put the .mid file in the same directory as run.py" elif intention == "-reveal": if path.isfile(midi_file_name): parts = midi_file_name.split(".") name = parts[0] extension = parts[-1]
def __init__(self, siteName): self.siteName = siteName self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt' self.indexFile = FileIO.createSiteIndexFile(self.siteName) self.links = set()
def __init__(self, *args, **kwargs): FileIO.__init__(self, *args, **kwargs) self.data_list = []
from book import Book from fileIO import FileIO as fileIO from datetime import date from pprint import pprint # debugging tool import json DATA_DIR = 'data' BOOKS_FILE_NAME = str(fileIO.pathJoin(DATA_DIR, 'wishlist.txt')) COUNTER_FILE_NAME = str(fileIO.pathJoin(DATA_DIR, 'counter.txt')) separator = '^^^' # a string probably not in any valid data relating to a book book_list = [] counter = 0 def setup(): """ Read book info from file, if file exists. """ global counter data = fileIO.readAsString(BOOKS_FILE_NAME) if len(data) > 0: make_book_list(data) counter = fileIO.readAsPosInt(COUNTER_FILE_NAME) if counter == -1: counter = len(book_list) def is_book(book_id):
def main(): print("Iniciando...") file = FileIO("./test.txt") vec = file.read_as_vector() print(vec)
from fileIO import FileIO from preprocess import Preprocessing from decisionTree import DecisionTree if __name__ == '__main__': filename = 'house-votes-84.data.txt' fileio = FileIO() data = fileio.read_csv(filename) preprocessing = Preprocessing() preprocessing.assume_missing_values(data) for percent in range(3, 8): training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10)) attributes_number = len(training_data[0]) - 1 decision_tree = DecisionTree() root_node = decision_tree.build(training_data) # decision_tree.print() # print("Classification: ") accuracy = 0 for row in testing_data: classified = decision_tree.classify(row, decision_tree.root) classified.calc_percentages(len(testing_data)) if classified.republicans_percent > 50.0 and row[0] == 'republican' or ( classified.democrats_percent > 50.0 and row[0] == 'democrat'): accuracy += 1 accuracy = accuracy / float(len(testing_data)) print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
def __init__(self, *args, **kwargs): FileIO.__init__(self, *args, **kwargs) self.is_flat = False