Пример #1
0
    def runSitemapCrawler(self):
        startTime = time.time()
        headers = {
            'User-Agent':
            "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Mobile Safari/537.36"
        }
        xmlQueue = set()
        xmlQueue.add(self.sitemapURL)
        htmlQueue = set()
        log('sitemap', 'Crawling XML Sitemap for ' + self.siteName)

        while (len(xmlQueue) != 0):
            nextParse = requests.get(xmlQueue.pop(), headers=headers)
            newXMLLinks = self.findNewLinksXML(nextParse)
            for link in newXMLLinks:
                if '.xml' in link:
                    if 'archive' not in link:
                        xmlQueue.add(link)
                else:
                    htmlQueue.add(link)

        FileIO.deleteFileContents(self.crawledFile)
        FileIO.setToFile(htmlQueue, self.crawledFile)
        log(
            'time', 'Finished crawling XML sitemap for ' + self.siteName +
            ' in ' + str(time.time() - startTime) + ' seconds')
Пример #2
0
    def runParser(self):
        FileIO.deleteFileContents(self.indexFile)

        if not os.path.isfile(self.crawledFile):
            log('error', 'No crawled file.')
            return self

        self.links = FileIO.fileToSet(self.crawledFile)
        self.linksList = list(self.links)

        if not self.links:
            log('error', 'Crawled file is empty')
            return self

        threadPool = []
        for i in range(0, self.MAX_THREADS):
            newThread = Thread(name='parser_' + str(i),
                               target=self.parserWorker)
            threadPool.append(newThread)

        for i in range(0, self.MAX_THREADS):
            threadPool[i].start()

        for i in range(0, self.MAX_THREADS):
            threadPool[i].join()

        self.saveLinkGraphs()
Пример #3
0
 def __init__(self, domain, threads, mode='DEV'):
     self.domain = domain
     self.mode = mode
     self.MAX_THREADS = threads
     self.buildQueue = []
     self.readSemaphore = True
     self.invertedIndexSemaphore = True
     self.hubAuthFile = 'domains/' + self.domain + '/' + self.domain + "_HubAuth.json"
     self.hubAuthScores = FileIO.readJsonFile(self.hubAuthFile)
     self.pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json"
     self.pageRanks = FileIO.readJsonFile(self.pageRankFile)
Пример #4
0
 def saveConfigFile(self):
     '''
     method when pressing Menu item for loading config file
     '''
     savefile = QFileDialog.getSaveFileName(self,
                     'Select where the config file should be saved',self.lastdir)
     if not savefile:
         return
     file_obj = FileIO(savefile)
     file_obj.writeFile(self, ParameterWrap)
     self.lastdir = self.dirs.getParentDir(str(savefile))
Пример #5
0
 def saveConfigFile(self):
     '''
     method when pressing Menu item for loading config file
     '''
     savefile = QFileDialog.getSaveFileName(
         self, 'Select where the config file should be saved', self.lastdir)
     if not savefile:
         return
     file_obj = FileIO(savefile)
     file_obj.writeFile(self, ParameterWrap)
     self.lastdir = self.dirs.getParentDir(str(savefile))
Пример #6
0
def setup():
    """ Read book info from file, if file exists. """

    global counter

    data = fileIO.readAsString(BOOKS_FILE_NAME)
    if len(data) > 0:
        make_book_list(data)

    counter = fileIO.readAsPosInt(COUNTER_FILE_NAME)
    if counter == -1:
        counter = len(book_list)
Пример #7
0
 def loadConfigFile(self):
     '''
     method for loading config-file (from Menu item)
     '''
     loadfile = QFileDialog.getOpenFileName(self,
                     'Select where the config file is located',self.lastdir)
     if not loadfile:
         return
     file_obj = FileIO(loadfile)
     file_obj.loadFile(self,ParameterWrap)
     
     self.dirs.inputdir = self.inputdirectory.text()
     self.dirs.initSinDirectory()
     self.lastdir = self.dirs.getParentDir(str(loadfile))
Пример #8
0
 def __init__(self, siteName, baseURL):
     self.siteName = siteName
     self.baseURL = baseURL
     FileIO.createSiteFileSetup(self.siteName, self.baseURL)
     self.queueFile = 'domains/' + siteName + '/' + siteName + '_queue.txt'
     self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt'
     self.queue = set()
     self.crawled = set()
     self.numCrawled = 0
     self.outlinkGraph = Graph()
     self.inlinkGraph = Graph()
     self.inlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_inlinks.json'
     self.outlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_outlinks.json'
     self.sitemapURL = self.findXMLSitemap()
Пример #9
0
    def loadConfigFile(self):
        '''
        method for loading config-file (from Menu item)
        '''
        loadfile = QFileDialog.getOpenFileName(
            self, 'Select where the config file is located', self.lastdir)
        if not loadfile:
            return
        file_obj = FileIO(loadfile)
        file_obj.loadFile(self, ParameterWrap)

        self.dirs.inputdir = self.inputdirectory.text()
        self.dirs.initSinDirectory()
        self.lastdir = self.dirs.getParentDir(str(loadfile))
Пример #10
0
    def build(self):
        filePath = 'domains/' + self.domain + '/' + self.domain + "_index.txt"
        pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json"

        rawData = FileIO.readJsonFile(filePath)

        count = 0
        for entry in rawData.keys():
            count += 1
            doc = rawData[entry]

            if doc['title'] == None:
                doc['title'] = 'No Title'

            self.addDocumentToCollection(
                url=entry,
                title=doc['title'],
                body=doc['body'],
                description=doc['description'],
                pageRank=self.pageRanks[entry],
                hub=self.hubAuthScores[doc['title']][0],
                authority=self.hubAuthScores[doc['title']][1])
            self.buildInvertedIndex(doc['body'], entry)

            if self.mode == 'DEV' and count >= 5:
                break
Пример #11
0
 def search(self):
     path = 'search_index'
     for file in os.listdir(path):
         if file.endswith(".json") or file.endswith(".txt"):
             indices = FileIO(path + '/' + file).read_file()["indices"]
             for object in indices:
                 print object["word_index"]
                 if self.check_words(object["word_index"]) > 0:
                     self.results.append(object)
Пример #12
0
 def record(self):
     """Start or end recording
     """
     if self.camera.is_recording:
         self.camera.stop_recording()
         self.rec_button.setText('&Rec')
         self.rec_act.setText('&Record')
         self.write_text("save : {}".format(self.video_filename))
     else:
         self.video_filename = FileIO.get_filename(self.filename_rule, self.video_suffix, self.parent_dir)
         self.camera.start_recording(self.video_filename, self.video_codec)
         self.rec_button.setText('Stop rec')
         self.rec_act.setText('Stop record')
Пример #13
0
 def __init__(self, siteName, baseURL, threads):
     self.siteName = siteName
     self.baseURL = baseURL
     self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt'
     self.indexFile = FileIO.createSiteIndexFile(self.siteName)
     self.links = set()
     self.linksList = None
     self.readSemaphore = True
     self.writeSemaphore = True
     self.MAX_THREADS = threads
     self.inlinkGraph = Graph()
     self.outlinkGraph = Graph()
     self.inlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_inlinks.json'
     self.outlinkGraphFile = 'domains/' + siteName + '/' + siteName + '_outlinks.json'
Пример #14
0
    def parserWorker(self):
        buffer = []
        while (len(self.linksList) > 0):
            while (not self.readSemaphore):
                pass

            self.readSemaphore = False
            start = len(self.linksList) - DataParser.MAX_BUFFER_LEN - 1 if len(
                self.linksList) > DataParser.MAX_BUFFER_LEN else 0
            end = len(self.linksList)
            toParse = self.linksList[start:end]
            del self.linksList[start:end]
            self.readSemaphore = True

            for link in toParse:
                obj = extractData(link)

                self.addNewLinksToGraphs(obj['link'], obj['newLinks'])

                buffer.append('link: ' + link + '\n')

                title = obj['title'] if obj['title'] != None else self.siteName
                buffer.append('title: ' + title + '\n')

                buffer.append('description: ' + obj['description'] + '\n', )

                beforeCleanupBody = obj['body'].replace('\n', ' ')
                afterCleanupBody = ' '.join(beforeCleanupBody.split())
                buffer.append('body: ' + afterCleanupBody + '\n\n')

            while (not self.writeSemaphore):
                pass
            self.writeSemaphore = False
            FileIO.writeToFile(self.indexFile, "".join(buffer))
            self.writeSemaphore = True
            buffer[:] = []
Пример #15
0
def shutdown():
    """Save all data to a file - one for books, one for the current counter value, for persistent storage"""

    output_data = make_output_data()

    # Create data directory
    fileIO.mkdir(DATA_DIR)

    # write data to file
    fileIO.overwrite(BOOKS_FILE_NAME, output_data)

    # write counter to data
    fileIO.overwrite(COUNTER_FILE_NAME, counter)
Пример #16
0
    def run(self):
        for url_object in self.url_list:
            extract_object = extraction.Extraction(url_object)
            url_object['word_index'] = extract_object.get_text()

            # try:
            #     # open('search_index/' + data.index_file, 'w')
            #     file = FileIO('search_index/' + data.index_file)
            #     print file.create_file(url_object)
            # except:
            #     file = FileIO('search_index/' + data.index_file)
            #     print file.create_file(url_object)

            # file = FileIO('search_index/' + data.get_index_file())
            # print file.create_file(url_object)
            # f = open(os.path.dirname(__file__) + '/../data.yml')
            if os.path.isfile('../search_index/' + data.index_file):
                file = FileIO('../search_index/' + data.index_file)
                file.update_file(url_object)
            else:
                file = FileIO('../search_index/' + data.index_file)
                file.create_file(url_object)
Пример #17
0
    def save_frame(self):
        """Save the frame on the window as an image.
        """
        if self.filename_rule == "Manual":
            self.save_frame_manual()
            if not self.filename:
                return None
            prm = re.sub(r"\.(.*)", ".csv", str(self.filename))
        else:
            self.filename = FileIO.get_filename(self.filename_rule, self.image_suffix, self.parent_dir)
            prm = str(self.filename).replace(self.image_suffix, "csv")

        if not self.dst.exists():
            self.dst.mkdir(parents=True)
        im = Image.fromarray(self.camera.frame)
        im.save(self.filename)

        # make a parameter file
        with open(prm, "w") as f:
            for name, key in self.current_params.items():
                f.write("{},{}\n".format(name, self.current_params[name]["value"]))

        self.write_text("{:<10}: {}".format("save image", self.filename))
        self.write_text("{:<10}: {}".format("save param", prm))
Пример #18
0
 def runParser(self):
     if not os.path.isfile(self.crawledFile):
         log('error', 'No crawled file.')
         return self
     self.links = FileIO.fileToSet(self.crawledFile)
     if not self.links:
         log('error', 'Crawled file is empty')
         return self
     data = FileIO.readJsonFile(self.indexFile)
     for link in self.links:
         if link not in data:
             obj = extractData(link)
             data[link] = {
                 'docId': DataParser.docId,
                 'title': obj['title'],
                 'body': obj['body']
             }
             DataParser.docId += 1
     FileIO.deleteFileContents(self.indexFile)
     FileIO.writeJsonFile(data, self.indexFile)
Пример #19
0
 def saveLinkGraphs(self):
     FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile)
     FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)
Пример #20
0
def getUrls(file):
    fileio = FileIO()
    return fileio.fileRead(file)
Пример #21
0
    def runSpider(self, iterations):
        startTime = time.time()
        for i in range(0, iterations):
            self.queue = FileIO.fileToSet(self.queueFile)
            self.crawled = FileIO.fileToSet(self.crawledFile)

            newLinks = set()
            newCrawledLinks = set()

            while (len(self.queue) != 0):
                nextLink = self.queue.pop()
                res = self.crawlPage(nextLink)
                newCrawledLinks.add(nextLink)
                newLinks = newLinks.union(res)

            FileIO.deleteFileContents(self.queueFile)
            FileIO.setToFile(newLinks, self.queueFile)
            FileIO.setToFile(newCrawledLinks, self.crawledFile)

        FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile)
        FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)

        log(
            'time', "Crawler for " + self.siteName +
            " execution Finished. Runtime: " + str(time.time() - startTime) +
            "seconds. Total links crawled: " + str(self.numCrawled))
Пример #22
0
 def multiple_search(self, file):
     path = 'search_index'
     indices = FileIO(path + '/' + file).read_file()["indices"]
     for object in indices:
         if self.check_words(object["word_index"]) > 0:
             self.results.append(object)
Пример #23
0
 def __init__(self, *args, **kwargs):
   FileIO.__init__(self, *args, **kwargs)
   self.data_list = []
   self.stemmer = PorterStemmer() # correct syntax?
   self.score_map = 
   self.ranges = 
from os import path

intention = sys.argv[1]
midi_file_name = sys.argv[2]
name = None

if intention == "-hide":
    message_file_name = sys.argv[3]
    if path.isfile(midi_file_name):
        if path.isfile(message_file_name):
            parts = midi_file_name.split(".")
            extension = parts[-1]
            if extension != "mid":
                print "The file must have a .mid extension"
            else:
                fileIO = FileIO()
                message = fileIO.get_text_from(message_file_name)
                hider = Hider()
                hider.hide(midi_file_name, message)
                print "The output file name will be: " "secret_in_" + midi_file_name
        else:
            print "You must put the message file in the same directory as midistegano.py"

    else:
        print "You must put the .mid file in the same directory as run.py"

elif intention == "-reveal":
    if path.isfile(midi_file_name):
        parts = midi_file_name.split(".")
        name = parts[0]
        extension = parts[-1]
Пример #25
0
 def __init__(self, siteName):
     self.siteName = siteName
     self.crawledFile = 'domains/' + siteName + '/' + siteName + '_crawled.txt'
     self.indexFile = FileIO.createSiteIndexFile(self.siteName)
     self.links = set()
Пример #26
0
 def __init__(self, *args, **kwargs):
     FileIO.__init__(self, *args, **kwargs)
     self.data_list = []
Пример #27
0
from book import Book
from fileIO import FileIO as fileIO
from datetime import date
from pprint import pprint  # debugging tool
import json

DATA_DIR = 'data'
BOOKS_FILE_NAME = str(fileIO.pathJoin(DATA_DIR, 'wishlist.txt'))
COUNTER_FILE_NAME = str(fileIO.pathJoin(DATA_DIR, 'counter.txt'))

separator = '^^^'  # a string probably not in any valid data relating to a book

book_list = []
counter = 0


def setup():
    """ Read book info from file, if file exists. """

    global counter

    data = fileIO.readAsString(BOOKS_FILE_NAME)
    if len(data) > 0:
        make_book_list(data)

    counter = fileIO.readAsPosInt(COUNTER_FILE_NAME)
    if counter == -1:
        counter = len(book_list)


def is_book(book_id):
Пример #28
0
 def __init__(self, *args, **kwargs):
   FileIO.__init__(self, *args, **kwargs)
   self.data_list = []
Пример #29
0
def main():
    print("Iniciando...")
    file = FileIO("./test.txt")
    vec = file.read_as_vector()
    print(vec)
Пример #30
0
from fileIO import FileIO
from preprocess import Preprocessing
from decisionTree import DecisionTree


if __name__ == '__main__':
    filename = 'house-votes-84.data.txt'
    fileio = FileIO()
    data = fileio.read_csv(filename)

    preprocessing = Preprocessing()
    preprocessing.assume_missing_values(data)
    for percent in range(3, 8):
        training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10))
        attributes_number = len(training_data[0]) - 1
        decision_tree = DecisionTree()
        root_node = decision_tree.build(training_data)
        # decision_tree.print()
        # print("Classification: ")
        accuracy = 0
        for row in testing_data:
            classified = decision_tree.classify(row, decision_tree.root)
            classified.calc_percentages(len(testing_data))
            if classified.republicans_percent > 50.0 and row[0] == 'republican' or (
                    classified.democrats_percent > 50.0 and row[0] == 'democrat'):
                accuracy += 1

        accuracy = accuracy / float(len(testing_data))
        print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
Пример #31
0
 def __init__(self, *args, **kwargs):
     FileIO.__init__(self, *args, **kwargs)
     self.is_flat = False