def _get_and_link_resource(asset_path): """Links downloaded asset name with its soure URL.""" path = join(asset_path, str(randrange(1000000))) with open(path, "wb") as f: f.write(response.read()) self.linker[url] = path dump_as_json(self.linker, self.linker_path) return ("", [])
def handle_data(self, tag): # Work only with non-empty tag strings. if re.sub("[\s+]", "", tag): if self.base_url not in self.sites_content: self.sites_content[self.base_url] = tag else: self.sites_content[self.base_url] += " " + tag if sys.getsizeof(self.sites_content) > 10e2: dump_as_json(self.sites_content, self.raw_data) self.sites_content = {}
def set_freq(self): """Returs a dictionary with the word frequency for each website.""" for site, tags in self.words_by_site.items(): self.word_frequency[site] = defaultdict(int) words = tags.split(" ") for word in words: # Save words containing no punctuation characters. match = [char in word for char in string.punctuation] if all(m is False for m in match) and len(word) > 3: self.word_frequency[site][word] += 1 dump_as_json(self.word_frequency, self.freqs_file_path) return self.word_frequency
def persist_reports(self, new_reports): """ :return: """ try: reports = load_as_json('data.json') except ValueError: reports = {'reports': []} reports['reports'].extend(new_reports['reports']) valid_reports = self.validate(reports['reports']) dump_as_json(valid_reports, 'data.json', 'w')
def persist_table(self): dump_as_json(self.table, 'league_table.json', 'w')