def write_articles(self): #t0 = datetime.datetime.now() if len(self.articles.keys()) > 10000: rows = [] try: for article_id, data in self.articles.iteritems(): data['article_id'] = article_id keys = data.keys() keys.sort() row = [] for key in keys: row.append(data[key]) if key == 'category' and data[key] != None: self.fh_article_meta.write( '%s\t%s\t%s\n' % (article_id, data[key], data['title'])) rows.append(row) file_utils.write_list_to_csv(rows, self.fh_articles, newline=False) except Exception, error: print '''Encountered the following error while writing article data to %s: %s''' % (self.fh_articles, error) self.articles = {} self.fh_articles.flush()
def write_training_dataset(self, fh): self.data = [] self.data.append(self.name) self.data.append(self.verified) self.add_clock_data() self.active() self.data.append(self.dt) file_utils.write_list_to_csv(self.data, fh)
def write_comments(self): try: for revision_id, comment in self.comments.iteritems(): file_utils.write_list_to_csv([revision_id, comment, '\n'], self.fh_comments, newline=False) except Exception, error: print '''Encountered the following error while writing comment data to %s: %s''' % (self.fh_comments, error)
def write_sorted_file(sorted_data, filename, rts): ''' Writes the sorted file to target ''' fh = file_utils.create_txt_filehandle(rts.sorted, filename, 'w', 'utf-8') file_utils.write_list_to_csv(sorted_data, fh) fh.close()
def merge_sorted_files(target, files): ''' Merges smaller sorted files in one big file, Only used for creating data competition file. ''' fh = file_utils.create_txt_filehandle(target, 'kaggle.csv', 'w', 'utf-8') lines = 0 for line in heapq.merge(*[readline(filename) for filename in files]): file_utils.write_list_to_csv(line, fh) lines += 1 fh.close() print 'Total number of edits: %s ' % lines return fh.name
def to_csv(self, filename): data = data_converter.convert_dataset_to_lists(self, 'manage') headers = data_converter.add_headers(self) lock = RLock() fh = file_utils.create_txt_filehandle(settings.dataset_location, filename, 'w', 'utf-8') file_utils.write_list_to_csv(headers, fh, recursive=False, newline=True) file_utils.write_list_to_csv(data, fh, recursive=False, newline=True, format=self.format, lock=lock) fh.close()
def write_revisions(self): #t0 = datetime.datetime.now() file_ids = self.revisions.keys() while len(file_ids) > 0: fh_id = self.fhd.assign_filehandle(self.process_id) revisions = self.revisions.get(fh_id, []) fh = self.filehandles[fh_id] for revision in revisions: try: file_utils.write_list_to_csv(revision, fh) except Exception, error: print '''Encountered the following error while writing revision data to %s: %s''' % (fh, error) fh.flush() self.fhd.return_filehandle(fh_id) try: del self.revisions[fh_id] file_ids.remove(fh_id) except KeyError: pass
def create_edgelist(project, collection): db = storage.init_database('mongo', project, collection) ids = db.retrieve_distinct_keys('editor') ids.sort() fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') for i in ids: author_i = db.find_one({'editor': i}) if author_i != None: article_i = create_articles_set(author_i['edits']) for j in ids: if i > j: author_j = db.find_one({'editor': j}) article_j = create_articles_set(author_j['edits']) common = article_i.intersection(article_j) if len(common) > 0: file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) fh.close()