示例#1
0
 def write_articles(self):
     #t0 = datetime.datetime.now()
     if len(self.articles.keys()) > 10000:
         rows = []
         try:
             for article_id, data in self.articles.iteritems():
                 data['article_id'] = article_id
                 keys = data.keys()
                 keys.sort()
                 row = []
                 for key in keys:
                     row.append(data[key])
                     if key == 'category' and data[key] != None:
                         self.fh_article_meta.write(
                             '%s\t%s\t%s\n' %
                             (article_id, data[key], data['title']))
                 rows.append(row)
             file_utils.write_list_to_csv(rows,
                                          self.fh_articles,
                                          newline=False)
         except Exception, error:
             print '''Encountered the following error while writing article 
                 data to %s: %s''' % (self.fh_articles, error)
         self.articles = {}
         self.fh_articles.flush()
示例#2
0
 def write_training_dataset(self, fh):
     self.data = []
     self.data.append(self.name)
     self.data.append(self.verified)
     self.add_clock_data()
     self.active()
     self.data.append(self.dt)
     file_utils.write_list_to_csv(self.data, fh)
示例#3
0
 def write_comments(self):
     try:
         for revision_id, comment in self.comments.iteritems():
             file_utils.write_list_to_csv([revision_id, comment, '\n'],
                                          self.fh_comments,
                                          newline=False)
     except Exception, error:
         print '''Encountered the following error while writing comment data 
             to %s: %s''' % (self.fh_comments, error)
示例#4
0
def write_sorted_file(sorted_data, filename, rts):
    '''
    Writes the sorted file to target
    '''
    fh = file_utils.create_txt_filehandle(rts.sorted,
                                          filename,
                                          'w',
                                          'utf-8')
    file_utils.write_list_to_csv(sorted_data, fh)
    fh.close()
示例#5
0
def merge_sorted_files(target, files):
    '''
    Merges smaller sorted files in one big file, Only used for creating 
    data competition file.  
    '''
    fh = file_utils.create_txt_filehandle(target, 'kaggle.csv', 'w', 'utf-8')
    lines = 0
    for line in heapq.merge(*[readline(filename) for filename in files]):
        file_utils.write_list_to_csv(line, fh)
        lines += 1
    fh.close()
    print 'Total number of edits: %s ' % lines
    return fh.name
示例#6
0
 def to_csv(self, filename):
     data = data_converter.convert_dataset_to_lists(self, 'manage')
     headers = data_converter.add_headers(self)
     lock = RLock()
     fh = file_utils.create_txt_filehandle(settings.dataset_location,
                                           filename, 'w', 'utf-8')
     file_utils.write_list_to_csv(headers,
                                  fh,
                                  recursive=False,
                                  newline=True)
     file_utils.write_list_to_csv(data,
                                  fh,
                                  recursive=False,
                                  newline=True,
                                  format=self.format,
                                  lock=lock)
     fh.close()
示例#7
0
 def write_revisions(self):
     #t0 = datetime.datetime.now()
     file_ids = self.revisions.keys()
     while len(file_ids) > 0:
         fh_id = self.fhd.assign_filehandle(self.process_id)
         revisions = self.revisions.get(fh_id, [])
         fh = self.filehandles[fh_id]
         for revision in revisions:
             try:
                 file_utils.write_list_to_csv(revision, fh)
             except Exception, error:
                 print '''Encountered the following error while writing 
                         revision data to %s: %s''' % (fh, error)
         fh.flush()
         self.fhd.return_filehandle(fh_id)
         try:
             del self.revisions[fh_id]
             file_ids.remove(fh_id)
         except KeyError:
             pass
示例#8
0
def create_edgelist(project, collection):
    db = storage.init_database('mongo', project, collection)
    ids = db.retrieve_distinct_keys('editor')
    ids.sort()
    fh = file_utils.create_txt_filehandle(settings.dataset_location,
                                          '%s_edgelist.csv' % project, 'w',
                                          'utf-8')
    for i in ids:
        author_i = db.find_one({'editor': i})
        if author_i != None:
            article_i = create_articles_set(author_i['edits'])
            for j in ids:
                if i > j:
                    author_j = db.find_one({'editor': j})
                    article_j = create_articles_set(author_j['edits'])
                    common = article_i.intersection(article_j)
                    if len(common) > 0:
                        file_utils.write_list_to_csv([i, j, len(common)],
                                                     fh,
                                                     recursive=False,
                                                     newline=True)
    fh.close()