Пример #1
0
 def run(self):
     '''
     The feeder function is called by the launcher and gives it a task to
     complete.
     '''
     while True:
         try:
             filename = self.tasks.get(block=False)
             self.tasks.task_done()
             if filename == None:
                 self.result.put(None)
                 break
             elif filename.startswith('comments') or \
                 filename.startswith('article'):
                 continue
             fh = file_utils.create_txt_filehandle(self.rts.txt,
                                                   filename,
                                                   'r',
                                                   'utf-8')
             data = file_utils.read_unicode_text(fh)
             fh.close()
             for x, d in enumerate(data):
                 d = d.strip().split('\t')
                 data[x] = d
             #data = [d.strip() for d in data]
             #data = [d.split('\t') for d in data]
             sorted_data = mergesort(data)
             write_sorted_file(sorted_data, filename, self.rts)
             self.result.put(True)
         except UnicodeDecodeError, error:
             print 'Error: %s, (%s)' % (error, filename)
         except MemoryError, error:
             print 'Error: %s, (%s)' % (error, filename)
Пример #2
0
def store_json_diffs(rts):
    files = os.listdir(rts.diffs)
    #print files, rts.diffs
    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
    buffer = cStringIO.StringIO()

    for filename in files:
        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
                                              'utf-8')
        for line in fh:
            if line.startswith('\n') or line.startswith('Start'):
                obj = buffer.getvalue()
                if obj != '':
                    obj = json.loads(obj)
                    obj[0]['article_id'] = int(obj[0]['article_id'])
                    for key, value in obj[0].iteritems():
                        if key == 'timestamp':
                            value = datetime.strptime(value,
                                                      '%Y-%m-%dT%H:%M:%S')
                        obj[0][key] = value
                    obj = obj[0]
                    try:
                        db.save(obj)
                    except bson.errors.InvalidDocument, error:
                        print error
                buffer = cStringIO.StringIO()
            else:
                buffer.write(line)
        fh.close()
Пример #3
0
def assign_filehandle(fh, file_id, location, process_id, format):
    if not fh:
        file_id = 0
        filename = '%s_%s.%s' % (file_id, process_id, format)
        fh = file_utils.create_txt_filehandle(location, filename, 'w', 'utf-8')
    else:
        size = fh.tell()
        max_size = 1024 * 1024 * 64
        if size > max_size:
            fh.close()
            file_id += 1
            filename = '%s_%s.%s' % (file_id, process_id, format)
            fh = file_utils.create_txt_filehandle(location, filename, 'w',
                                                  'utf-8')

    return fh, file_id
Пример #4
0
    def run(self):
        db = storage.init_database(self.rts.storage, self.rts.dbname,
                                   self.rts.editors_raw)
        editor_cache = cache.EditorCache(db)
        while True:
            try:
                filename = self.tasks.get(block=False)
                self.tasks.task_done()
                if filename == None:
                    self.result.put(None)
                    break

                fh = file_utils.create_txt_filehandle(self.rts.sorted,
                                                      filename, 'r', 'utf-8')
                data = []
                for line in file_utils.read_raw_data(fh):
                    if len(line) == 1:  # or len(line) == 4:
                        continue
                    obs = prepare_data(line)
                    if obs != {}:
                        data.append(obs)
                    if len(data) == 10000:
                        db.insert(data, safe=False)
                        data = []

                if data != []:
                    db.insert(data, safe=False)
                fh.close()
                self.result.put(True)
            except Empty:
                pass
Пример #5
0
def store_articles(tasks, rts):
    db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw)
    filename = None
    while True:
        try:
            filename = tasks.get(block=False)
            tasks.task_done()
            if filename == None:
                break
            print 'Processing %s...' % filename
            fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r',
                                                  'utf-8')
            for line in fh:
                line = line.strip()
                line = line.split('\t')
                data = {}
                x, y = 0, 1
                while y < len(line):
                    key, value = line[x], line[y]
                    if key == 'ns' or key == 'id':
                        data[key] = int(value)
                    else:
                        data[key] = value
                    x += 2
                    y += 2
                db.insert(data)
            fh.close()
        except Empty:
            pass
    print 'Finished processing %s...' % filename
Пример #6
0
    def __init__(self, process_id, rts, fhd):
        super(CSVBuffer, self).__init__(rts, process_id)
        self.fhd = fhd
        self.comments = {}
        self.articles = {}
        self.filehandles = [
            file_utils.create_txt_filehandle(self.rts.txt, file_id, 'a',
                                             'utf-8')
            for file_id in xrange(self.rts.max_filehandles)
        ]

        self.fh_articles = file_utils.create_txt_filehandle(
            self.rts.txt, 'articles_%s' % self.process_id, 'w', 'utf-8')
        self.fh_comments = file_utils.create_txt_filehandle(
            self.rts.txt, 'comments_%s' % self.process_id, 'w', 'utf-8')
        self.fh_article_meta = file_utils.create_txt_filehandle(
            self.rts.txt, 'articles_meta_%s' % self.process_id, 'w', 'utf-8')
Пример #7
0
def write_sorted_file(sorted_data, filename, rts):
    '''
    Writes the sorted file to target
    '''
    fh = file_utils.create_txt_filehandle(rts.sorted,
                                          filename,
                                          'w',
                                          'utf-8')
    file_utils.write_list_to_csv(sorted_data, fh)
    fh.close()
Пример #8
0
def store_diffs_debug(rts):
    db = storage.init_database(rts)
    db.drop_collection()
    files = os.listdir(rts.diffs)
    for filename in files:
        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
                                              'utf-8')
        diffs = json.load(fh)
        db.insert(diffs)
        fh.close()
Пример #9
0
def bot_training_dataset(bots):
    fh = file_utils.create_txt_filehandle(settings.csv_location,
                                          'training_bots.csv', 'w', 'utf-8')
    keys = bots.keys()
    for key in keys:
        bot = bots.get(key)
        bot.hours_active()
        bot.avg_lag_between_edits()
        bot.write_training_dataset(fh)

    fh.close()
Пример #10
0
def write_bot_list_to_csv(bots, keys):
    fh = file_utils.create_txt_filehandle(settings.csv_location,
                                          'bots_ids.csv', 'w', 'utf-8')
    bot_dict = convert_object_to_dict(bots, exclude=['time', 'written'])
    for bot in bot_dict:
        bot = bot_dict[bot]
        file_utils.write_dict_to_csv(bot,
                                     fh,
                                     keys,
                                     write_key=False,
                                     newline=True)
    fh.close()
Пример #11
0
def merge_sorted_files(target, files):
    '''
    Merges smaller sorted files in one big file, Only used for creating 
    data competition file.  
    '''
    fh = file_utils.create_txt_filehandle(target, 'kaggle.csv', 'w', 'utf-8')
    lines = 0
    for line in heapq.merge(*[readline(filename) for filename in files]):
        file_utils.write_list_to_csv(line, fh)
        lines += 1
    fh.close()
    print 'Total number of edits: %s ' % lines
    return fh.name
Пример #12
0
 def to_csv(self, filename):
     data = data_converter.convert_dataset_to_lists(self, 'manage')
     headers = data_converter.add_headers(self)
     lock = RLock()
     fh = file_utils.create_txt_filehandle(settings.dataset_location,
                                           filename, 'w', 'utf-8')
     file_utils.write_list_to_csv(headers,
                                  fh,
                                  recursive=False,
                                  newline=True)
     file_utils.write_list_to_csv(data,
                                  fh,
                                  recursive=False,
                                  newline=True,
                                  format=self.format,
                                  lock=lock)
     fh.close()
Пример #13
0
def create_edgelist(project, collection):
    db = storage.init_database('mongo', project, collection)
    ids = db.retrieve_distinct_keys('editor')
    ids.sort()
    fh = file_utils.create_txt_filehandle(settings.dataset_location,
                                          '%s_edgelist.csv' % project, 'w',
                                          'utf-8')
    for i in ids:
        author_i = db.find_one({'editor': i})
        if author_i != None:
            article_i = create_articles_set(author_i['edits'])
            for j in ids:
                if i > j:
                    author_j = db.find_one({'editor': j})
                    article_j = create_articles_set(author_j['edits'])
                    common = article_i.intersection(article_j)
                    if len(common) > 0:
                        file_utils.write_list_to_csv([i, j, len(common)],
                                                     fh,
                                                     recursive=False,
                                                     newline=True)
    fh.close()
Пример #14
0
def download_wiki_file(task_queue, rts):
    '''
    This is a very simple replacement for wget and curl because Windows does
    not have these tools installed by default
    '''
    success = True
    chunk = 1024 * 4

    while True:
        filename = task_queue.get(block=False)
        task_queue.task_done()
        if filename == None:
            print 'Swallowed a poison pill'
            break
        widgets = log.init_progressbar_widgets(filename)
        extension = os.path.splitext(filename)[1]
        filemode = file_utils.determine_file_mode(extension)
        filesize = http_utils.determine_remote_filesize(
            rts.wp_dump_location, rts.dump_relative_path, filename)

        mod_date = http_utils.determine_modified_date(rts.wp_dump_location,
                                                      rts.dump_relative_path,
                                                      filename)
        mod_date = text_utils.convert_timestamp_to_datetime_naive(
            mod_date, rts.timestamp_server)
        if file_utils.check_file_exists(rts.input_location, filename):
            mod_loc = file_utils.get_modified_date(rts.input_location,
                                                   filename)
            if mod_loc == mod_date and (rts.force == False
                                        or rts.force == None):
                print 'You already have downloaded the most recent %s%s dumpfile.' % (
                    rts.language.code, rts.project.name)
                continue

        if filemode == 'w':
            fh = file_utils.create_txt_filehandle(rts.input_location, filename,
                                                  filemode, rts.encoding)
        else:
            fh = file_utils.create_binary_filehandle(rts.input_location,
                                                     filename, 'wb')

        if filesize != -1:
            pbar = progressbar.ProgressBar(widgets=widgets,
                                           maxval=filesize).start()
        else:
            pbar = progressbar.ProgressBar(widgets=widgets).start()
        try:
            path = '%s%s' % (rts.dump_absolute_path, filename)
            req = urllib2.Request(path)
            response = urllib2.urlopen(req)
            while True:
                data = response.read(chunk)
                if not data:
                    print 'Finished downloading %s.' % (path)
                    break
                fh.write(data)

                filesize -= chunk
                if filesize < 0:
                    chunk = chunk + filesize
                pbar.update(pbar.currval + chunk)

        except urllib2.URLError, error:
            print 'Reason: %s' % error
        except urllib2.HTTPError, error:
            print 'Error: %s' % error