def write_to_csv(self, output_file): csvfile = open(output_file, 'wb') csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(['Provider Success Total', self.provider_success_count]) csvwriter.writerow(['Provider Uploaded Total', self.provider_uploaded_count]) csvwriter.writerow(['Provider Failed Total', self.provider_failed_count]) csvwriter.writerow(['Provider Total', self.provider_total_count]) csvwriter.writerow(['Percent Provider Success', self.percent_provider_success]) csvwriter.writerow(['Provider Running Total', self.provider_running_count]) csvwriter.writerow(['Courses Total ', self.course_total_count]) csvwriter.writerow(['Time Spent', time_display(self.spent_time)]) cols = ['provider_id', 'config_file_name', 'courses_total', 'execution_time_display', 'status', 's3_info', 'message'] cols_meta = ['PROVIDER_ID', 'CONFIG FILE NAME ', 'COURSES TOTAL ', 'TIME EXECUTION', 'STATUS', 'UPLOADED_S3', 'MESSAGE'] csvwriter.writerow('') csvwriter.writerow('') csvwriter.writerow(cols_meta) provider_total = self.provider_success + self.provider_failed + self.provider_running for item in provider_total: fields_value = [] for col in cols: if col == 's3_info' and item.s3_info: if item.s3_info: fields_value.append(item.s3_info.uploaded) else: fields_value.append('No') else: value = getattr(item, col) fields_value.append(value) csvwriter.writerow(fields_value) csvfile.close()
def finish(self): if self.spider: collector = self.spider._crawler.stats self.scrapy_collector = json.dumps(collector, default=lambda o: o.__dict__ if hasattr(o, '__dict__') else None) self.end_time = int(time.time()) self.execution_time = self.end_time - self.start_time self.execution_time_display = time_display(self.execution_time) self.courses_total = collector._stats[ 'item_scraped_count'] if 'item_scraped_count' in collector._stats else 0 self.suspect_requests_count = len(self.spider.suspect_requests) # calcul the percent null and not null if self.courses_total: total_percent_null = 0 for item in self.fields_collector: percent_null = float("{0:.2f}".format(item.count_null / item.count_total)) item.percent_null = percent_null total_percent_null += percent_null if total_percent_null != 0: self.average_percent_null = float("{0:.2f}".format(total_percent_null / len(self.fields_collector))) if self.courses_total > 0 and self.average_percent_null <= PERCENT_FIELD_NULL_MAX: self.status = SUCCESS_STATUS elif self.average_percent_null > PERCENT_FIELD_NULL_MAX: self.status = FAILED_STATUS self.message = 'Average percentage of null percent is %s'%(self.average_percent_null) elif self.status == ITEM_SCRAPED_NOT_GROW_UP: self.status = FAILED_STATUS self.message = "The web site is very slowly or there are multiple incorrect requests" else: self.status = FAILED_STATUS else: raise Exception("Spider is not found")