def getMatch(url, i): req = urllib2.Request(url + str(i), headers={'User-Agent' : "scraping bot"}) con = urllib2.urlopen(req) print(url + str(i) + " " + str(con.getcode())) html = con.read() soup = BeautifulSoup(html, 'html.parser') if len(soup.select("body > main > section > h1")) > 0 and soup.select("body > main > section > h1")[0].text == '404': print ("404, matchid = " + str(i)) with open(csvpath, 'ab') as csvfile: writer = unicodecsv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC) writer.writerow([i, "404"]) return date = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(1) > div:nth-of-type(3)")[0] date = date['title'] + " " + date.text.strip() bo = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(1) > div:nth-of-type(2)")[0].text team_a = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(1) > span > b")[0].text team_b = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(2) > span > b")[0].text team_a_pct = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(1) > span")[0].i.text team_b_pct = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > a:nth-of-type(2) > span")[0].i.text team_a_odds = re.findall(r'(\d+(\.\d+)? (to \d+(\.\d+)? )?for 1)',str(soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div.full")[0].select('div.half')[0].find('div')))[0][0] team_b_odds = re.findall(r'(\d+(\.\d+)? (to \d+(\.\d+)? )?for 1)',str(soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div.full")[0].select('div.half')[1].find('div')))[0][0] team_a_won = "(win)" in team_a.lower() team_b_won = "(win)" in team_b.lower() status = soup.select("body > main > section:nth-of-type(1) > div.box-shiny-alt > div:nth-of-type(2)")[0] if status.has_attr('class') and 'full' in status['class']: status = "" else: status = status.text.strip() winner = "a" if team_a_won else "b" if team_b_won else "none" with open(csvpath, 'ab') as csvfile: writer = unicodecsv.writer(csvfile, quoting=csv.QUOTE_NONNUMERIC) writer.writerow([i, date, bo, team_a, team_b, team_a_pct, team_b_pct, team_a_odds, team_b_odds, winner, status])
def import_raw_xlsx(): fields = [ 'la_code', 'ba_ref', 'prop_empty:boolean', 'prop_empty_date:date~make_date_YYYY_MM_DD', 'prop_occupied:boolean', 'prop_occupied_date:date', 'prop_ba_rates:numeric', 'tenant', ] files = [f for f in listdir(DIR) if isfile(join(DIR, f))] with open('vacancy_errors.csv', 'w') as fe: with open('vacancy.csv', 'w') as fp: a = csv.writer(fp, delimiter=',') e = csv.writer(fe, delimiter=',') a.writerows([fields]) e.writerows([['file', 'line', 'error'] + fields]) for f in files: out, errors = process(f) a.writerows(out) e.writerows(errors)
def process_raw_msg(raw_msg, formatted_output_file, append=True): """ Given a Python list of raw messages and an output CSV file to write to, write details of the messages out to the CSV file in the format: <sender-domain>,<subject>,<message-text> """ if append: mode = "ab" else: mode = "wb" mime_msg = email.message_from_string(raw_msg) text = remove_punctuation(html_to_text(concat_email_text(mime_msg))) subject = mime_msg.get("Subject") # Decode escaped character sets in the subject line subject = u" ".join([a[0].decode('utf-8', 'replace') for a in email.header.decode_header(subject)]) subject = remove_punctuation(subject.replace("\r", " ").replace("\n", " ")) sender_domain = mime_msg.get("From").split("@")[1].split(">")[0]#\ #.decode("utf-8") # Strip whitespace csv_line = [fix_spaces_cr_lf(s) for s in [sender_domain, subject, text]] # If any of our strings are empty, replace with a placeholder # to make sure each CSV line has three items. csv_line = map(lambda s: (u'' == s) and u"PLACEHOLDERNONE" or s , csv_line) if formatted_output_file == "STDOUT": writer = unicodecsv.writer(sys.stdout, quoting=unicodecsv.QUOTE_ALL) writer.writerow(csv_line) else: with open(formatted_output_file, mode) as handle: writer = unicodecsv.writer(handle, quoting=unicodecsv.QUOTE_ALL) writer.writerow(csv_line)
def processData(): global manualIgnoreRecords global yesIgnoreRecords global manualProcessedRecords global yesProcessedRecords dirpath = parentdir + "/R3_profiles_YNNM_raw/" with open(dirpath + 'MANUAL_RAW.csv', 'r') as infile, open(processeddir + 'MANUAL_PROCESSED.csv', 'ab') as outfile: rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8') writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8') for row in rows: if(row[6] in manual_ignore_list): #Ignore it manualIgnoreRecords += 1 continue else: manualProcessedRecords += 1 writer.writerow(row) with open(dirpath + 'YES_RAW.csv', 'r') as infile, open(processeddir + 'YES_PROCESSED.csv', 'ab') as outfile: rows = unicodecsv.reader(infile, delimiter=';', encoding='utf-8') writer = unicodecsv.writer(outfile, delimiter=';', encoding='utf-8') for row in rows: if(row[6] in yes_ignore_list): #Ignore it yesIgnoreRecords += 1 continue else: yesProcessedRecords writer.writerow(row)
def csvlist_to_string(csvlist): f = StringIO() csv.writer(f, quoting=csv.QUOTE_MINIMAL, encoding='utf-8').writerow(csvlist) string = f.getvalue() f.close() return string
def export_as_csv(queryset, fields=None, header=None, filename=None, options=None, out=None): """ Exports a queryset as csv from a queryset with the given fields. :param queryset: queryset to export :param fields: list of fields names to export. None for all fields :param header: if True, the exported file will have the first row as column names :param filename: name of the filename :param options: CSVOptions() instance or none :param: out: object that implements File protocol. HttpResponse if None. :return: HttpResponse instance """ if out is None: if filename is None: filename = filename or "%s.csv" % queryset.model._meta.verbose_name_plural.lower().replace(" ", "_") response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment;filename="%s"' % filename.encode('us-ascii', 'replace') else: response = out if options is None: config = csv_options_default else: config = csv_options_default.copy() config.update(options) if fields is None: fields = [f.name for f in queryset.model._meta.fields] dialect = config.get('dialect', None) if dialect is not None: writer = csv.writer(response, dialect=dialect) else: writer = csv.writer(response, escapechar=str(config['escapechar']), delimiter=str(config['delimiter']), quotechar=str(config['quotechar']), quoting=int(config['quoting'])) if bool(header): if isinstance(header, (list, tuple)): writer.writerow(header) else: writer.writerow([f for f in fields]) for obj in queryset: row = [] for fieldname in fields: value = get_field_value(obj, fieldname) if isinstance(value, datetime.datetime): value = dateformat.format(value, config['datetime_format']) elif isinstance(value, datetime.date): value = dateformat.format(value, config['date_format']) elif isinstance(value, datetime.time): value = dateformat.format(value, config['time_format']) row.append(smart_str(value)) writer.writerow(row) return response
def export_list(self, queryset): headers = [ 'Mobile', 'Text', 'Direction', 'Created', 'Delivered' ] output = StringIO() writer = unicodecsv.writer(output, encoding='utf-8') writer.writerow([unicode(i) for i in headers]) yield output.getvalue() output.close() for message in queryset: # limit to three numbers for export and pad if less than three record = [ message.sender if message.direction == 'IN' else message.recipient, message.text, message.direction, message.received.strftime('%Y-%m-%d %H:%M:%S') if message.received else '', message.delivered.strftime('%Y-%m-%d %H:%M:%S') if message.delivered else '' ] output = StringIO() writer = unicodecsv.writer(output, encoding='utf-8') writer.writerow([unicode(i) for i in record]) yield output.getvalue() output.close()
def xml2csv(file): # open file, parse it and get root tree = et.parse(file) root = tree.getroot() # setup csv writer for our data output csvwriter = csv.writer(sys.stdout) # print header row #csvwriter.writerow( [ ['CVE-ID'], ['CVSS Risk'], ['Summary'] ] ) for entry_node in tree.xpath('*[local-name()="entry"]'): # Declare and initialise variables for use. vuln_id = "Unknown" vuln_score = "Unknown" vuln_summary = "Unknown" # get cve_id value vuln_id = entry_node.get('id') # get vuln summary index = len(entry_node.getchildren()) summarynode = entry_node.getchildren()[index-1] vuln_summary = summarynode.text # get cvss risck score for n in entry_node.getchildren(): if n.tag.find('cvss') != -1: cvss_node = n vuln_score = cvss_node.getchildren()[0].getchildren()[0].text row = [ vuln_id, vuln_score, vuln_summary ] csvwriter = csv.writer(sys.stdout) csvwriter.writerows( [row] )
def augment_csv(src_csv,dest_csv): data = load(src_csv,with_keys=True) master_description = master_table(data[1:],DESCRIPTION) con_description = confidence_table(master_description) master_tags = master_table(data[1:],TAGS) con_tags = confidence_table(master_tags) master_title = master_table(data[1:],TITLE) con_title = confidence_table(master_title) data[0].append('guess_description') data[0].append('confidence') data[0].append('guess_tags') data[0].append('confidence') data[0].append('guess_title') data[0].append('confidence') for n in range(1,len(data)): theme,confidence = guess_for_row(data[n],con_description) data[n].append(theme) data[n].append(confidence) theme,confidence = guess_for_row(data[n],con_tags) data[n].append(theme) data[n].append(confidence) theme,confidence = guess_for_row(data[n],con_title) data[n].append(theme) data[n].append(confidence) with open(dest_csv,'w') as f: unicodecsv.writer(f).writerows(data)
def writerawresults(data, columns, placeholderurl, filename): csvoutfile = open(sys.argv[2] + '.csv', 'wb') datawriter = csv.writer(csvoutfile, delimiter=',') #Not needed in the long term. This was for comparing the file-finding capabilities of #of different methods csvfilesoutfile = open(sys.argv[2]+'.files.csv', 'wb') filesdatawriter = csv.writer(csvfilesoutfile, delimiter=',') row = [] extraitems = ['format', 'geo', 'groups', 'tags'] row.extend(extraitems); columnsoffset = len(extraitems) for column in columns: row.append(column) datawriter.writerow(row) for package in data: row = [] #All files, for analysis dict_string = package['data_dict'] json_dict = json.loads(dict_string) for resource in json_dict['resources']: if 'url' in resource: frow = [] frow.append(resource['url']) filesdatawriter.writerow(frow) #Get resource formats if ('res_format' in package): [text, geo] = processListOfFormats(package['res_format']) row.extend([text, geo]) else: row.extend('','') groups = u'' tags = u'' if 'groups' in package: row.append(arraytocsv(package['groups'])) if 'tags' in package: row.append(arraytocsv(package['tags'])) for column in columns: if column in package: row.append(package[column]) else: row.append('') if row[columns.index('url') + columnsoffset] == '': row[columns.index('url') + columnsoffset] = placeholderurl + row[columns.index('id') + columnsoffset] datawriter.writerow(row) csvoutfile.close(); csvfilesoutfile.close();
def __clean_and_fill_csv(self, file_path, headers, data_list): myfile = open(file_path, 'wb') wr = unicodecsv.writer(myfile, quoting=csv.QUOTE_NONE, delimiter = '\t') wr.writerow(headers) myfile = open(file_path, 'ab') wr = unicodecsv.writer(myfile, quoting=csv.QUOTE_NONE, delimiter = '\t') wr.writerows(data_list)
def fetch_rolling(): output = [] min_date = connection.execute(text('SELECT MIN(date) FROM races')).first()[0].replace(day=1) for begin_date in list(rrule.rrule(rrule.MONTHLY, dtstart=min_date, until=date.today())): end_date = begin_date + relativedelta.relativedelta(months=6) sql = 'SELECT AVG(avg), STDDEV(avg), AVG(dev) FROM (SELECT AVG(ranking) avg, STDDEV(ranking) dev FROM rankings WHERE rank_date BETWEEN :begin_date AND :end_date GROUP BY _driver) avg' result = connection.execute(text(sql), begin_date=begin_date, end_date=end_date).first() output.append([end_date.strftime('%Y-%m')] + result.values()) unicodecsv.writer(open('charts/rolling_averages.csv', 'w')).writerows(output)
def _split_batch_csv(self, data_file, f_values, f_ids): writer_values = unicodecsv.writer(f_values) writer_ids = unicodecsv.writer(f_ids) for row in unicodecsv.reader(data_file): writer_values.writerow(row[1:]) writer_ids.writerow([row[:1]]) f_values.seek(0) f_ids.seek(0) return f_values, f_ids
def main(argv=None): if argv is None: argv = sys.argv parser = argparse.ArgumentParser( description="Parse program execution entries from the Amcache.hve Registry hive") parser.add_argument("registry_hive", type=str, help="Path to the Amcache.hve hive to process") parser.add_argument("-v", action="store_true", dest="verbose", help="Enable verbose output") parser.add_argument("-t", action="store_true", dest="do_timeline", help="Output in simple timeline format") args = parser.parse_args(argv[1:]) if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if sys.platform == "win32": import os, msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) r = Registry.Registry(args.registry_hive) try: ee = parse_execution_entries(r) except NotAnAmcacheHive: g_logging.error("doesn't appear to be an Amcache.hve hive") return if args.do_timeline: entries = [] for e in ee: for t in ["first_run", "created_timestamp", "modified_timestamp", "modified_timestamp2", "linker_timestamp"]: ts = getattr(e, t) if ts == UNIX_TIMESTAMP_ZERO: continue if ts == WINDOWS_TIMESTAMP_ZERO: continue if ts == datetime.datetime.min: continue entries.append(TimelineEntry(ts, t, e)) w = unicodecsv.writer(sys.stdout, delimiter="|", quotechar="\"", quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8") w.writerow(["timestamp", "timestamp_type", "path", "sha1"]) for e in sorted(entries, key=lambda e: e.timestamp): w.writerow([e.timestamp, e.type, e.entry.path, e.entry.sha1]) else: w = unicodecsv.writer(sys.stdout, delimiter="|", quotechar="\"", quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8") w.writerow(map(lambda e: e.name, FIELDS)) for e in ee: w.writerow(map(lambda i: getattr(e, i.name), FIELDS))
def createTrainTestFiles(inputFileName,folderName,trainSize,testSize,totalSamples=6034196,tags=None): if not os.path.exists('data/'+folderName): os.makedirs('data/'+folderName) writerSampleTrain = unicodecsv.writer(open('data/'+folderName +'/TrainSamples.csv', 'w')) writerSampleTest = unicodecsv.writer(open('data/'+folderName +'/TestSamples.csv', 'w')) readerSample = unicodecsv.reader(open('data/parsed/'+inputFileName+'Samples.csv', 'r')) writerIdsTrain = open('data/'+folderName +'/TrainIds', 'w') writerIdsTest = open('data/'+folderName +'/TestIds', 'w') readerIds = open('data/parsed/'+inputFileName+'Ids', 'r') writerTagsTrain = open('data/'+folderName +'/TrainTags', 'w') writerTagsTest = open('data/'+folderName +'/TestTags', 'w') readerTags = open('data/parsed/'+inputFileName+'Tags', 'r') if trainSize>=totalSamples: trainSize = totalSamples-testSize i = 0 trainCount=0 testCount=0 for rowSample,rowId,rowTags in izip(readerSample,readerIds,readerTags): i+=1 if tags is not None: toContinue = False for t in rowTags.split(): if t not in tags: toContinue = True break if toContinue: continue x = randint(0,totalSamples) if x<testSize: writerSampleTest.writerow(rowSample) writerIdsTest.write(rowId) writerTagsTest.write(rowTags) testCount+=1 else: if trainCount<trainSize: writerSampleTrain.writerow(rowSample) writerIdsTrain.write(rowId) writerTagsTrain.write(rowTags) trainCount+=1 elif randint(0,5)<2: writerSampleTest.writerow(rowSample) writerIdsTest.write(rowId) writerTagsTest.write(rowTags) testCount+=1 if testCount>=testSize and trainCount>=trainSize: break print 'train : ' + str(trainCount) print 'test : ' + str(testCount) print 'total : ' + str(i)
def writerow(self, row): row = [str(s) for s in row] try: queue = StringIO() self.writer = csv.writer(queue, lineterminator="") self.writer.writerow(row) except TypeError: queue = BytesIO() self.writer = csv.writer(queue, lineterminator="") self.writer.writerow(row) return queue.getvalue()
def process_tweets(self, dtype): self.fuser = open('users.csv', 'wb') self.ft = None self.csv_u = unicodecsv.writer(self.fuser, delimiter=',') if dtype == 'json': f = open('csm-tweets-nairobi.json', 'rb') raw_tweets = f.readlines() fjson_tweets = [] count = 1 print 'loading tweets from fake json\n' for raw_tweet in raw_tweets: tweet = ast.literal_eval(raw_tweet.rstrip("\n")) if count % 50000 == 1: if self.ft is not None: self.ft.close() self.ft= open('tweets_%d.csv'%(count/10000+1), 'wb') self.csv_t = unicodecsv.writer(self.ft, delimiter=',') self.createDicts(tweet) if count % 50000 == 0: print '%d tweets processed!\n'%(count) self.batchInsertDicts() count += 1 self.insertMentions() f.close() elif dtype == 'pickle': count = 1 for fn in glob.glob("*.pickle"): f = open(fn, 'rb') tweets = pickle.load(f) print 'tweets extracted from pickle!\n' for tweet in tweets: if count % 50000 == 1: if self.ft is not None: self.ft.close() self.ft= open('tweets_%d.csv'%(count/10000+1), 'wb') self.csv_t = csv.writer(self.ft, delimiter=',') self.createDicts(tweet) count += 1 self.batchInsertDicts() print 'another pickle processed!\n' f.close() self.insertMentions() else: print 'wrong data type!' return None
def export_list(self, queryset): headers = [ u'Participant ID', u'Name', u'Partner', u'Role', u'Location ID', u'Supervisor ID', u'Gender', u'Email', u'Password', u'Phone Primary', u'Phone Secondary #1', u'Phone Secondary #2' ] if queryset.count(): location_types = LocationType.objects( is_administrative=True, deployment=queryset.first().deployment) headers = headers[:5] + \ map(lambda location_type: location_type.name, location_types) + \ headers[5:] for extra_field in queryset.first().deployment.participant_extra_fields: headers.append(extra_field.label) output = StringIO() writer = unicodecsv.writer(output, encoding='utf-8') writer.writerow([unicode(i) for i in headers]) yield output.getvalue() output.close() for participant in queryset: # limit to three numbers for export and pad if less than three phone_numbers = [i.number for i in participant.phones][:3] phone_numbers += [''] * (3 - len(phone_numbers)) record = [ participant.participant_id if participant.participant_id else '', participant.name if participant.name else '', participant.partner.name if participant.partner else '', participant.role.name if participant.role else '', participant.location.code if participant.location else ''] + \ [participant.location_name_path.get( location_type.name, '') for location_type in location_types] + \ [participant.supervisor.participant_id if participant.supervisor else '', participant.gender if participant.gender else '', participant.email if participant.email else '', participant.password if participant.password else '', ] record.extend(phone_numbers) for extra_field in participant.deployment.participant_extra_fields: record.append(getattr(participant, extra_field.name, '')) output = StringIO() writer = unicodecsv.writer(output, encoding='utf-8') writer.writerow([unicode(i) for i in record]) yield output.getvalue() output.close()
def __init__(self, *args, **kwargs): self.dataset_file = tempfile.NamedTemporaryFile(delete=False) self.resource_file = tempfile.NamedTemporaryFile(delete=False) self.dataset_csv = csv.writer(self.dataset_file) self.resource_csv = csv.writer(self.resource_file) self.dataset_filename = self.dataset_file.name self.resource_filename = self.resource_file.name self.organization_cache = {} self.keys = []
def export_list(self, queryset): headers = [] location_types = list(LocationTypesService().find().order_by("ancestors_ref")) for location_type in location_types: location_name = location_type.name.upper() headers.append("{}_N".format(location_name)) headers.append("{}_ID".format(location_name)) if location_type.has_political_code: headers.append("{}_PCODE".format(location_name)) if location_type.has_registered_voters: headers.append("{}_RV".format(location_name)) for metafield in location_type.metafields: headers.append("{}_{}".format(location_name, metafield.upper())) output = StringIO() writer = unicodecsv.writer(output, encoding="utf-8") writer.writerow([unicode(i) for i in headers]) yield output.getvalue() output.close() if queryset.count() < 1: yield else: locations = queryset locations = locations.order_by("code") for location in locations: record = [] for location_type in location_types: try: this_location = filter( lambda l: l.location_type == location_type.name, location.ancestors_ref ).pop() except IndexError: if location.location_type == location_type.name: this_location = location else: this_location = None record.append(this_location.name or "" if this_location else "") record.append(this_location.code or "" if this_location else "") if location_type.has_political_code: record.append(this_location.political_code or "" if this_location else "") if location_type.has_registered_voters: record.append(this_location.registered_voters or "" if this_location else "") for metafield in location_type.metafields: record.append(getattr(this_location, metafield, "") if this_location else "") output = StringIO() writer = unicodecsv.writer(output, encoding="utf-8") writer.writerow([unicode(i) for i in record]) yield output.getvalue() output.close()
def _init_writer(self, row): from sqlalchemy.engine.result import RowProxy # Four cases: # Write header, or don't # Write list, or dict row_is_dict = isinstance(row, dict) or isinstance(row, RowProxy) row_is_list = isinstance(row, (list, tuple)) has_header = self.header is not None if not os.path.exists(self.path): if not os.path.exists(os.path.dirname(self.path)): os.makedirs(os.path.dirname(self.path)) f = open(self.path, 'wb', buffering=self.buffer_size) self._f = f delimiter = self.delimiter if row_is_dict and has_header: self._writer = unicodecsv.DictWriter(f, self.header, delimiter=delimiter, escapechar=self.escapechar, encoding=self.encoding) if self.write_header: self._writer.writeheader() self._inserter = self._write_dict elif row_is_dict and not has_header: self.header = row.keys() self._writer = unicodecsv.DictWriter(f, self.header, delimiter=delimiter, escapechar=self.escapechar, encoding=self.encoding) if self.write_header: self._writer.writeheader() self._inserter = self._write_dict elif row_is_list and has_header: self._writer = unicodecsv.writer(f, delimiter=delimiter, escapechar=self.escapechar, encoding=self.encoding) if self.write_header: self._writer.writerow(self.header) self._inserter = self._write_list elif row_is_list and not has_header: self._writer = unicodecsv.writer(f, delimiter=delimiter, escapechar=self.escapechar, encoding=self.encoding) self._inserter = self._write_list else: raise Exception("Unexpected case for type {}".format(type(row)))
def yield_csv_catalog(datasets): '''Yield a dataset catalog line by line''' csvfile = StringIO.StringIO() writer = unicodecsv.writer(csvfile, encoding='utf-8', delimiter=b',', quotechar=b'"') # Generate header specs = Metric.get_for(Dataset) writer.writerow(header(specs)) yield csvfile.getvalue() for dataset in datasets: csvfile = StringIO.StringIO() writer = unicodecsv.writer(csvfile, encoding='utf-8', delimiter=b',', quotechar=b'"') writer.writerow(to_row(dataset, specs)) yield csvfile.getvalue()
def solve(): for init_file in os.listdir(FOLDER): start_file = datetime.datetime.now() util.write_log(LOGS, "%s start\n" % init_file) rows = extract_info(FOLDER + init_file) filename = init_file.split(".")[0] folder = RESULT + "/" + filename os.mkdir(folder) os.chdir(folder) # select (cut) a part of rows to match file_idx = 0 length = len(rows) row_written = -1 last_diff = 0 last_time = int(MAX_NUM) i = 0 time_i = rows[0][TIME] mileage_i = rows[0][MILE] j = 1 while j < length: mileage_j = rows[j][MILE] time_j = rows[j][TIME] if mileage_i == mileage_j: if (row_written != -1) and (time_j - last_time >= THIRTY_MINUTES): if row_written != last_diff: match(row_written, last_diff, rows) test_over_speed(row_written, last_diff, rows) with open(str(file_idx) + ".csv", "wb") as output_csv: writer = ucsv.writer(output_csv) writer.writerows(rows[row_written:last_diff + 1]) file_idx += 1 row_written = -1 else: last_diff = i last_time = time_i if row_written == -1: row_written = last_diff i += 1 time_i = time_j mileage_i = mileage_j j += 1 if (row_written != -1) and (row_written < length - 1): match(row_written, length - 1, rows) test_over_speed(row_written, length - 1, rows) with open(str(file_idx) + ".csv", "wb") as output_csv: writer = ucsv.writer(output_csv) writer.writerows(rows[row_written:]) os.chdir("../..") end_file = datetime.datetime.now() util.write_log(LOGS, "%s finish , total costs %s\n" % (init_file, str(end_file - start_file)))
def output(self): identidad = 0 line = [] loan = [] for afiliado in self.afiliados: if afiliado.cardID is None or afiliado.cardID == '0': identidad += 1 continue line.append( ( self.fecha.year, self.fecha.month, afiliado.cardID.replace('-', ''), 11, afiliado.get_monthly(self.fecha) ) ) loan.append( ( self.fecha.year, self.fecha.month, afiliado.cardID.replace('-', ''), 11, afiliado.get_prestamo() ) ) mode = 'wb' if self.append: mode = 'ab+' planilla = unicodecsv.writer( io.open(os.path.join(directory, 'INPREMA{0}.csv'.format(str(self.fecha))), mode), quoting=csv.QUOTE_ALL, encoding='utf-8') prestamos = unicodecsv.writer( io.open(os.path.join(directory, 'INPREMA{0}-prestamo.csv'.format( str(self.fecha))), mode), quoting=csv.QUOTE_ALL) print("Generando Colegiación") [planilla.writerow(l) for l in line] print("Generando Prestamos") [prestamos.writerow(l) for l in loan]
def parsePapersToCSV(papers_filename): global PAPERS_OUT global REF_OUT papers_file = open(papers_filename, 'r') dblp_papers = open(PAPERS_OUT, "w") dblp_ref = open(REF_OUT, "w") paper_wr = csv.writer(dblp_papers, quoting=csv.QUOTE_ALL) ref_wr = csv.writer(dblp_ref) INDEX = "" row = [] ref_line = [] for rline in tqdm(papers_file): line = rline.decode("utf-8").rstrip().replace("\n","").replace("\"", "").replace('\\', "") if line.startswith("#index"): INDEX = line.lstrip("#index").strip() row.append(INDEX) if line.startswith("#*"): row.append(line.lstrip("#*")) if line.startswith("#c"): row.append(line.lstrip("#c")) if line.startswith("#t"): row.append(line.lstrip("#t")) if line.startswith("#%"): ref = line.lstrip("#%").strip() ref_line.append(ref) if line.startswith("#!"): abstract = line.lstrip("#!") if (abstract == ""): abstract = "" row.append(abstract) if line == "": paper_wr.writerow(row) for r in ref_line: if (r != ""): ref_wr.writerow([INDEX, r]) row = [] ref_line = [] dblp_papers.close() dblp_ref.close() papers_file.close()
def write_csvs(data): for key, value in data.items(): with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_page.csv', 'wb') as f: w = unicodecsv.writer(f) w.writerow(('日期', '相对国美UV', '相对国美PV')) w.writerow((str(datetime.datetime.today().date()),'%.5g' % (value['uv'] / float(data['gome']['uv'])), '%.5g' % (value['pv'] / float(data['gome']['pv'])))) w.writerow(()) w.writerow(('页面', '比例')) w.writerow(('商品详情页', '1')) w.writerow(('收藏次数','%.5g' % (float(value['collection']) / value['pv']))) w.writerow(('加入购物车成功页','%.5g' % (float(value['addcart']) / value['pv']))) w.writerow(('购物车页','%.5g' % (float(value['cart']) / value['pv']))) w.writerow(('订单填写页', '%.5g' % (float(value['orderwrite']) / value['pv']))) w.writerow(('订单成功页','%.5g' % (float(value['ordersuccess']) / value['pv']))) w.writerow(()) w.writerow(('会员中心页面', '相对全站UV占比', '相对全站PV占比', '会员页面访问时长')) w.writerow( ('-','%.5g' % (float(value['vip_uv']) / value['uv']),'%.5g' % (float(value['vip_pv']) / value['pv']), '-')) with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_search.csv', 'wb') as f: w = unicodecsv.writer(f) w.writerow(('TOP100搜索词', '占总体搜索次数百分比')) w.writerows([(i,'%.5g' % (j / float(value['total_keyword_count']))) for i, j in value['top_100_keyword']]) with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_category_and_product.csv', 'wb') as f: w = unicodecsv.writer(f) w.writerow(('流量TOP15品类名称', '占品类总流量的百分比')) w.writerows([(i, '%.5g' % (j / float(value['pv']))) for (i, j) in value['top_15_category']]) with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_Beijing.csv', 'wb') as f: w = unicodecsv.writer(f) w.writerow(('省份/城市', '相对国美UV', '相对国美PV')) w.writerow(('北京','%.5g' % (float(value['beijing_uv']) / data['gome']['beijing_uv']), '%.5g' % (float(value['beijing_pv']) / data['gome']['beijing_pv']))) w.writerow(('省份', '流量TOP5品类', '占该省份品类总流量的占比')) w.writerows( [('北京', i,'%.5g' % (float(j) / value['total_beijing_category'])) for i, j in value['top_5_beijing_category']]) w.writerow(('省份', '流量TOP5商品', '占该省份品类总流量的占比','商品url')) # print [('北京', i,'%.5g' % (float(j['count']) / value['total_beijing_product'],j['url'])) for i, j in # value['top_5_beijing_product']] w.writerows( [('北京', i,('%.5g' % (float(j['count']) / value['total_beijing_product'])),j['url']) for i, j in value['top_5_beijing_product']]) with open('output/TongjiCnzzGomeBusinessServiceCSV/' + key + '_referer.csv', 'wb') as f: w = unicodecsv.writer(f) w.writerow(('站外渠道名称', '流量占比')) w.writerows((i,'%.5g' % j) for i,j in value['referer_percent'])
def download_tweets_csv(self, request, queryset): f = StringIO() w = unicodecsv.writer(f, encoding='utf-8') for tweet in queryset: w.writerow(( tweet['data']['id'], tweet['data']['text'], tweet['data']['timestamp'], tweet['data']['retweet_count'], tweet['data']['favorite_count'], tweet['data']['in_reply_to_status_id'], tweet['data']['in_reply_to_user_id'], tweet['data']['retweeted_status_id'], tweet['data']['coords'], tweet['data']['user']['screen_name'], tweet['data']['user']['id'], tweet['data']['user']['name'], )) f.seek(0) response = HttpResponse( f.read(), content_type='text/csv' ) response['Content-Disposition'] = 'attachment;filename=export.csv' return response
def to_csv( self, hql, csv_filepath, schema='default', delimiter=',', lineterminator='\r\n', output_header=True): schema = schema or 'default' with self.get_conn() as conn: with conn.cursor() as cur: logging.info("Running query: " + hql) cur.execute(hql) schema = cur.description with open(csv_filepath, 'wb') as f: writer = csv.writer(f, delimiter=delimiter, lineterminator=lineterminator, encoding='utf-8') if output_header: writer.writerow([c[0] for c in cur.description]) i = 0 while True: rows = [row for row in cur.fetchmany() if row] if not rows: break writer.writerows(rows) i += len(rows) logging.info("Written {0} rows so far.".format(i)) logging.info("Done. Loaded a total of {0} rows.".format(i))
def main(): jsonfile = urllib2.urlopen(BOARD_URL) jsons = json.loads(jsonfile.read()) cards = jsons["cards"] lists = jsons["lists"] members = jsons["members"] # done or archived, actually donelist = filter(lambda x: x["name"] == "Done" or x["closed"] == True, lists)[0] donecards = filter(lambda x: x["idList"] == donelist["id"], cards) cardsWithMember = lambda m, x: m["id"] in x["idMembers"] membersCardCount = dict() for member in members: numDoneWithMember = len(filter(lambda x: cardsWithMember(member, x), donecards)) membersCardCount[member["username"]] = numDoneWithMember for member in membersCardCount.keys(): print member + "\t:", membersCardCount[member] idToUserName = dict() for member in members: idToUserName[member["id"]] = member["username"] with open("done.csv", "w") as donecsvfile: csvwriter = csv.writer(donecsvfile) csvwriter.writerows(map(lambda x: cardToRow(x, idToUserName),donecards))
def _extend(self, rows): rows = iter(rows) if sys.version_info[0] == 3: f = self.open(self.path, 'a', newline='') elif sys.version_info[0] == 2: f = self.open(self.path, 'ab') try: row = next(rows) except StopIteration: return if isinstance(row, dict): schema = dshape(self.schema) row = coerce_record_to_row(schema, row) rows = (coerce_record_to_row(schema, row) for row in rows) # Write all rows to file writer = csv.writer(f, **self.dialect) writer.writerow(row) writer.writerows(rows) try: f.close() except AttributeError: pass
def retrivalByHashTag(numOfTopic, numOfCluster, clusters, hashtags): lda_result_dir = LDA_DIR + str(numOfTopic) + "/" kmean_result_dir = lda_result_dir + "KMEAN_" + str(numOfCluster) + "/" plot_dir = kmean_result_dir + "plot/" if not path.exists(plot_dir): makedirs(plot_dir) assignment_file = kmean_result_dir + "assignment_sorted.csv" docs = [] with open(assignment_file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') for row in spamreader: if int(row[0]) in clusters: docs.append(row[1]) dates = [] files = [ f for f in listdir(LDA_DIR) if isfile(join(LDA_DIR, f)) and "docs" in f and ".csv" in f ] for file in files: with open(LDA_DIR + file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') for row in spamreader: if row[0] in docs: doc = row[3].lower() tokens = tokenizer.tokenize(doc) if any(i in hashtags for i in tokens): postDate = row[5].split("T")[0] postMonth = postDate[:-3] dates.append(postMonth) sortedDates = sorted(dates, key=lambda d: map(int, d.split('-'))) dateDict = Counter(sortedDates) startYear = int(sortedDates[0].split("-")[0]) startMonth = sortedDates[0].split("-")[1] endYear = int(sortedDates[-1].split("-")[0]) endMonth = sortedDates[-1].split("-")[1] print startYear, endYear cfile = "" for c in clusters: cfile = cfile + str(c) + "_" cfile = cfile[:-1] totalPost = len(docs) relevantPost = len(dates) plotWriter = csv.writer(open(plot_dir + 'plot_' + cfile + '.csv', 'wb'), encoding='utf-8') plotWriter.writerow([sortedDates[0], sortedDates[-1]]) plotWriter.writerow([totalPost, relevantPost]) months = [ "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12" ] index = 1 for y in range(startYear, endYear + 1): if y == startYear: for m in months[months.index(startMonth):]: d = str(y) + "-" + m if d in dateDict: plotWriter.writerow([index, dateDict[d]]) else: plotWriter.writerow([index, 0]) index = index + 1 continue if y == endYear: for m in months[:months.index(endMonth) + 1]: d = str(y) + "-" + m if d in dateDict: plotWriter.writerow([index, dateDict[d]]) else: plotWriter.writerow([index, 0]) index = index + 1 continue for m in months: d = str(y) + "-" + m if d in dateDict: plotWriter.writerow([index, dateDict[d]]) else: plotWriter.writerow([index, 0]) index = index + 1
def getAnnoataion(): annotation_file = LDA_DIR + "AnnotationTaskDrugRelated.csv" annotation = dict() with open(annotation_file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') spamreader.next() for row in spamreader: annotation[row[0]] = row[1] annotation_counter = Counter(annotation.values()) for t in NUM_OF_TOPIC: plotWriter = csv.writer(open(LDA_DIR + str(t) + 'clusterResult.csv', 'wb'), encoding='utf-8') for c in NUM_OF_CLUSTER: lda_result_dir = LDA_DIR + str(t) + "/" kmean_result_dir = lda_result_dir + "KMEAN_" + str(c) + "/" assignment_file = kmean_result_dir + "assignment_sorted.csv" clustering = dict() with open(assignment_file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') for row in spamreader: if row[0] in clustering: clustering[ row[0]]["count"] = clustering[row[0]]["count"] + 1 if row[1] in annotation: label = annotation[row[1]] if label in clustering[row[0]]: clustering[row[0]][label] = clustering[ row[0]][label] + 1 else: clustering[row[0]][label] = 1 else: clustering[row[0]] = dict() clustering[row[0]]["count"] = 1 if row[1] in annotation: label = annotation[row[1]] clustering[row[0]][label] = 1 plotWriter.writerow(["K=" + str(c)]) for i in range(0, c): h = str(i) print i, clustering[h]["count"] plotWriter.writerow( ["cluster" + str(i), clustering[h]["count"]]) plotWriter.writerow( ["MedicalUse", "DrugAbuse", "Not Related", "Not Sure"]) docCount = [] docPrecent = [] for j in ['1', '2', '3', '4']: num = 0 numPrecent = 0.0 if j in clustering[h]: num = clustering[h][j] #/ float(annotation_counter[j]) numPrecent = clustering[h][j] / float( annotation_counter[j]) docCount.append(num) docPrecent.append(numPrecent) docCount.append(docCount[1] - docCount[2]) docCount.append(docPrecent[1] - docPrecent[2]) plotWriter.writerow(docCount) plotWriter.writerow([])
"Danish": "Denmark", "Carinthian": "Trieste", "Schleswiger": "Kiel", "Polish": "Warsaw" } if __name__ == "__main__": for fname in glob.glob(foldername + "*.results"): gamename = basename(fname) gamename, _ = splitext(gamename) # splitext is more robust print "Processing ", gamename reader = unicodecsv.reader(open(fname, "rb"), encoding="utf8", lineterminator="\n") gswriter = unicodecsv.writer(open( gamestatefolder + gamename + ".gamestate", "wb"), encoding="utf8", lineterminator="\n") # skip header reader.next() try: countries = {} supplycenter = {} r = reader.next() if r[1].lower().count("starting") == 0: continue lines = r[2].split("\n") for line in lines: if line.count("Army") != 0 or line.count("Fleet") != 0: l = line[:-1].split() country = l[0][:-1]
def merge_unload(data, delimiter, file_name, table, stage_table, upserts_dml_statements, unload_query, unload_bucket_name, unload_key_name, unload_status_query='', additional_copy_options='', additional_unload_options=''): """ Refer this link for more details about redshift merge feature. https://docs.aws.amazon.com/redshift/latest/dg/merge-examples.html This function handles merge and unload to s3 in one single transaction. It creates the temp stage table identical to main table. Copy data from memory to s3 as a file and load the file is s3 using Redshfit copy command. Apply merge queries to update / insert into main table. The callee function has to pass the merge queries. Also callee function has to pass unload query and the query needs to be run to mark the rows being copied to s3. """ try: with tempfile.NamedTemporaryFile() as temp: fn = temp.name with open(fn, "w") as f: writer = csv.writer(f, delimiter=delimiter) writer.writerows(data) logger.info("Loading data to s3") session = boto3.Session( aws_access_key_id=os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'], aws_secret_access_key=os.environ['DATA_ETL_AWS_SECRET_KEY_ID']) s3 = session.resource('s3') s3_bucket_load = S3_BUCKET_NAME s3_key_load = '{}/temp/{}.csv'.format(S3_PREFIX_KEY, file_name) object = s3.Object(s3_bucket_load, s3_key_load) object.upload_file(fn) upsert_statements = [] create_stage_table_sql = "create temp table %s (like %s including defaults);" % ( stage_table, table) sql_copy = '''COPY {} FROM 's3://{}/{}' CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' REGION as 'us-west-2' STATUPDATE ON COMPUPDATE ON delimiter '{}' {}; '''.format(stage_table, s3_bucket_load, s3_key_load, os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'], os.environ['DATA_ETL_AWS_SECRET_KEY_ID'], delimiter, additional_copy_options) sql_unload = '''UNLOAD($${}$$) TO 's3://{}/{}' CREDENTIALS 'aws_access_key_id={};aws_secret_access_key={}' delimiter ',' {} ;'''.format(unload_query, unload_bucket_name, unload_key_name, os.environ['DATA_ETL_AWS_ACCESS_KEY_ID'], os.environ['DATA_ETL_AWS_SECRET_KEY_ID'], additional_unload_options) upsert_statements.extend([ create_stage_table_sql, sql_copy, ]) upsert_statements.extend(upserts_dml_statements) upsert_statements.append(sql_unload) if unload_status_query is not None: upsert_statements.append(unload_status_query) execute_queries(upsert_statements) object.delete() except: e = sys.exc_info() logger.error("Error occurred in load_extract function ".format(e)) raise
# -*- coding: utf-8 -*- import os import re import unicodecsv as csv from google.cloud import translate translate_client = translate.Client() dirs = [ d for d in os.walk("/Users/levisrael/sefaria/Aspaklaria/www.aspaklaria.info") ] fnames = [ fname for (dirpath, dirnames, filenames) in dirs[1:] for fname in filenames[1:] ] clean_names = [ re.sub("\s+", " ", f.replace(".html", "").strip()) for f in fnames ] with open("Aspaklaria-Headwords.csv", "w") as csvout: csvout = csv.writer(csvout) for tag in clean_names: translation = translate_client.translate(tag, target_language='en', source_language='iw') csvout.writerow([tag, translation['translatedText']])
def generate_user_report(output_filename): import unicodecsv from django.core.files.storage import get_storage_class from kpi.deployment_backends.kc_access.shadow_models import ( KobocatUser, KobocatUserProfile, KobocatXForm, ) from hub.models import ExtraUserDetail def format_date(d): if hasattr(d, 'strftime'): return d.strftime('%F') else: return d def get_row_for_user(u: KobocatUser) -> list: row_ = [] try: profile = KobocatUserProfile.objects.get(user=u) except KobocatUserProfile.DoesNotExist: profile = None try: extra_user_detail = ExtraUserDetail.objects.get(user_id=u.pk) except ExtraUserDetail.DoesNotExist: extra_details = None else: extra_details = extra_user_detail.data row_.append(u.username) row_.append(u.email) row_.append(u.pk) row_.append(u.first_name) row_.append(u.last_name) if extra_details: name = extra_details.get('name', '') else: name = '' if name: row_.append(name) elif profile: row_.append(profile.name) else: row_.append('') if extra_details: organization = extra_details.get('organization', '') else: organization = '' if organization: row_.append(organization) elif profile: row_.append(profile.organization) else: row_.append('') row_.append(KobocatXForm.objects.filter(user=u).count()) if profile: row_.append(profile.num_of_submissions) else: row_.append(0) row_.append(format_date(u.date_joined)) row_.append(format_date(u.last_login)) return row_ CHUNK_SIZE = 1000 columns = [ 'username', 'email', 'pk', 'first_name', 'last_name', 'name', 'organization', 'XForm count', 'num_of_submissions', 'date_joined', 'last_login', ] default_storage = get_storage_class()() with default_storage.open(output_filename, 'wb') as output_file: writer = unicodecsv.writer(output_file) writer.writerow(columns) kc_users = KobocatUser.objects.exclude( pk=settings.ANONYMOUS_USER_ID ).order_by('pk') for kc_user in kc_users.iterator(CHUNK_SIZE): try: row = get_row_for_user(kc_user) except Exception as e: row = ['!FAILED!', 'User PK: {}'.format(kc_user.pk), repr(e)] writer.writerow(row)
def handle(self, *args, **options): #read xml from url file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportSRIRegistrationData'+'?pUsername=%s&pPassword=%s' % (settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD) url = urllib2.urlopen(file_url) contents = url.read() xml_file = open("jslps_data_integration_files/person.xml", 'w') xml_file.write(contents) xml_file.close() partner = Partner.objects.get(id = 24) user_obj = User.objects.get(username="******") csv_file = open('jslps_data_integration_files/person_error.csv', 'wb') wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL) tree = ET.parse('jslps_data_integration_files/person.xml') root = tree.getroot() for c in root.findall('SRIRegistrationData'): pc = c.find('MemID').text pn = unicode(c.find('MemberName').text) if c.find('FatherName') is not None: pfn = unicode(c.find('FatherName').text) else: pfn = '' if c.find('PhoneNo') is not None: phone = c.find('PhoneNo').text else: phone = '' if c.find('Age') is not None: age = int(c.find('Age').text) else: age = None if c.find('Gender') is not None: gender = c.find('Gender').text else: gender = 'F' vc = c.find('VillageCode').text gc = c.find('GroupCode').text try: village = JSLPS_Village.objects.get(village_code = vc) except JSLPS_Village.DoesNotExist as e: wtr.writerow(['JSLPS village not EXIST: '+str(pc), vc, e]) continue error = 0 try: group = JSLPS_Persongroup.objects.get(group_code = gc) except JSLPS_Persongroup.DoesNotExist as e: wtr.writerow(['JSLPS group not EXIST: '+str(pc), gc, e]) error = 1 if error == 1: try: person, created = \ Person.objects.get_or_create(person_name = pn, father_name = pfn, partner=partner, gender=gender, village = village.Village, ) person.age = age person.phone_no = phone person.user_created_id = user_obj.id person.save() jslps.new_count += 1 except Exception as e: person = None if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow(['Error in save person group: ',pc, e]) else: jslps.duplicate_count += 1 else: try: person, created = \ Person.objects.get_or_create(person_name = pn, father_name = pfn, partner=partner, gender=gender, village = village.Village, ) person.age = age person.phone_no = phone person.group=group.group person.user_created_id = user_obj.id person.save() jslps.new_count += 1 except Exception as e: person = None if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow(['Error in save person (group(%s)): '%(str(gc)),pc, e]) else: jslps.duplicate_count += 1 if person != None: jslps_person_list = JSLPS_Person.objects.filter(person_code=pc) if jslps_person_list.count() == 0: if group is not None: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=pc, person=person, user_created_id=user_obj.id, activity="LIVELIHOOD" ) jslps_person.group=group jslps_person.save() else: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=pc, person=person, user_created_id=user_obj.id, activity="LIVELIHOOD", ) else: jslps_person = jslps_person_list[0] jslps_person.person = person jslps_person.save() else: person_list = Person.objects.filter(person_name = pn,father_name = pfn,village = village.Village) if person_list.count() != 0: person = person_list[0] jslps_person_list = JSLPS_Person.objects.filter(person_code=pc,person=person) if jslps_person_list.count() == 0: if group is not None: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=pc, person=person, user_created_id=user_obj.id, activity="LIVELIHOOD" ) jslps_person.group=group jslps_person.save() else: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=pc, person=person, user_created_id=user_obj.id, activity="LIVELIHOOD", ) else: jslps_person = jslps_person_list[0] if jslps_person.person == None: jslps_person.person = person jslps_person.save() else: wtr.writerow(['Person not saved and duplicate also not exist',pc, "not saved"]) csv_file.close()
col = td.text.strip() if (tdC == 0): newCol = col.split('\n') try: if len(newCol[1]) > 0: rowData.append(newCol[1]) except: continue else: col.strip('"') #takes quotes off wages if tdC < 3: rowData.append(col) #ignoring 3rd column since it's vacant if tdC > 3: col = col[1:].replace(",", "") #removes commas from wages try: col = int(col) rowData.append(int(col)) except: continue tdC += 1 if len(rowData) > 0: #only append if there's actual data if type(rowData[-1] ) is int: #only accepting data with player wages rowData.append(clubName[i]) data.append(rowData) with open("playerData.csv", 'a') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerows(data)
else: trimmed_record[column] = "Unknown" if "nominate" in record: for column in output_columns["nominate_columns"]: if record["nominate"] is not None and column in record["nominate"]: trimmed_record[column] = record["nominate"][column] else: trimmed_record[column] = "Unknown" for column in output_columns["record_columns"]: if column in record: trimmed_record[column] = record[column] else: trimmed_record[column] = "Unknown" output.append(trimmed_record) # except Exception as e: # print e # print "(moving on)" print "Output compiled ({}/{} matched records included)".format( str(len(output)), str(len(matched))) print "Writing JSON..." with open(output_location, "w") as jsonout: json.dump(output, jsonout, indent=4) if csv_location is not None: print "Writing CSV..." with open(csv_location, "w") as csvfile: writer = csv.writer(csvfile, encoding="utf-8") writer.writerow(output_columns["order"]) for match in tqdm(matched, desc="write"): writer.writerow(generate_row(match)) print "Done!"
def handle(self, *args, **options): #read xml from url file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportGroupMemberDataHnN' + '?pUsername=%s&pPassword=%s' % ( settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD) url = urllib2.urlopen(file_url) contents = url.read() xml_file = open("jslps_data_integration_files/jslps-hnn-person.xml", 'w') xml_file.write(contents) xml_file.close() partner = Partner.objects.get(id=24) user_obj = User.objects.get(username="******") csv_file = open('jslps_data_integration_files/jslps-hnn-person.csv', 'wb') wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL) tree = ET.parse('jslps_data_integration_files/jslps-hnn-person.xml') root = tree.getroot() for c in root.findall('GroupMemberData'): district_code = c.find('DistrictCode').text block_code = c.find('BlockCode').text village_code = c.find('VillageCode').text group_code = c.find('GroupCode').text member_code = c.find('Group_M_Code').text member_name = c.find('MemberName').text father_name = c.find('FatherName').text if c.find( 'FatherName') else 'X' gender = 'F' try: village = JSLPS_Village.objects.get(village_code=village_code) except JSLPS_Village.DoesNotExist as e: wtr.writerow([ 'JSLPS village not EXIST: ' + str(member_code), village_code, group_code, e ]) continue error = 0 try: group = JSLPS_Persongroup.objects.get(group_code=group_code) except JSLPS_Persongroup.DoesNotExist as e: wtr.writerow([ 'JSLPS group not EXIST: ' + str(member_code), group_code, e ]) error = 1 # where group is not present we are trying to save person if error == 1: try: person, created = \ Person.objects.get_or_create(person_name=member_name, partner=partner, gender=gender, village = village.Village, ) jslps.new_count += 1 except Exception as e: person = None if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow( ['Error in save person group: ', member_code, e]) else: jslps.duplicate_count += 1 # where group is present we are trying to save person else: try: person, created = \ Person.objects.get_or_create(person_name=member_name, partner=partner, gender=gender, village = village.Village, father_name=father_name, group=group.group, user_created_id=user_obj.id ) # person.group=group.group # person.user_created_id = user_obj.id # person.save() jslps.new_count += 1 except Exception as e: # import pdb;pdb.set_trace() person = None if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow([ 'Error in save person (group(%s)): ' % (str(group_code)), member_code, e ]) else: jslps.duplicate_count += 1 if person != None: jslps_person_list = JSLPS_Person.objects.filter( person_code=member_code) if len(jslps_person_list) == 0: if group is not None: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=member_code, person=person, user_created_id=user_obj.id, activity="HNN" ) jslps_person.group = group jslps_person.save() else: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=member_code, person=person, user_created_id=user_obj.id, activity="HNN", ) else: jslps_person = jslps_person_list[0] jslps_person.person = person jslps_person.save() else: person_list = Person.objects.filter(person_name=member_name, village=village.Village) if len(person_list) != 0: person = person_list[0] jslps_person_list = JSLPS_Person.objects.filter( person_code=member_code, person=person) if len(jslps_person_list) == 0: if group is not None: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=member_code, person=person, user_created_id=user_obj.id, activity="HNN" ) jslps_person.group = group jslps_person.save() else: jslps_person, created = \ JSLPS_Person.objects.get_or_create(person_code=member_code, person=person, user_created_id=user_obj.id, activity="HNN", ) else: jslps_person = jslps_person_list[0] if jslps_person.person == None: jslps_person.person = person jslps_person.save() else: wtr.writerow([ 'Person not saved and duplicate also not exist', member_code, "not saved" ]) csv_file.close()
def campaign_view_counts(campaign): timezone = g.user.timezone if g.user else 'UTC' viewdict = defaultdict(dict) interval = chart_interval_for(campaign) hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_view.datetime AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_view WHERE campaign_id=:campaign_id GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, campaign_id=campaign.id) for hour, count in hourly_views: viewdict[hour]['_views'] = count hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_anon_view.datetime AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_anon_view WHERE campaign_id=:campaign_id GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, campaign_id=campaign.id) for hour, count in hourly_views: viewdict[hour]['_views'] = viewdict[hour].setdefault('_views', 0) + count hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(user_id)) AS count FROM campaign_user_action WHERE action_id IN (SELECT id FROM campaign_action WHERE campaign_id = :campaign_id AND type != :dismiss_type) GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, campaign_id=campaign.id, dismiss_type=CAMPAIGN_ACTION.DISMISS) for hour, count in hourly_views: viewdict[hour]['_combined'] = count hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_anon_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(anon_user_id)) AS count FROM campaign_anon_user_action WHERE action_id IN (SELECT id FROM campaign_action WHERE campaign_id = :campaign_id AND type != :dismiss_type) GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, campaign_id=campaign.id, dismiss_type=CAMPAIGN_ACTION.DISMISS) for hour, count in hourly_views: viewdict[hour]['_combined'] = viewdict[hour].setdefault('_combined', 0) + count action_names = [] for action in campaign.actions: action_names.append(action.name) hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_user_action WHERE action_id=:action_id GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, action_id=action.id) for hour, count in hourly_views: viewdict[hour][action.name] = count hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, campaign_anon_user_action.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(*) AS count FROM campaign_anon_user_action WHERE action_id=:action_id GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, action_id=action.id) for hour, count in hourly_views: viewdict[hour][action.name] = viewdict[hour].setdefault(action.name, 0) + count if viewdict: # Top-off with site-wide user presence (available since 31 Jan 2015 in user_active_at) minhour = g.user.tz.localize(min(viewdict.keys())).astimezone(UTC).replace(tzinfo=None) maxhour = g.user.tz.localize(max(viewdict.keys()) + timedelta(seconds=3599)).astimezone(UTC).replace(tzinfo=None) hourly_views = db.session.query('hour', 'count').from_statement(db.text( '''SELECT date_trunc(:interval, user_active_at.active_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(user_active_at.user_id)) AS count FROM user_active_at WHERE user_active_at.active_at >= :min AND user_active_at.active_at <= :max GROUP BY hour ORDER BY hour;''' )).params(interval=interval, timezone=timezone, min=minhour, max=maxhour) for hour, count in hourly_views: viewdict[hour]['_site'] = count hourly_views = db.session.query('hour', 'count').from_statement( '''SELECT DATE_TRUNC(:interval, event_session.created_at AT TIME ZONE 'UTC' AT TIME ZONE :timezone) AS hour, COUNT(DISTINCT(anon_user_id)) AS count FROM event_session WHERE event_session.anon_user_id IS NOT NULL AND event_session.created_at >= :min AND event_session.created_at <= :max GROUP BY hour ORDER BY hour''' ).params(interval=interval, timezone=timezone, min=minhour, max=maxhour) for hour, count in hourly_views: viewdict[hour]['_site'] = viewdict[hour].setdefault('_site', 0) + count viewlist = [] for slot in viewdict: row = [slot, viewdict[slot].get('_site', 0), viewdict[slot].get('_views', 0), viewdict[slot].get('_combined', 0)] for name in action_names: row.append(viewdict[slot].get(name, 0)) viewlist.append(row) viewlist.sort() # Sorts by first column, the hour slot (datetime) for row in viewlist: row[0] = row[0].isoformat() outfile = StringIO() out = unicodecsv.writer(outfile, 'excel') out.writerow(['_hour', '_site', '_views', '_combined'] + action_names) out.writerows(viewlist) return outfile.getvalue(), 200, {'Content-Type': 'text/plain'}
def handle(self, *args, **options): #saving videos file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportVedioMasterData' + '?pUsername=%s&pPassword=%s' % ( settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD) url = urllib2.urlopen(file_url) contents = url.read() xml_file = open("jslps_data_integration_files/video.xml", 'w') xml_file.write(contents) xml_file.close() partner = Partner.objects.get(id=24) csv_file = open('jslps_data_integration_files/videos_error.csv', 'wb') wtr = csv.writer(csv_file, quoting=csv.QUOTE_ALL) tree = ET.parse('jslps_data_integration_files/video.xml') root = tree.getroot() for c in root.findall('VedioMasterData'): vdc = c.find('VideoID').text vn = c.find('VideoTitle').text vt = int(c.find('VideoType').text) if c.find('Category') is not None: cg = int(c.find('Category').text) else: cg = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without category', vdc, 'title', vn, e ]) continue if c.find('SubCategory') is not None: scg = int(c.find('SubCategory').text) else: scg = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without category', vdc, 'title', vn, e ]) continue if c.find('Practice') is not None: vp = int(c.find('Practice').text) else: vp = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without category', vdc, 'title', vn, e ]) continue if c.find('YouTubeID') is not None: yid = c.find('YouTubeID').text else: yid = '' pd = datetime.strptime(c.find('ProductionDate').text, '%d/%m/%Y') if c.find('ApprovalDt') is not None: ad = datetime.strptime(c.find('ApprovalDt').text, '%d/%m/%Y') else: ad = None #sd = datetime.strptime(c.find('StartDt').text, '%d/%m/%Y') #ed = datetime.strptime(c.find('EndDt').text, '%d/%m/%Y') ln = int(c.find('Video_Language').text) if (ln == 2): ln = 18 elif (ln == 3): ln = 10 if c.find('Benefit') is not None: benefit = unicode(c.find('Benefit').text) else: benefit = '' vc = c.find('VillageCode').text pro_team = c.find('ProductionTeam').text.split(',') error = 0 try: village = JSLPS_Village.objects.get(village_code=vc) language = Language.objects.get(id=ln) try: facililator = JSLPS_Animator.objects.get( animator_code=pro_team[0]) except JSLPS_Animator.DoesNotExist as e: facililator = JSLPS_Animator.objects.get( animator_code=str(4)) try: camera_operator = JSLPS_Animator.objects.get( animator_code=pro_team[1]) except JSLPS_Animator.DoesNotExist as e: camera_operator = JSLPS_Animator.objects.get( animator_code=str(4)) try: category = Category.objects.get(id=cg) except Category.DoesNotExist as e: category = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without category', vdc, 'title', vn, e ]) continue try: subcategory = SubCategory.objects.get(id=scg) except SubCategory.DoesNotExist as e: subcategory = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without subcategory', vdc, 'title', vn, e ]) continue try: videopractice = VideoPractice.objects.get(id=vp) except VideoPractice.DoesNotExist as e: videopractice = None jslps.other_error_count += 1 wtr.writerow([ 'Can not save video without practice', vdc, 'title', vn, e ]) continue except (JSLPS_Village.DoesNotExist, Language.DoesNotExist) as e: print e if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow(['village', vc, 'title', vn, e]) error = 1 if (error == 0): video_set = dict( Video.objects.filter( village_id=village.Village.id).values_list( 'title', 'village')) video_db = [] video_xml = str(vn) + str(village.Village.id) for key, value in video_set.iteritems(): name = str(key) + str(value) video_db.append(name) if video_xml not in video_db: try: vid = Video(title=vn, video_type=vt, language=language, benefit=benefit, production_date=pd, village_id=village.Village.id, partner=partner, approval_date=ad, youtubeid=yid, category=category, subcategory=subcategory, videopractice=videopractice) vid.save() jslps.new_count += 1 print "video saved" except Exception as e: print vdc, e if "Duplicate entry" in str(e): jslps.duplicate_count += 1 else: jslps.other_error_count += 1 wtr.writerow(['video save', vdc, e]) try: vid = Video.objects.get(title=vn, village_id=village.Village.id, partner_id=partner.id) vid.production_team.add(facililator.animator) vid.save() jslps.new_count += 1 vid.production_team.add(camera_operator.animator) vid.save() jslps.new_count += 1 print "farmer shown saved" except Exception as e: if "Duplicate entry" in str(e): jslps.duplicate_count += 1 else: jslps.other_error_count += 1 wtr.writerow(['production team save', e]) video_added = [] video = None try: video = Video.objects.filter( title=vn, village_id=village.Village.id, partner_id=partner.id).get() video_added = JSLPS_Video.objects.values_list( 'vc', flat=True) #video_added = [i[0] for i in video_added] except Exception as e: print e try: if vdc not in video_added: vj = JSLPS_Video(vc=vdc, video=video) vj.save() except Exception as e: print vdc, e if "Duplicate entry" not in str(e): jslps.other_error_count += 1 wtr.writerow(['JSLPS Video save', vdc, e]) #saving non-negotiables file_url = 'http://webservicesri.swalekha.in/Service.asmx/GetExportVedioNon_NegotiableMasterData' + '?pUsername=%s&pPassword=%s' % ( settings.JSLPS_USERNAME, settings.JSLPS_PASSWORD) url = urllib2.urlopen(file_url) contents = url.read() xml_file = open("jslps_data_integration_files/nn.xml", 'w') xml_file.write(contents) xml_file.close() partner = Partner.objects.get(id=24) tree = ET.parse('jslps_data_integration_files/nn.xml') root = tree.getroot() for c in root.findall('VedioNon_NegotiableMasterData'): nn_c = c.find('Non_NegotiablesID').text vdc = int(c.find('VideoId').text) nn_n = c.find('Non_NegotiablesName').text vr = c.find('Verification').text if vr == 'false': vr = False else: vr = True error = 0 try: video = JSLPS_Video.objects.get(vc=vdc) except JSLPS_Video.DoesNotExist as e: error = 1 if error == 0: try: nonnego_already_exist = NonNegotiable.objects.filter( video_id=video.video_id, non_negotiable=nn_n, physically_verifiable=vr) if len(nonnego_already_exist) == 0: nn = NonNegotiable(video_id=video.video_id, non_negotiable=nn_n, physically_verifiable=vr) nn.save() jslps.new_count += 1 except Exception as e: print e if "Duplicate entry" in str(e): jslps.duplicate_count += 1 else: jslps.other_error_count += 1 wtr.writerow(['Non nego', nn_c, 'video', vdc, e])
ignored = {} available = {} for fname in os.listdir(FINAL): tid = fname.split("_")[1].split(".")[0] ignored[tid] = 0 # Get remaining unlabeled tweets for fname in os.listdir(TWEETS): tid = fname.split("_")[1].split(".")[0] if tid not in ignored: p = os.path.join(TWEETS, fname) with codecs.open(p, "rb", encoding="utf-8") as f: tweet = json.load(f, encoding="utf-8") text = tweet["text"] # Remove all line breaks in a tweet text = text.replace('\n', ' ').replace('\r', '') available[tid] = text if "\t" in available[tid]: print "tab", tid # Store in csv file with open(DST, "wb") as f: writer = csv.writer(f, dialect='excel', encoding='utf-8', delimiter="\t") # Write header for tid in available: text = available[tid] writer.writerow([tid, text])
def dump_csv(array, delimiter=','): f = io.BytesIO() writer = csv.writer(f, delimiter=delimiter, quoting=csv.QUOTE_ALL) writer.writerow(array) return f.getvalue()[:-2].decode('utf-8')
def contracts_csv(request, unit_slug, semester): hiring_semester = get_object_or_404(HiringSemester, semester__name=semester, unit__in=request.units, unit__label=unit_slug) contracts = TAContract.objects.signed(hiring_semester) response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'inline; filename="%s.csv"' % ( hiring_semester.semester.name) writer = csv.writer(response) writer.writerow([ 'Batch ID', 'Term ID', 'Contract Signed', 'Benefits Indicator', 'EmplID', 'SIN', 'Last Name', 'First Name 1', 'First Name 2', 'Payroll Start Date', 'Payroll End Date', 'Action', 'Action Reason', 'Position Number', 'Job Code', 'Full_Part time', 'Pay Group', 'Employee Class', 'Category', 'Fund', 'Dept ID (cost center)', 'Project', 'Account', 'Prep Units', 'Base Units', 'Appt Comp Freq', 'Semester Base Salary Rate', 'Biweekly Base Salary Pay Rate', 'Hourly Rate', 'Standard Hours', 'Scholarship Rate Code', 'Semester Scholarship Salary Pay Rate', 'Biweekly Scholarship Salary Pay Rate', 'Lump Sum Amount', 'Lump Sum Hours', 'Scholarship Lump Sum' ]) seq = hiring_semester.next_export_seq() batchid = '%s_%s_%02i' % (hiring_semester.unit.label, datetime.date.today().strftime("%Y%m%d"), seq) for c in contracts: bu = c.bu total_bu = c.total_bu prep_units = c.total_bu - c.bu signed = 'Y' benefits = 'Y' schol_rate = 'TSCH' if c.scholarship_per_bu > 0 else '' salary_total = c.total_pay schol_total = c.scholarship_pay if prep_units == 0: prep_units = '' row = [] #Batch ID row.append(batchid) #Term ID row.append(hiring_semester.semester.name) #Signed row.append(signed) #Benefits Indicator row.append(benefits) #Emplid row.append(c.person.emplid) #SIN row.append(c.sin) #Name row.extend( [c.person.last_name, c.person.first_name, c.person.middle_name]) #Payroll Start Date, Payroll End Date row.append(c.pay_start.strftime("%Y%m%d")) row.append(c.pay_end.strftime("%Y%m%d")) #Action, Action Reason row.append('REH') row.append('REH') #Position Number row.append("%08i" % c.category.account.position_number) #Job Code row.append('') #Full_Part time row.append('') #Pay Group row.append('TSU') #Employee Class row.append('') #Category row.append(c.category.code) #Fund row.append(11) #Dept ID(cost center) row.append(hiring_semester.unit.deptid()) #Project row.append('') #Account row.append(c.category.account.account_number) #Prep Units row.append(prep_units) #Base Units row.append(bu) #Appt Comp Freq row.append('T') #Semester Base Salary Rate row.append("%2f" % (salary_total, )) #Biweekly Base Salary Rate, Hourly Rate, Standard Hours row.extend(['', '', '']) #Scholarhip Rate Code row.append(schol_rate) #Semester Scholarship Salary Pay Rate row.append(schol_total) #Biweekly Scholarship Salary Pay Rate, Lump Sum Amount #Lump Sum Hours, Scholarship Lump Sum row.extend(['', '', '', '']) writer.writerow(row) return response
def _save_id_map(self): import unicodecsv writer = unicodecsv.writer(open(self.id_map_file, 'wb')) for key, value in self.id_map.items(): writer.writerow([value, key])
def write_data(file_data, file_name): with open(file_name, 'a') as f: f_csv = unicodecsv.writer(f, encoding='utf-8-sig') f_csv.writerows(file_data) print 'output finished!'
def main(): home_directory = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) with open( os.path.join(home_directory, 'extracted_ocr\\Salem_Athenaeum_1811.html'), 'r') as f: contents = f.read() file = html.unescape(bs(contents, 'lxml')) catalog = [] final_catalog = [] sum_of_heights = 0 volume_sizes = [ 'folio', 'fol', 'quarto', '4to', 'octavo', 'oct', '8vo', 'duodecimo', 'duodec', '12mo' ] for index, page in enumerate(file.find_all('page')): #Counting the words that go into the denominator for average word height counted_words = 0 words = page.find_all('word') if len(words) == 0: continue words.sort(key=lambda x: float(x.get('xmin'))) # First we sorted all of the words on the page from furthest left to furthest right. # Now we sort them all again by highest to lowest. Assuming one column of text per page, # this does an excellent job of putting all of the words in normal reading order. # Will need a different method for catalogs with two columns per page. words.sort(key=lambda x: float(x.get('ymin'))) old_line_y = float(words[0].get('ymin')) line = [] for word in words: # Ignoring empty "words." Not sure where they're coming from; this started happening # when I switched to ASCII encoding from the problematic-for-other-reasons UTF-8 if word.text == '': continue # Ignoring single lower case letters. These tend to be OCR artifacts that aren't useful if re.match("[a-z]{1}$", word.text): continue # Ignoring et cetera if word.text == '&c': continue # Ignoring things in all caps because they're typically headers # Even the catalogs where they're not necessarily (e.g., Ladies' Lib # of Kalamazoo), the OCR usually makes the all caps in running text # into normal text. if re.match("[A-Z]{2}", word.text): continue # Needed for Lib Co Boston 1830, where a lot of 1s got turned into Is in # the shelf number column # if word.text == 'I': # continue # Ignoring random flecks on the page that get turned into punctuation by OCR # Combined with number screen below. #if re.match("[_., %*:|'\"\^\-º“]$", word.text): # continue # Ignoring common column headings. "Mo." is a common OCR error for an italicized "No." if re.match("Vol", word.text) or re.match( "Wols", word.text) or re.match("No.", word.text) or re.match( "Mo.", word.text) or re.match("Size", word.text): continue # "Shelf" is such a pain it gets a line of its own. if edit_distance("shelf", word.text.rstrip('.,?!:;').casefold()) < 3: continue # Ignoring page numbers -- 1, 2, or 3-digit numbers not followed by "nd", "rd", "th", etc. # Can't do that for Charleston, because we need the numbers to calculate indents correctly. # Can also adjust to ignore shelf numbers when needed (e.g., NY Mechanics 1844) # And a tweak to ignore OCR-eaten numbers. # I swear that not all of this punctuation is in the ASCII code space, but I've seen # all of it in the OCR extracted by pdftotext with ASCII7 encoding.... # Had to remove * from the punctuation list for NY Society 1813 if re.match("[=%_.,:;#|*'\"\^\-º•§“{}\[\]&]{1,5}$", word.text) or re.search("[0-9]{3,4}", word.text): # if re.match("[0-9]{1,4}$", word.text): continue counted_words += 1 sum_of_heights = sum_of_heights + (float(word.get('ymax')) - float(word.get('ymin'))) line_y = float(word.get('ymin')) # We know we're on a new line of text when the ymin increases more than 7 pixels. # 7 pixels was selected empirically based on first several catalogs processed. # This may be too large of a number for very small-type catalogs. # Changed to 8 because found cases in Milwaukee YMA where 7 was too small. # Changed to 12 because of non-straight lines in Charleston Lib Co. if (line_y - 8) > old_line_y: old_line_y = line_y if line: line.sort(key=lambda x: float(x.get('xmin'))) catalog.append(line) line = [word] else: line.append(word) #Append the last one on the page if len(line) > 0: line.sort(key=lambda x: float(x.get('xmin'))) catalog.append(line) #Process the page, putting together split lines into single entries previous_line_xmin = None for entry in catalog: # Is the new line indented further than the old line? If so, # it needs some special handling. # If it's the first line on the page, the question is moot. if previous_line_xmin == None: indent = 0 else: first_real_word = next( (y for y in entry if y.text.rstrip('.,?!') != "do"), None) if first_real_word: this_line_xmin = float(first_real_word.get('xmin')) indent = previous_line_xmin - this_line_xmin else: continue # If it's a big indent, we want to carry down. if (indent + 25) < 0: # If this line is indented and not continuing previous line, # we want to append to this line everything from # the previous line with an xmin smaller than the xmin of this word. # Since this will carry down all relevant information from the words on the # previous lines, this *should* work even in catalogs with multiple # levels of indents. # 'vocable' because 'word' was already taken in this script # 10 pixels for slop again for vocable in final_catalog[-1]: if (float(vocable.get('xmin')) + 10) < this_line_xmin: entry.append(vocable) # sort it again because we screwed up the sort appending more words to it entry = sorted(entry, key=lambda x: float(x.get('xmin'))) # If it's a little indent, it's continuing the line above. elif (indent + 5) < 0: # Try to reassemble hyphenated words. OCR process ate # the hyphens at the end of lines, so we have to guess # if two words go together or not. We'll assume that if the # first word on the second line is not capitalized and is not # recognized by the spellchecker then it should be concatenated # with the last word on the previous line. last_word_of_carryover = final_catalog[-1][-1].string first_word_of_line = entry[0].string if first_word_of_line and ( re.match("[A-Z]", first_word_of_line) or re.match("[0-9 \-.,]+", first_word_of_line) or (first_word_of_line.rstrip('.,?!').casefold() in SpellChecker() and last_word_of_carryover.rstrip('.,?!').casefold() in SpellChecker())): final_catalog[-1] += entry final_catalog_sorted = sorted( final_catalog[-1], key=lambda x: float(x.get('xmin'))) previous_line_xmin = float( final_catalog_sorted[0].get('xmin')) entry = None else: final_catalog[-1][-1].string = final_catalog[-1][ -1].text + entry[0].text del entry[0] final_catalog[-1] += entry final_catalog_sorted = sorted( final_catalog[-1], key=lambda x: float(x.get('xmin'))) previous_line_xmin = float( final_catalog_sorted[0].get('xmin')) entry = None if entry: previous_line_xmin = float(entry[0].get('xmin')) final_catalog.append(entry) catalog = [] average_line_height = sum_of_heights / counted_words #average_line_height = sum_of_heights / len(file.find_all('word')) with open( os.path.join(home_directory, 'replication\\Salem_Athenaeum_1811.csv'), 'wb+') as outfile: csvwriter = unicodecsv.writer(outfile, encoding='utf-8') for item in final_catalog: # Write catalog entries out to a CSV, omitting shelf numbers and sizes # Also omit headers, to the extent we can identify them by # having a line-height more than 20% larger than average # (testing this just on 1st word in line) if (float(item[0].get('ymax')) - float(item[0].get('ymin')) < average_line_height * 1.2): final_entry = '' for vocable in item: # Don't want to include shelf numbers etc. in output. # Do want to include "1st", 12th", etc. # Also want to get rid of "do"s now that we're done w/them # Get rid of punctuation that confuses Solr (including commas and periods, which interact badly with the # fuzzy search ~ when they trail a word). This includes "'s" on the end of words. stripped_vocable = re.sub( r'[\+ \- & \| ! , \. ( ) \{ \} \[ \] \^ " ~ \* \? : \\ #”`]', '', vocable.text) if stripped_vocable == '': continue # "o" because we need to keep volume sizes for a future step to work if stripped_vocable[0].isalpha( ) or stripped_vocable[-1] == 'h' or stripped_vocable[ -1] == 'd' or stripped_vocable[ -1] == 't' or stripped_vocable[-1] == 'o': stripped_vocable = stripped_vocable.replace("'s", '') # Don't want to attach donor info from ends of entries. if stripped_vocable == 'Gift': break # For NY Apprentice's Lib and NY Society 1813, everything after the volume size is junk, so we're going to stop # adding words to final_entry when we hit one of those. if stripped_vocable == 'vol' or stripped_vocable.casefold( ) in volume_sizes: break #if stripped_vocable != 'do' and stripped_vocable != 'ditto': final_entry += ' ' + stripped_vocable # This is for NY Society 1813 and NY Mercantile 1825. These translation notes break matching every time. # Not stripping the word "translated" here (which precedes these phrases in NY Merc 1825) because it's # in the list of words to ignore in the Solr matching script already. # final_entry = final_entry.replace("from the Latin","") # final_entry = final_entry.replace("from the French","") # final_entry = final_entry.replace("from the German","") csvwriter.writerow([final_entry])
import warc import sys import unicodecsv as csv filetypes = ['.CSV', '.XLS', '.XLSX', '.JSON', '.RDF', '.ZIP'] geofiletypes = ('.GEOJSON', '.GML', '.GPX', '.GJSON', '.TIFF', '.SHP', '.KML', '.KMZ', '.WMS', '.WFS') filetypes.extend(geofiletypes) csvoutfile = open(sys.argv[1] + '.data.csv', 'a+b') datawriter = csv.writer(csvoutfile, delimiter=',') columns = [ 'Stadt_URL', 'URL_Datei', 'URL_Text', 'URL_Dateiname', 'Format', 'geo', 'URL_PARENT', 'Title_Parent' ] datawriter.writerow(columns) f = warc.open(sys.argv[2]) domain = sys.argv[1] blacklist = ('.jpg', '.gif', '.ico', '.txt', '.pdf', '.png', 'dns:', '.css', '.js') for record in f: if ('WARC-Target-URI' in record.header) and ( domain in record['WARC-Target-URI']) and not any( x in record['WARC-Target-URI'] for x in blacklist) and 'metadata' in record['warc-type']: #for item in record.__dict__['header'].items(): #print item
def claro_sva_start(): import MySQLdb import MySQLdb.cursors import unicodecsv as csv import gzip import urlparse from datetime import datetime, timedelta from tcm import app uri = urlparse.urlparse(app.config['SQLALCHEMY_DATABASE_URI']) conn = MySQLdb.connect(host=uri.hostname, port=uri.port or 3306, user=uri.username, passwd=uri.password, db=uri.path.lstrip('/'), cursorclass=MySQLdb.cursors.SSCursor) dt = datetime.today() td = datetime.today() - timedelta(days=1) c = conn.cursor() c.execute( """SELECT C.created, C.id_product, CH.msisdn, C.activated, C.medium, CH.charged, CH.value, C.cancelled, CH.charged, CH.updated, C.msisdn, C.id_package, CH.code FROM contract_manager.charge as CH JOIN contract_manager.contract as C ON C.id = CH.id_contract WHERE C.id_product in (44, 45, 52) and CH.charged between '{}' and '{}';""".format(td, dt)) date = td.strftime("%Y%m%d%H%M00") filename = 'SVA_10_CLARO_START_{date}.txt.gz'.format(date=date) with gzip.open(filename, 'w') as f: writer = csv.writer(f, delimiter='|') writer.writerow([ "Data_Criacao_Registro_Origem", "Hora_Criacao_Registro_Origem", "Codigo_servico", "Codigo_Categoria", "Numero_telefone", "Data_Ativacao", "Hora_Ativacao", "Codigo_canal", "Comando", "Data_Tarifacao", "Hora_Tarifacao", "Valor_Tarifacao", "codigo_assinatura", "Data_Cancelamento", "Hora_Cancelamento", "Data_Tentativa", "Hora_Tentativa", "Codigo_Status", "Data_Estimada_Churn", "Hora_Estimada_Churn", "Valor_Repasse_Parceiro", "Valor_Repasse_Claro", "Valor_Imposto", "Codigo_Parceiro", "codigo_cliente", "tipo_chave", "Codigo_Tipo_Serviço", "Duracao_Servico", "Codigo_detalhe_produto", "Codigo_Sharecode" ]) for i, row in enumerate(c.fetchall(), 1): writer.writerow([ '%s' % row[0].strftime("%Y%m%d %H:%M:%S"), '', '%s' % row[1], '', '%s' % row[2][2:], '%s' % row[3].strftime("%Y%m%d %H:%M:%S"), '', '%s' % row[4], '', '%s' % row[5].strftime("%Y%m%d %H:%M:%S"), '', '%s' % str(round(row[6], 2)).replace('.', ','), '', '%s' % row[7].strftime("%Y%m%d %H:%M:%S") if row[7] else '', '', '%s' % row[8].strftime("%Y%m%d %H:%M:%S") if row[8] else '', '%s' % row[9].strftime("%Y%m%d %H:%M:%S"), '', '', '', '40', '60', '', '10', '%s' % row[10][2:], '1', 'e', '', '%s' % row[11], '%s' % row[12] ]) f.close() return {'total': i, 'filename': filename}
def subClustering(t, k, id, num): lda_result_dir = LDA_DIR + str(t) + "/" kmean_result_dir = lda_result_dir + "KMEAN_" + str(k) + "/" assignment_file = kmean_result_dir + "assignment_sorted.csv" kmean_result_subdir = kmean_result_dir + "sub/" docs_id = [] with open(assignment_file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') for row in spamreader: if row[0] == id: docs_id.append(row[1]) docs = [] distribution_file = lda_result_dir + 'document-topic-distribution.csv' with open(distribution_file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') next(spamreader) i = 0 for row in spamreader: if row[0] in docs_id: docs.append(row) i = i + 1 print docs result = numpy.array(docs).astype('float') kmeans = KMeans(n_clusters=num, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) kmeans.fit(result[:, 1:]) label = str(t) + '_' + str(k) + '_' + str(num) centerWriter = csv.writer(open( kmean_result_subdir + 'subcenters_' + label + '.csv', 'wb'), encoding='utf-8') center_list = [] for i, center in enumerate(kmeans.cluster_centers_): row = [int(i)] center_list.append([]) for value in center: row.append(value) centerWriter.writerow(row) labels = kmeans.labels_ assignmentWriter = csv.writer(open( kmean_result_subdir + 'assignment' + label + '.csv', 'wb'), encoding='utf-8') for i, index in enumerate(result[:, 0]): dist = numpy.linalg.norm(result[i, 1:] - kmeans.cluster_centers_[labels[i]]) center_list[labels[i]].append((int(index), dist)) assignmentWriter.writerow([int(index), labels[i], dist]) sortedWriter = csv.writer(open( kmean_result_subdir + 'assignment_sorted' + label + '.csv', 'wb'), encoding='utf-8') for i, d in enumerate(center_list): sorted_d = sorted(d, key=lambda x: x[1]) for item in sorted_d: sortedWriter.writerow([int(i), item[0], item[1]])
import unicodecsv import requests from BeautifulSoup import BeautifulSoup headers = ['county', 'precinct', 'office', 'district', 'party', 'candidate', 'votes'] offices = ['United States President', 'US Senator', 'US Representative', 'State Representative', 'Governor', 'State Treasurer', 'Attorney General', 'Secretary of State', 'United States President and VP'] office_lookup = { 'US Senator' : 'U.S. Senate', 'US Representative' : 'U.S. House', 'Governor' : 'Governor', 'State Senator' : 'State Senate', 'State Representative' : 'State House', 'State Representative, 58th District': 'State House' } with open('20141104__or__general__union__precinct.csv', 'wb') as csvfile: w = unicodecsv.writer(csvfile, encoding='utf-8') w.writerow(headers) r = requests.get('http://union-county.org/wp-content/uploads/2013/08/EL301.htm') soup = BeautifulSoup(r.text) lines = soup.find('pre').text.split('\r\n') keys = [] for line in lines: if line.strip() == '\n': continue if "PRECINCT REPORT Union County, Oregon" in line: continue if line.strip() == '': continue if 'Run Date:' in line: continue
def KMeanClustering(): for numOfTopic in NUM_OF_TOPIC: lda_result_dir = LDA_DIR + str(numOfTopic) + "/" distribution_file = lda_result_dir + 'document-topic-distribution.csv' for num in NUM_OF_CLUSTER: kmean_result_dir = lda_result_dir + "KMEAN_" + str(num) + "/" if not path.exists(kmean_result_dir): makedirs(kmean_result_dir) reader = csv.reader(open(distribution_file, "rb"), encoding='utf-8') next(reader, None) x = list(reader) result = numpy.array(x).astype('float') kmeans = KMeans(n_clusters=num, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) kmeans.fit(result[:, 1:]) centerWriter = csv.writer(open(kmean_result_dir + 'centers.csv', 'wb'), encoding='utf-8') header = [] header.append("id") for i in range(numOfTopic): header.append("Topic " + str(i)) centerWriter.writerow(header) center_list = [] for i, center in enumerate(kmeans.cluster_centers_): row = [int(i)] center_list.append([]) for value in center: row.append(value) centerWriter.writerow(row) labels = kmeans.labels_ assignmentWriter = csv.writer(open( kmean_result_dir + 'assignment.csv', 'wb'), encoding='utf-8') for i, index in enumerate(result[:, 0]): dist = numpy.linalg.norm(result[i, 1:] - kmeans.cluster_centers_[labels[i]]) center_list[labels[i]].append((int(index), dist)) assignmentWriter.writerow([int(index), labels[i], dist]) sortedWriter = csv.writer(open( kmean_result_dir + 'assignment_sorted.csv', 'wb'), encoding='utf-8') docsWriter = csv.writer(open( kmean_result_dir + 'assignment_sorted_doc.csv', 'wb'), encoding='utf-8') for i, d in enumerate(center_list): sorted_d = sorted(d, key=lambda x: x[1]) count = 0 for item in sorted_d: sortedWriter.writerow([int(i), item[0], item[1]]) if count < 20: docsWriter.writerow([int(i), item[0], item[1]]) count = count + 1
def do(self): order_round = get_current_order_round() print(("Order round: %s" % order_round)) if order_round.is_open: print("Order round is closed") return if order_round.is_not_open_yet(): print("Order round is not open yet") return if order_round.order_placed: print("Order has already been placed") return # To prevent mail loops order_round.order_placed = True order_round.save() for supplier in Supplier.objects.all(): if not supplier.has_orders_in_current_order_round(): print(("No orders for supplier %s" % supplier)) log_event(event="Supplier %s has no orders in current round, " "so not sending order list." % supplier) continue print(("Generating lists for supplier %s" % supplier)) # Generate CSV in_mem_file = io.BytesIO() csv_writer = csv.writer(in_mem_file, delimiter=';', quotechar='|') # Write header row csv_writer.writerow([ "Aantal", "Eenheid", "Product", "Omschrijving", "Inkoopprijs Euro", "Subtotaal" ]) ordered_products = supplier.products. \ exclude(orderproducts=None). \ filter(orderproducts__order__paid=True). \ filter(order_round=order_round). \ annotate(amount_sum=Sum('orderproducts__amount')) if not ordered_products: log_event(event="Supplier %s has only STOCK orders " "in current round, " "so not sending order list." % supplier) continue # Write products and amounts for obj in ordered_products: csv_writer.writerow([ fix_decimal_separator(obj.amount_sum), obj.unit_of_measurement, obj.name, obj.description.replace("\n", " ").replace("\r", " "), # TODO make sure newlines aren't stored # in our products fix_decimal_separator(obj.base_price), fix_decimal_separator(obj.amount_sum * obj.base_price) ]) # Write 'total' row total = fix_decimal_separator( sum([ obj.amount_sum * obj.base_price for obj in ordered_products ])) csv_writer.writerow([]) csv_writer.writerow(["TOTAAL", "", "", "", "", total]) in_mem_file.seek(0) in_mem_file.seek(0) # Why is this here twice? # Generate mail subject = ('VOKO Utrecht - Bestellijst voor %s' % order_round.collect_datetime.strftime("%d %B %Y")) from_email = 'VOKO Utrecht Boerencontact <*****@*****.**>' to = '%s <%s>' % (supplier.name, supplier.email) text_content = """ Hoi %s, Hierbij sturen we onze bestelling voor bestelronde %d voor ons uitlevermoment op %s. De bestelling is in CSV-formaat, dit is te openen in bijvoorbeeld Excel. Dit is een geautomatiseerd bericht, maar reageren is gewoon mogelijk. Vriendelijke groeten, VOKO Utrecht """ % (supplier, order_round.pk, order_round.collect_datetime.strftime("%d %B %Y")) msg = EmailMultiAlternatives( subject, text_content, from_email, [to], cc=["VOKO Utrecht Boerencontact <*****@*****.**>"]) msg.attach('voko_utrecht_bestelling_ronde_%d.csv' % order_round.pk, in_mem_file.read().decode("utf-8"), 'text/csv') msg.send()
def LDA(): files = [ f for f in listdir(LDA_DIR) if isfile(join(LDA_DIR, f)) and "docs" in f and ".csv" in f ] texts = [] index = [] for file in files: print file with open(LDA_DIR + file, 'rb') as csvfile: spamreader = csv.reader(csvfile, encoding='utf-8') for row in spamreader: doc = row[3].lower() tokens = tokenizer.tokenize(doc) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] if len(stemmed_tokens) > 2: texts.append(stemmed_tokens) index.append(row[0]) dictionary = corpora.Dictionary(texts) dictionary.save() corpus = [dictionary.doc2bow(text) for text in texts] print len(corpus) for num in NUM_OF_TOPIC: lda_result_dir = LDA_DIR + str(num) + "/" if not path.exists(lda_result_dir): makedirs(lda_result_dir) if isfile(join(lda_result_dir, "model.model")): ldamodel = models.ldamodel.LdaModel.load(join( lda_result_dir, "model.model"), mmap='r') else: ldamodel = models.ldamodel.LdaModel(corpus, num_topics=num, id2word=dictionary, passes=20) ldamodel.save(lda_result_dir + "model.model") print "-------------", ldamodel.alpha textWriter = csv.writer(open(lda_result_dir + 'topic.csv', 'wb'), encoding='utf-8') for i in range(num): textWriter.writerow(["Topic " + str(i)]) for word in ldamodel.show_topic(i, 20): textWriter.writerow([word[0], word[1]]) textWriter.writerow([]) disWriter = csv.writer(open( lda_result_dir + 'document-topic-distribution.csv', 'wb'), encoding='utf-8') header = [] header.append("id") for i in range(num): header.append("Topic " + str(i)) disWriter.writerow(header) for i, bow in enumerate(corpus): distribution = [None] * (num + 1) distribution[0] = index[i] for dis in ldamodel.get_document_topics(bow, minimum_probability=0): distribution[dis[0] + 1] = dis[1] disWriter.writerow(distribution)
import unicodecsv as csv API_KEY = 'vhqTHd5SvnkydORd2kwPa1CPUgN8bzHwNvTQu7KUdUgxGh88eF' API_URL = 'http://api.tumblr.com/v2/blog/%s/posts?api_key=%s&offset=%d' TUMBLR_NAME = 'wearethe99percent.tumblr.com' r = requests.get(API_URL % (TUMBLR_NAME, API_KEY, 0)) json = simplejson.loads(r.content) total_posts = json['response']['total_posts'] offset = 0 csv_file = open('tumblr.csv', 'w') writer = csv.writer(csv_file) writer.writerow(('id', 'post_url', 'timestamp', 'datetime', 'tags', 'body')) while offset < total_posts: print 'Fetching with offset %d' % offset r = requests.get(API_URL % (TUMBLR_NAME, API_KEY, offset)) offset += 20 json = simplejson.loads(r.content) posts = json['response']['posts'] for post in posts: if post['type'] == 'text': body = post['body'] elif post['type'] == 'photo': body = post['caption'] else:
def write_to_csv(self, item): csv_write = unicodecsv.writer(self.csvfile, encoding='utf-8-sig', dialect='excel') csv_write.writerow(item)
from kafka import KafkaConsumer import json import unicodecsv as csv from ast import literal_eval import os.path from os import path # Connect to a particular topic and decode the messages consumer = KafkaConsumer('Histcounts_Month',bootstrap_servers=['0.0.0.0:9092'],value_deserializer=lambda v: json.loads(v).decode('utf-8')) if (path.exists("histclientcounts_month.csv")) == False: f = open("histclientcounts_month.csv","a") fWriter = csv.writer(f) #f.write('\n') # write the header of the csv fWriter.writerow(['DisplayName','id','CollectionTime','Subkey','type','MAC']) f.close() # flag for spacing a = 1 flag = False f = open("histclientcounts_month.csv","a") f.write('\n') f.close() # loop for messages for msg in consumer: f = open("histclientcounts_month.csv","a") # new line after 6 entries if (a-1) % 6 == 0 and flag == True: f.write('\n')
def execute(self, context): vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) tmpfile = None result = None selected_columns = [] count = 0 with closing(vertica.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute(self.sql) selected_columns = [d.name for d in cursor.description] if self.bulk_load: tmpfile = NamedTemporaryFile("w") self.log.info( "Selecting rows from Vertica to local file %s...", tmpfile.name) self.log.info(self.sql) csv_writer = csv.writer(tmpfile, delimiter='\t', encoding='utf-8') for row in cursor.iterate(): csv_writer.writerow(row) count += 1 tmpfile.flush() else: self.log.info("Selecting rows from Vertica...") self.log.info(self.sql) result = cursor.fetchall() count = len(result) self.log.info("Selected rows from Vertica %s", count) if self.mysql_preoperator: self.log.info("Running MySQL preoperator...") mysql.run(self.mysql_preoperator) try: if self.bulk_load: self.log.info("Bulk inserting rows into MySQL...") with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute( "LOAD DATA LOCAL INFILE '%s' INTO " "TABLE %s LINES TERMINATED BY '\r\n' (%s)" % (tmpfile.name, self.mysql_table, ", ".join(selected_columns))) conn.commit() tmpfile.close() else: self.log.info("Inserting rows into MySQL...") mysql.insert_rows(table=self.mysql_table, rows=result, target_fields=selected_columns) self.log.info("Inserted rows into MySQL %s", count) except (MySQLdb.Error, MySQLdb.Warning): self.log.info("Inserted rows into MySQL 0") raise if self.mysql_postoperator: self.log.info("Running MySQL postoperator...") mysql.run(self.mysql_postoperator) self.log.info("Done")
@retry(wait_random_min=5000, wait_random_max=10000) def parse_page(url): print url token = str(random.randint(100000, 999999)) cat_page = html.parse(base + url) pages = cat_page.xpath('//select[@name="page"]/option') pages = int(pages[-1].attrib["value"]) if pages else 0 for thread in cat_page.xpath('//ul[@class="thread-list"]/li//h3/a'): scrape_thread(thread) for page in xrange(1, pages): cat_page = html.parse(base + url + "?p_token=" + token + "&page=" + str(page)) for thread in cat_page.xpath('//ul[@class="thread-list"]/li//h3/a'): scrape_thread(thread) base = "http://patient.info" start = html.parse("http://patient.info/forums") f = open('patient.csv', 'w') w = unicodecsv.writer(f, encoding='utf-8', lineterminator='\n') w.writerow([ 'uniqueID', 'qid', 'localID', 'title', 'poster', 'date', 'replyTo', 'content', 'infered_replies', 'category' ], ) for letter in string.lowercase: for cat in html.parse(base + "/forums/index-" + letter).xpath('//table[@class="zebra-table"]//a'): subforum = cat.text parse_page(cat.attrib["href"])