def scrape_ri(): #scrape reports for rhode island numbers = range(1,9) letters = string.letters.upper() numbers.extend(letters) f = open('ri.csv', 'wt') writer = ucsv.writer(f) for n in numbers: print 'starting %s' % n r = requests.get( 'http://food.ri.digitalhealthdepartment.com/search.cfm?facletter=%s&search=active&maxrows=1000' % n ) soup = BeautifulSoup(r.text) links = [] table = soup.find('table', {'class':'mainsearchclassmall'}) id_list = [x.attrs[0][1].split('&id=')[1] for x in table('a', {'class':None})] for id in id_list: link = 'http://food.ri.digitalhealthdepartment.com/scores.cfm?id=%s' % id links.append(link) writer.writerow((link,)) print link print 'ending %s, waiting 5sec' % n time.sleep(5) f.close()
def scrape_links(read_csv=False, export_csv=False, update=True): """ gets all links from the pgda site for courses :param: export_csv: whether or not to export a csv file: pgda_courses.csv :returns: links, list of urls for courses """ links = [] if read_csv: #fetch the links from the csv file f = open('pgda_courses.csv', 'r') reader = ucsv.reader(f) for row in reader: links.append(row[3]) f.close() else: #fetch the links from the site start_r = requests.get('http://www.pdga.com/course_directory/country/us') start_soup = BeautifulSoup(start_r.text) last_link = start_soup.find('a', {'class':'pager-last active'}).attrs[0][1] last_page = int(re.search('page=[\d][\d]', last_link).group(0).split('=')[1]) if export_csv: f = open('pgda_courses.csv', 'wt') try: if export_csv: writer = ucsv.writer(f) for page in range(0, last_page+1): DIRECTORY_URL = 'http://www.pdga.com/course_directory/country/us' if page == 0: link = DIRECTORY_URL else: link = '%s&page=%s' % (DIRECTORY_URL, page) r = requests.get(link) soup = BeautifulSoup(r.text) for row in soup('table')[0].tbody('tr'): tds = row.findAll('td') course_link = "%s%s" % (BASE_URL, tds[0].a.attrs[0][1].split('?s=')[0]) name = tds[0].text city = tds[1].text province = tds[2].text if export_csv: writer.writerow( (name, city, province, course_link) ) links.append((name, course_link)) print('added %s' % course_link) finally: if export_csv: f.close() return links
def export_models_to_csv(): """exports course model info to a csv file""" courses = Course.objects.all() f = open('pgda_courses.csv', 'wt') try: writer = ucsv.writer(f) for c in courses: writer.writerow( ( c.title, c.slug, c.description, c.directions, c.year_established, c.number_of_holes, c.basket_type, c.tee_type, c.course_length, c.alt_length, c.holes_under_300, c.holes_300_to_400, c.holes_more_than_400, c.has_restrooms, c.has_tee_signs, c.handicap_accessible, c.has_camping, c.on_private_land, c.contact_name, c.contact_number_type, c.contact_number, c.contact_links, c.address, c.city, c.state, c.country, c.postcode, c.location, c.map_link, c.average_rating, c.total_votes, c.pgda_link, c.creator, c.created, c.updated, c.flagged, c.deleted ) ) print '.' finally: f.close()
def task__export(request): """ For task queue. Not for direct access. """ ret = {} eid = request.POST.get('eid') try: el = get_object_or_404(ExportLog, id=eid) if el.status == ExportStatus.succeed: ret['OK'] = 'OK' return _render_json_repsonse(ret) exp = StringIO.StringIO() cw = csv.writer(exp) cw.writerow([u'书名', u'状态', u'备注', u'链接']) for i in el.user.bookownership_set.all(): cw.writerow([ i.book.title, i.get_status_display(), i.remark, 'http://sichu.sinaapp.com/cabinet/book/%d/' % i.book.id ]) # send email t = Template(EXP_SUBJECT) c = Context({'user': el.user.get_nickname()}) body = t.render(c) message = EmailMultiAlternatives(subject=EXP_TITLE.encode('utf-8'), body=body.encode('utf-8'), from_email=settings.EMAIL_HOST_USER, to=[el.email]) message.attach('books.csv', exp.getvalue(), 'text/csv; charset=UTF-8') message.send(fail_silently=False) exp.close() el.status = ExportStatus.succeed.value el.save() ret['OK'] = 'OK' except Http404: set_errors(ret, u'6601') except Exception: logger.exception(str(sys._getframe().f_code.co_name)) set_errors(ret, u'6003') el.status = ExportStatus.failed.value el.save() return _render_json_repsonse(ret)
def main(argv): (inputfile, outputfile) = parse_input(argv) phrases = [] cleaned_file = '' try: cleaned_file = sanitize_file(inputfile) line_number = 1 # Here we don't need the codecs.open as we use ucsv to read the file with open(cleaned_file, 'rb') as csvfile: for row in ucsv.DictReader(csvfile): line_number += 1 pinyin_phrase = row['Pronunciation'] try: annotated_pinyin = annotate_phrase(pinyin_phrase) except ValueError: print "There's a fishy pronunciation entry on line %d." % line_number continue sort_value = calc_sort_value(annotated_pinyin[::-1], 1, 0) (first_syllable, _tone) = annotated_pinyin[0] hanzi_phrase = row['Word'] phrases.append((sort_value, first_syllable, hanzi_phrase, pinyin_phrase)) if cleaned_file.endswith('sanitized.csv'): os.remove(cleaned_file) except IOError: if cleaned_file.endswith('sanitized.csv'): os.remove(cleaned_file) print 'Bad input file: ', inputfile sorted_phrases = sorted(phrases, key = itemgetter(0, 1)) output_ready_phrases = [phrase[2:4] for phrase in sorted_phrases] with open(outputfile, 'wb') as f: writer = ucsv.writer(f) writer.writerow(['Word', 'Pronunciation']) writer.writerows(output_ready_phrases)
) args = parser.parse_args() # Do we have games available to scrape? # This lets us break out of our loop games_available = True current_game = 0 # Specific game we are ready to scrape. Used with start_game games_scraped = 0 # Count of how many games we have scraped so far vgchartz_page = 1 # Which VGChartz Page are we on # Open our CSV file and write the headers now = datetime.datetime.now() csvfilename = "gamedata-" + now.strftime("%Y%m%d-%H_%M_%S") + ".csv" csvfile = open(csvfilename, "wb") gamewriter = csv.writer(csvfile) gamewriter.writerow(['DateTime:', str(now)]) gamewriter.writerow([ 'name', 'platform', 'release year', 'genre', 'publisher', 'north america sales', 'europe sales', 'japan sales', 'rest of world sales', 'global sales', 'release date', 'critic score', 'critic outof', 'critic count', 'user score', 'user count', 'developer', 'rating' ]) start_time = time.time() while games_available: # Connect to the VGChartz table. There are 1000 results per page. sys.stdout.write("Connecting to VGChartz Page " + str(vgchartz_page) + "...") vgchartz_url = "http://www.vgchartz.com/gamedb/?page=" + str(
def write_rt_ratings_to_csv(csv_output, rt_user_id): with open(csv_output, 'w') as f: writer = ucsv.writer(f) for n, row in enumerate(scrape_rt(rt_user_id)): writer.writerow(row) logging.info('Wrote {0} ratings to {1}'.format(n, csv_output))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0, 1] # Define the webbrowser as Phantomjs and pretend it as Google Chrome. browser = defineBrowser() with open('Main_Study_Comments.csv', 'rb') as fr: reader = csv.reader(fr) with open('Main_Study_Comments_Abstracts.csv', 'wb') as fw: writer = csv.writer(fw) pubResultRow = [ "Firstname", "Lastname", "Specialization", "Affiliation", "location", "Phase 1", "Phase 2", "Phase 3", "High View", "Citation Benefit", "Private Comes First", "Acknowledgement", "Likely to Cite", "May Include Reference", "Might Refer to", "Likely to Cite + Acknowledgement", "May Include Reference + Acknowledgement", "Might Refer to + Acknowledgement", "Especially popular", "Highly visible", "Highly popular", "Manual Recommendation", "Track changed", "Inappropriate Comment", "Interested", "Withdrawal", "Email 1 Opened", "Email 2 Opened", "Email 3 Opened", "Econ Wikiproject Clicked", "Email Communication", "User Agent", "Publication", "Wikipage URL", "Edit Protection Level", "Quality Class", "Importance Class", "Page Length", "Watchers",
def main(): # включаем парсер параметров parser = argparse.ArgumentParser(description='Convert price to ShopOS CSV import file.') # входной XML-файл (прайс) parser.add_argument('arguments', metavar='ARG', type=str, nargs=1, help='input XLS file') # включать описания в выходной CSV parser.add_argument('-d', dest='descr', action='store_true', help='enable description') # сравнивать прайс с export.csv parser.add_argument('-c', dest='compare', action='store_true', help='compare prices') # сливать export.csv с сервера parser.add_argument('-e', dest='download', action='store_true', help='download export.csv') # вычленять бренды parser.add_argument('-b', dest='brands', action='store_true', help='filter brands') # создать дополнительный -add.csv файл, в котором включены описания # (используется для прайса на оборудование) parser.add_argument('-f', dest='descfilter', action='store_true', help='filter descriptions (for tools)') args = parser.parse_args() # если указан -e if args.download: # сливаем с сайта export.csv download_export() # xls_input - входной XLS-прайс xls_input = args.arguments[0] debug_print('Reading XLS ' + xls_input) # перегоняем данные из входного прайса в table_input # table_input[категория] = [int(артикул), unicode(имя), unicode(описание), int(опт), int(розн), str(плюсики)] table_input = xlsread(xls_input) # имя выходного CSV-файла csv_output_fname = args.arguments[0][:-3] + 'csv' debug_print('Creating CSV ' + csv_output_fname) # создаем CSV-файл, разделитель "|" file_csv_output = open(csv_output_fname, 'wb') csv_output = csv.writer(file_csv_output, delimiter='|') # если указано, что нужно фильтровать оборудование, делаем второй CSV if args.descfilter: csv_output_fname_add = args.arguments[0][:-4] + '-add.csv' file_csv_output_add = open(csv_output_fname_add, 'wb') debug_print('Creating CSV ' + csv_output_fname_add) csv_output_add = csv.writer(file_csv_output_add, delimiter='|') # и говорим, что его нужно еще и сравнивать args.compare = True # если таки нужно сравнивать, if args.compare: # открываем любезно приготовленный export-out.csv # (экспортные данные с сайта) csv_compare = csv.reader(open('export-out.csv', 'rb'), delimiter='|') # таблица, в которой будут храниться все необходимые для сравнения данные о продуктах с сайта # table_compare[int(артикул)] = [int(опт), int(розница), показатель сниженной цены, описание] table_compare = {} for i,row in enumerate(csv_compare): if (i != 0) and (row[11][2:-5] != '') and (int(row[8]) != 0): table_compare[int(row[1])] = [int(row[11][2:-5]), int(row[8]), row[15], row[29]] # opt, rozn, ean, descr # объявляем таблицу выходных данных table_output = [] # прописываем заголовки CSV-таблиц if args.descfilter: table_output_add = [] table_output_add.append(['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5']) if args.descr: header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5'] else: header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl', 'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5'] table_output.append(header) debug_print('Starting parsing catalog') # начинаем парсить прайс по одной позиции # category - категория прайса, из которой берем позицию for category in table_input: # item - собственно позиция со всеми ее данными for item in table_input[category]: # ДЕФОЛТЫ {{ p_status = '1' # статус товара - всегда будет включен # дальнейшие дефолты значения не имеют p_sorting = '0' # сортировка p_shipping = '0' # доставка p_tpl = p_opttpl = 'default' # шаблоны p_fsk18 = p_tax = p_vpe = p_vpe_status = '0' p_priceNoTax_1 = p_priceNoTax_2 = p_url = '' p_weight = '0.00' p_disc = p_vpe_value = '0.0000' # }} //ДЕФОЛТЫ p_model = str(item[0]) # артикул p_stock = config.QTY[item[5]] # наличие p_name = item[1].replace('/', '/ ') # наименование if args.brands: [p_manufacturer, pricebrand] = brand(p_name, p_model) # производитель else: [p_manufacturer, pricebrand] = ['', ''] # короткое описание p_shortdesc = short_desc(category, item[2]) # шаблонистое SEO-описание - пока в разработке seo_description = seo_desc(p_name, category, brand(p_name, p_model)[1]) if seo_description == config.LEAVE_OLD_DESC: if args.compare: try: p_desc = table_compare[int(p_model)][3] except: debug_print_pause(table_compare[11928]) else: pass else: if p_shortdesc != '': p_desc = '<p>' + p_shortdesc + '</p><br/>' + seo_description # описание с ништяками else: p_desc = seo_description # описание с ништяками #p_desc = p_shortdesc # вместо этого заюзаем такую незапаристую конструкцию #p_desc = '' #if short_desc != '': # p_desc = '<p>' + p_shortdesc + '</p>' # фильтруем наличие товара/описания для товара-оборудования if args.descfilter: # по умолчанию добавляем товару описание из прайса is_add = True try: # если же на сайте уже есть описание, или же в прайсе его нет, if (table_compare[int(p_model)][3] != '') or (p_shortdesc == ''): # то ничего и не вставляем is_add = False except KeyError, e: pass # оптовая цена p_priceNoTax_3 = str(int(math.ceil(item[3]))) # розничная цена p_priceNoTax = str(int(math.ceil(item[4]))) # категоризируем p_cat = getcat(category, p_name) # мета [temp_name, p_meta_title, p_meta_desc, p_meta_key] = metatags(p_name, p_shortdesc, p_cat, p_model) if temp_name != '': p_name = temp_name # заниженная цена p_ean = '' if args.compare: p_ean = ean(p_model, p_priceNoTax_3, p_priceNoTax, table_compare) # сливаем все полученные данные в одну строку массива # и дампим ее в csv if args.descfilter: # если это прайс на оборудование, который надо раздербанивать на два, # то раздербаниваем, чо if is_add: table_output_add.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]) else: table_output.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]) else: # тут все остальные случаи if args.descr: row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]] else: row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]] table_output.append(row)
parser.add_argument('-w', '--wait', type=int, default=0, help='Number of seconds to wait before each request to MetaCritc (0 to disable).') args = parser.parse_args() # Do we have games available to scrape? # This lets us break out of our loop games_available = True current_game = 0 # Specific game we are ready to scrape. Used with start_game games_scraped = 0 # Count of how many games we have scraped so far vgchartz_page = 1 # Which VGChartz Page are we on # Open our CSV file and write the headers now = datetime.datetime.now() csvfilename = "gamedata-" + now.strftime("%Y%m%d-%H_%M_%S") + ".csv" csvfile = open(csvfilename, "wb") gamewriter = csv.writer(csvfile) gamewriter.writerow(['DateTime:', str(now)]) gamewriter.writerow(['name', 'platform', 'release year', 'genre', 'publisher', 'north america sales', 'europe sales', 'japan sales', 'rest of world sales', 'global sales', 'release date', 'critic score', 'critic outof', 'critic count', 'user score', 'user count', 'developer', 'rating']) start_time = time.time() while games_available: # Connect to the VGChartz table. There are 1000 results per page. sys.stdout.write("Connecting to VGChartz Page " + str(vgchartz_page) + "...") vgchartz_url = "http://www.vgchartz.com/gamedb/?page=" + str(vgchartz_page) + "&results=1000&name=&platform=&minSales=0&publisher=&genre=&sort=GL" #vgchartz_url = "file:vgchartz.htm" # This is a DEBUG line - pulling vgchartz data from filesystem. Comment it out for production. vgchartz_conn = urllib2.urlopen(vgchartz_url) vgchartz_html = vgchartz_conn.read() sys.stdout.write("connected.\n") vgsoup = BeautifulSoup(vgchartz_html) rows = vgsoup.find("table", class_="chart").find_all("tr")
if cmd_folder not in sys.path: sys.path.insert(0, cmd_folder) from openpyxl import load_workbook parser = argparse.ArgumentParser() parser.add_argument("input_file") args = parser.parse_args() def xstr(s): if s is None: return '' return s # workbook = load_workbook(args.input_file, use_iterators = True) # sheet = workbook.get_active_sheet() # # with open((args.input_file + '.csv'), 'wb') as csvfile: # csvwriter = csv.writer(csvfile, delimiter=args.delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL) # # for i, row in enumerate(sheet.iter_rows()): # row = [ c.internal_value for c in row ] # csvwriter.writerow(row) wb = load_workbook(args.input_file) sh = wb.get_active_sheet() with open((args.input_file + '.csv'), 'wb') as f: c = csv.writer(f,delimiter="\t") for r in sh.rows: c.writerow([xstr(cell.value) for cell in r][:50])
sys.path.insert(0, cmd_folder) from openpyxl import load_workbook parser = argparse.ArgumentParser() parser.add_argument("input_file") args = parser.parse_args() def xstr(s): if s is None: return '' return s # workbook = load_workbook(args.input_file, use_iterators = True) # sheet = workbook.get_active_sheet() # # with open((args.input_file + '.csv'), 'wb') as csvfile: # csvwriter = csv.writer(csvfile, delimiter=args.delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL) # # for i, row in enumerate(sheet.iter_rows()): # row = [ c.internal_value for c in row ] # csvwriter.writerow(row) wb = load_workbook(args.input_file) sh = wb.get_active_sheet() with open((args.input_file + '.csv'), 'wb') as f: c = csv.writer(f,delimiter="\t") for r in sh.rows: # c.writerow([xstr(cell.value) for cell in r][:50]) c.writerow([xstr(cell.value) for cell in r])
print('usage: csql <query>') print('example: csql "select first,last,phone from phones.csv"') exit(1) query = sys.argv.pop(0) db = Database() eof = 'Ctrl+D' if os.name == 'nt': eof = 'Ctrl+Z' words = re.findall(r'(?i)[a-z0-9.]+\.csv|stdin', query) for word in words: if word.lower().endswith('.csv'): db.import_table(word, open(word)) query = query.replace(word, db.get_table_name(word)) continue if word.lower() == 'stdin': if not 'stdin' in db.tables: if sys.stdin.isatty(): msg = ('Importing data from STDIN; '+ '%s twice to conclude.') % eof print >>sys.stderr, msg db.import_table('stdin', sys.stdin) results = csv.writer(sys.stdout, dialect='excel') cursor = db.execute_query(query) results.writerow( [ field[0] for field in cursor.description ] ) for row in cursor: results.writerow(row)
return True, pubResultRow viewedRecommendations = {} print "Enter the name of the authors and publications dataset csv file without any sufix:" datatsetFileName = raw_input() if datatsetFileName == "": datatsetFileName = 'Ideas_Repec_Dataset_Pilot3_Clean' with open(datatsetFileName + '.csv', 'rb') as fr: reader = csv.reader(fr) with open(datatsetFileName + '_Recommendations.csv', 'wb') as fw: writer = csv.writer(fw) pubResultRow = [ 'email', 'publication1', 'Wikipage1', 'WikipageURL1', 'publication2', 'Wikipage2', 'WikipageURL2', 'publication3', 'Wikipage3', 'WikipageURL3', 'publication4', 'Wikipage4', 'WikipageURL4', 'publication5', 'Wikipage5', 'WikipageURL5', 'publication6', 'Wikipage6', 'WikipageURL6', 'publication7', 'Wikipage7', 'WikipageURL7' ] writer.writerow(pubResultRow) with open(datatsetFileName + '_Recommendations_Stats.csv', 'wb') as fw: writer_Stats = csv.writer(fw) wikipageResultRow = [
def econPapersCategoriesExtractor(ideasURL, startSpecialization, endSpecialization): with open('Ideas_Repec_Dataset.csv', 'wb') as fw: writer = csv.writer(fw) with open('Ideas_Repec_Affiliations.csv', 'wb') as fwAffiliations: writerAffiliations = csv.writer(fwAffiliations) resultRow = [ 'firstName', 'lastName', 'email', 'specialization', 'EconPapers Profile', 'affiliation', 'location', 'homepage', 'publication1', 'publicationYear1', 'citation1', 'firstKeyword1', 'publication2', 'publicationYear2', 'citation2', 'firstKeyword2', 'publication3', 'publicationYear3', 'citation3', 'firstKeyword3', 'publication4', 'publicationYear4', 'citation4', 'firstKeyword4', 'publication5', 'publicationYear5', 'citation5', 'firstKeyword5', 'publication6', 'publicationYear6', 'citation6', 'firstKeyword6', 'publication7', 'publicationYear7', 'citation7', 'firstKeyword7' ] writer.writerow(resultRow) resultRowAffiliations = [ 'firstName', 'lastName', 'email', 'affiliation1', 'location1', 'affiliation2', 'location2', 'affiliation3', 'location3', 'affiliation4', 'location4', 'affiliation5', 'location5', 'affiliation6', 'location6', 'affiliation7', 'location7' ] writerAffiliations.writerow(resultRowAffiliations) ideasSoup = soupStructure(ideasURL) # If the content of the page is returned in BeautifulSoup structure: if ideasSoup != '': # Find the main list. mainList = ideasSoup.body.find( text=re.compile('.*Accounting & Auditing.*')).parent.parent # If there is a main list: if mainList != None: # Set if the startSpecialization is observed. startSpecializationObserved = False # Find all li tags inside the main list. liTags = mainList.findAll('li') for i in range(len(liTags)): # Find the hyperlink tag inside the list item. aTag = liTags[i].find('a') specialization = aTag.nextSibling[1:] print str(specialization) if specialization == startSpecialization: startSpecializationObserved = True if specialization != "All new papers" and specialization != "German Papers" and startSpecializationObserved: econPapersAuthorListExtractor( writer, writerAffiliations, specialization, 'https://ideas.repec.org' + aTag['href']) # if foundTheLastSubject: # return if specialization == endSpecialization: return