Exemplo n.º 1
0
def scrape_ri():
    #scrape reports for rhode island
    numbers = range(1,9)
    letters = string.letters.upper()
    numbers.extend(letters)

    f = open('ri.csv', 'wt')
    writer = ucsv.writer(f)
    for n in numbers:
        print 'starting %s' % n
        r = requests.get(
            'http://food.ri.digitalhealthdepartment.com/search.cfm?facletter=%s&search=active&maxrows=1000' % n
        )
        soup = BeautifulSoup(r.text)

        links = []

        table = soup.find('table', {'class':'mainsearchclassmall'})
        id_list = [x.attrs[0][1].split('&id=')[1] for x in table('a', {'class':None})]
        for id in id_list:
            link = 'http://food.ri.digitalhealthdepartment.com/scores.cfm?id=%s' % id
            links.append(link)
            writer.writerow((link,))
            print link
        print 'ending %s, waiting 5sec' % n
        time.sleep(5)
    f.close()
Exemplo n.º 2
0
def scrape_links(read_csv=False, export_csv=False, update=True):
    """
    gets all links from the pgda site for courses

    :param: export_csv: whether or not to export a csv file: pgda_courses.csv

    :returns: links, list of urls for courses
    """
    links = []

    if read_csv:
        #fetch the links from the csv file
        f = open('pgda_courses.csv', 'r')
        reader = ucsv.reader(f)
        for row in reader:
            links.append(row[3])
        f.close()
    else:
        #fetch the links from the site
        start_r = requests.get('http://www.pdga.com/course_directory/country/us')
        start_soup = BeautifulSoup(start_r.text)

        last_link = start_soup.find('a', {'class':'pager-last active'}).attrs[0][1]
        last_page = int(re.search('page=[\d][\d]', last_link).group(0).split('=')[1])

        if export_csv:
            f = open('pgda_courses.csv', 'wt')

        try:
            if export_csv:
                writer = ucsv.writer(f)

            for page in range(0, last_page+1):
                DIRECTORY_URL = 'http://www.pdga.com/course_directory/country/us'
                if page == 0:
                    link = DIRECTORY_URL
                else:
                    link = '%s&page=%s' % (DIRECTORY_URL, page)

                r = requests.get(link)
                soup = BeautifulSoup(r.text)
                for row in soup('table')[0].tbody('tr'):
                    tds = row.findAll('td')

                    course_link = "%s%s" % (BASE_URL, tds[0].a.attrs[0][1].split('?s=')[0])
                    name = tds[0].text
                    city = tds[1].text
                    province = tds[2].text

                    if export_csv:
                        writer.writerow( (name, city, province, course_link) )
                    links.append((name, course_link))

                    print('added %s' % course_link)
        finally:
            if export_csv:
                f.close()
    return links
Exemplo n.º 3
0
def export_models_to_csv():
    """exports course model info to a csv file"""
    courses = Course.objects.all()

    f = open('pgda_courses.csv', 'wt')

    try:
        writer = ucsv.writer(f)
        for c in courses:
            writer.writerow(
                (
                    c.title,
                    c.slug,
                    c.description,
                    c.directions,
                    c.year_established,
                    c.number_of_holes,
                    c.basket_type,
                    c.tee_type,
                    c.course_length,
                    c.alt_length,
                    c.holes_under_300,
                    c.holes_300_to_400,
                    c.holes_more_than_400,
                    c.has_restrooms,
                    c.has_tee_signs,
                    c.handicap_accessible,
                    c.has_camping,
                    c.on_private_land,
                    c.contact_name,
                    c.contact_number_type,
                    c.contact_number,
                    c.contact_links,
                    c.address,
                    c.city,
                    c.state,
                    c.country,
                    c.postcode,
                    c.location,
                    c.map_link,
                    c.average_rating,
                    c.total_votes,
                    c.pgda_link,
                    c.creator,
                    c.created,
                    c.updated,
                    c.flagged,
                    c.deleted
                )
            )
            print '.'
    finally:
        f.close()
Exemplo n.º 4
0
def task__export(request):
    """
    For task queue. Not for direct access.
    """
    ret = {}
    eid = request.POST.get('eid')
    try:
        el = get_object_or_404(ExportLog, id=eid)
        if el.status == ExportStatus.succeed:
            ret['OK'] = 'OK'
            return _render_json_repsonse(ret)

        exp = StringIO.StringIO()
        cw = csv.writer(exp)
        cw.writerow([u'书名', u'状态', u'备注', u'链接'])
        for i in el.user.bookownership_set.all():
            cw.writerow([
                i.book.title,
                i.get_status_display(), i.remark,
                'http://sichu.sinaapp.com/cabinet/book/%d/' % i.book.id
            ])
        # send email
        t = Template(EXP_SUBJECT)
        c = Context({'user': el.user.get_nickname()})
        body = t.render(c)
        message = EmailMultiAlternatives(subject=EXP_TITLE.encode('utf-8'),
                                         body=body.encode('utf-8'),
                                         from_email=settings.EMAIL_HOST_USER,
                                         to=[el.email])
        message.attach('books.csv', exp.getvalue(), 'text/csv; charset=UTF-8')
        message.send(fail_silently=False)
        exp.close()
        el.status = ExportStatus.succeed.value
        el.save()
        ret['OK'] = 'OK'
    except Http404:
        set_errors(ret, u'6601')
    except Exception:
        logger.exception(str(sys._getframe().f_code.co_name))
        set_errors(ret, u'6003')
        el.status = ExportStatus.failed.value
        el.save()

    return _render_json_repsonse(ret)
Exemplo n.º 5
0
def main(argv):
    (inputfile, outputfile) = parse_input(argv)

    phrases = []
    cleaned_file = ''
    try:
        cleaned_file = sanitize_file(inputfile)
        line_number = 1
        # Here we don't need the codecs.open as we use ucsv to read the file
        with open(cleaned_file, 'rb') as csvfile:
            for row in ucsv.DictReader(csvfile):
                line_number += 1
                pinyin_phrase = row['Pronunciation']
                try:
                    annotated_pinyin = annotate_phrase(pinyin_phrase)
                except ValueError:
                    print "There's a fishy pronunciation entry on line %d." % line_number
                    continue
                sort_value = calc_sort_value(annotated_pinyin[::-1], 1, 0)
                (first_syllable, _tone) = annotated_pinyin[0]

                hanzi_phrase = row['Word']
                phrases.append((sort_value, first_syllable,
                                hanzi_phrase, pinyin_phrase))
        if cleaned_file.endswith('sanitized.csv'):
            os.remove(cleaned_file)
    except IOError:
        if cleaned_file.endswith('sanitized.csv'):
            os.remove(cleaned_file)
        print 'Bad input file: ', inputfile 

    sorted_phrases = sorted(phrases, key = itemgetter(0, 1))

    output_ready_phrases = [phrase[2:4] for phrase in sorted_phrases]
    
    with open(outputfile, 'wb') as f:
        writer = ucsv.writer(f)
        writer.writerow(['Word', 'Pronunciation'])
        writer.writerows(output_ready_phrases)
Exemplo n.º 6
0
)
args = parser.parse_args()

# Do we have games available to scrape?
# This lets us break out of our loop
games_available = True

current_game = 0  # Specific game we are ready to scrape. Used with start_game
games_scraped = 0  # Count of how many games we have scraped so far
vgchartz_page = 1  # Which VGChartz Page are we on

# Open our CSV file and write the headers
now = datetime.datetime.now()
csvfilename = "gamedata-" + now.strftime("%Y%m%d-%H_%M_%S") + ".csv"
csvfile = open(csvfilename, "wb")
gamewriter = csv.writer(csvfile)
gamewriter.writerow(['DateTime:', str(now)])
gamewriter.writerow([
    'name', 'platform', 'release year', 'genre', 'publisher',
    'north america sales', 'europe sales', 'japan sales',
    'rest of world sales', 'global sales', 'release date', 'critic score',
    'critic outof', 'critic count', 'user score', 'user count', 'developer',
    'rating'
])

start_time = time.time()
while games_available:
    # Connect to the VGChartz table. There are 1000 results per page.
    sys.stdout.write("Connecting to VGChartz Page " + str(vgchartz_page) +
                     "...")
    vgchartz_url = "http://www.vgchartz.com/gamedb/?page=" + str(
Exemplo n.º 7
0
def write_rt_ratings_to_csv(csv_output, rt_user_id):
    with open(csv_output, 'w') as f:
        writer = ucsv.writer(f)
        for n, row in enumerate(scrape_rt(rt_user_id)):
            writer.writerow(row)
    logging.info('Wrote {0} ratings to {1}'.format(n, csv_output))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')


def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]


# Define the webbrowser as Phantomjs and pretend it as Google Chrome.
browser = defineBrowser()

with open('Main_Study_Comments.csv', 'rb') as fr:
    reader = csv.reader(fr)

    with open('Main_Study_Comments_Abstracts.csv', 'wb') as fw:
        writer = csv.writer(fw)

        pubResultRow = [
            "Firstname", "Lastname", "Specialization", "Affiliation",
            "location", "Phase 1", "Phase 2", "Phase 3", "High View",
            "Citation Benefit", "Private Comes First", "Acknowledgement",
            "Likely to Cite", "May Include Reference", "Might Refer to",
            "Likely to Cite + Acknowledgement",
            "May Include Reference + Acknowledgement",
            "Might Refer to + Acknowledgement", "Especially popular",
            "Highly visible", "Highly popular", "Manual Recommendation",
            "Track changed", "Inappropriate Comment", "Interested",
            "Withdrawal", "Email 1 Opened", "Email 2 Opened", "Email 3 Opened",
            "Econ Wikiproject Clicked", "Email Communication", "User Agent",
            "Publication", "Wikipage URL", "Edit Protection Level",
            "Quality Class", "Importance Class", "Page Length", "Watchers",
Exemplo n.º 9
0
def main():
    # включаем парсер параметров
    parser = argparse.ArgumentParser(description='Convert price to ShopOS CSV import file.')

    # входной XML-файл (прайс)
    parser.add_argument('arguments', metavar='ARG', type=str, nargs=1, help='input XLS file')

    # включать описания в выходной CSV
    parser.add_argument('-d', dest='descr', action='store_true', help='enable description')

    # сравнивать прайс с export.csv
    parser.add_argument('-c', dest='compare', action='store_true', help='compare prices')

    # сливать export.csv с сервера
    parser.add_argument('-e', dest='download', action='store_true', help='download export.csv')

    # вычленять бренды
    parser.add_argument('-b', dest='brands', action='store_true', help='filter brands')

    # создать дополнительный -add.csv файл, в котором включены описания
    # (используется для прайса на оборудование)
    parser.add_argument('-f', dest='descfilter', action='store_true', help='filter descriptions (for tools)')
    args = parser.parse_args()

    # если указан -e
    if args.download:
        # сливаем с сайта export.csv
        download_export()

    # xls_input - входной XLS-прайс
    xls_input = args.arguments[0]

    debug_print('Reading XLS ' + xls_input)
    
    # перегоняем данные из входного прайса в table_input
    # table_input[категория] = [int(артикул), unicode(имя), unicode(описание), int(опт), int(розн), str(плюсики)]
    table_input = xlsread(xls_input)

    # имя выходного CSV-файла
    csv_output_fname = args.arguments[0][:-3] + 'csv'

    debug_print('Creating CSV ' + csv_output_fname)

    # создаем CSV-файл, разделитель "|"
    file_csv_output = open(csv_output_fname, 'wb')
    csv_output = csv.writer(file_csv_output, delimiter='|')

    # если указано, что нужно фильтровать оборудование, делаем второй CSV
    if args.descfilter:
        csv_output_fname_add = args.arguments[0][:-4] + '-add.csv'
        file_csv_output_add = open(csv_output_fname_add, 'wb')

        debug_print('Creating CSV ' + csv_output_fname_add)
        csv_output_add = csv.writer(file_csv_output_add, delimiter='|')

        # и говорим, что его нужно еще и сравнивать
        args.compare = True

    # если таки нужно сравнивать,
    if args.compare:
        # открываем любезно приготовленный export-out.csv
        # (экспортные данные с сайта)
        csv_compare = csv.reader(open('export-out.csv', 'rb'), delimiter='|')

        # таблица, в которой будут храниться все необходимые для сравнения данные о продуктах с сайта
        # table_compare[int(артикул)] = [int(опт), int(розница), показатель сниженной цены, описание]
        table_compare = {}
        for i,row in enumerate(csv_compare):
            if (i != 0) and (row[11][2:-5] != '') and (int(row[8]) != 0):
                table_compare[int(row[1])] = [int(row[11][2:-5]), int(row[8]), row[15], row[29]]
                                            # opt,                rozn,        ean,     descr

    # объявляем таблицу выходных данных
    table_output = []

    # прописываем заголовки CSV-таблиц
    if args.descfilter:
        table_output_add = []
        table_output_add.append(['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1',
                  'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight',
                  'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value',
                  'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru',
                  'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3',
                  'p_cat.4', 'p_cat.5'])

    if args.descr:
        header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1',
                  'p_priceNoTax.2', 'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight',
                  'p_ean', 'p_disc', 'p_opttpl', 'p_vpe', 'p_vpe_status', 'p_vpe_value',
                  'p_name.ru', 'p_desc.ru', 'p_shortdesc.ru', 'p_meta_title.ru', 'p_meta_desc.ru',
                  'p_meta_key.ru', 'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3',
                  'p_cat.4', 'p_cat.5']
    else:
        header = ['XTSOL', 'p_model', 'p_stock', 'p_sorting', 'p_shipping', 'p_tpl',
                  'p_manufacturer', 'p_fsk18', 'p_priceNoTax', 'p_priceNoTax.1', 'p_priceNoTax.2',
                  'p_priceNoTax.3', 'p_tax', 'p_status', 'p_weight', 'p_ean', 'p_disc', 'p_opttpl', 'p_vpe',
                  'p_vpe_status', 'p_vpe_value', 'p_name.ru', 'p_meta_title.ru', 'p_meta_desc.ru', 'p_meta_key.ru',
                  'p_url.ru', 'p_cat.0', 'p_cat.1', 'p_cat.2', 'p_cat.3', 'p_cat.4', 'p_cat.5']
    table_output.append(header)

    debug_print('Starting parsing catalog')

    # начинаем парсить прайс по одной позиции
    # category - категория прайса, из которой берем позицию
    for category in table_input:
        # item - собственно позиция со всеми ее данными
        for item in table_input[category]:
            # ДЕФОЛТЫ {{
            p_status = '1' # статус товара - всегда будет включен

            # дальнейшие дефолты значения не имеют
            p_sorting = '0' # сортировка
            p_shipping = '0' # доставка
            p_tpl = p_opttpl = 'default' # шаблоны
            p_fsk18 = p_tax = p_vpe = p_vpe_status = '0'
            p_priceNoTax_1 = p_priceNoTax_2 = p_url = ''
            p_weight = '0.00'
            p_disc = p_vpe_value = '0.0000'
            # }} //ДЕФОЛТЫ

            p_model = str(item[0]) # артикул
            p_stock = config.QTY[item[5]] # наличие
            p_name = item[1].replace('/', '/ ') # наименование

            if args.brands:
                [p_manufacturer, pricebrand] = brand(p_name, p_model) # производитель
            else:
                [p_manufacturer, pricebrand] = ['', '']

            # короткое описание
            p_shortdesc = short_desc(category, item[2])
            # шаблонистое SEO-описание - пока в разработке
            
            seo_description = seo_desc(p_name, category, brand(p_name, p_model)[1])

            if seo_description == config.LEAVE_OLD_DESC:
                if args.compare:
                    try:
                        p_desc = table_compare[int(p_model)][3]
                    except:
                        debug_print_pause(table_compare[11928])

                else:
                    pass
            else:
                if p_shortdesc != '':
                    p_desc = '<p>' + p_shortdesc + '</p><br/>' + seo_description # описание с ништяками
                else:
                    p_desc = seo_description # описание с ништяками
                #p_desc = p_shortdesc
        

            # вместо этого заюзаем такую незапаристую конструкцию
            #p_desc = ''
            #if short_desc != '':
            #    p_desc = '<p>' + p_shortdesc + '</p>'

            # фильтруем наличие товара/описания для товара-оборудования
            if args.descfilter:
                # по умолчанию добавляем товару описание из прайса
                is_add = True

                try:
                    # если же на сайте уже есть описание, или же в прайсе его нет,
                    if (table_compare[int(p_model)][3] != '') or (p_shortdesc == ''):
                        # то ничего и не вставляем
                        is_add = False
                except KeyError, e:
                    pass

            # оптовая цена
            p_priceNoTax_3 = str(int(math.ceil(item[3])))
            # розничная цена
            p_priceNoTax = str(int(math.ceil(item[4])))

            # категоризируем
            p_cat = getcat(category, p_name)

            # мета
            [temp_name, p_meta_title, p_meta_desc, p_meta_key] = metatags(p_name, p_shortdesc, p_cat, p_model)
            if temp_name != '':
                p_name = temp_name

            # заниженная цена
            p_ean = ''
            if args.compare:
                p_ean = ean(p_model, p_priceNoTax_3, p_priceNoTax, table_compare)

            # сливаем все полученные данные в одну строку массива
            # и дампим ее в csv
            if args.descfilter:
                # если это прайс на оборудование, который надо раздербанивать на два,
                # то раздербаниваем, чо
                if is_add:
                    table_output_add.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]])
                else:
                    table_output.append(['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]])
            else:
                # тут все остальные случаи
                if args.descr:
                    row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_desc, p_shortdesc, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]
                else:
                    row = ['XTSOL', p_model, p_stock, p_sorting, p_shipping, p_tpl, p_manufacturer, p_fsk18, p_priceNoTax + '.0000', p_priceNoTax_1, p_priceNoTax_2, '1:' + p_priceNoTax_3 + '.0000', p_tax, p_status, p_weight, p_ean, p_disc, p_opttpl, p_vpe, p_vpe_status, p_vpe_value, p_name, p_meta_title, p_meta_desc, p_meta_key, p_url, p_cat[0], p_cat[1], p_cat[2], p_cat[3], p_cat[4], p_cat[5]]

                table_output.append(row)
Exemplo n.º 10
0
parser.add_argument('-w', '--wait', type=int, default=0, help='Number of seconds to wait before each request to MetaCritc (0 to disable).')
args = parser.parse_args()

# Do we have games available to scrape?
# This lets us break out of our loop
games_available = True 

current_game = 0  # Specific game we are ready to scrape. Used with start_game
games_scraped = 0 # Count of how many games we have scraped so far
vgchartz_page = 1 # Which VGChartz Page are we on

# Open our CSV file and write the headers
now = datetime.datetime.now()
csvfilename = "gamedata-" + now.strftime("%Y%m%d-%H_%M_%S") + ".csv"
csvfile = open(csvfilename, "wb")
gamewriter = csv.writer(csvfile)
gamewriter.writerow(['DateTime:', str(now)])
gamewriter.writerow(['name', 'platform', 'release year', 'genre', 'publisher', 'north america sales', 'europe sales', 'japan sales', 'rest of world sales', 'global sales', 'release date', 'critic score', 'critic outof', 'critic count', 'user score', 'user count', 'developer', 'rating'])

start_time = time.time()
while games_available:
	# Connect to the VGChartz table. There are 1000 results per page.
	sys.stdout.write("Connecting to VGChartz Page " + str(vgchartz_page) + "...")
	vgchartz_url = "http://www.vgchartz.com/gamedb/?page=" + str(vgchartz_page) + "&results=1000&name=&platform=&minSales=0&publisher=&genre=&sort=GL"
	#vgchartz_url = "file:vgchartz.htm"	# This is a DEBUG line - pulling vgchartz data from filesystem. Comment it out for production.
	vgchartz_conn = urllib2.urlopen(vgchartz_url)
	vgchartz_html = vgchartz_conn.read()
	sys.stdout.write("connected.\n")
	
	vgsoup = BeautifulSoup(vgchartz_html)
	rows = vgsoup.find("table", class_="chart").find_all("tr")
Exemplo n.º 11
0
if cmd_folder not in sys.path:
    sys.path.insert(0, cmd_folder)

from openpyxl import load_workbook

parser = argparse.ArgumentParser()
parser.add_argument("input_file")
args = parser.parse_args()

def xstr(s):
    if s is None:
        return ''
    return s

# workbook = load_workbook(args.input_file, use_iterators = True)
# sheet = workbook.get_active_sheet()
# 
# with open((args.input_file + '.csv'), 'wb') as csvfile:
#   csvwriter = csv.writer(csvfile, delimiter=args.delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL)
# 
# for i, row in enumerate(sheet.iter_rows()):
#   row = [ c.internal_value for c in row ]
#   csvwriter.writerow(row)

wb = load_workbook(args.input_file)
sh = wb.get_active_sheet()
with open((args.input_file + '.csv'), 'wb') as f:
    c = csv.writer(f,delimiter="\t")
    for r in sh.rows:
        c.writerow([xstr(cell.value) for cell in r][:50])
Exemplo n.º 12
0
    sys.path.insert(0, cmd_folder)

from openpyxl import load_workbook

parser = argparse.ArgumentParser()
parser.add_argument("input_file")
args = parser.parse_args()

def xstr(s):
    if s is None:
        return ''
    return s

# workbook = load_workbook(args.input_file, use_iterators = True)
# sheet = workbook.get_active_sheet()
# 
# with open((args.input_file + '.csv'), 'wb') as csvfile:
#   csvwriter = csv.writer(csvfile, delimiter=args.delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL)
# 
# for i, row in enumerate(sheet.iter_rows()):
#   row = [ c.internal_value for c in row ]
#   csvwriter.writerow(row)

wb = load_workbook(args.input_file)
sh = wb.get_active_sheet()
with open((args.input_file + '.csv'), 'wb') as f:
    c = csv.writer(f,delimiter="\t")
    for r in sh.rows:
        # c.writerow([xstr(cell.value) for cell in r][:50])
        c.writerow([xstr(cell.value) for cell in r])
Exemplo n.º 13
0
        print('usage:     csql   <query>')
        print('example:   csql   "select first,last,phone from phones.csv"')
        exit(1)

    query = sys.argv.pop(0)

    db = Database()
    eof = 'Ctrl+D'
    if os.name == 'nt':
        eof = 'Ctrl+Z'

    words = re.findall(r'(?i)[a-z0-9.]+\.csv|stdin', query)
    for word in words:
        if word.lower().endswith('.csv'):
            db.import_table(word, open(word))
            query = query.replace(word, db.get_table_name(word))
            continue
        if word.lower() == 'stdin':
            if not 'stdin' in db.tables:
                if sys.stdin.isatty():
                    msg = ('Importing data from STDIN; '+
                           '%s twice to conclude.') % eof
                    print >>sys.stderr, msg
                db.import_table('stdin', sys.stdin)

    results = csv.writer(sys.stdout, dialect='excel')
    cursor = db.execute_query(query)
    results.writerow( [ field[0] for field in cursor.description ] )
    for row in cursor:
        results.writerow(row)
Exemplo n.º 14
0
        return True, pubResultRow


viewedRecommendations = {}

print "Enter the name of the authors and publications dataset csv file without any sufix:"
datatsetFileName = raw_input()

if datatsetFileName == "":
    datatsetFileName = 'Ideas_Repec_Dataset_Pilot3_Clean'

with open(datatsetFileName + '.csv', 'rb') as fr:
    reader = csv.reader(fr)

    with open(datatsetFileName + '_Recommendations.csv', 'wb') as fw:
        writer = csv.writer(fw)

        pubResultRow = [
            'email', 'publication1', 'Wikipage1', 'WikipageURL1',
            'publication2', 'Wikipage2', 'WikipageURL2', 'publication3',
            'Wikipage3', 'WikipageURL3', 'publication4', 'Wikipage4',
            'WikipageURL4', 'publication5', 'Wikipage5', 'WikipageURL5',
            'publication6', 'Wikipage6', 'WikipageURL6', 'publication7',
            'Wikipage7', 'WikipageURL7'
        ]
        writer.writerow(pubResultRow)

        with open(datatsetFileName + '_Recommendations_Stats.csv', 'wb') as fw:
            writer_Stats = csv.writer(fw)

            wikipageResultRow = [
Exemplo n.º 15
0
def econPapersCategoriesExtractor(ideasURL, startSpecialization,
                                  endSpecialization):

    with open('Ideas_Repec_Dataset.csv', 'wb') as fw:
        writer = csv.writer(fw)

        with open('Ideas_Repec_Affiliations.csv', 'wb') as fwAffiliations:
            writerAffiliations = csv.writer(fwAffiliations)

            resultRow = [
                'firstName', 'lastName', 'email', 'specialization',
                'EconPapers Profile', 'affiliation', 'location', 'homepage',
                'publication1', 'publicationYear1', 'citation1',
                'firstKeyword1', 'publication2', 'publicationYear2',
                'citation2', 'firstKeyword2', 'publication3',
                'publicationYear3', 'citation3', 'firstKeyword3',
                'publication4', 'publicationYear4', 'citation4',
                'firstKeyword4', 'publication5', 'publicationYear5',
                'citation5', 'firstKeyword5', 'publication6',
                'publicationYear6', 'citation6', 'firstKeyword6',
                'publication7', 'publicationYear7', 'citation7',
                'firstKeyword7'
            ]
            writer.writerow(resultRow)

            resultRowAffiliations = [
                'firstName', 'lastName', 'email', 'affiliation1', 'location1',
                'affiliation2', 'location2', 'affiliation3', 'location3',
                'affiliation4', 'location4', 'affiliation5', 'location5',
                'affiliation6', 'location6', 'affiliation7', 'location7'
            ]
            writerAffiliations.writerow(resultRowAffiliations)

            ideasSoup = soupStructure(ideasURL)

            # If the content of the page is returned in BeautifulSoup structure:
            if ideasSoup != '':

                # Find the main list.
                mainList = ideasSoup.body.find(
                    text=re.compile('.*Accounting & Auditing.*')).parent.parent

                # If there is a main list:
                if mainList != None:

                    # Set if the startSpecialization is observed.
                    startSpecializationObserved = False

                    # Find all li tags inside the main list.
                    liTags = mainList.findAll('li')

                    for i in range(len(liTags)):

                        # Find the hyperlink tag inside the list item.
                        aTag = liTags[i].find('a')

                        specialization = aTag.nextSibling[1:]

                        print str(specialization)

                        if specialization == startSpecialization:
                            startSpecializationObserved = True

                        if specialization != "All new papers" and specialization != "German Papers" and startSpecializationObserved:

                            econPapersAuthorListExtractor(
                                writer, writerAffiliations, specialization,
                                'https://ideas.repec.org' + aTag['href'])

                            # if foundTheLastSubject:
                            # 	return
                        if specialization == endSpecialization:
                            return