Exemplo n.º 1
0
def main():
    #user settings
    input_file_name = 'data/input.xlsx'
    output_file_name = 'data/output_data'
    input_sheet_name = 'product_list'
    output_sheet_name = 'processed_data'
    
    number_of_items = 100

    #Initialize from given settings
    book_in = open_workbook(input_file_name)
    sheet_in = book_in.sheet_by_name(input_sheet_name)
    
    #Get list of items from excel file
    ids = sheet_in.col_values(0,1)
    product_types = sheet_in.col_values(1,1)
    
    io = input('starting point?')
    i = io
    amzn = AmazonScraper(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ASSOCIATE_TAG)
    book_out = Workbook()
    sheet_out = book_out.add_sheet(output_sheet_name)
    add_data_headers(sheet_out)
    p_count = 0
    
    #iterate through items
    while i < len(ids):
        p = amzn.lookup(ItemId=ids[i])
        p_count += 1
        print 'Processing', p_count
        p_data = data(amzn, p, product_types[i])
        add_data(sheet_out, p_count, p_data)
        book_out.save(output_file_name + '_' + product_types[i] + '3.xls')
        i = i+1
Exemplo n.º 2
0
def update_reviews(asin_list):
    for asin in asin_list:
        f = open(os.path.dirname(os.path.realpath(__file__)) + "/keys/aws_keys.json")
        configs = json.loads(f.read())
        amzn = AmazonScraper(configs["aws_public_key"], configs["aws_secret_key"], configs["product_api_tag"])
        try:
            p = amzn.lookup(ItemId=asin)
        except amazon.api.AsinNotFound as e:
            continue
        reviews = p.reviews()
        dates = queries.find_date_for_review(asin)
        media_type = queries.find_type_by_id(asin)
        unix_dates = []
        for date in dates:
            unix_dates.append(get_date(date))
        date = max(unix_dates)
        update = False
        for review in reviews:
            if date < int(review.date):  #check if asin needs updating
                print("needs updating")
                update = True
        list_of_review_dicts =[]
        #if the product has new reviews get them from amazon
        if(update):
            all_reviews = list(reviews)
            for review in all_reviews:  #get all reviews and add in values into the dictionary
                 product_api = aws_module.setup_product_api()
                 comment_dict = dict()
                 comment_dict["text"] = url_scrape.parser(review.url)
                 comment_dict["unixtime"] = int(review.date)
                 list_of_review_dicts.append(comment_dict)
        return data_ingester.handleReview(asin, list_of_review_dicts, product_api, media_type)
Exemplo n.º 3
0
def initialize(prodId):

    amzn = AmazonScraper(acess_key, secret_key, customer_tag, Region='IN')
    p = amzn.lookup(ItemId=prodId)
    rs = amzn.reviews(ItemId=prodId)
    reviews, reviews_title = [], []
    i = 1
    for r in rs:
        fr = r.full_review()
        print_review(fr.title, fr.text, i)
        reviews.append(fr.text)
        reviews_title.append(fr.title)
        i += 1
    prodName = p.title
    for x in range(len(prodName)):
        string = list(prodName)
        if string[x] == '.' or string[x] == '/': string[x] = '-'
        prodName = ''.join(string)
    return reviews, reviews_title, prodName
Exemplo n.º 4
0
# Amazon Associates account (final code), and then you need to sign up to use
# the Product Advertising API within the Associates account

filename = "reviews_allinfo.csv"
filename2 = "reviews_notext.csv"

save_path = 'c:/output/'

with open('product_ids.csv', 'rb') as f:
    csv_f = csv.reader(f)
    items = [row[0].strip() for row in csv_f]

for number in items:

    try:
        p = amzn.lookup(ItemId=number)
    except AsinNotFound as e:
        print "Product {} was not found".format(number)
        continue

    rs = p.reviews()
    counter = 0

    try:
        for review in rs:
            print review.asin
            print review.url
            print review.soup
            counter += 1

            if (counter % 80) == 0: