Пример #1
0
def update_csv():
    local = '/Users/constantine/PycharmProjects/test02/data.csv'
    tmpLocal = '/Users/constantine/PycharmProjects/test02/tmpdata.csv'
    remote = '/data/data.csv'
    host = '127.0.0.1:9870'
    user_name = 'host'
    client = HdfsClient(hosts=host,user_name=user_name)
    if client.exists(remote):
        client.copy_to_local(remote,tmpLocal)
        client.delete(remote)
        fRead = open(local,'r')
        fWrite = open(tmpLocal,'w')
        lines = fRead.readlines()

        for line in lines:
            fWrite.writelines(lines)
        fRead.close()
        fWrite.close()
        fRead = open(local, 'r')
        lines = fRead.read()
        fRead.close()
        fWrite = open(tmpLocal, 'w')
        lines = '\n'.join(list(set(lines.split('\n')))[1:])
        fWrite.write(lines)
        fWrite.close()
        client.copy_from_local(tmpLocal,remote)


    else:
        client.copy_from_local(local, remote)
Пример #2
0
def Copy_From_Local(file):
    '''
    上传文件到hadoop
    '''
    h_file = ('/tmp/te/%s' % file)
    client = HdfsClient(hosts='localhost:50070')  #hdfs地址,连接hdfs
    if client.exists(h_file):
        client.delete(h_file)
        #判断文件是否存在于hdfs,存在就删除
    client.copy_from_local(file, h_file)
Пример #3
0
def crawler(word, products_list=[]):
    """ 爬取一号店的商品数据 """
    word = urllib.parse.quote(word)

    url = 'https://search.yhd.com/c0-0/k{0}'.format(word)

    # 获取html源码
    html_doc = requests.get(url).text

    # xpath对象
    selector = html.fromstring(html_doc)

    # 商品列表
    ul_list = selector.xpath('//div[@id="itemSearchList"]/div')

    # 解析数据
    for li in ul_list:

        # 标题
        title = li.xpath('div//p[@class="proName clearfix"]/a/@title')
        #print(title)

        # 链接
        link = li.xpath('div//p[@class="proName clearfix"]/a/@href')
        #print(link)

        # 价格
        price = li.xpath('div//p[@class="proPrice"]/em/@yhdprice')

        with open("p_price", "a", encoding="gbk") as f:
            for j in range(len(price)):
                f.write(price[j] + "\n")

        f.close()
        #print(price)

        if len(title) > 0 and len(link) > 0 and len(price) > 0:
            # print(title)
            # print(link)
            # print(price)
            # print('--------------------')

            products_list.append({
                'title': title[0],
                'price': price[0],
                'link': 'https:' + link[0],
                'referer': '1号店'
            })
    client = HdfsClient(hosts='222.27.166.209:50070', user_name='hadoop')
    client.copy_from_local('/home/hadoop/Downloads/PriceCompaer/p_price',
                           '/p_price.txt')
Пример #4
0
def basic():
	client = HdfsClient(hosts='study:50070')
	print(client.list_status('/'))

	print '判断某个路径是否存在'
	print client.exists("/test")
	print client.exists("/data/gz/thrift-0.9.2.tar.gz")

	client = HdfsClient(hosts='study:50070')
	print client.get_file_checksum("/data/gz/bison-2.5.1.tar.gz")

	summary = client.get_content_summary("/")
	print summary

	#文件拷贝--从HDFS拷贝到本地磁盘系统
	client.copy_to_local("/data/gz/pip-7.1.2.tar.gz","/root/data/pip-7.1.2.tar.gz")
	#文件拷贝--从本地磁盘系统拷贝到HDFS系统中
	client.copy_from_local("/root/data/thrift-0.9.2.tar.gz","/data/gz/thrift-0.9.2.tar.gz")

	print client.get_home_directory()
Пример #5
0
    #db.commit()

    cursor.execute(sql4)
    db.commit()

    cursor.execute(sql5)
    db.commit()
    #cursor.execute(sql6)
    #db.commit()

    mysql_time = time.asctime(time.localtime(time.time()))
    print "Data into Mysql: ", mysql_time
    cursor.execute(sql7)
    db.commit()

    client.copy_from_local('/tmp/data_5q.txt', '/data_5q.txt')

    #results = cursor.fetchall()
    #client.create('/data_1000.txt', '\0')
    #for row in results:
    #    f = row[0]
    #    name = row[1]
    #    score = row[2]
    #    s = str(f) + ',' + name + ',' + str(score) + '\n'
    #    client.append('/data_1000.txt', s)
    endtime = time.asctime(time.localtime(time.time()))
    print "Data into hdfs: ", endtime
except:
    print "Error!"
db.close()
Пример #6
0
def csv_to_hdfs(row):
    client = HdfsClient(hosts=HDFS_IP)
    client.copy_from_local("%s%s%s.csv" % (local_csv_dir, tablename, row),
                           '/hivecsv-%s%s' %
                           (tablename, row))  #本地文件绝对路径,HDFS目录必须不存在
Пример #7
0
# 将本地文件传入HDFS存储
from pyhdfs import HdfsClient

client = HdfsClient(hosts='ghym:50070', user_name='hadoop')
client.copy_from_local(
    'D:/programs/workspace/pythonworks/doubanuser/doubanuser/userdemo.txt',
    '/score.txt')
Пример #8
0
    #cursor.execute(sql3)
    #db.commit()

    #cursor.execute(sql4)
    #db.commit()
    cursor.execute(sql5)
    db.commit()
    #cursor.execute(sql6)
    #db.commit()

    mysql_time = time.asctime(time.localtime(time.time()))
    print "Data into Mysql: ", mysql_time
    cursor.execute(sql7)
    db.commit()

    client.copy_from_local('/tmp/data_1000m.txt', '/data_1000m.txt')

    #results = cursor.fetchall()
    #client.create('/data_1000.txt', '\0')
    #for row in results:
    #    f = row[0]
    #    name = row[1]
    #    score = row[2]
    #    s = str(f) + ',' + name + ',' + str(score) + '\n'
    #    client.append('/data_1000.txt', s)
    endtime = time.asctime(time.localtime(time.time()))
    print "Data into hdfs: ", endtime
except:
    print "Error!"
db.close()