def read_sentiment140(sentiment140Path="/data/sentiment140/sentiment140.csv"): """ Get a generator for the sentiment140 dataset may need to provide local path to data. """ senti140Cache = join(cacheDir, "sentiment140.json") if not exists(senti140Cache): ensureCache() if not sentiment140Path or not exists(sentiment140Path): print("Please provide the local path to the sentiment140 dataset: ") sentiment140Path = sys.stdin.readline().strip() with open(senti140Cache, "w") as cacheFile: with open(sentiment140Path) as sentiPath: reader = csv.reader(sentiPath) for line in reader: cacheFile.write( json.dumps([preprocess.tweet(line[5].decode("utf-8")), 1 if line[0] == "4" else -1]) ) cacheFile.write("\n") return cacheMaker(senti140Cache)
def read_sentiment140(sentiment140Path="/data/sentiment140/sentiment140.csv"): """ Get a generator for the sentiment140 dataset @Arguments: sentiment140Path: path to load (and/or save) file Process: Download file if not present Cache file as (tweet,label) pairs @Return: generator to cached file """ senti140Cache = join(cacheDir, "sentiment140.json") # create cached file if necessary if not exists(senti140Cache): ensureCache() # request path to file if necessary if not sentiment140Path: print( "Please provide the local path to the sentiment140 dataset: ") sentiment140Path = sys.stdin.readline().strip() # download the file if it doesn't exist if not exists(sentiment140Path): # download entire source zipfile from internet print("Downloading sentiment140 dataset from Stanford...") file_path = get_file(url_sentiment140) # save specified CSV from zipfile with ZipFile(file_path, 'r') as zp: zp.extract(csv_sentiment140, dir_tmp_sentiment140) shutil.move( os.path.join(dir_tmp_sentiment140, csv_sentiment140), sentiment140Path) # write to cache with open(senti140Cache, "w") as cacheFile: with open(sentiment140Path) as sentiPath: # enumerate over CSV entries reader = latin_csv_reader(sentiPath, delimiter=',') for i, line in enumerate(reader): # format text text = preprocess.tweet(line[index_sentiment140_label]) # generate binary label if line[index_sentiment140_text] == label_sentiment140_positive: label = label_positive else: label = label_negative # write (text,label) pairs cacheFile.write(json.dumps([text, label])) cacheFile.write("\n") return cacheMaker(senti140Cache)
def read_sentiment140(sentiment140Path = "/data/sentiment140/sentiment140.csv"): """ Get a generator for the sentiment140 dataset @Arguments: sentiment140Path: path to load (and/or save) file Process: Download file if not present Cache file as (tweet,label) pairs @Return: generator to cached file """ senti140Cache = join(cacheDir, "sentiment140.json") # create cached file if necessary if not exists(senti140Cache): ensureCache() # request path to file if necessary if not sentiment140Path: print("Please provide the local path to the sentiment140 dataset: ") sentiment140Path = sys.stdin.readline().strip() # download the file if it doesn't exist if not exists(sentiment140Path): # download entire source zipfile from internet print("Downloading sentiment140 dataset from Stanford...") file_path = get_file(url_sentiment140) # save specified CSV from zipfile with ZipFile(file_path, 'r') as zp: zp.extract(csv_sentiment140, dir_tmp_sentiment140) shutil.move(os.path.join(dir_tmp_sentiment140, csv_sentiment140), sentiment140Path) # write to cache with open(senti140Cache,"w") as cacheFile: with open(sentiment140Path) as sentiPath: # enumerate over CSV entries reader = latin_csv_reader(sentiPath, delimiter=',') for i, line in enumerate(reader): # format text text = preprocess.tweet(line[index_sentiment140_label]) # generate binary label if line[index_sentiment140_text] == label_sentiment140_positive: label = label_positive else: label = label_negative # write (text,label) pairs cacheFile.write( json.dumps([text, label]) ) cacheFile.write("\n") return cacheMaker(senti140Cache)