Пример #1
0
def read_sentiment140(sentiment140Path="/data/sentiment140/sentiment140.csv"):
    """
        Get a generator for the sentiment140 dataset
          may need to provide local path to data.
    """

    senti140Cache = join(cacheDir, "sentiment140.json")
    if not exists(senti140Cache):
        ensureCache()
        if not sentiment140Path or not exists(sentiment140Path):
            print("Please provide the local path to the sentiment140 dataset: ")
            sentiment140Path = sys.stdin.readline().strip()

        with open(senti140Cache, "w") as cacheFile:
            with open(sentiment140Path) as sentiPath:
                reader = csv.reader(sentiPath)
                for line in reader:
                    cacheFile.write(
                        json.dumps([preprocess.tweet(line[5].decode("utf-8")), 1 if line[0] == "4" else -1])
                    )
                    cacheFile.write("\n")

    return cacheMaker(senti140Cache)
Пример #2
0
def read_sentiment140(sentiment140Path="/data/sentiment140/sentiment140.csv"):
    """
        Get a generator for the sentiment140 dataset

        @Arguments:
            sentiment140Path: path to load (and/or save) file

        Process:
            Download file if not present
            Cache file as (tweet,label) pairs

        @Return:
            generator to cached file
    """

    senti140Cache = join(cacheDir, "sentiment140.json")

    # create cached file if necessary
    if not exists(senti140Cache):
        ensureCache()

        # request path to file if necessary
        if not sentiment140Path:
            print(
                "Please provide the local path to the sentiment140 dataset: ")
            sentiment140Path = sys.stdin.readline().strip()

        # download the file if it doesn't exist
        if not exists(sentiment140Path):

            # download entire source zipfile from internet
            print("Downloading sentiment140 dataset from Stanford...")
            file_path = get_file(url_sentiment140)

            # save specified CSV from zipfile
            with ZipFile(file_path, 'r') as zp:
                zp.extract(csv_sentiment140, dir_tmp_sentiment140)
                shutil.move(
                    os.path.join(dir_tmp_sentiment140, csv_sentiment140),
                    sentiment140Path)

        # write to cache
        with open(senti140Cache, "w") as cacheFile:
            with open(sentiment140Path) as sentiPath:

                # enumerate over CSV entries
                reader = latin_csv_reader(sentiPath, delimiter=',')
                for i, line in enumerate(reader):

                    # format text
                    text = preprocess.tweet(line[index_sentiment140_label])

                    # generate binary label
                    if line[index_sentiment140_text] == label_sentiment140_positive:
                        label = label_positive
                    else:
                        label = label_negative

                    # write (text,label) pairs
                    cacheFile.write(json.dumps([text, label]))
                    cacheFile.write("\n")

    return cacheMaker(senti140Cache)
Пример #3
0
def read_sentiment140(sentiment140Path = "/data/sentiment140/sentiment140.csv"):
    """
        Get a generator for the sentiment140 dataset

        @Arguments:
            sentiment140Path: path to load (and/or save) file

        Process:
            Download file if not present
            Cache file as (tweet,label) pairs

        @Return:
            generator to cached file
    """

    senti140Cache = join(cacheDir, "sentiment140.json")


    # create cached file if necessary
    if not exists(senti140Cache):
        ensureCache()

        # request path to file if necessary
        if not sentiment140Path:
            print("Please provide the local path to the sentiment140 dataset: ")
            sentiment140Path = sys.stdin.readline().strip()

        # download the file if it doesn't exist
        if not exists(sentiment140Path):

            # download entire source zipfile from internet
            print("Downloading sentiment140 dataset from Stanford...")
            file_path = get_file(url_sentiment140)

            # save specified CSV from zipfile
            with ZipFile(file_path, 'r') as zp:
                zp.extract(csv_sentiment140, dir_tmp_sentiment140)
                shutil.move(os.path.join(dir_tmp_sentiment140, csv_sentiment140), sentiment140Path)

        # write to cache
        with open(senti140Cache,"w") as cacheFile:
            with open(sentiment140Path) as sentiPath:

                # enumerate over CSV entries
                reader = latin_csv_reader(sentiPath, delimiter=',')
                for i, line in enumerate(reader):

                    # format text
                    text = preprocess.tweet(line[index_sentiment140_label])

                    # generate binary label
                    if line[index_sentiment140_text] == label_sentiment140_positive:
                      label = label_positive
                    else:
                      label = label_negative

                    # write (text,label) pairs
                    cacheFile.write( json.dumps([text, label]) )
                    cacheFile.write("\n")

    return cacheMaker(senti140Cache)