def filter(self):
        SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
        FILTERED_DICTINARIES_CONF = config.get("FILTERED_DICTINARIES_CONF",
                                               None)
        if not (SAVE_DICTIONARY_DIR):
            print(
                "config keys - SAVE_DICTIONARY_DIR is not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        if not (FILTERED_DICTINARIES_CONF
                and len(FILTERED_DICTINARIES_CONF) > 0):
            print(
                "Configuration for filtering dictionaries is not defined in socialconfig.py"
            )
            exit()

        SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                      "Unfiltered")
        if not os.path.exists(
                SAVE_UNFILTERED_DICTIONARY_DIR) and not os.path.isdir(
                    SAVE_UNFILTERED_DICTIONARY_DIR):
            raise ("Directory {d} does not exist".format(
                d=SAVE_UNFILTERED_DICTIONARY_DIR))

        SAVE_FILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                    "Filtered")
        if not (os.path.exists(SAVE_FILTERED_DICTIONARY_DIR)
                and os.path.isdir(SAVE_FILTERED_DICTIONARY_DIR)):
            os.makedirs(SAVE_FILTERED_DICTIONARY_DIR)

        for bdir, subdirs, files in os.walk(SAVE_UNFILTERED_DICTIONARY_DIR):
            if "negative" in bdir:
                continue
            if "positive" in bdir:
                continue
            if len(files) > 0:
                for file in files:
                    if "_dict.dict" in file:
                        dictionary = Dictionary.load(os.path.join(bdir, file))
                        yelp_category = file.split("_dict.dict")[0]
                        if yelp_category == "food":
                            FOOD_POSITIVE_SEEDS.extend(FOOD_NEGATIVE_SEEDS)
                            keep_tokens = FOOD_POSITIVE_SEEDS
                        elif yelp_category == "bars":
                            BARS_POSITIVE_SEEDS.extend(BARS_NEGATIVE_SEEDS)
                            keep_tokens = BARS_POSITIVE_SEEDS
                        elif yelp_category == "leisure":
                            LEISURE_POSITIVE_SEEDS.extend(
                                LEISURE_NEGATIVE_SEEDS)
                            keep_tokens = LEISURE_POSITIVE_SEEDS
                        elif yelp_category == "grooming":
                            GROOMING_POSITIVE_SEEDS.extend(
                                GROOMING_NEGATIVE_SEEDS)
                            keep_tokens = GROOMING_POSITIVE_SEEDS
                        elif yelp_category == "health":
                            HEALTH_POSITIVE_SEEDS.extend(HEALTH_NEGATIVE_SEEDS)
                            keep_tokens = HEALTH_POSITIVE_SEEDS
                        elif yelp_category == "learn":
                            LEARN_POSITIVE_SEEDS.extend(LEARN_NEGATIVE_SEEDS)
                            keep_tokens = LEARN_POSITIVE_SEEDS
                        elif yelp_category == "municipal":
                            MUNICIPAL_POSITIVE_SEEDS.extend(
                                MUNICIPAL_NEGATIVE_SEEDS)
                            keep_tokens = MUNICIPAL_POSITIVE_SEEDS
                        elif yelp_category == "sports":
                            SPORTS_POSITIVE_SEEDS.extend(SPORTS_NEGATIVE_SEEDS)
                            keep_tokens = SPORTS_POSITIVE_SEEDS
                        elif yelp_category == "planning":
                            PLANNING_POSITIVE_SEEDS.extend(
                                PLANNING_NEGATIVE_SEEDS)
                            keep_tokens = PLANNING_POSITIVE_SEEDS
                        elif yelp_category == "services":
                            SERVICES_POSITIVE_SEEDS.extend(
                                SERVICES_NEGATIVE_SEEDS)
                            keep_tokens = SERVICES_POSITIVE_SEEDS
                        elif yelp_category == "shopping":
                            SHOPPING_POSITIVE_SEEDS.extend(
                                SHOPPING_NEGATIVE_SEEDS)
                            keep_tokens = SHOPPING_POSITIVE_SEEDS
                        else:
                            keep_tokens = []

                        keep_token_ids = []
                        for token in keep_tokens:
                            l = []
                            l.append(token)
                            res = dictionary.doc2bow(l, return_missing=True)
                            found, missing = res
                            if token in missing:
                                continue
                            else:
                                keep_token_ids.append(found[0])

                        CATEGORY_SPECIFIC_FILTERED_DICT_DIR = os.path.join(
                            SAVE_FILTERED_DICTIONARY_DIR, yelp_category)
                        if not (os.path.exists(
                                CATEGORY_SPECIFIC_FILTERED_DICT_DIR)
                                and os.path.isdir(
                                    CATEGORY_SPECIFIC_FILTERED_DICT_DIR)):
                            os.makedirs(CATEGORY_SPECIFIC_FILTERED_DICT_DIR)

                        for conf_name, conf in FILTERED_DICTINARIES_CONF.items(
                        ):
                            keep_n = conf.get("keep_n", None)
                            no_below = conf.get("no_below", None)
                            if not (keep_n and no_below):
                                print(
                                    "Dictionary config: `{configuration}` does not define values for keep_n and no_below, skipping."
                                    .format(configuration=conf_name))
                                continue
                            else:
                                save_in_dir = os.path.join(
                                    CATEGORY_SPECIFIC_FILTERED_DICT_DIR,
                                    conf_name)
                                if not (os.path.exists(save_in_dir)
                                        and os.path.isdir(save_in_dir)):
                                    os.makedirs(save_in_dir)
                                save_type_dict_file_name = os.path.join(
                                    save_in_dir,
                                    "{cat}_filtered_top_{keepn}_tokens_dict.dict"
                                    .format(cat=yelp_category, keepn=keep_n))
                                save_type_text_file_name = os.path.join(
                                    save_in_dir,
                                    "{cat}_filtered_top_{keepn}_tokens_dict.txt"
                                    .format(cat=yelp_category, keepn=keep_n))
                                save_type_token_docfreqs_file_name = os.path.join(
                                    save_in_dir,
                                    "{cat}_top_{keepn}_token_doc_freqs.txt".
                                    format(cat=yelp_category, keepn=keep_n))

                                dictionary.filter_extremes(
                                    keep_n=keep_n,
                                    no_below=no_below,
                                    keep_tokens=keep_token_ids)
                                dictionary.save(save_type_dict_file_name)
                                dictionary.save_as_text(
                                    save_type_text_file_name)
                                sorted_doc_freqs = sorted(
                                    dictionary.dfs.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

                                with open(save_type_token_docfreqs_file_name,
                                          'w') as socialfile:
                                    for (token_id,
                                         doc_freq) in sorted_doc_freqs:
                                        socialfile.write(
                                            str(
                                                dictionary.get(
                                                    token_id, "Unknown").
                                                encode("utf-8")) + " " +
                                            str(doc_freq) + "\n")
                                        print(
                                            "Saved filtered dictionary ({conf_type}) and token document frequencies for tokens in {category}-reviews in directory: {d}"
                                            .format(
                                                category=yelp_category,
                                                d=
                                                CATEGORY_SPECIFIC_FILTERED_DICT_DIR,
                                                conf_type=conf_name))
                                del sorted_doc_freqs
                        del dictionary
                    else:
                        print("{file} ignored, not of type dict".format(
                            file=file))
                        continue
from socialsent.polarity_induction_methods import graph_propagate
from socialsent.polarity_induction_methods import dist

from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation
from textblob import TextBlob as tb
from textblob import Word as wd
import operator
import re
import traceback

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

if __name__ == "__main__":
    SAVE_WORD_EMBEDDINGS_DIR = config.get("SAVE_WORD_EMBEDDINGS_DIR", None)
    SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
    SAVE_POLARITIES_DIR = config.get("SAVE_POLARITIES_DIR", None)

    if not (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR
            and SAVE_POLARITIES_DIR):
        print(
            "Keys (SAVE_WORD_EMBEDDINGS_DIR|SAVE_DICTIONARY_DIR|SAVE_POLARITIES_DIR) not set in the config file socialconfig.py"
        )
        exit()

    SAVE_FILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                "Filtered")

    if not (os.path.exists(SAVE_WORD_EMBEDDINGS_DIR)
            and os.path.isdir(SAVE_WORD_EMBEDDINGS_DIR)):
예제 #3
0
    def process(self):
        # directory where stories are kept
        YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None)
        YELP_REVIEWS_FILE_NAME = config.get("YELP_REVIEWS_FILE_NAME", None)
        SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None)

        if not (YELP_DATASET_DIR and YELP_REVIEWS_FILE_NAME
                and SAVE_REVIEWS_DIRECTORY):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        if not (os.path.exists(YELP_DATASET_DIR)
                and os.path.isdir(YELP_DATASET_DIR)):
            print(
                "Either Yelp Dataset directory path is not set correctly in the socialconfig.py file \nOR\
					\nThe directory does not exist. Please make sure you have downloaded the yelp dataset(in JSON format) and copied the `yelp_academic_dataset_business.json` and `yelp_academic_dataset_business.json` files into the yelp_dataset sub-directory of your project directory(socialsentRun)"
            )
            exit()

        YELP_REVIEWS_ABS_FILE_PATH = os.path.join(YELP_DATASET_DIR,
                                                  YELP_REVIEWS_FILE_NAME)

        YELP_BUSINESSES_FILE_NAME = config.get("YELP_BUSINESSES_FILE_NAME",
                                               None)
        YELP_BUSINESSES_ABS_FILE_PATH = os.path.join(
            YELP_DATASET_DIR, YELP_BUSINESSES_FILE_NAME)

        try:
            f = open(YELP_REVIEWS_ABS_FILE_PATH, 'r')
            if f:
                f.close()
        except IOError:
            msg = "Error opening file: {}".format((YELP_REVIEWS_FILE_NAME))
            print(msg)
            print(traceback.format_exc())
            exit()
        if not (os.path.exists(SAVE_REVIEWS_DIRECTORY)
                and os.path.isdir(SAVE_REVIEWS_DIRECTORY)):
            os.makedirs(SAVE_REVIEWS_DIRECTORY)
            print(
                "Created directory: {d} to save yelp reviews in buckets of 50,000 reviews per file"
                .format(d=SAVE_REVIEWS_DIRECTORY))

        write_file_first = os.path.join(SAVE_REVIEWS_DIRECTORY,
                                        'yelp_reviews_0_50000.json')

        wfile = open(write_file_first, 'w')
        reviews = get_reviews_iterable(YELP_REVIEWS_ABS_FILE_PATH)
        i = 0

        print("Reading Yelp Reviews")
        for line in reviews:
            i += 1
            json_dict = ujson.loads(line)
            review_text = json_dict.get("text", None)
            business_id = json_dict.get("business_id", None)
            review_id = json_dict.get("review_id", None)
            review_rating = json_dict.get("stars", None)

            if not (review_id or business_id or review_text or review_rating):
                continue
            review_text = review_text
            business_id = business_id
            review_id = review_id
            write_dict = {}
            write_dict['business_id'] = business_id
            write_dict['review_text'] = review_text
            write_dict['review_id'] = review_id
            write_dict['rating'] = review_rating

            wfile.write(ujson.dumps(write_dict) + "\n")

            if i % 100 == 0:
                print("{num}00 reviews processed".format(num=str(int(i /
                                                                     100))))
            if i % 50000 == 0:
                new_file = os.path.join(
                    SAVE_REVIEWS_DIRECTORY,
                    'yelp_reviews_{from_num}_{to_num}.json'.format(
                        from_num=(i + 1), to_num=(i + 50000)))
                old_file = write_file_first
                print("Changing file from {old_file} to {new_file}".format(
                    old_file=old_file, new_file=new_file))
                del wfile
                wfile = open(new_file, 'w')
        del wfile
        print("Processed {total} reviews".format(total=str(i)))
                                print("Excluded Category encountered: {cat}".
                                      format(cat=cat))

                        if review_counter % 100 == 0:
                            print("{num}00 reviews processed".format(
                                num=(str(int(review_counter / 100)))))

                        if review_counter >= PROCESS_N_REVIEWS_ONLY:
                            break

        print("{count} Reviews processed".format(count=review_counter))


if __name__ == "__main__":

    SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY = config.get(
        "SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY", None)
    if not SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY:
        print(
            "Cannot load business_id to business_category mappings as the location(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY) of the files that hold those mappings is not specified correctly in the config file socialconfig.py"
        )
        exit()
    if not (os.path.exists(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)
            and os.path.isdir(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)):
        print(
            "{SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY} is either not a valid directory or it does not exist"
            .format(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY=
                    SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY))
        exit()

    # initialise class
    ps = segregate_reviews_by_category()
    def process(self):
        # directory where reviews are kept
        SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None)
        SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get(
            "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None)
        PROCESS_N_REVIEWS_ONLY = int(
            config.get("PROCESS_N_REVIEWS_ONLY", 1000000))
        print(
            "Will process only {num} reviews as per the directive in the socialconfig.py"
            .format(num=str(PROCESS_N_REVIEWS_ONLY)))

        if not (SAVE_REVIEWS_DIRECTORY and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        if not os.path.exists(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir(
                    SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):
            os.makedirs(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY)

        bars_file_counter, food_file_counter, grooming_file_counter, learn_file_counter, leisure_file_counter, municipal_file_counter, planning_file_counter, services_file_counter, shopping_file_counter, sports_file_counter, health_file_counter, other_file_counter = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        bars_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "bars")
        if not (os.path.exists(bars_dir) and os.path.isdir(bars_dir)):
            os.makedirs(bars_dir)
        bars_file_path = os.path.join(bars_dir, "yelp_reviews_bars_1.txt")

        food_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "food")
        if not (os.path.exists(food_dir) and os.path.isdir(food_dir)):
            os.makedirs(food_dir)
        food_file_path = os.path.join(food_dir, "yelp_reviews_food_1.txt")

        grooming_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                    "grooming")
        if not (os.path.exists(grooming_dir) and os.path.isdir(grooming_dir)):
            os.makedirs(grooming_dir)
        grooming_file_path = os.path.join(grooming_dir,
                                          "yelp_reviews_grooming_1.txt")

        learn_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "learn")
        if not (os.path.exists(learn_dir) and os.path.isdir(learn_dir)):
            os.makedirs(learn_dir)
        learn_file_path = os.path.join(learn_dir, "yelp_reviews_learn_1.txt")

        leisure_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                   "leisure")
        if not (os.path.exists(leisure_dir) and os.path.isdir(leisure_dir)):
            os.makedirs(leisure_dir)
        leisure_file_path = os.path.join(leisure_dir,
                                         "yelp_reviews_leisure_1.txt")

        municipal_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                     "municipal")
        if not (os.path.exists(municipal_dir)
                and os.path.isdir(municipal_dir)):
            os.makedirs(municipal_dir)
        municipal_file_path = os.path.join(municipal_dir,
                                           "yelp_reviews_municipal_1.txt")

        planning_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                    "planning")
        if not (os.path.exists(planning_dir) and os.path.isdir(planning_dir)):
            os.makedirs(planning_dir)
        planning_file_path = os.path.join(planning_dir,
                                          "yelp_reviews_planning_1.txt")

        services_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                    "services")
        if not (os.path.exists(services_dir) and os.path.isdir(services_dir)):
            os.makedirs(services_dir)
        services_file_path = os.path.join(services_dir,
                                          "yelp_reviews_services_1.txt")

        shopping_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY,
                                    "shopping")
        if not (os.path.exists(shopping_dir) and os.path.isdir(shopping_dir)):
            os.makedirs(shopping_dir)
        shopping_file_path = os.path.join(shopping_dir,
                                          "yelp_reviews_shopping_1.txt")

        sports_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "sports")
        if not (os.path.exists(sports_dir) and os.path.isdir(sports_dir)):
            os.makedirs(sports_dir)
        sports_file_path = os.path.join(sports_dir,
                                        "yelp_reviews_sports_1.txt")

        health_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "health")
        if not (os.path.exists(health_dir) and os.path.isdir(health_dir)):
            os.makedirs(health_dir)
        health_file_path = os.path.join(health_dir,
                                        "yelp_reviews_health_1.txt")

        other_dir = os.path.join(SAVE_REVIEWS_BY_CATEGORY_DIRECTORY, "other")
        if not (os.path.exists(other_dir) and os.path.isdir(other_dir)):
            os.makedirs(other_dir)
        other_file_path = os.path.join(other_dir, "yelp_reviews_other_1.txt")

        bars_file = open(bars_file_path, 'w')
        food_file = open(food_file_path, 'w')
        grooming_file = open(grooming_file_path, 'w')
        learn_file = open(learn_file_path, 'w')
        leisure_file = open(leisure_file_path, 'w')
        municipal_file = open(municipal_file_path, 'w')
        planning_file = open(planning_file_path, 'w')
        services_file = open(services_file_path, 'w')
        shopping_file = open(shopping_file_path, 'w')
        sports_file = open(sports_file_path, 'w')
        health_file = open(health_file_path, 'w')
        other_file = open(other_file_path, 'w')

        review_counter = 0
        for file in os.listdir(SAVE_REVIEWS_DIRECTORY):
            if not os.path.isdir(file):
                abs_file_path = os.path.join(SAVE_REVIEWS_DIRECTORY, file)
                if "yelp_reviews_" in file:
                    reviews = get_iterable(abs_file_path)
                    for review in reviews:
                        review_counter += 1
                        review_dict = ujson.loads(review)
                        business_id = review_dict.get("business_id", None)
                        review_text = review_dict.get("review_text", None)

                        wiki = tb(review_text)
                        tags = wiki.tags
                        adj_words_list = [
                            wd(fword).lemmatize("a") for fword in [
                                self.strip_special_chars(word.lower())
                                for word, tag in tags if
                                tag in ["JJ", "JJR", "JJS"] and len(word) > 2
                            ] if len(fword) > 2
                        ]
                        adj_list_string = " ".join(adj_words_list)
                        review_dict.update({"adjectives": adj_list_string})
                        write_line = ujson.dumps(review_dict)

                        cat_list = self.business_to_cat.get(business_id, [])
                        for cat in cat_list:
                            if cat == "bars":
                                bars_file.write(write_line + "\n")
                                bars_file_counter += 1
                                if bars_file_counter % 25000 == 0:
                                    del bars_file
                                    old_bars_file = bars_file_path
                                    new_bars_file = os.path.join(
                                        bars_dir,
                                        "yelp_reviews_bars_{c}.txt".format(
                                            c=str(
                                                int((bars_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `BARS` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_bars_file} to {new_bars_file}'
                                        .format(old_bars_file=old_bars_file,
                                                new_bars_file=new_bars_file))
                                    bars_file = open(new_bars_file, 'w')
                            elif cat == "food":
                                food_file.write(write_line + "\n")
                                food_file_counter += 1
                                if food_file_counter % 25000 == 0:
                                    del food_file
                                    old_food_file = food_file_path
                                    new_food_file = os.path.join(
                                        food_dir,
                                        "yelp_reviews_food_{c}.txt".format(
                                            c=str(
                                                int((food_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `food` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_food_file} to {new_food_file}'
                                        .format(old_food_file=old_food_file,
                                                new_food_file=new_food_file))
                                    food_file = open(new_food_file, 'w')
                            elif cat == "grooming":
                                grooming_file.write(write_line + "\n")
                                grooming_file_counter += 1
                                if grooming_file_counter % 25000 == 0:
                                    del grooming_file
                                    old_grooming_file = grooming_file_path
                                    new_grooming_file = os.path.join(
                                        grooming_dir,
                                        "yelp_reviews_grooming_{c}.txt".format(
                                            c=str(
                                                int((grooming_file_counter +
                                                     1) / 25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `grooming` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_grooming_file} to {new_grooming_file}'
                                        .format(
                                            old_grooming_file=old_grooming_file,
                                            new_grooming_file=new_grooming_file
                                        ))
                                    grooming_file = open(
                                        new_grooming_file, 'w')
                            elif cat == "learn":
                                learn_file.write(write_line + "\n")
                                learn_file_counter += 1
                                if learn_file_counter % 25000 == 0:
                                    del learn_file
                                    old_learn_file = learn_file_path
                                    new_learn_file = os.path.join(
                                        learn_dir,
                                        "yelp_reviews_learn_{c}.txt".format(
                                            c=str(
                                                int((learn_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `learn` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_learn_file} to {new_learn_file}'
                                        .format(old_learn_file=old_learn_file,
                                                new_learn_file=new_learn_file))
                                    learn_file = open(new_learn_file, 'w')
                            elif cat == "leisure":
                                leisure_file.write(write_line + "\n")
                                leisure_file_counter += 1
                                if leisure_file_counter % 25000 == 0:
                                    del leisure_file
                                    old_leisure_file = leisure_file_path
                                    new_leisure_file = os.path.join(
                                        learn_dir,
                                        "yelp_reviews_leisure_{c}.txt".format(
                                            c=str(
                                                int((leisure_file_counter +
                                                     1) / 25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `leisure` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_leisure_file} to {new_leisure_file}'
                                        .format(
                                            old_leisure_file=old_leisure_file,
                                            new_leisure_file=new_leisure_file))
                                    leisure_file = open(new_leisure_file, 'w')
                            elif cat == "municipal":
                                municipal_file.write(write_line + "\n")
                                municipal_file_counter += 1
                                if municipal_file_counter % 25000 == 0:
                                    del municipal_file
                                    old_municipal_file = municipal_file_path
                                    new_municipal_file = os.path.join(
                                        municipal_dir,
                                        "yelp_reviews_municipal_{c}.txt".
                                        format(c=str(
                                            int((municipal_file_counter + 1) /
                                                25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `municipal` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_municipal_file} to {new_municipal_file}'
                                        .format(old_municipal_file=
                                                old_municipal_file,
                                                new_municipal_file=
                                                new_municipal_file))
                                    municipal_file = open(
                                        new_municipal_file, 'w')
                            elif cat == "planning":
                                planning_file.write(write_line + "\n")
                                planning_file_counter += 1
                                if planning_file_counter % 25000 == 0:
                                    del planning_file
                                    old_planning_file = planning_file_path
                                    new_planning_file = os.path.join(
                                        planning_dir,
                                        "yelp_reviews_planning_{c}.txt".format(
                                            c=str(
                                                int((planning_file_counter +
                                                     1) / 25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `planning` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_planning_file} to {new_planning_file}'
                                        .format(
                                            old_planning_file=old_planning_file,
                                            new_planning_file=new_planning_file
                                        ))
                                    planning_file = open(
                                        new_planning_file, 'w')
                            elif cat == "services":
                                services_file.write(write_line + "\n")
                                services_file_counter += 1
                                if services_file_counter % 25000 == 0:
                                    del services_file
                                    old_services_file = services_file_path
                                    new_services_file = os.path.join(
                                        services_dir,
                                        "yelp_reviews_services_{c}.txt".format(
                                            c=str(
                                                int((services_file_counter +
                                                     1) / 25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `services` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_services_file} to {new_services_file}'
                                        .format(
                                            old_services_file=old_services_file,
                                            new_services_file=new_services_file
                                        ))
                                    services_file = open(
                                        new_services_file, 'w')
                            elif cat == "shopping":
                                shopping_file.write(write_line + "\n")
                                shopping_file_counter += 1
                                if shopping_file_counter % 25000 == 0:
                                    del shopping_file
                                    old_shopping_file = shopping_file_path
                                    new_shopping_file = os.path.join(
                                        shopping_dir,
                                        "yelp_reviews_shopping_{c}.txt".format(
                                            c=str(
                                                int((shopping_file_counter +
                                                     1) / 25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `shopping` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_shopping_file} to {new_shopping_file}'
                                        .format(
                                            old_shopping_file=old_shopping_file,
                                            new_shopping_file=new_shopping_file
                                        ))
                                    shopping_file = open(
                                        new_shopping_file, 'w')
                            elif cat == "sports":
                                sports_file.write(write_line + "\n")
                                sports_file_counter += 1
                                if sports_file_counter % 25000 == 0:
                                    del sports_file
                                    old_sports_file = sports_file_path
                                    new_sports_file = os.path.join(
                                        sports_dir,
                                        "yelp_reviews_sports_{c}.txt".format(
                                            c=str(
                                                int((sports_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `sports` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_sports_file} to {new_sports_file}'
                                        .format(
                                            old_sports_file=old_sports_file,
                                            new_sports_file=new_sports_file))
                                    sports_file = open(new_sports_file, 'w')
                            elif cat == "health":
                                health_file.write(write_line + "\n")
                                health_file_counter += 1
                                if health_file_counter % 25000 == 0:
                                    del health_file
                                    old_health_file = health_file_path
                                    new_health_file = os.path.join(
                                        health_dir,
                                        "yelp_reviews_health_{c}.txt".format(
                                            c=str(
                                                int((health_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `health` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_health_file} to {new_health_file}'
                                        .format(
                                            old_health_file=old_health_file,
                                            new_health_file=new_health_file))
                                    health_file = open(new_health_file, 'w')
                            else:
                                other_file.write(write_line + "\n")
                                other_file_counter += 1
                                if other_file_counter % 25000 == 0:
                                    del other_file
                                    old_other_file = other_file_path
                                    new_other_file = os.path.join(
                                        other_dir,
                                        "yelp_reviews_other_{c}.txt".format(
                                            c=str(
                                                int((other_file_counter + 1) /
                                                    25000) + 1)))
                                    print(
                                        "25000 Reviews collected in `other` category, creating a new file to keep file sizes manageable"
                                    )
                                    print(
                                        'Changing file from : {old_other_file} to {new_other_file}'
                                        .format(old_other_file=old_other_file,
                                                new_other_file=new_other_file))
                                    other_file = open(new_other_file, 'w')
                                print("Excluded Category encountered: {cat}".
                                      format(cat=cat))

                        if review_counter % 100 == 0:
                            print("{num}00 reviews processed".format(
                                num=(str(int(review_counter / 100)))))

                        if review_counter >= PROCESS_N_REVIEWS_ONLY:
                            break

        print("{count} Reviews processed".format(count=review_counter))
    def process(self):
        # directory where stories are kept
        YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None)
        YELP_BUSINESSES_FILE_NAME = config.get("YELP_BUSINESSES_FILE_NAME",
                                               None)
        SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY = config.get(
            "SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY", None)
        SAVE_BUSINESSES_BY_STATE_DIRECTORY = config.get(
            "SAVE_BUSINESSES_BY_STATE_DIRECTORY", None)

        if not (YELP_DATASET_DIR and YELP_BUSINESSES_FILE_NAME
                and SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY
                and SAVE_BUSINESSES_BY_STATE_DIRECTORY):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        if not (os.path.exists(YELP_DATASET_DIR)
                and os.path.isdir(YELP_DATASET_DIR)):
            print(
                "Either Yelp Dataset directory path is not set correctly in the socialconfig.py file \nOR\
					\nThe directory does not exist. Please make sure you have downloaded the yelp dataset(in JSON format) and copied the `yelp_academic_dataset_business.json` and `yelp_academic_dataset_business.json` files into the yelp_dataset sub-directory of your project directory(socialsentRun)"
            )
            exit()

        YELP_BUSINESSES_ABS_FILE_PATH = os.path.join(
            YELP_DATASET_DIR, YELP_BUSINESSES_FILE_NAME)
        try:
            f = open(YELP_BUSINESSES_ABS_FILE_PATH, 'r')
            if f:
                f.close()
        except IOError:
            msg = "Error opening file: {f}".format(
                f=YELP_BUSINESSES_ABS_FILE_PATH)
            print(msg)
            print(traceback.format_exc())
            exit()

        if not (os.path.exists(SAVE_BUSINESSES_BY_STATE_DIRECTORY)
                and os.path.isdir(SAVE_BUSINESSES_BY_STATE_DIRECTORY)):
            os.makedirs(SAVE_BUSINESSES_BY_STATE_DIRECTORY)
        if not (os.path.exists(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)
                and os.path.isdir(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)):
            os.makedirs(SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY)

        yelp_food_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_food.json")
        yelp_travel_and_leisure_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_travel_and_leisure.json")
        yelp_health_and_doctor_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_health_and_doctor.json")
        yelp_sports_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_sports.json")
        yelp_bars_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_bars.json")
        yelp_shopping_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_shopping.json")
        yelp_grooming_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_grooming.json")
        yelp_learning_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_learning.json")
        yelp_advice_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_advice.json")
        yelp_services_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_services.json")
        yelp_municipal_businesses_file = os.path.join(
            SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
            "yelp_businesses_category_municipal.json")

        businesses = get_businesses_iterable(YELP_BUSINESSES_ABS_FILE_PATH)
        i = 0
        for line in businesses:
            i += 1
            json_dict = ujson.loads(line)
            bus_id = json_dict.get("business_id", None)
            state = json_dict.get("state", None)
            city = json_dict.get("city", None)
            sub_cats = json_dict.get("categories", None)
            rev_count = json_dict.get("review_count", None)

            classified_categories = set()
            if not (sub_cats and bus_id):
                continue
            for sub_cat in sub_cats.split(","):
                sub_cat = sub_cat.strip()
                for classfied_category in category_dict:
                    for subcat in category_dict[classfied_category]:
                        if sub_cat == subcat:
                            classified_categories.add(classfied_category)
            write_files = []
            for final_cat in classified_categories:
                write_files.append(
                    os.path.join(
                        SAVE_BUSINESSES_BY_CATEGORY_DIRECTORY,
                        "yelp_businesses_in_category_{final_cat}.csv".format(
                            final_cat=final_cat)))

            if state:
                write_files.append(
                    os.path.join(
                        SAVE_BUSINESSES_BY_STATE_DIRECTORY,
                        "yelp_businesses_in_state_{state}.csv".format(
                            state=state)))

            for file in write_files:
                if os.path.exists(file) and os.path.isfile(file):
                    write_mode = 'a'
                else:
                    write_mode = 'w'

                if not state:
                    state = "NoStateName_EncodingIssue"
                if not city:
                    city = "NoCityName_EncodingIssue"
                if not rev_count:
                    rev_count = 0

                write_str = ujson.dumps(' '.join(
                    [bus_id, state, city, str(rev_count)]))

                with open(file, write_mode) as wfile:
                    try:
                        wfile.write(write_str + "\n")
                    except:
                        print("Cannot write business_id:{bus_id} to {file}".
                              format(bus_id=bus_id, file=file))
                        print(traceback.format_exc())
                        continue
            if i % 100 == 0:
                print("{num}00 businesses processed".format(
                    num=str(int(i / 100))))
        print("Processed {num} businesses".format(num=str(i)))
예제 #7
0
                        "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~"))
            return word.rstrip("@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~").lstrip(
                "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~")
        else:
            return ""

    def has_special_chars(self, word):
        if word[0] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~" or word[
                -1] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~":
            return True
        return False


if __name__ == "__main__":

    SAVE_WORD_EMBEDDINGS_DIR = config.get("SAVE_WORD_EMBEDDINGS_DIR", None)
    SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
    SAVE_WORD2VEC_MODELS_DIR = config.get("SAVE_WORD2VEC_MODELS_DIR", None)
    REVIEWS_CATEGORIES_LIST = config.get("REVIEWS_CATEGORIES_LIST", None)
    WORD2VEC_MODELS_CONF = config.get("WORD2VEC_MODELS_CONF", None)

    if not (REVIEWS_CATEGORIES_LIST and len(REVIEWS_CATEGORIES_LIST) > 0):
        print(
            "Categories List is not defined in socialconfig.py, cannot determine the paths to category-specific word2vec stored models"
        )
        exit()

    if not (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR
            and SAVE_WORD2VEC_MODELS_DIR):
        print(
            "config keys (SAVE_WORD_EMBEDDINGS_DIR and SAVE_DICTIONARY_DIR and SAVE_WORD2VEC_MODELS_DIR) are not set correctly in the config file: socialconfig.py"
import numpy as np
import ujson
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import re

import sys
from textblob import TextBlob as tb

from collections import defaultdict
from socialconfig import config

from textblob import TextBlob as tb
from textblob import Word as wd
scaler = MinMaxScaler(feature_range=(-1, 1))

SAVE_POLARITIES_DIR = config.get("SAVE_POLARITIES_DIR", None)
SAVE_REVIEWS_DIRECTORY = config.get("SAVE_REVIEWS_DIRECTORY", None)
EVALUATE_SENTIMENTS_DIR = config.get("EVALUATE_SENTIMENTS_DIR", None)
if not SAVE_POLARITIES_DIR:
    print(
        "SAVE_POLARITIES_DIR is not defined in socialconfig.py, can not load embeddings"
    )
    exit()

if not SAVE_REVIEWS_DIRECTORY:
    print(
        "SAVE_REVIEWS_DIRECTORY is not defined in socialconfig.py, can not load documents to evaluate polarities"
    )
    exit()

for category_entity in os.listdir(SAVE_POLARITIES_DIR):
    def create_dictionary(self):
        YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None)
        SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get(
            "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None)
        SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
        SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None)
        SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int(
            config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000))

        if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY
                and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR
                and SAVE_DICTIONARY_DIR):
            print(
                "config keys are not set correctly in the config file: socialconfig.py"
            )
            exit(0)

        SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR,
                                                      "Unfiltered")

        if not os.path.exists(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir(
                    SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):
            raise ("Directory {d} does not exist".format(
                d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY))

        if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR)
                and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)):
            os.makedirs(SAVE_BAG_OF_WORDS_DIR)

        if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR)
                and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)):
            os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR)

        for pardir, sub_dirs, files in os.walk(
                SAVE_REVIEWS_BY_CATEGORY_DIRECTORY):

            if len(files) > 0:
                error_count = 0
                review_docs = []
                negative_docs = []
                positive_docs = []

                doc_count = 0
                docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE
                file_num = str((doc_count / docs_per_file) + 1)
                for file in files:
                    if "yelp_reviews_" in file and "category" in pardir:
                        reviews = get_reviews_iterable(
                            os.path.join(pardir, file))
                        yelp_category = pardir.split('/')[-1]

                        CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category)
                        if not (os.path.exists(
                                CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path
                                .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)):
                            os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)

                        fname = os.path.join(
                            SAVE_BAG_OF_WORDS_DIR, yelp_category,
                            "{cat}_file_{file_num}.txt".format(
                                cat=yelp_category, file_num=file_num))
                        bow_file = open(fname, 'w')
                        print(
                            "Writing docs (in bag of words form) for {cat} to directory: {d}"
                            .format(cat=yelp_category,
                                    d=os.path.join(SAVE_BAG_OF_WORDS_DIR,
                                                   yelp_category)))
                        for review in reviews:
                            try:
                                review_dict = ujson.loads(review)
                            except:
                                error_count += 1
                                pass
                            adjs = review_dict.get("adjectives", None)
                            rating = int(review_dict.get("rating", -1))
                            if adjs:
                                doc_count += 1
                                bow_file.write(
                                    ujson.dumps(adjs.encode("utf-8")) + "\n")
                                review_docs.append(adjs.strip().split())
                                if (doc_count % docs_per_file) == 0:
                                    if bow_file:
                                        bow_file.close()
                                    file_num = str((doc_count /
                                                    docs_per_file) + 1)
                                    fname = os.path.join(
                                        SAVE_BAG_OF_WORDS_DIR, yelp_category,
                                        "{cat}_file_{file_num}.txt".format(
                                            cat=yelp_category,
                                            file_num=file_num))
                                    bow_file = open(fname, 'w')
                            if rating:
                                if rating > 3:
                                    positive_docs.append(adjs.strip().split())
                                elif rating < 3:
                                    negative_docs.append(adjs.strip().split())
                                else:
                                    pass
                print("Wrote {total} docs in {cat} category".format(
                    total=str(doc_count), cat=yelp_category))

                dictionary = Dictionary(review_docs)

                CATEGORY_SPECIFIC_DICT_DIR = os.path.join(
                    SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category)
                POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "positive")
                NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR,
                                                "negative")
                if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR)
                        and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)):
                    os.makedirs(CATEGORY_SPECIFIC_DICT_DIR)
                    os.makedirs(POSITIVE_SUB_DIR)
                    os.makedirs(NEGATIVE_SUB_DIR)

                dictionary.save(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.dict".format(
                            yelp_category=yelp_category)))
                dictionary.save_as_text(
                    os.path.join(
                        CATEGORY_SPECIFIC_DICT_DIR,
                        "{yelp_category}_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_doc_freqs = sorted(dictionary.dfs.items(),
                                          key=lambda x: x[1],
                                          reverse=True)

                # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category)))
                with open(
                        os.path.join(
                            CATEGORY_SPECIFIC_DICT_DIR,
                            "{yelp_category}_words_doc_frequencies.txt".format(
                                yelp_category=yelp_category)), 'w') as df_file:
                    for (token_id, doc_freq) in sorted_doc_freqs:
                        df_file.write(
                            str(
                                dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del dictionary
                del review_docs
                del sorted_doc_freqs

                pos_dictionary = Dictionary(positive_docs)
                del positive_docs

                neg_dictionary = Dictionary(negative_docs)
                del negative_docs

                pos_dictionary.save(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.dict".format(
                            yelp_category=yelp_category)))
                pos_dictionary.save_as_text(
                    os.path.join(
                        POSITIVE_SUB_DIR,
                        "{yelp_category}_pos_dict.txt".format(
                            yelp_category=yelp_category)))

                sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            POSITIVE_SUB_DIR,
                            "{yelp_category}_pos_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_pos_doc_freqs:
                        df_file.write(
                            str(
                                pos_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del pos_dictionary
                del sorted_pos_doc_freqs

                neg_dictionary.save(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.dict".format(
                            yelp_category=yelp_category)))
                neg_dictionary.save_as_text(
                    os.path.join(
                        NEGATIVE_SUB_DIR,
                        "{yelp_category}_neg_dict.txt".format(
                            yelp_category=yelp_category)))
                sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(),
                                              key=lambda x: x[1],
                                              reverse=True)
                with open(
                        os.path.join(
                            NEGATIVE_SUB_DIR,
                            "{yelp_category}_neg_words_doc_frequencies.txt".
                            format(yelp_category=yelp_category)),
                        'w') as df_file:
                    for (token_id, doc_freq) in sorted_neg_doc_freqs:
                        df_file.write(
                            str(
                                neg_dictionary.get(token_id, "Unknown").encode(
                                    'utf-8')) + " " + str(doc_freq) + "\n")

                del neg_dictionary
                del sorted_neg_doc_freqs

                print(
                    "{count} {cat} reviews were discarded because of parsing errors"
                    .format(count=error_count, cat=yelp_category))
                print("Created dictionary for {cat} tokens".format(
                    cat=yelp_category))
예제 #10
0
                        "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~"))
            return word.rstrip("@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~").lstrip(
                "!@#'\"%&()*,-./:;<=>?[\\]^_`{|}~")
        else:
            return ""

    def has_special_chars(self, word):
        if word[0] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~" or word[
                -1] in "@!#\"%&()*,-'./:;<=>?[\\]^_`{|}~":
            return True
        return False


if __name__ == "__main__":

    SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None)
    SAVE_WORD2VEC_MODELS_DIR = config.get("SAVE_WORD2VEC_MODELS_DIR", None)
    PROCESS_N_REVIEWS_ONLY_PER_CATEGORY = int(
        config.get("PROCESS_N_REVIEWS_ONLY_PER_CATEGORY", 25000))
    SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int(
        config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000))
    FILTERED_DICTINARIES_CONF = config.get("FILTERED_DICTINARIES_CONF", {})
    FILTER_DICTIONARY_TOP_N_WORDS_PER_CATEGORY = int(
        config.get("FILTER_DICTIONARY_TOP_N_WORDS_PER_CATEGORY", 10000))
    SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None)
    WORD2VEC_MODELS_CONF = config.get("WORD2VEC_MODELS_CONF", None)

    if not (WORD2VEC_MODELS_CONF and isinstance(WORD2VEC_MODELS_CONF, dict)
            and len(WORD2VEC_MODELS_CONF)):
        print(
            "No Model Configurations specified in the socialconfig.py file for training the Word2vec models"