Exemplo n.º 1
0
import file_save_load as fsl
import csv
import unicodedata
from unidecode import unidecode
import io

movies = fsl.read_from_file(
    "imdb_dataset_v7.1_6_actors_complete_wrong_genre.tsv", 6)

movies_and_genres = {}

with open("IMDB_files_link/_filtered_data/genres.filtered") as data_file:
    reader = csv.reader(data_file, delimiter='\r')
    for line in reader:
        # line variable here is a list of strings, so we join it into one string
        full_line = " ".join(line)
        # partition returns 3-tuple: (part_before_delimiter, delimiter, part_after_delimiter)
        parted = full_line.partition("\t")
        title = parted[0]
        # in the part_after_delimiter we delete all \t characters
        genre = parted[2].replace("\t", "")

        if title not in movies_and_genres:
            movies_and_genres[title] = [genre]
        else:
            movies_and_genres[title].append(genre)

# print movies_and_genres["Den skaldede frisor (2012)"]

# for title in movies_and_genres:
#     if "skaldede" in title:
Exemplo n.º 2
0
import csv, wikipedia, unicodedata, wptools
import time
import file_save_load as fsl

fileName = 'datasetV_20161118-080950'
actors_amount = 6

movies = fsl.read_from_file(fileName, 6)


def compute_jaccard_index(list_1, list_2):
    list_1 = list_1.replace("(", "").replace(")", "")
    list_2 = list_2.replace("(", "").replace(")", "")
    set_1 = set(list_1.split())
    set_2 = set(list_2.split())
    return len(set_1.intersection(set_2)) / float(len(set_1.union(set_2)))


movies_with_no_budget = []
wiki_no_budget = []
wiki_budget_ok = []
different_budget_keys = set()

for title in movies:
    if movies[title]["budget"] == "no_info":
        full_title = title
        title_no_year = title.partition("(")[0]
        movies_with_no_budget.append([title_no_year, full_title])

counter = 0
for title in movies_with_no_budget:
Exemplo n.º 3
0
import csv
import time
import ast
import file_save_load as fsl

######################################################
#               adding new budgets
######################################################

fileNameDataset = 'datasetV_20161118-080950'
fileNameBudgets = '_wiki_budg_for_' + fileNameDataset
actors_amount = 3

movies = fsl.read_from_file(fileNameDataset, actors_amount)

# with open('files/datasetV_20161116-203716') as csvfile:
#     reader = csv.DictReader(csvfile, delimiter = "\t")
#     for entry in reader:
#         movies[
#             entry["title"]
#         ] = {
#         "director":     entry["director"],
#         "rating":       entry["rating"],
#         "votes":        entry["votes"],
#         "year":         entry["year"],
#         "genre":        entry["genre"],
#         "gross":        entry["gross"],
#         "budget":       entry["budget"],
#         "run-time":     entry["run-time"] ,
#         "actor1":       entry["actor1"],
#         "actor1_rank":  entry["actor1_rank"],
Exemplo n.º 4
0
import wikipedia, unicodedata
import file_save_load as fsl
import time

fileName = 'imdb_dataset_v7_no_plots'
actors_amount = 6

movies = fsl.read_from_file(fileName, actors_amount)
# with open('files/datasetV_20161116-203716') as csvfile:
#     reader = csv.DictReader(csvfile, delimiter = "\t")
#     for entry in reader:
#         movies[
#             entry["title"]
#         ] = {
#         "director":     entry["director"],
#         "rating":       entry["rating"],
#         "votes":        entry["votes"],
#         "year":         entry["year"],
#         "genre":        entry["genre"],
#         "gross":        entry["gross"],
#         "budget":       entry["budget"],
#         "run-time":     entry["run-time"] ,
#         "actor1":       entry["actor1"],
#         "actor1_rank":  entry["actor1_rank"],
#         "actor1_sex":   entry["actor1_sex"],
#         "actor2":       entry["actor2"],
#         "actor2_rank":  entry["actor2_rank"],
#         "actor2_sex":   entry["actor2_sex"],
#         "actor3":       entry["actor3"],
#         "actor3_rank":  entry["actor3_rank"],
#         "actor3_sex":   entry["actor3_sex"],