Exemplo n.º 1
0
def cast_data(header, tablename, data):
    typedict = get_typedict(tablename)
    type_casters = []
    for i in range(len(header)):
        sql_type = typedict[header[i]]
        if sql_type == text_type:
            type_casters.append(lambda str: str.encode('UTF-8'))
            #type_casters.append(lambda passer: passer)
        elif sql_type == int_type:
            type_casters.append(int)
        elif sql_type == date_type:
            type_casters.append(timestamp_parser.parse)

    log('casting data for ' + str(len(data)) + " rows")

    def cast_line(dataln):
        cast_line = []
        for col_id in range(len(dataln)):
            cast_line.append(type_casters[col_id](dataln[col_id]))
        return cast_line

    tpool = Pool(processes=6)
    ret = tpool.map(cast_line, data)
    tpool.close()
    return ret
 def get_titles_list(self, results_list):
     if verbose:
         log("scraping title")
     for result in results_list:
         self.scraped_dict["titles"].append(
             str(result.find("span", {
                 "class": "title"
             }).string))
Exemplo n.º 3
0
def main():
    try:
        log_return()
        log("starting")
        clear_downloaded_dir()
        run()
        compress_to_zip_and_upload()

    except Exception as e:
        log_error(traceback.format_exc())
        log_error(pformat(traceback.format_stack()))
        raise e
 def get_good_games(self, merged_results, keys):
     n_rev_idx = keys['n_user_reviews']
     min_positive_idx = keys['percent_reviews_positive']
     ret = []
     before = len(merged_results)
     for result in merged_results:
         if result[n_rev_idx] >= self.min_reviews and result[
                 min_positive_idx] >= self.min_positive:
             ret.append(result)
     log(
         str(len(ret)) + " out of " + str(before) +
         " had good enough reviews")
     return ret
 def get_highly_discounted(self, merged_results, keys):
     percents_index = keys["discount_percents"]
     # parameters for get_good_games
     # todo make configureable
     merged_results.sort(key=lambda p: p[percents_index], reverse=True)
     before = len(merged_results)
     for i in range(0, len(merged_results)):
         if merged_results[i][percents_index] < self.minimum_discount:
             break
     merged_results = merged_results[:i]
     log(
         str(len(merged_results)) + " out of " + str(before) +
         " had deep enough discount")
     return merged_results
 def clean_extra_dots(__str):
     # to deal with prices higher that 1000
     dots_idx = [
         m.start() for m in re.finditer("\.", __str)
     ]
     if len(dots_idx) > 1:
         log("found multible dots in price {}".format(
             __str))
         for dot_idx in reversed(dots_idx[:len(dots_idx) -
                                          1]):
             __str = __str[:dot_idx] + __str[dot_idx + 1:]
         log("cleaned multible dots. is now {}".format(
             __str))
     return __str
Exemplo n.º 7
0
def format_and_save_results(result, region_name):
    path = downloaded_dir + region_name + ".json"

    json_output = {
        "timestamp": str(datetime.datetime.now()),
        "filter settings": {
            "minimum_discount": 40,
            "min_reviews": 10,
            "min_positive": 40
        },
        "items": result
    }

    with open(path, "w", encoding='UTF-8') as json_file:
        json_file.write(json.dumps(json_output, indent=4))

    log("done saving json to disk")
    def get_user_reviews(self, results):
        # returns 2 lists
        # the first list is how many user reviews the result got
        # the second list is what percentage was positive
        n_user_reviews = []
        percent_reviews_positive = []
        found = 0
        if verbose:
            log("scraping reviews")
        for result in results:
            var = result.find("span", {"class": "search_review_summary"})
            if not isinstance(var, type(None)):  # if true it contains a review
                var = str(var)
                of_the_str = "% of the "
                of_the_start = var.find(of_the_str)
                of_the_end = of_the_start + len(of_the_str)
                # this part checks how many of the reviews where positive
                percent_positive_as_str = ""
                for char in var[
                        of_the_start -
                        3:of_the_start]:  # 3 is because a max of 3 digets
                    if char in ints_str:
                        percent_positive_as_str += char

                percent_reviews_positive.append(int(percent_positive_as_str))

                # this part get how many reviews it got
                temp_n_reviews = ""
                for char in var[of_the_end:]:
                    if char == " ":
                        break
                    else:
                        if not char == "," and not char == ".":
                            temp_n_reviews += char
                # print("reviews " + temp_n_reviews)
                n_user_reviews.append(int(temp_n_reviews))

                found += 1
            else:
                n_user_reviews.append(0)
                percent_reviews_positive.append(0)
        for i in range(len(n_user_reviews)):
            self.scraped_dict['n_user_reviews'].append(n_user_reviews[i])
            self.scraped_dict['percent_reviews_positive'].append(
                percent_reviews_positive[i])
 def get_discount_percents(self, results_list):
     if verbose:
         log('scraping discount percents')
     discount_percents = []
     for r in results_list:
         string = str(
             r.find("div",
                    {"class": "col search_discount responsive_secondrow"}))
         span = "<span>"
         # for some f*****g reason not all results have a discount number
         if string.find(span) != -1:
             # the +1 and -1 are to cut off the - and the %
             start = string.find(span) + len(span) + 1
             end = string.find("</span>") - 1
             discount_percents.append(int(string[start:end]))
         else:
             discount_percents.append(0)
     for item in discount_percents:
         self.scraped_dict["discount_percents"].append(item)
Exemplo n.º 10
0
def run():
    origanal_db_cursor = create_connection(comments_db_path).cursor()
    new_db = create_connection(ROOTDIR + dir_sep + "dirty_bodies.db")
    new_db_cursor = new_db.cursor()

    new_db_cursor.execute("DELETE FROM bodies")
    new_db_cursor.execute("delete from sqlite_sequence where name='bodies'")
    origanal_db_cursor.execute("select body from May2015")
    data = origanal_db_cursor.fetchmany(rows_per_loop)
    more_data = True
    while more_data:
        query = "insert into bodies (bodies) values (?)"
        log("inserting 50k rows")
        new_db_cursor.executemany(query, data)
        new_db.commit()

        data = origanal_db_cursor.fetchmany(rows_per_loop)
        if len(data) is 0:
            more_data = False
Exemplo n.º 11
0
 def delete_duplicates(self, merged_results, keys):
     # i think that it adds results onto the final page until it's 25
     doubles_found = 0
     ret = []
     appid_key = keys['appids']
     is_bundle_key = keys['is_bundle']
     for line in merged_results:
         if len(ret) > 0:
             double = False
             for i in ret:
                 if i[appid_key] == line[appid_key] and i[
                         is_bundle_key] == line[is_bundle_key]:
                     double = True
                     doubles_found += 1
                     break
             if not double:
                 ret.append(line)
         else:
             ret.append(line)
     log('removed ' + str(doubles_found) + ' doubles')
     return ret
Exemplo n.º 12
0
def run(is_test):
    for filename in os.listdir(tables_dir):
        if filename == 'COUNTRY_UNITED_STATES.csv' or not is_test:
            log('dumping data for ' + filename)
            with open(tables_dir + filename, 'r', encoding='UTF-8') as file:
                reader = csv.reader(file)
                data = []
                for row in reader:
                    data.append(row)

            header = data[0]
            data = data[1:]
            if is_test:
                data = data[:100]
            for i in range(len(header)):
                header[i] = header[i].replace(" ", "_")
            tablename = filename.replace(".csv", '').replace(" ", "_").replace(
                '-', '_').replace('.', "")
            data = cast_data(header, tablename, data)
            q = build_queries(header, data, tablename, False)
            run_queries(q)
Exemplo n.º 13
0
def compress_to_zip_and_upload():
    files = [
        f for f in listdir(downloaded_dir) if isfile(join(downloaded_dir, f))
    ]

    def json_filter(file_name):
        regex = re.compile(".*\.json$")
        res = regex.match(file_name) is not None
        return res

    files = list(filter(json_filter, files))
    chdir(downloaded_dir)
    with ZipFile(downloaded_dir + zipfile_name,
                 mode="w",
                 compression=ZIP_DEFLATED) as zip:
        for file in files:
            zip.write(file)
    chdir(ROOTDIR)
    log("saving to s3")
    s3 = boto3.resource("s3")
    s3.meta.client.upload_file(downloaded_dir + zipfile_name, bucket_name,
                               zipfile_name)
    def get_app_id(self, results_list):
        # THE APP ID(S) ARE STORED AS STRINGS FOR NOW since i don't need them as ints right now.
        # nor can i think of a reason why i should want to.
        if verbose:
            log("scraping appids")
        for result in results_list:
            try:
                if (
                        ',' in result['data-ds-appid']
                ):  # if it has multible appids which is when it's an old style bundle
                    self.scraped_dict["appids"].append(
                        result['data-ds-packageid'])
                    self.scraped_dict["is_bundle"].append(True)
                    self.scraped_dict["is_old_bundle"].append(True)
                    self.scraped_dict["new_cdn_id"].append("")

                else:
                    self.scraped_dict["appids"].append(result['data-ds-appid'])
                    self.scraped_dict["is_bundle"].append(False)
                    self.scraped_dict["is_old_bundle"].append(False)
                    self.scraped_dict["new_cdn_id"].append("")
            except KeyError:
                self.scraped_dict["appids"].append(result['data-ds-bundleid'])
                self.scraped_dict["is_bundle"].append(True)
                self.scraped_dict["is_old_bundle"].append(False)

                # The url for the thumbnail.
                url = result.find("div", {
                    "class": "search_capsule"
                }).find("img")["src"]
                cdn_id = re.search("/(\w+)/capsule", url)

                if cdn_id is None:
                    log_warning(
                        "could not find img_id in {} appending blank string".
                        format(url))
                    self.scraped_dict["new_cdn_id"].append("")
                else:
                    self.scraped_dict["new_cdn_id"].append(cdn_id.group(1))
Exemplo n.º 15
0
def do_scrape(region_dict, use_proxy=False):

    aws_region = list(region_dict.keys())[0]
    region_name = region_dict[aws_region]

    def scrape_and_save(_proxy):
        log("scraping {}".format(region_name))
        format_and_save_results(run_scrape(is_test=is_test, proxy=_proxy),
                                region_name)

    if use_proxy:
        log("scraping with proxy in aws region {}".format(aws_region))
        ec2 = boto3.client("ec2", region_name=aws_region)

        def get_proxy_instance(_ec2):
            resp = _ec2.describe_instances()
            for _inst in resp["Reservations"][0]["Instances"]:
                for tag in _inst["Tags"]:
                    if tag["Value"] == "steam_app":
                        return _inst
            return None

        inst = get_proxy_instance(ec2)
        if inst is None:
            log_warning("no instance found in {}".format(aws_region))
        else:
            inst_id = inst["InstanceId"]
            tries = 5
            while inst["State"]["Name"] != "running" and tries > 0:
                log("proxy not running in {}. starting now. {} tries left.".
                    format(aws_region, tries))
                ec2.start_instances(InstanceIds=[inst_id])
                time.sleep(45)
                inst = get_proxy_instance(ec2)
                tries -= 1

            proxy_ip = inst["PublicIpAddress"]
            proxy = "http://{}:{}".format(proxy_ip, proxy_port)
            scrape_and_save(proxy)
            log("saved results. now stopping proxy in {}".format(aws_region))
            ec2.stop_instances(InstanceIds=[inst_id])
    else:
        log("doing scrape without proxy in region {}".format(aws_region))
        proxy = None
        scrape_and_save(proxy)
Exemplo n.º 16
0
def run_queries(queries):
    for line in queries:
        log('executeing: ' + line[1])
        if line[0] == 'one':
            db_cursor.execute(line[1])
        elif line[0] == 'many':
            db_cursor.executemany(line[1], line[2])
        else:
            log('could not execute query')
        log('Query ok')
    db.commit()
    def filter(dirty_data):
        log("starting filter")
        tpool = Pool(processes=cpus)
        ret = []
        log("filtering deleted and not english")
        for line in tpool.map(Filter.__is_not_deleted_or_not_non_english,
                              dirty_data):
            if line[1]:
                ret.append(line[0])

        def clean_links_and_punctuation(comment):
            words = comment.split(" ")
            words = list(map(Filter.__filter_links, words))
            comment = reduce(lambda x, y: x + " " + y, words)
            return comment

        log("filtering links and punctuation")
        ret = tpool.map(clean_links_and_punctuation, ret)
        tpool.close()
        log("filter done")
        return ret
Exemplo n.º 18
0
def run_scrape(is_test, proxy=None):
    log("running scrape")
    if proxy is None:
        http = urllib3.PoolManager()
    else:
        http = urllib3.ProxyManager(proxy_url=proxy)

    results_as_strs = []
    if is_test:
        num_pages = 3
    else:
        num_pages = get_number_pages(http)
    data_scraper = Data_Scraper()
    data_scraper.scraped_dict = collections.defaultdict(
        list)  # seems to not gc this between runs

    # for testing
    # i = 1
    # page_results_as_bs4 = get_results_from_page_n(i, http)
    # log("got page " + str(i) + "/" + str(num_pages))
    # apply_data_scraping(page_results_as_bs4, data_scraper)
    # i = 28
    # page_results_as_bs4 = get_results_from_page_n(i, http)
    # log("got page " + str(i) + "/" + str(num_pages))
    # apply_data_scraping(page_results_as_bs4, data_scraper)

    for i in range(1, num_pages + 1):
        page_results_as_bs4 = get_results_from_page_n(i, http)
        log("got page " + str(i) + "/" + str(num_pages))
        apply_data_scraping(page_results_as_bs4, data_scraper)

    merged_results, keys = apply_filters(data_scraper.scraped_dict)
    log('scrape done')

    # formats results
    output = []
    for i in range(len(merged_results)):
        item = {}
        for key in keys:
            item[key] = merged_results[i][keys[key]]
        output.append(item)

    return output
def clean_data(dirty_db_path, clean_db_path):
    dirty_db_cursor = create_connection(dirty_db_path).cursor()
    clean_db = create_connection(clean_db_path)
    clean_db_cursor = clean_db.cursor()

    clean_db_cursor.execute("DELETE FROM bodies")
    clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'")

    dirty_db_cursor.execute("select bodies from bodies")
    data = dirty_db_cursor.fetchmany(rows_per_loop)

    log("start")
    log("detected " + str(cpus) + " as cpu count")
    inserted = 0
    more_data = True
    while more_data:
        log("cleaning data")
        data = list(map(lambda i: i[0].replace("\n", " "), data))
        data = Filter.filter(data)
        data = list(map(lambda line: (line, ), data))

        log("inserting 100k rows")
        query = "insert into bodies (bodies) values (?)"
        clean_db_cursor.executemany(query, data)
        clean_db.commit()

        log("done loop, getting more data.")
        inserted += len(data)
        data = dirty_db_cursor.fetchmany(rows_per_loop)
        if len(data) < 1:
            more_data = False
    log("done")
    log("inserted " + str(inserted) + " rows")
Exemplo n.º 20
0
            db_cursor.execute(raw_input("Enter query "))
            r = db_cursor.fetchall()
            pprint(r)
            db.commit()
        except _mysql_exceptions.ProgrammingError as e:
            print(str(e))


tables_dir = ROOTDIR + dir_sep + 'csv_done' + dir_sep
# todo make slightly faster my having seperate query and preparing threads

int_type = "INT"
text_type = 'TINYTEXT'
date_type = 'DATETIME'

region_table_types = {
    'timestamp': date_type,
    'search': text_type,
    'tweet_volume': int_type,
    'is_promoted_contend': text_type
}
global_table_types = {'country_table': text_type}
country_table_types = {'region_table': text_type, 'woeid': int_type}

db_cursor, db = init_db()

if __name__ == '__main__':
    log_return()
    log('starting app')
    run(False)
    start_cli()
    def get_old_and_new_price(self, results_list):
        if verbose:
            log('scraping the old+new price')

        def set_curr_symbol(_price_str):
            for sym in self.curr_symbol_list:
                if sym in _price_str:
                    self.curr_symbol = sym
                    break

        def safe_to_float(_str):
            try:
                return float(_str)
            except Exception as e:
                log_error("error converting price to float:")
                log_error(traceback.format_exc())
                log_error(pformat(traceback.format_stack()))
                return float(0)

        for result in results_list:
            cont = result.find(
                'div',
                {'class': 'col search_price discounted responsive_secondrow'})
            if cont is not None:
                old_price = cont.find("strike").text
                new_price = cont.text.replace(old_price, "")

                if self.curr_symbol is None:
                    set_curr_symbol(old_price)

                new_price = new_price.replace('\t', '') \
                    .replace('\n', '') \

                def clean(_str):
                    def clean_extra_dots(__str):
                        # to deal with prices higher that 1000
                        dots_idx = [
                            m.start() for m in re.finditer("\.", __str)
                        ]
                        if len(dots_idx) > 1:
                            log("found multible dots in price {}".format(
                                __str))
                            for dot_idx in reversed(dots_idx[:len(dots_idx) -
                                                             1]):
                                __str = __str[:dot_idx] + __str[dot_idx + 1:]
                            log("cleaned multible dots. is now {}".format(
                                __str))
                        return __str

                    return clean_extra_dots(
                        _str.replace(',', '.').replace(self.curr_symbol, "").
                        replace(
                            '--', '0'
                        )  # if a price has no decimal places it apparently adds --
                    )

                new_price = clean(new_price)
                old_price = clean(old_price)
                if "Free" in new_price:
                    new_price = 0

                self.scraped_dict["old_price"].append(safe_to_float(old_price))
                self.scraped_dict["new_price"].append(safe_to_float(new_price))

            else:
                log_warning(
                    "could not find price container, probably because the item isn't discounted. appending 0s"
                )
                self.scraped_dict["old_price"].append(float(0))
                self.scraped_dict["new_price"].append(float(0))
Exemplo n.º 22
0
            raise log_and_raise_exept(
                "No tweet_text.txt at the provided path: " + path)
    if unique_id is not None:
        crontab = sh.crontab("-l").stdout.decode("ascii")
        try:
            crontab.index(unique_id)
        except ValueError:
            log_and_raise_exept(
                "The crontab does not contain the unique string: " + unique_id)

    api = get_api()
    tweet_text = get_tweet_text(path)
    media_ids = get_media_ids(path, api)

    if not is_test:
        try:
            if media_ids is None:
                api.update_status(status=tweet_text)
            else:
                api.update_status(status=tweet_text, media_ids=media_ids)
        except Exception as e:
            log(str(e))
            raise e

    if unique_id is not None:
        delete_cron_line(crontab, unique_id)

    if not keep_files:
        shutil.rmtree(path)  # not tested

    log("Sucsessfully posted tweet!")
Exemplo n.º 23
0
 def scrape_and_save(_proxy):
     log("scraping {}".format(region_name))
     format_and_save_results(run_scrape(is_test=is_test, proxy=_proxy),
                             region_name)
def clean_data():
    rows_per_loop = 100000
    log("")
    log("starting")

    dirty_db_path = ROOTDIR + dir_sep + "stage_2_clean.db"
    clean_db_path = ROOTDIR + dir_sep + "stage_3_cleaner.db"
    dirty_db_cursor = create_connection(dirty_db_path).cursor()
    clean_db = create_connection(clean_db_path)
    clean_db_cursor = clean_db.cursor()

    clean_db_cursor.execute("DELETE FROM bodies")
    clean_db_cursor.execute("delete from sqlite_sequence where name='bodies'")

    dirty_db_cursor.execute("select bodies from bodies")
    data = dirty_db_cursor.fetchmany(rows_per_loop)

    tpool = Pool(processes=4)
    locp_n = 1
    log("detected " + str(cpus) + " as cpu count")
    inserted = 0
    more_data = True
    while more_data:
        log("cleaning data")
        data = tpool.map(clean_line, data)
        data = list(map(lambda line: (line, ), data))

        log("inserting 100k rows")
        query = "insert into bodies (bodies) values (?)"
        clean_db_cursor.executemany(query, data)
        clean_db.commit()

        log("done loop, getting more data.")
        inserted += len(data)
        data = dirty_db_cursor.fetchmany(rows_per_loop)
        #more_data = False
        if len(data) < 1:
            more_data = False
            log("end of data")
        log("done " + str(locp_n) + " loops")
        locp_n += 1
    log("done")
    log("inserted " + str(inserted) + " rows")
def count_words():  # TODO CLEAN THE PUNCTUATION BETTER
    db_cursor = create_connection(ROOTDIR + dir_sep +
                                  "stage_2_clean.db").cursor()
    db_cursor.execute("select bodies from bodies")
    loaded_data = db_cursor.fetchmany(rows_per_loop)

    log("")
    log("start")

    data = [{}]
    i = 1
    more_data = True
    while more_data:
        log("splitting data")
        for line in loaded_data:
            line = line[0]
            line = ''.join(char for char in line
                           if char not in string.punctuation)
            for word in line.split(" "):
                data.append(word)

        log("reducing data")
        data = [reduce(lambda x, y: add_word_to_dict(x, y), data)]
        log("done loop " + str(i) + ", getting more data.")
        i += 1

        loaded_data = db_cursor.fetchmany(rows_per_loop)
        #more_data = False
        if len(loaded_data) < 1:
            more_data = False
        if not more_data:
            log("end of data")
    log("done")
    log("list is  " + str(len(data[0])) + " words")
    dict_to_disk(data,
                 ROOTDIR + dir_sep + "word_count" + dir_sep + "test.json")
    print()