예제 #1
0
 def add_dicts_value(dict1, dict2):
     if len(dict1) != len(dict2):
         logger1.error("Emotion list do not have the same length!")
         sys.exit()
     """Adding the values of the 2 dicts with the same key"""
     for key, value in dict2.items():
         dict1[key] += dict2[key]
     return dict1
예제 #2
0
    def save_raw_data(self, save_folder, is_f=True, is_prediction=False):
        save_count = 0
        for sample, t_feature_array in self.a_share_samples_t_dict.items():
            feature_array_list = []
            # (0.) add technical features
            feature_array_list.append(t_feature_array)
            # (1.) add fundamental features
            if is_f:
                is_sample_exist = self.a_share_samples_f_dict.get(sample)
                if is_sample_exist is None:
                    logger1.error(
                        'sample {} does not have any fundamental data'.format(
                            sample))
                    continue
                f_feature_array = self.a_share_samples_f_dict[sample]
                feature_array_list.append(f_feature_array)

            # concatenate all features
            feature_array_final = np.array([])
            for feature_array in feature_array_list:
                feature_array_final = self.integrate_tech_fundamental_feature(
                    feature_array_final, feature_array)

            # convert every feature to float
            feature_array_final = feature_array_final.astype(float)
            #
            feature_list_final = list(feature_array_final)

            attribitors = self.t_attributors + self.f_attributors

            if len(attribitors) != len(feature_list_final):
                logger1.error(
                    'sample: {}, feature_list_final and attribitors are not the same length! {}, {}'
                    .format(sample, len(attribitors), len(feature_list_final)))
                continue

            save_zip = zip(attribitors, feature_list_final)
            # save file
            save_name = sample + '.csv'
            save_path = os.path.join(save_folder, save_name)
            with open(save_path, 'w', encoding='utf-8') as f:
                for attribitor, feature_value in save_zip:
                    f.write(str(attribitor) + ',' + str(feature_value) + '\n')
                save_count += 1

        print("Save {} samples to {} succesfully!".format(
            save_count, save_folder))
예제 #3
0
    def get_emotion_dict(self, file_path):
        # 0f85da3c79394b2887291025758afa94, e1fc0e40c7464cebbc63317c3b0f5b26
        headers = {
            # Basic Authorization Sample
            'Content-type': 'application/octet-stream',
            'Ocp-Apim-Subscription-Key': 'e1fc0e40c7464cebbc63317c3b0f5b26',
        }

        params = urllib.parse.urlencode({
            ## Specify your subscription key
            # 'subscription-key': '',
            ## Specify values for optional parameters, as needed
            # 'analyzesFaceLandmarks': 'false',
            # 'analyzesAge': 'false',
            # 'analyzesGender': 'false',
            # 'analyzesHeadPose': 'false',
        })

        file = open(file_path, "rb").read()

        try:
            conn = http.client.HTTPSConnection(
                'westus.api.cognitive.microsoft.com')
            conn.request("POST", "/emotion/v1.0/recognize?%s" % params, file,
                         headers)
            print("send request")
            response = conn.getresponse()
            data = response.read().decode('utf-8')
            print("data: ", data)
            json_obj = json.loads(data)[0]
            conn.close()
        except IndexError:
            logger1.info("{} does not contain any faces".format(file_path))
            return None
        except KeyError:
            logger1.info(
                "{} key Error! Maybe too big or too small".format(file_path))
            return None
        except:
            logger1.error("Unexpected error:", sys.exc_info()[0])
            raise
            return None

        return json_obj
예제 #4
0
    def get_text_emotion_dict(self):

        texts_name_list = os.listdir(self.text_folder_path)
        texts_path_list = [
            os.path.join(self.text_folder_path, x) for x in texts_name_list
        ]
        for text_file in texts_path_list:
            print("processing {}....".format(text_file))
            with open(text_file, 'r', encoding='utf-8') as f:
                date_str = re.findall(r'([0-9]+-[0-9]+-[0-9]+)#', f.name)[0]
                #print ("date_str :", date_str)
                date_of_text_temp = time.strptime(date_str, '%Y-%m-%d')
                date_of_text = datetime.datetime(*date_of_text_temp[:3])
                date_object = datetime.date(year=date_of_text.year,
                                            month=date_of_text.month,
                                            day=date_of_text.day)
                text_content_list = f.readlines()
                text_content = '.'.join(text_content_list)
                text_content_re_list = re.findall(r'[A-Za-z]+', text_content)
                text_content = '.'.join(text_content_re_list)[0:80000]
                #print ('text_content: ', text_content)
                request_dict = {}
                request_dict['language'] = "english"
                request_dict['text'] = text_content
                response = requests.post(
                    "https://japerk-text-processing.p.mashape.com/sentiment/",
                    headers={
                        "X-Mashape-Key":
                        "muMV4DdXyqmsh6hEQIryzApEFo4bp14Nb8ojsnQZdTCaEAUMxo",
                        "Content-Type": "application/x-www-form-urlencoded",
                        "Accept": "application/json"
                    },
                    data=request_dict)
                print("status_code: ", response.status_code)
                response_dict = response.json()['probability']
                response_dict['emotion_value'] = response_dict[
                    'pos'] - response_dict['neg']
                #print(response_dict)
                if self.date_text_emotion_dict[date_object]:
                    logger1.error("{} has mutilple copies".format(date_object))
                self.date_text_emotion_dict[date_object] = response_dict
                # add raw txt data to dict and create new file
                self.raw_data_dict[date_object].append(text_content)
예제 #5
0
 def input_list_length_check(p1_chbits, p2_chbits):
     # check the length of both parents
     if len(p1_chbits) != len(p2_chbits):
         logger1.error(
             "The length of the input parents for the crossover is not equal!!"
         )
         logger1.error("Error parent list: ", p1_chbits)
         logger1.error("Error parent list: ", p2_chbits)
         sys.exit(0)
예제 #6
0
    def get_photo_emotion_dict(self):
        def add_dicts_value(dict1, dict2):
            if len(dict1) != len(dict2):
                logger1.error("Emotion list do not have the same length!")
                sys.exit()
            """Adding the values of the 2 dicts with the same key"""
            for key, value in dict2.items():
                dict1[key] += dict2[key]
            return dict1

        # ::: get_photo_emotion_dict
        photo_folder_list = os.listdir(self.face_folder_path)
        photo_folder_list = [
            os.path.join(self.face_folder_path, x) for x in photo_folder_list
        ]
        #print ("photo_folder_list", photo_folder_list)
        for photo_file_path in photo_folder_list:
            # days_ago =
            f = open(photo_file_path, 'rb')
            # Return Exif tags
            tags = exifread.process_file(f)
            try:
                date_of_photo = tags['EXIF DateTimeDigitized']
            except KeyError:
                logger1.error(
                    "photo in {} has no meta data of digitized time!".format(
                        photo_file_path))
                continue
            # convert to str,
            date_of_photo = str(date_of_photo)
            # convert str to date object

            # date_of_photo = 2017:02:01
            date_of_photo = re.findall(r'([0-9]+:[0-9]+:[0-9]+)',
                                       date_of_photo)[0]
            date_of_photo_temp = time.strptime(date_of_photo, '%Y:%m:%d')
            date_of_photo = datetime.datetime(*date_of_photo_temp[:3])
            date_of_photo = datetime.date(year=date_of_photo.year,
                                          month=date_of_photo.month,
                                          day=date_of_photo.day)
            print("date_of_photo", date_of_photo, type(date_of_photo))

            photo_emotion_dict = self.get_emotion_dict(photo_file_path)
            if photo_emotion_dict:
                #print ("photo_emotion_dict: ", photo_emotion_dict)
                photo_emotion_dict = photo_emotion_dict['scores']
            else:
                continue
            #pp.pprint(photo_emotion_dict)
            #pp.pprint(type(photo_emotion_dict))

            if self.date_photo_emotion_dict[date_of_photo]['dict']:
                self.date_photo_emotion_dict[date_of_photo][
                    'dict'] = add_dicts_value(
                        self.date_photo_emotion_dict[date_of_photo]['dict'],
                        photo_emotion_dict)
            else:
                self.date_photo_emotion_dict[date_of_photo][
                    'dict'] = photo_emotion_dict

            self.date_photo_emotion_dict[date_of_photo]['dict_num'] += 1

        # compute the average of date_photo_emotion_dict
        for date, date_dict in self.date_photo_emotion_dict.items():
            dict_num = date_dict['dict_num']
            for emotion, emotion_value in self.date_photo_emotion_dict[date][
                    'dict'].items():
                emotion_value /= dict_num
                self.date_photo_emotion_dict[date]['dict'][emotion] = float(
                    "{:.3f}".format(emotion_value))
예제 #7
0
    def read_tech_history_data(self, start_date, is_prediction=False):
        # clear
        self.a_share_samples_t_dict = collections.defaultdict(lambda: 0)
        #

        start_date_temp = time.strptime(start_date, '%Y-%m-%d')
        start_date_obj = datetime.datetime(*start_date_temp[:3]).date()
        today_obj = datetime.datetime.today().date()
        today = datetime.datetime.today().strftime("%Y-%m-%d")
        t_attributors_set = set(
            ts.get_k_data("600883", start="2017-05-09", ktype='W').keys())
        t_attributors_set -= {'code', 'date'}
        t_attributors_set.add('priceChange')
        t_attributors_set.add('candleLength')
        t_attributors_set.add('candlePos')
        t_attributors = sorted(list(t_attributors_set))
        self.t_attributors = t_attributors

        stock_list = list(self.stock_set)[:]
        is_close_price_exist = True

        for stock_id in stock_list:
            fund_dict = ts.get_k_data(stock_id, start=start_date,
                                      ktype='W').to_dict()
            # date_list: ['2017-05-05', '2017-05-12', '2017-05-19']
            try:
                # date_items: [(29, '2016-08-05'), (30, '2016-08-12'), (31, '2016-08-19'), ...]
                date_items = sorted(list(fund_dict['date'].items()),
                                    key=lambda x: x[0])
                #print ("date_items: ", date_items)
            except KeyError:
                logger1.error("{} stock has no key data".format(stock_id))
                continue

            for i, (id, date_str) in enumerate(date_items):
                if i > len(date_items) - 3 and is_prediction is False:
                    print(
                        "Skip {} on {} because of reaching the end. The data of the rest date "
                        "cannot be fully presented".format(id, date_str))
                    continue

                # # ======================================================================================================
                # # TODO
                # # DATA CHECK FOR NEXT WEEK AND NEXT NEXT WEEK, BUT IT'S HARD TO ACHIEVE BECAUSE THE VARITIES OF HOLIDAIES
                # # ======================================================================================================
                # # get the date_str for next next week
                # date_temp = time.strptime(date_str, '%Y-%m-%d')
                # date_obj = datetime.datetime(*date_temp[:3])
                # delta_14 = datetime.timedelta(days=14)
                # delta_7 = datetime.timedelta(days=7)
                # date_obj_nw = date_obj + delta_7
                # date_nw_str = date_obj_nw.strftime("%Y-%m-%d")
                # date_obj_nnw = date_obj + delta_14
                # date_nnw_str = date_obj_nnw.strftime("%Y-%m-%d")
                # #
                #
                # # check next week's data
                # if date_nw_str != date_items[i + 1][1]:
                #     # print ("date_nw_str: ", date_nw_str)
                #     # print ("date_items[i + 1][1]: ", date_items[i + 1][1])
                #     # sys.exit()
                #     logger1.error("{} stock has no tech data on {} for next week".format(stock_id, date_nnw_str))
                #     continue
                # #
                #
                # # check next next week's data
                # if date_nnw_str != date_items[i + 2][1]:
                #     logger1.error("{} stock has no tech data on {} for next next week".format(stock_id, date_nnw_str))
                #     continue
                # #
                # # ======================================================================================================

                feature_list = []

                for attributor in t_attributors:
                    # for pricechange
                    if attributor == 'priceChange' and is_prediction is False:

                        nw_open = fund_dict['open'][date_items[i + 1][0]]
                        nnw_open = fund_dict['open'][date_items[i + 2][0]]
                        priceChange = "{:.5f}".format(
                            (nnw_open - nw_open) / nw_open)

                        # # price change for the next week's close
                        # close_price = fund_dict['close'][id]
                        # close_price_next_week = fund_dict['close'][date_items[i + 1][0]]
                        # priceChange = "{:.5f}".format((close_price_next_week - close_price) / close_price)
                        # #
                        feature_list.append(priceChange)

                    elif attributor == 'priceChange' and is_prediction is True:
                        priceChange = "nan"
                        feature_list.append(priceChange)

                    elif attributor == 'candleLength':
                        close_price = fund_dict['close'][id]
                        open_price = fund_dict['open'][id]
                        high_price = fund_dict['high'][id]
                        low_price = fund_dict['low'][id]
                        candle_length = "{:.5f}".format(
                            abs((close_price - open_price) /
                                (high_price - low_price)))
                        feature_list.append(candle_length)

                    elif attributor == 'candlePos':
                        close_price = fund_dict['close'][id]
                        open_price = fund_dict['open'][id]
                        high_price = fund_dict['high'][id]
                        low_price = fund_dict['low'][id]
                        price = max(close_price, open_price)
                        candle_pos = "{:.5f}".format(
                            abs((high_price - price) /
                                (high_price - low_price)))
                        feature_list.append(candle_pos)

                    else:
                        # for other attributors
                        feature_list.append(fund_dict[attributor][id])

                feature_array = np.array(feature_list)
                sample_name = date_str + '_' + stock_id
                self.a_share_samples_t_dict[sample_name] = feature_array
            print("saving {} stock t features".format(stock_id))

        print("t_attributors: {}".format(t_attributors))
        print("a_share_samples_t_dict: {}".format(
            self.a_share_samples_t_dict.values()))
        print("a_share_samples_t_dict_value: {}".format(
            list(self.a_share_samples_t_dict.values())[0]))
예제 #8
0
    def feature_engineering(self,
                            input_folder,
                            save_folder,
                            keep_stock_ids_path=None):

        if keep_stock_ids_path:
            keep_stock_ids_list = []
            with open(keep_stock_ids_path, 'r') as f:
                for line in f:
                    keep_stock = line.strip()
                    keep_stock_ids_list.append(keep_stock)

        file_name_list = os.listdir(input_folder)
        file_path_list = [
            os.path.join(input_folder, file_name)
            for file_name in file_name_list
        ]

        successful_save_count = 0
        original_data_count = len(file_name_list)

        for i, file_path in enumerate(file_path_list):
            file_name = file_name_list[i]
            stock_id = re.findall(r'_([0-9]+).csv', file_name)[0]
            # filter stock ids
            if keep_stock_ids_path:
                if stock_id not in keep_stock_ids_list:
                    continue
            #

            date = re.findall(r'([0-9]+-[0-9]+-[0-9]+)_', file_name)[0]
            # find the data of the previous friday
            date_obj_temp = time.strptime(date, '%Y-%m-%d')
            date_obj = datetime.datetime(*date_obj_temp[:3])

            # find the file for the previous week for the calculation of certain features,
            # the day gap does not necessary to be 7 days
            previous_week_date_full_path = ''
            pre_f_day_range = (7, 13)
            for days in range(pre_f_day_range[0], pre_f_day_range[1]):
                previous_friday_obj = date_obj - datetime.timedelta(days=days)
                previous_friday_str = previous_friday_obj.strftime("%Y-%m-%d")
                previous_friday_full_path = previous_friday_str + '_' + stock_id + '.csv'
                previous_friday_full_path = os.path.join(
                    input_folder, previous_friday_full_path)
                try:
                    open(previous_friday_full_path, 'r', encoding='utf-8')
                    previous_week_date_full_path = previous_friday_full_path
                    break
                except FileNotFoundError:
                    continue

            if not previous_week_date_full_path:
                logger1.error(
                    "{} cannot find the previous week's data within 13 days".
                    format(file_name))
                continue
            else:
                with open(previous_week_date_full_path, 'r',
                          encoding='utf-8') as f:
                    previous_f_feature_pair_dict = {}
                    for line in f:
                        line_list = line.split(',')
                        feature_name = line_list[0]
                        feature_value = float(line_list[1].strip())
                        previous_f_feature_pair_dict[
                            feature_name] = feature_value
            #

            feature_pair_dict = {}
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line_list = line.split(',')
                    feature_name = line_list[0]
                    feature_value = float(line_list[1].strip())
                    feature_pair_dict[feature_name] = feature_value

            # ===================================================================================
            # add features
            # ===================================================================================
            # (1.) open change
            pre_f = previous_f_feature_pair_dict['open']
            f = feature_pair_dict['open']
            feature_pair_dict['openChange'] = "{:.5f}".format(
                (f - pre_f) / pre_f)
            # -----------------------------------------------------------------------------------
            # (2.) close change
            pre_f = previous_f_feature_pair_dict['close']
            f = feature_pair_dict['close']
            feature_pair_dict['closeChange'] = "{:.5f}".format(
                (f - pre_f) / pre_f)
            # -----------------------------------------------------------------------------------
            # (3.) high change
            pre_f = previous_f_feature_pair_dict['high']
            f = feature_pair_dict['high']
            feature_pair_dict['highChange'] = "{:.5f}".format(
                (f - pre_f) / pre_f)
            # -----------------------------------------------------------------------------------
            # (4.) low change
            pre_f = previous_f_feature_pair_dict['low']
            f = feature_pair_dict['low']
            feature_pair_dict['lowChange'] = "{:.5f}".format(
                (f - pre_f) / pre_f)
            # -----------------------------------------------------------------------------------
            # (5.) volume change
            pre_f = previous_f_feature_pair_dict['volume']
            f = feature_pair_dict['volume']
            feature_pair_dict['volumeChange'] = "{:.5f}".format(
                (f - pre_f) / pre_f)
            # -----------------------------------------------------------------------------------
            # (6.) open close change
            open_price = feature_pair_dict['open']
            close_price = feature_pair_dict['close']
            open_close_change = (close_price - open_price) / open_price
            feature_pair_dict['openCloseChange'] = "{:.5f}".format(
                open_close_change)
            # -----------------------------------------------------------------------------------
            # (7.) low high change
            low_price = feature_pair_dict['low']
            high_price = feature_pair_dict['high']
            low_high_change = (high_price - low_price) / low_price
            feature_pair_dict['lowHighChange'] = "{:.5f}".format(
                low_high_change)

            # **********************************************************************************************
            # FUNDAMENTALS
            # **********************************************************************************************

            FUNDAMENTAL_ATTRIBUTOR_SET = {'pb', 'pe'}
            for attritubtor in FUNDAMENTAL_ATTRIBUTOR_SET:
                pre = previous_f_feature_pair_dict[attritubtor]
                this_week = feature_pair_dict[attritubtor]
                new_attributor_name = attritubtor + 'Change'
                try:
                    feature_pair_dict[new_attributor_name] = "{:.6f}".format(
                        (this_week - pre) / pre)
                except ZeroDivisionError:
                    set_value = "1.0"
                    feature_pair_dict[new_attributor_name] = set_value
                    logger1.error(
                        "New attributor {} has ZeroDivisionError! attritubtor: {}, temporal set value: {}"
                        .format(os.path.basename(previous_week_date_full_path),
                                new_attributor_name, set_value))

            # **********************************************************************************************

            # ===================================================================================

            # ===================================================================================
            # delete features: close, high, low, open
            # ===================================================================================
            # delete_features_set = {'close', 'high', 'low', 'open', 'timeToMarket'}
            delete_features_set = {
                'close', 'high', 'low', 'open', 'timeToMarket', 'liquidAssets',
                'fixedAssets', 'reserved', 'reservedPerShare', 'esp', 'bvps',
                'pb', 'undp', 'perundp', 'holders', 'totals', 'totalAssets',
                'outstanding'
            }

            for feature_name in delete_features_set:
                feature_pair_dict.pop(feature_name)
            # ===================================================================================

            # write the feature engineered file to folder
            file_name = file_name.replace('csv', 'txt')
            save_file_path = os.path.join(save_folder, file_name)
            with open(save_file_path, 'w', encoding='utf-8') as f:
                feature_pair_list = []
                feature_pair_tuple_list = sorted(list(
                    feature_pair_dict.items()),
                                                 key=lambda x: x[0])
                for feature_pair in feature_pair_tuple_list:
                    feature_pair_list.append(feature_pair[0])
                    feature_pair_list.append(feature_pair[1])

                feature_pair_list = [str(x) for x in feature_pair_list]
                feature_pair_str = ','.join(feature_pair_list)

                f.write(feature_pair_str)
                successful_save_count += 1
        print(
            "Succesfully engineered {} raw data! original count: {}, delete {} files"
            .format(successful_save_count, original_data_count,
                    original_data_count - successful_save_count))
예제 #9
0
    def read_fundamental_data(self, start_date, is_filter_new_stock=False):
        # clear
        self.a_share_samples_f_dict = collections.defaultdict(lambda: 0)
        #
        start_date_temp = time.strptime(start_date, '%Y-%m-%d')
        start_date_obj = datetime.datetime(*start_date_temp[:3]).date()
        today_obj = datetime.datetime.today().date()
        today = datetime.datetime.today().strftime("%Y-%m-%d")

        # manually type
        f_attributors_set = {
            'holders', 'undp', 'gpr', 'pb', 'industry', 'bvps', 'timeToMarket',
            'rev', 'perundp', 'fixedAssets', 'name', 'reservedPerShare',
            'totals', 'outstanding', 'liquidAssets', 'profit', 'pe',
            'reserved', 'npr', 'area', 'totalAssets', 'esp'
        }
        #
        # change the date if time out
        #f_attributors_set = set(ts.get_stock_basics(date = "2017-05-26").to_dict().keys())
        #

        filter_set = {'name', 'industry', 'area'}
        f_attributors_set = f_attributors_set - filter_set
        f_attributors = sorted(list(f_attributors_set))
        self.f_attributors = f_attributors

        for single_date in daterange(start_date_obj, today_obj):
            temp_stock_feature_dict = collections.defaultdict(lambda: [])
            temp_stock_feature_dict_key_pop_set = set(
            )  # for filtering the new stocks
            # if it is not friday, skip!
            if single_date.weekday() != 4:
                continue
            date_str = single_date.strftime("%Y-%m-%d")

            try:
                print("date_str: ", date_str)
                ts_temp = ts.get_stock_basics(date=date_str)
                if ts_temp is None:
                    logger1.error("{} not found any data!".format(date_str))
                    continue
                fund_dict = ts_temp.to_dict()
            except urllib.error.HTTPError:
                logger1.error("{} not found any data!".format(date_str))
                continue

            for key, stock_key_value_dict in sorted(fund_dict.items()):

                # filter name,industry,,area,
                if key in filter_set:
                    continue
                #

                for stock_id, value in stock_key_value_dict.items():

                    if is_filter_new_stock:
                        if key == "timeToMarket":
                            timeToMarket = str(value)
                            try:
                                date_temp = time.strptime(
                                    timeToMarket, '%Y%m%d')
                            except ValueError:
                                logger1.error(
                                    "{} has invalid timeToMarket value!".
                                    format(stock_id))
                                temp_stock_feature_dict[stock_id].append(
                                    (key, value))
                                continue

                            date_obj = datetime.datetime(*date_temp[:3]).date()

                            # set the threshold for new stock
                            delta = datetime.timedelta(days=28)
                            #

                            date_gap = single_date - date_obj

                            if date_gap <= delta:
                                print(
                                    "stock_id: {} is new stock for {}, release date: {}"
                                    .format(stock_id, single_date,
                                            timeToMarket))
                                temp_stock_feature_dict_key_pop_set.add(
                                    stock_id)

                    temp_stock_feature_dict[stock_id].append((key, value))

            # filter new stocks
            if is_filter_new_stock:
                for stock_id in temp_stock_feature_dict_key_pop_set:
                    temp_stock_feature_dict.pop(stock_id, 'None')
                #

            for stock_id, feature_list in temp_stock_feature_dict.items():
                feature_list = sorted(feature_list, key=lambda x: x[0])
                feature_value_list = [x[1] for x in feature_list]
                feature_array = np.array(feature_value_list)
                sample_name = date_str + '_' + stock_id

                # save samples
                self.a_share_samples_f_dict[sample_name] = feature_array
            print("saving {}'s stock feature to a_share_samples_f_dict".format(
                single_date))

        print("f_attributors: {}".format(f_attributors))
        print("a_share_samples_f_dict_value: {}".format(
            list(self.a_share_samples_f_dict.values())[0]))