def test_Hungarian(self):
        lectEx = [[14, 5, 8, 7], [2, 12, 6, 5], [7, 8, 3, 9], [2, 4, 6, 10]]

        assign = DataFrame([
            [0, 1, 0, 0],
            [0, 0, 0, 1],
            [0, 0, 1, 0],
            [1, 0, 0, 0],
        ])
        assign.columns = assign.columns + 1
        assign.index = assign.index + 1
        reducedDF = DataFrame([
            [10, 0, 3, 0],
            [0, 9, 3, 0],
            [5, 5, 0, 4],
            [0, 1, 3, 5],
        ]).applymap(lambda x: float(x))
        reducedDF.columns = reducedDF.columns + 1
        reducedDF.index = reducedDF.index + 1

        h = OR.Hungarian.new(lectEx)
        j = h.solve(echo=False)

        assert j.reducedDF.equals(reducedDF)
        assert j.assignedDF.equals(assign)
示例#2
0
    def etl(self, test_data, agg_time, type):
        from numpy import nan
        if isinstance(test_data, list):
            data = DataFrame(test_data)

            if type == 0:
                data.columns = [
                    'serial_number', 'manufacturer', 'vendor', 'collect_time',
                    'mca_id', 'transaction'
                ]
            elif type == 1:
                data.columns = [
                    'collect_time', '1_hwerr_f', '1_hwerr_e', '2_hwerr_c',
                    '2_sel', '3_hwerr_n', '2_hwerr_s', '3_hwerr_m',
                    '1_hwerr_st', '1_hw_mem_c', '3_hwerr_p', '2_hwerr_ce',
                    '3_hwerr_as', '1_ke', '2_hwerr_p', '3_hwerr_kp',
                    '1_hwerr_fl', '3_hwerr_r', '_hwerr_cd', '3_sup_mce_note',
                    '3_cmci_sub', '3_cmci_det', '3_hwerr_pi', '3_hwerr_o',
                    '3_hwerr_mce_l', 'serial_number', 'manufacturer', 'vendor'
                ]
            elif type == 2:
                data.columns = [
                    'serial_number', 'manufacturer', 'vendor', 'memory',
                    'rankid', 'bankid', 'collect_time', 'row', 'col'
                ]

            data[-1] = pd.to_datetime(data['collect_time']).dt.ceil(agg_time)
            group_data = data.groupby(['serial_number', 'collect_time'],
                                      as_index=False).agg('sum')
            return group_data
        else:
            return None
    def _finalize_output(self, frame: DataFrame) -> DataFrame:
        """
        Processes data read in based on kwargs.

        Parameters
        ----------
        frame: DataFrame
            The DataFrame to process.

        Returns
        -------
        DataFrame
            The processed DataFrame.
        """
        num_cols = len(frame.columns)
        multi_index_named = True
        if self.header is None:
            if self.names is None:
                if self.prefix is not None:
                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
                elif self.header is None:
                    self.names = range(num_cols)
            if len(self.names) != num_cols:
                # usecols is passed through to pyarrow, we only handle index col here
                # The only way self.names is not the same length as number of cols is
                # if we have int index_col. We should just pad the names(they will get
                # removed anyways) to expected length then.
                self.names = list(
                    range(num_cols - len(self.names))) + self.names
                multi_index_named = False
            frame.columns = self.names
        # we only need the frame not the names
        # error: Incompatible types in assignment (expression has type
        # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp,
        # Timedelta, Any]]], Index]", variable has type "Index")  [assignment]
        frame.columns, frame = self._do_date_conversions(  # type: ignore[assignment]
            frame.columns, frame)
        if self.index_col is not None:
            for i, item in enumerate(self.index_col):
                if is_integer(item):
                    self.index_col[i] = frame.columns[item]
                else:
                    # String case
                    if item not in frame.columns:
                        raise ValueError(f"Index {item} invalid")
            frame.set_index(self.index_col, drop=True, inplace=True)
            # Clear names if headerless and no name given
            if self.header is None and not multi_index_named:
                frame.index.names = [None] * len(frame.index.names)

        if self.kwds.get("dtype") is not None:
            try:
                frame = frame.astype(self.kwds.get("dtype"))
            except TypeError as e:
                # GH#44901 reraise to keep api consistent
                raise ValueError(e)
        return frame
示例#4
0
    def _finalize_output(self, frame: DataFrame) -> DataFrame:
        """
        Processes data read in based on kwargs.

        Parameters
        ----------
        frame: DataFrame
            The DataFrame to process.

        Returns
        -------
        DataFrame
            The processed DataFrame.
        """
        num_cols = len(frame.columns)
        multi_index_named = True
        if self.header is None:
            if self.names is None:
                if self.prefix is not None:
                    self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
                elif self.header is None:
                    self.names = range(num_cols)
            if len(self.names) != num_cols:
                # usecols is passed through to pyarrow, we only handle index col here
                # The only way self.names is not the same length as number of cols is
                # if we have int index_col. We should just pad the names(they will get
                # removed anyways) to expected length then.
                self.names = list(
                    range(num_cols - len(self.names))) + self.names
                multi_index_named = False
            frame.columns = self.names
        # we only need the frame not the names
        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
        if self.index_col is not None:
            for i, item in enumerate(self.index_col):
                if is_integer(item):
                    self.index_col[i] = frame.columns[item]
                else:
                    # String case
                    if item not in frame.columns:
                        raise ValueError(f"Index {item} invalid")
            frame.set_index(self.index_col, drop=True, inplace=True)
            # Clear names if headerless and no name given
            if self.header is None and not multi_index_named:
                frame.index.names = [None] * len(frame.index.names)

        if self.kwds.get("dtype") is not None:
            frame = frame.astype(self.kwds.get("dtype"))
        return frame
示例#5
0
 def get_data_frame(self):
     data = DataFrame(self.data)
     data.columns = ['学校', '考试方式', '院系所', '', '专业',
                     '学习方式', '研究方向', '指导教师', '拟招生人数', '备注']
     data.drop(labels='', axis=1, inplace=True)
     data.to_csv(self.provinceName + "研究生招生信息.csv",
                 encoding="utf_8_sig", index=False)
示例#6
0
    def get_result(self):
        if self._is_series:
            if self.axis == 0:
                new_data = com._concat_compat([x.get_values() for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat')
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                if columns is not None:
                    tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True)

            return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
示例#7
0
    def get_result(self):
        if self._is_series:
            if self.axis == 0:
                new_data = com._concat_compat([x.get_values() for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat')
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                if columns is not None:
                    tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat')
示例#8
0
    def get_trends(self, df: DataFrame):
        ans = {}
        df["date"] = pd.to_datetime(
            df["date"]).apply(lambda x: (datetime.today() - x).days)

        df.columns = df.columns.to_series().replace({"date": "days"})
        df = df[df["days"] <= 720]
        drop_asins = (df.groupby("asin").size().sort_values()[
            df.groupby("asin").size().sort_values() <= 60])

        df = df[~df["asin"].isin(drop_asins.index)]

        for index in range(df["category_id"].nunique()):
            current_category_id = df["category_id"].unique()[index]
            sumcoef = []
            for asin in df[df["category_id"] ==
                           current_category_id]["asin"].unique():
                try:
                    sts = sm.tsa.seasonal_decompose(
                        df[df["asin"] == f"{asin}"]['ranking'], period=30)
                    X = sts.trend[sts.trend.notna()].index
                    y = sts.trend[sts.trend.notna()].values.astype("int")
                    model = linear_model.LinearRegression()
                    model.fit(np.array(X).reshape(-1, 1), y)
                    sumcoef.append(model.coef_)
                except:
                    print(f"An exception occurred with asin {asin}")

            ans[current_category_id] = np.array(sumcoef).mean()
        finans = {
            k: v
            for k, v in sorted(ans.items(), key=lambda item: item[1])
        }
        return finans
示例#9
0
def processCSVMatrix(file):
    with open(file, 'r') as csvfile:
        dialect = Sniffer().sniff(csvfile.readline())

    df = DataFrame()
    for chunk in read_csv(file,
                          sep=dialect.delimiter,
                          mangle_dupe_cols=True,
                          index_col=False,
                          chunksize=1000):
        df = concat([df, chunk], ignore_index=True)

    nodes = df.columns.values.tolist()
    nodes.pop(0)
    df["Unnamed: 0"] = nodes
    df = df.rename(columns={'Unnamed: 0': 'name'})
    df = df.set_index(keys='name')

    # Remove underscores in names
    names = df.columns.tolist()
    names = [name.replace('_', ' ') for name in names]
    df.columns = names
    df.set_index([df.columns], inplace=True)

    return df
def classifyTestData(testFilePath,modelRoot):
    """
    This method calls the traverseDecisionTreeModel() to classify the test data on the trained model and generate Confusion matrix and error at the given depth
    :param testFilePath: Path to the test file
    :param modelRoot: Root node of the decision tree of the trained model

    """
    correctlyClassifiedInstances=0
    incorrectlyClassifiedInstances=0
    testDataList=[]
    input=open(testFilePath,'rU')
    csvObject=csv.reader(input)
    label = featureList[len(featureList) -1]
    classLabels = featureAndValueMapping.get(label)
    classLabelCount = len(classLabels)
    ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))]
    for row in csvObject:
        predictedLabel=traverseDecisionTreeModel(row,root)
        ConfusionMatrix[int(row[len(row)- 1]) - 1][int(predictedLabel) - 1] += 1

        if predictedLabel==row[len(row)-1]:
            correctlyClassifiedInstances+=1
        else:
            incorrectlyClassifiedInstances+=1
    df = DataFrame(ConfusionMatrix)
    df.columns = classLabels
    df.index = classLabels

    print "Confusion Matrix :: \n"
    print df
    print "Correctly Classified Instance ",correctlyClassifiedInstances
    print "Incorrectly Classified Instance ",incorrectlyClassifiedInstances
示例#11
0
def excel_save(List, sheet_title):
    df = DataFrame(List)
    df.columns = ['排名', 'AV号', 'UP名', '标题', '综合评分', '总播放量', '投币数量', '弹幕总数']
    writer = pd.ExcelWriter('B站{0}综合排行榜前100视频.xlsx'.format(sheet_title))
    df.to_excel(excel_writer=writer, index=False, encoding='utf-8', sheet_name=sheet_title)
    writer.save()
    writer.close()
示例#12
0
    def get_result(self):

        # series only
        if self._is_series:

            # stack blocks
            if self.axis == 0:
                new_data = com._concat_compat([x._values for x in self.objs])
                name = com._consensus_name_attr(self.objs)
                return (Series(new_data, index=self.new_axes[0],
                               name=name,
                               dtype=new_data.dtype)
                        .__finalize__(self, method='concat'))

            # combine as columns in a frame
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                # checks if the column variable already stores valid column
                # names (because set via the 'key' argument in the 'concat'
                # function call. If that's not the case, use the series names
                # as column names
                if (columns.equals(Index(np.arange(len(self.objs)))) and
                        not self.ignore_index):
                    columns = np.array([data[i].name
                                        for i in range(len(data))],
                                       dtype='object')
                    indexer = isnull(columns)
                    if indexer.any():
                        columns[indexer] = np.arange(len(indexer[indexer]))
                tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')

        # combine block managers
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes,
                concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return (self.objs[0]._from_axes(new_data, self.new_axes)
                    .__finalize__(self, method='concat'))
示例#13
0
def ListoDF(data):
    if isinstance(data, list):
        Df = DataFrame(data)  # 转为数据框
        Df.columns = Df.iloc[0, :]  # 修改列名
        Df.drop(0, axis=0, inplace=True)  # 删除第一行
    else:
        Df = data
    return Df
示例#14
0
def output_keywords(wordlist):
    wordlist = list(map( list,wordlist ))
    data = DataFrame(wordlist)
    data.columns = ['關鍵詞','分數']
    data['分數'] = data['分數'] * 10000
    data.sort_values("分數",inplace=True,ascending=False)
    data = data.round({'分數':2})
    return data
def fix_tax(q):  # q = opts.i
    taxl = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
    data1 = pd.read_csv(q, header=0, sep="\t")
    taxwz = data1.columns.tolist().index('taxonomy')  # taxonomy的位置
    # Unknown 替换
    data1.iloc[:, taxwz].replace("__Unknown_\w*",
                                 "__Unknown",
                                 regex=True,
                                 inplace=True)

    # taxonomy 拆分
    data2 = [i.split(";") for i in data1.iloc[:, taxwz]]
    bb = []
    for j in range(len(data2)):
        bb.append(data2[j] + [""] * (7 - len(data2[j])))
    data3 = DataFrame(bb)
    data3.columns = taxl
    regex = re.compile(r'(;s__[a-zA-Z0-9_-]*)', flags=re.IGNORECASE)
    data3['tax'] = data1.iloc[:, taxwz].replace(regex, "")
    data3['OTU_ID'] = data1.iloc[:, 0]

    # 挑出需要修改的行
    re1 = re.compile(
        "uncultured|Incertae_Sedis$|Unknown|Subgroup_|norank|Family_",
        flags=re.IGNORECASE)  # 门到属包含指定字符的行
    bl1 = data3['tax'].apply(lambda x: bool(re1.search(x)))
    re2 = re.compile(r'[dpcofgs]__', re.I)
    data4_1 = data3.loc[bl1, :].replace(re2, "").drop(['tax'], axis=1)
    #data4_1.to_csv("test.txt",header=0,index=0)
    for i in range(1, data4_1.shape[1] - 2):  # i = 4
        for k in range(data4_1.shape[0]):  # k = 278
            if bool(
                    re.search(data4_1.iloc[k, i] + "$", data4_1.iloc[k, i - 1],
                              re.I)):
                data4_1.iloc[k, i] = data4_1.iloc[k, i - 1]
            elif bool(re.search(re1, data4_1.iloc[k, i])):
                data4_1.iloc[k, i] = data4_1.iloc[k, i -
                                                  1] + "_" + data4_1.iloc[k, i]

    for i in range(0, data4_1.shape[1] - 1):  # i = 4
        for k in range(data4_1.shape[0]):  # k = 278
            if data4_1.iloc[k, i] != "":
                data4_1.iloc[k, i] = data4_1.columns[i] + data4_1.iloc[k, i]
    data4_1['tax'] = data4_1.iloc[:, :-1].apply(
        lambda row: ';'.join(row.values.astype(str)), axis=1)
    # 合并
    data4_2 = data3.loc[~bl1, :]
    regex2 = re.compile(r'(_[0-9]*$)')
    data5 = pd.concat([data4_1[data4_2.columns.tolist()],
                       data4_2]).replace(regex2, "")
    data5.index = data5["OTU_ID"]
    data6 = np.array(data5.loc[data1.iloc[:, 0].tolist(), :]).tolist()
    data7 = DataFrame([[data6[pp][-1], ";".join(data6[pp][0:7])]
                       for pp in range(len(data6))])
    data7.replace(";+", ";", regex=True, inplace=True)
    data7.replace(";$", "", regex=True, inplace=True)
    data1.iloc[:, taxwz] = data7.iloc[:, 1]
    data1.to_csv("fix_" + q, sep='\t', na_rep='', index=0)  # ,header=0
示例#16
0
 def process(self, instrucction, table: DataFrame, name):
     try:
         if isinstance(self.condition, Relop) or isinstance(
                 self.condition, LogicalOperators):
             value = self.condition.process(instrucction)
             if isinstance(value, list):
                 list_alias = value[0]
                 table = self.create_temporal_tables(list_alias, value[2])
                 query = value[1]
                 table = table.query(query)
                 table.columns = self.change_name_column(
                     table.columns.tolist(), value[2])
             else:
                 table = table.query(value)
         elif isinstance(self.condition, LikeClause):
             value = self.condition.process(instrucction)
             table = table.query(value)
         elif isinstance(self.condition, Between):
             value = self.condition.process(instrucction)
             table = table.query(value)
         elif isinstance(self.condition, isClause):
             value = self.condition.process(instrucction)
             table = table.query(value)
         elif isinstance(self.condition, InClause):
             value = self.condition.process(instrucction)
             table = table.query(value)
         elif isinstance(self.condition, ExistsClause):
             value = self.condition.process(instrucction)
             try:
                 value_aux = value
                 result = table.columns.intersection(value_aux.columns)
                 list_col = list(result)
                 table = table[list_col].isin(value_aux[list_col])
             except:
                 desc = "FATAL ERROR, murio porque usaste where con columnas de otra tabla, F"
                 ErrorController().add(34, 'Execution', desc, 0, 0)
         elif isinstance(self.condition, list):
             not_c = self.condition[0]
             condition = self.condition[1]
             value = condition.process(instrucction)
             try:
                 value_aux = value
                 result = table.columns.intersection(value_aux.columns)
                 list_col = list(result)
                 table = ~table[list_col].isin(value_aux[list_col])
             except:
                 desc = "FATAL ERROR, murio porque usaste where con columnas de otra tabla, F"
                 ErrorController().add(34, 'Execution', desc, 0, 0)
         # al fin xd
         print(table)
         storage_columns(table.values.tolist(), table.columns.tolist(), 0,
                         0)
         storage_table(table.values.tolist(), table.columns.tolist(), name,
                       0, 0)
         return table
     except:
         desc = "FATAL ERROR, murio en Where, F"
         ErrorController().add(34, 'Execution', desc, 0, 0)
示例#17
0
def names_to_bigquery():  #upload 2008to2017
    for i in range(len(years)):
        data = pd.read_csv(f'./data/yob{years[i]}.txt')
        data = DataFrame(data)

        data.columns = ['name', 'gender', 'count']

        data.to_gbq(destination_table=f'mm.{years[i]}', project_id=project_id)
        print(f"uploaded{years[i]}")
示例#18
0
文件: merge.py 项目: yaduart/pandas
    def get_result(self):

        # series only
        if self._is_series:

            # stack blocks
            if self.axis == 0:
                # concat Series with length to keep dtype as much
                non_empties = [x for x in self.objs if len(x) > 0]
                if len(non_empties) > 0:
                    values = [x._values for x in non_empties]
                else:
                    values = [x._values for x in self.objs]
                new_data = com._concat_compat(values)

                name = com._consensus_name_attr(self.objs)
                return (Series(new_data,
                               index=self.new_axes[0],
                               name=name,
                               dtype=new_data.dtype).__finalize__(
                                   self, method='concat'))

            # combine as columns in a frame
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')

        # combine block managers
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(mgrs_indexers,
                                                  self.new_axes,
                                                  concat_axis=self.axis,
                                                  copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return (self.objs[0]._from_axes(
                new_data, self.new_axes).__finalize__(self, method='concat'))
示例#19
0
文件: strings.py 项目: t1c1/pandas
def str_extract(arr, pat, flags=0):
    """
    Find groups in each string using passed regular expression

    Parameters
    ----------
    pat : string
        Pattern or regular expression
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE

    Returns
    -------
    extracted groups : Series (one group) or DataFrame (multiple groups)


    Notes
    -----
    Compare to the string method match, which returns re.match objects.
    """
    regex = re.compile(pat, flags=flags)

    # just to be safe, check this
    if regex.groups == 0:
        raise ValueError("This pattern contains no groups to capture.")
    elif regex.groups == 1:

        def f(x):
            if not isinstance(x, compat.string_types):
                return None
            m = regex.search(x)
            if m:
                return m.groups()[0]  # may be None
            else:
                return None
    else:
        empty_row = Series(regex.groups * [None])

        def f(x):
            if not isinstance(x, compat.string_types):
                return empty_row
            m = regex.search(x)
            if m:
                return Series(list(m.groups()))  # may contain None
            else:
                return empty_row

    result = arr.apply(f)
    result.replace({None: np.nan}, inplace=True)
    if regex.groups > 1:
        result = DataFrame(result)  # Don't rely on the wrapper; name columns.
        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
        result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
    else:
        result.name = regex.groupindex.get(0)
    return result
示例#20
0
 def getPercentile(self, df, trg_percentile):
     
     percentile = df.quantile(trg_percentile, axis=0)
     
     percentile_df = DataFrame(percentile)
     
     column_name = trg_percentile * 100
     percentile_df.columns = [str(column_name)]
     
     return percentile_df
示例#21
0
    def getPercentile(self, df, trg_percentile):

        percentile = df.quantile(trg_percentile, axis=0)

        percentile_df = DataFrame(percentile)

        column_name = trg_percentile * 100
        percentile_df.columns = [str(column_name)]

        return percentile_df
示例#22
0
 def write_to_csv(self):
     nw_df = DataFrame(list(self.lst))
     nw_df.columns = ['Redirect count','ssl_classification','url_length','hostname_length','subdomain_count','at_sign_in_url','exe_extension_in_request_url','exe_extension_in_landing_url',
                         'ip_as_domain_name','no_of_slashes_in requst_url','no_of_slashes_in_landing_url','no_of_dots_in_request_url','no_of_dots_in_landing_url','tld_value','age_of_domain',
                         'age_of_last_modified','content_length','same_landing_and_request_ip','same_landing_and_request_url']
     frames = [self.df['label'],self.df2['label']]
     new_df = pd.concat(frames)
     new_df = new_df.reset_index()
     nw_df['label'] = new_df['label']
     nw_df.to_csv('dataset1.csv',sep=',', encoding='latin-1')
示例#23
0
    def get_result(self):

        # series only
        if self._is_series:

            # stack blocks
            if self.axis == 0:
                # concat Series with length to keep dtype as much
                non_empties = [x for x in self.objs if len(x) > 0]
                if len(non_empties) > 0:
                    values = [x._values for x in non_empties]
                else:
                    values = [x._values for x in self.objs]
                new_data = com._concat_compat(values)

                name = com._consensus_name_attr(self.objs)
                return (Series(new_data, index=self.new_axes[0],
                               name=name,
                               dtype=new_data.dtype)
                        .__finalize__(self, method='concat'))

            # combine as columns in a frame
            else:
                data = dict(zip(range(len(self.objs)), self.objs))
                index, columns = self.new_axes
                tmpdf = DataFrame(data, index=index)
                tmpdf.columns = columns
                return tmpdf.__finalize__(self, method='concat')

        # combine block managers
        else:
            mgrs_indexers = []
            for obj in self.objs:
                mgr = obj._data
                indexers = {}
                for ax, new_labels in enumerate(self.new_axes):
                    if ax == self.axis:
                        # Suppress reindexing on concat axis
                        continue

                    obj_labels = mgr.axes[ax]
                    if not new_labels.equals(obj_labels):
                        indexers[ax] = obj_labels.reindex(new_labels)[1]

                mgrs_indexers.append((obj._data, indexers))

            new_data = concatenate_block_managers(
                mgrs_indexers, self.new_axes,
                concat_axis=self.axis, copy=self.copy)
            if not self.copy:
                new_data._consolidate_inplace()

            return (self.objs[0]._from_axes(new_data, self.new_axes)
                    .__finalize__(self, method='concat'))
示例#24
0
def str_extract(arr, pat, flags=0):
    """
    Find groups in each string using passed regular expression

    Parameters
    ----------
    pat : string
        Pattern or regular expression
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE

    Returns
    -------
    extracted groups : Series (one group) or DataFrame (multiple groups)


    Notes
    -----
    Compare to the string method match, which returns re.match objects.
    """
    regex = re.compile(pat, flags=flags)

    # just to be safe, check this
    if regex.groups == 0:
        raise ValueError("This pattern contains no groups to capture.")
    elif regex.groups == 1:
        def f(x):
            if not isinstance(x, compat.string_types):
                return None
            m = regex.search(x)
            if m:
                return m.groups()[0]  # may be None
            else:
                return None
    else:
        empty_row = Series(regex.groups * [None])

        def f(x):
            if not isinstance(x, compat.string_types):
                return empty_row
            m = regex.search(x)
            if m:
                return Series(list(m.groups()))  # may contain None
            else:
                return empty_row
    result = arr.apply(f)
    result.replace({None: np.nan}, inplace=True)
    if regex.groups > 1:
        result = DataFrame(result)  # Don't rely on the wrapper; name columns.
        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
        result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
    else:
        result.name = regex.groupindex.get(0)
    return result
示例#25
0
def Main_Mode_Function(df,GroupKey,JoinKey,Var,Leakage=False):
    tmp = copy.deepcopy(df)
    tmp_list = [list(tmp[JoinKey]),
                list(tmp[GroupKey]),
                list(tmp[Var].astype(str)),
                [0]*tmp.shape[0]
                ]
    data=[]
    for j in range(len(tmp_list[0])):
        data.append([tmp_list[i][j] for i in range(len(tmp_list))])

    GroupKey_bool = None;start_point=0
    for i in range(len(data)):
        if data[i][1]!=GroupKey_bool:
            if (Leakage==True and i>0):
                save_point=i
                Leakage_Mode(data,mode_dic,start_point,save_point)
                start_point=save_point
            GroupKey_bool=data[i][1]
            mode_dic={}
            mode_dic[data[i][2]]=mode_dic.get(data[i][2],0)+1
            data[i][2] = [data[i][2]]
            data[i][3] = len([data[i][2]])
        else:
            mode_dic[data[i][2]]=mode_dic.get(data[i][2],0)+1
            mode_value = Get_Mode(mode_dic)
            data[i][2] = mode_value
            data[i][3] = len(mode_value)

    save_point=i+1
    Leakage_Mode(data,mode_dic,start_point,save_point)

    data=DataFrame(data)
    if Leakage==False:
        data.columns = [str(JoinKey),str(GroupKey),Var+"_mode",Var+"_mode_count"]
    else:
        data.columns = [str(JoinKey),str(GroupKey),Var+"_mode",Var+"_mode_count",Var+"_mode_Leakage"]
    globals()[Var + "_ModeFrame" ] = data
    print("Done: "+Var+"_ModeFrame")
示例#26
0
 def getTraceData(self, reportShortName, conceptName, CIK, periodType):
     rs = FactDao.getFactValues2(reportShortName=reportShortName,
                                 conceptName=conceptName,
                                 CIK=CIK,
                                 periodType=periodType)
     rows = rs.fetchall()
     if (len(rows) != 0):
         df = DataFrame(rows)
         df.columns = rs.keys()
         trace = go.Scatter(x=df["date_"], y=df["value"], name=conceptName)
         return trace
     else:
         raise Exception("No data found " + conceptName)
示例#27
0
 def get_result(self):
     if self._is_series and self.axis == 0:
         new_data = com._concat_compat([x.values for x in self.objs])
         name = com._consensus_name_attr(self.objs)
         return Series(new_data, index=self.new_axes[0], name=name)
     elif self._is_series:
         data = dict(itertools.izip(xrange(len(self.objs)), self.objs))
         tmpdf = DataFrame(data, index=self.new_axes[0])
         tmpdf.columns = self.new_axes[1]
         return tmpdf
     else:
         new_data = self._get_concatenated_data()
         return self.objs[0]._from_axes(new_data, self.new_axes)
示例#28
0
文件: merge.py 项目: da415/pandas
 def get_result(self):
     if self._is_series and self.axis == 0:
         new_data = com._concat_compat([x.values for x in self.objs])
         name = com._consensus_name_attr(self.objs)
         return Series(new_data, index=self.new_axes[0], name=name)
     elif self._is_series:
         data = dict(itertools.izip(xrange(len(self.objs)), self.objs))
         tmpdf = DataFrame(data, index=self.new_axes[0])
         tmpdf.columns = self.new_axes[1]
         return tmpdf
     else:
         new_data = self._get_concatenated_data()
         return self.objs[0]._from_axes(new_data, self.new_axes)
示例#29
0
def bingxing(filename):
    start = end = time.clock()
    host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ppPj2gyINjoYiqkhsjAnyYDC&client_secret=2Q6tsZrbGsE60pXuoxg5o5AOUDCSMaLP'
    header = {'Content-Type': 'application/json; charset=UTF-8'}
    r = requests.post(host, headers=header)
    r = json.loads(r.text)
    Access_token = r['access_token']

    f = open(filename, 'rb')
    img = base64.b64encode(f.read())
    data = {"image": img, "templateSign": "7dc32854acac2c3bac8d3bb599ceaeca"}
    ocr_host = 'https://aip.baidubce.com/rest/2.0/solution/v1/iocr/recognise?access_token=' + Access_token
    ocr_header = {
        'Content-Type': 'application/x-www-form-urlencoded',
        "apikey": "ppPj2gyINjoYiqkhsjAnyYDC"
    }
    img = requests.post(ocr_host, headers=ocr_header, data=data)
    img = json.loads(img.text)
    ocr_res = img["data"]["ret"]
    sim_res = [i['word'] for i in ocr_res]
    testdata = DataFrame(sim_res[1::2]).T
    testdata.columns = sim_res[0::2]
    testdata.columns
    testdata = testdata.rename(
        columns={
            '中性细胞比率': '中性粒细胞百分比',
            '淋巴细胞(%)': '淋巴细胞百分比',
            '嗜酸性粒细胞比': '嗜酸性粒细胞百分比',
            '嗜酸性粒细胞比': '嗜酸性粒细胞百分比',
            '中性细胞数': '中性粒细胞计数',
            '淋巴细胞值': '淋巴细胞数计数',
            '单核细胞百分比': '单核细胞',
            '嗜酸性粒细胞': '嗜酸性粒细胞计数',
            '嗜碱性粒细胞': '嗜碱性粒细胞计数',
            '红细胞平均体积': '平均红细胞体积',
            '平均血红蛋白量': '平均血红蛋白',
            '红细胞分布宽度': '红细胞分布宽度变异系数',
            '平均血小板体积': '血小板平均体积',
            '血小板分布宽度': '血小板平均分布宽度'
        })
    testdata = testdata.apply(pd.to_numeric, errors='ignore')
    xtest = testdata[np.array(rowname)[clf.feature_importances_ >= 0.03]]
    #print(xtest)
    prob = model.predict_proba(xtest).tolist()[0]
    if model.predict(xtest):
        print('该人得有肾病,概率为%f' % prob[1])
    else:
        print('该人未得肾病,概率为%f' % prob[0])
    end = time.clock()
    print('运行时间为' + str(end - start) + '秒')
def runModelOnTest(testFilePath):
    classLabels = featureAndValueMapping.get(featureList[len(featureList) -1])
    classLabelCount = len(classLabels)
    ConfusionMatrix = [[0 for x in range(int(classLabelCount))] for x in range(int(classLabelCount))]
    input=open(testFilePath,'rU')
    csvObject=csv.reader(input)
    for row in csvObject:
        predictedLabel=classify(row[:len(row)-1])
        ConfusionMatrix[int(row[len(row)- 1])][int(predictedLabel)] += 1
        # print "Actual label : "+row[len(row)- 1]+"Class label : "+classify(row[:len(row)-1])
    df = DataFrame(ConfusionMatrix)
    df.columns = classLabels
    df.index = classLabels
    print df
示例#31
0
 def get_idList(self, bw_id=None):
     with open(self.filename, 'r', encoding='utf-8') as f:
         reader = csv.DictReader(f)
         idList = [row['bw_id'] for row in reader]
         if self.temp:
             # 减少转发紊乱导致的重复爬取
             # 用set会重新排序,则断点失效
             df = DataFrame(idList)
             df.columns = ['bw_id']
             df = df.drop_duplicates(keep='last')
             idList = df['bw_id']
             idList = idList.tolist()
             if bw_id:
                 pos = idList.index(bw_id)  # 必须为字符串形式
                 idList = idList[pos + 1:]
         return idList
示例#32
0
def toDB_record(functionname, list, remark):
    try:
        start = datetime.datetime.now()
        if (functionname != 'get_rise'):
            _excutesql("delete from t_record where date=" + c.DATE +
                       " AND type=" + functionname)
    finally:
        if list:
            df = DataFrame(list)
            df.columns = ['code']
            df.insert(0, 'date', c.DATE)
            df.insert(2, 'type', functionname)
            df.insert(3, 'remark', remark)
            df.to_sql('t_record', c.ENGINE, if_exists='append')
        end = datetime.datetime.now()
        print("TODB: " + str(end - start))
示例#33
0
文件: merge.py 项目: nitfer/pandas
 def get_result(self):
     if self._is_series and self.axis == 0:
         new_data = com._concat_compat([x.get_values() for x in self.objs])
         name = com._consensus_name_attr(self.objs)
         new_data = self._post_merge(new_data)
         return Series(new_data, index=self.new_axes[0], name=name)
     elif self._is_series:
         data = dict(zip(range(len(self.objs)), self.objs))
         index, columns = self.new_axes
         tmpdf = DataFrame(data, index=index)
         if columns is not None:
             tmpdf.columns = columns
         return tmpdf
     else:
         new_data = self._get_concatenated_data()
         new_data = self._post_merge(new_data)
         return self.objs[0]._from_axes(new_data, self.new_axes)
示例#34
0
文件: merge.py 项目: weilinear/pandas
 def get_result(self):
     if self._is_series and self.axis == 0:
         new_data = com._concat_compat([x.get_values() for x in self.objs])
         name = com._consensus_name_attr(self.objs)
         new_data = self._post_merge(new_data)
         return Series(new_data, index=self.new_axes[0], name=name)
     elif self._is_series:
         data = dict(zip(range(len(self.objs)), self.objs))
         index, columns = self.new_axes
         tmpdf = DataFrame(data, index=index)
         if columns is not None:
             tmpdf.columns = columns
         return tmpdf
     else:
         new_data = self._get_concatenated_data()
         new_data = self._post_merge(new_data)
         return self.objs[0]._from_axes(new_data, self.new_axes)
示例#35
0
 def do_load(self):
     all_unimported = IncomingSalesforceRecord.get_unimported()
     object_types = all_unimported.select(IncomingSalesforceRecord.object_type).distinct()
     
     for obj in object_types:
         unimported_recs = all_unimported.select().where(IncomingSalesforceRecord.object_type==obj.object_type)
         unimported_dicts = [json.loads(rec.record) for rec in unimported_recs]
         for d in unimported_dicts:
             d['url'] = d['attributes']['url']
             del d['attributes']
             
             for k,v in d.iteritems():
                 if isinstance(v, dict):
                     d[k] = json.dumps(v)
                     
         df = DataFrame(unimported_dicts)
         df.columns = [colname.lower() for colname in df.columns]
         table_name = 'sf_%s' % (obj.object_type.lower())
         logger.info('Writing records for Salesforce object %s to db table %s' % (obj.object_type,table_name))
         df.to_sql(table_name, self.engine, flavor='postgresql', if_exists='replace', index=False, index_label=None)
示例#36
0
def save2DB():
    with open(
            "/Users/admin/Desktop/doc/finance/multifactor/data/industry/sina_config_data.txt",
            'r') as f:
        configstr = f.read().replace("\\'", "'")
        ldict = json.loads(configstr)
        #申万二级
        ind = ldict[1][0][1][3][1]
        allNodes = []
        # for ind2 in ind:
        #     ind3 = ind2[1]
        makeTopNode(ind, allNodes, "热门概念")
        data = {'indcode'}
        # print(ldict)
        pdind = DataFrame(allNodes)
        pdind.columns = [
            'indcode', 'indname', 'level', 'par_indcode', 'par_indname',
            'classname'
        ]
        # pdmean.to_sql('statistic2', engine)
        pdind.to_sql('industry', engine, if_exists='append')
示例#37
0
def parse():
    df_data = pd.read_excel('bin.xlsx')
    # 默认读取前5行的数据
    data = df_data.head()
    print(data)
    svc_dict = list()
    for row in df_data.itertuples():
        name = getattr(row, '发卡行名称')
        length = getattr(row, '长度')
        val = getattr(row, '取值')
        c_type = getattr(row, '卡种')
        datepat = re.compile(r'\(.*?\)')
        b_name = re.sub(datepat, '1', name.replace('\n', ''))
        body = (b_name.replace('1', ''), length, val, c_type)
        svc_dict.append(body)

    df_list = DataFrame(svc_dict)
    df_list.columns = ['发卡行名称', '长度', '取值', '卡种']
    df_list.to_csv('bin.csv', encoding='utf_8_sig')

    print("finished")
示例#38
0
def parse():

    df_list = DataFrame()
    for url in urls:
        response = requests.get(url, headers=headers).text
        # 转化为字符串
        json_str = json.loads(response)
        # 大title
        title = json_str['info']['title']
        print(title)
        service_path = json_str['paths']
        svc_dict = list()
        for svc, data in service_path.items():
            req = data.get('post')
            req_method = 'post'
            if req == '' or req is None:
                req = data.get('get')
                req_method = 'get'
            if req == '' or req is None:
                req = data.get('put')
                req_method = 'put'
            if req == '' or req is None:
                req = data.get('delete')
                req_method = 'delete'
            if req is not None:
                body = (title, svc, req.get('summary'), req_method)
                svc_dict.append(body)

        if df_list.empty:
            df_list = DataFrame(svc_dict)
        else:
            df_list = df_list.append(DataFrame(svc_dict))

    df_list.columns = ['title', 'url', 'description', 'method']
    if os.path.exists(file_name):
        os.remove(file_name)
    df_list.to_csv('svc.csv', encoding='utf_8_sig')

    print("finished")
示例#39
0
def ll_to_grid(ll_data_2g):
    """
    grid_num 是从1开始编号的
    :param ll_data_2g:
    :return:
    """

    # y_box_num = int((haversine(lb_Longitude, lb_Latitude, lb_Longitude, rt_Latitude))/per_len) + 1
    # X_box_num = int((haversine(lb_Longitude, lb_Latitude, rt_Longitude, lb_Latitude))/per_len) + 1
    # print(X_box_num)
    # print(y_box_num)
    # print(ll_data_2g)
    ll_data_2g_list = ll_data_2g.as_matrix().tolist()
    for row in ll_data_2g_list:
        lon = row[2]
        lat = row[3]
        # grid_index = calculate_grid(lb_Latitude, lb_Longitude, lat, lon)
        y_length = haversine(lb_Longitude, lb_Latitude, lb_Longitude, lat)
        X_length = haversine(lb_Longitude, lb_Latitude, lon, lb_Latitude)

        y = int(y_length / per_len)
        X = int(X_length / per_len)

        if y_length % per_len != 0:
            y += 1
        if X_length % per_len != 0:
            X += 1

        grid_num = X + (y - 1) * X_box_num
        row.append(grid_num)

    indexs = ll_data_2g.columns.values.tolist()
    indexs.append('grid_num')
    train_data = DataFrame(ll_data_2g_list)
    train_data.columns = indexs

    # print(train_data)
    return train_data
示例#40
0
    def export_analyze_result(self, arg_symbol):

        if not os.path.exists(DEF_EXPORT_FOLDER_NAME):
            os.makedirs(DEF_EXPORT_FOLDER_NAME)

        main_data = DataFrame(list(self.__data))
        main_data.columns = DEF_STOCK_COULMN_NAME

        result_data = []
        result_data.append(main_data)
        result_data += self.__result

        df = pd.concat(result_data, axis=1)

        filename = datetime.today().strftime(arg_symbol + "_%Y%m%d-%H%M%S.csv")

        try:
            df.to_csv(DEF_EXPORT_FOLDER_NAME + os.sep + filename,
                      sep=',',
                      encoding='utf-8')
        except:
            return None

        return filename
示例#41
0
文件: strings.py 项目: jzwick/pandas
def str_extract(arr, pat, flags=0):
    """
    Find groups in each string using passed regular expression

    Parameters
    ----------
    pat : string
        Pattern or regular expression
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE

    Returns
    -------
    extracted groups : Series (one group) or DataFrame (multiple groups)

    Examples
    --------
    A pattern with one group will return a Series. Non-matches will be NaN.

    >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
    0      1
    1      2
    2    NaN
    dtype: object

    A pattern with more than one group will return a DataFrame.
    
    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')

    A pattern may contain optional groups.
    
    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)')

    Named groups will become column names in the result.
    
    >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)')
    """
    regex = re.compile(pat, flags=flags)

    # just to be safe, check this
    if regex.groups == 0:
        raise ValueError("This pattern contains no groups to capture.")
    elif regex.groups == 1:
        def f(x):
            if not isinstance(x, compat.string_types):
                return None
            m = regex.search(x)
            if m:
                return m.groups()[0]  # may be None
            else:
                return None
    else:
        empty_row = Series(regex.groups * [None])

        def f(x):
            if not isinstance(x, compat.string_types):
                return empty_row
            m = regex.search(x)
            if m:
                return Series(list(m.groups()))  # may contain None
            else:
                return empty_row
    result = arr.apply(f)
    result.replace({None: np.nan}, inplace=True)
    if regex.groups > 1:
        result = DataFrame(result)  # Don't rely on the wrapper; name columns.
        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
        result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
    else:
        result.name = regex.groupindex.get(0)
    return result
示例#42
0
文件: _base.py 项目: josham/pandas
    def parse(self,
              sheet_name=0,
              header=0,
              names=None,
              index_col=None,
              usecols=None,
              squeeze=False,
              dtype=None,
              true_values=None,
              false_values=None,
              skiprows=None,
              nrows=None,
              na_values=None,
              verbose=False,
              parse_dates=False,
              date_parser=None,
              thousands=None,
              comment=None,
              skipfooter=0,
              convert_float=True,
              mangle_dupe_cols=True,
              **kwds):

        _validate_header_arg(header)

        ret_dict = False

        # Keep sheetname to maintain backwards compatibility.
        if isinstance(sheet_name, list):
            sheets = sheet_name
            ret_dict = True
        elif sheet_name is None:
            sheets = self.sheet_names
            ret_dict = True
        else:
            sheets = [sheet_name]

        # handle same-type duplicates.
        sheets = list(OrderedDict.fromkeys(sheets).keys())

        output = OrderedDict()

        for asheetname in sheets:
            if verbose:
                print("Reading sheet {sheet}".format(sheet=asheetname))

            if isinstance(asheetname, compat.string_types):
                sheet = self.get_sheet_by_name(asheetname)
            else:  # assume an integer if not a string
                sheet = self.get_sheet_by_index(asheetname)

            data = self.get_sheet_data(sheet, convert_float)
            usecols = _maybe_convert_usecols(usecols)

            if sheet.nrows == 0:
                output[asheetname] = DataFrame()
                continue

            if is_list_like(header) and len(header) == 1:
                header = header[0]

            # forward fill and pull out names for MultiIndex column
            header_names = None
            if header is not None and is_list_like(header):
                header_names = []
                control_row = [True] * len(data[0])

                for row in header:
                    if is_integer(skiprows):
                        row += skiprows

                    data[row], control_row = _fill_mi_header(data[row],
                                                             control_row)

                    if index_col is not None:
                        header_name, _ = _pop_header_name(data[row], index_col)
                        header_names.append(header_name)

            if is_list_like(index_col):
                # Forward fill values for MultiIndex index.
                if not is_list_like(header):
                    offset = 1 + header
                else:
                    offset = 1 + max(header)

                # Check if we have an empty dataset
                # before trying to collect data.
                if offset < len(data):
                    for col in index_col:
                        last = data[offset][col]

                        for row in range(offset + 1, len(data)):
                            if data[row][col] == '' or data[row][col] is None:
                                data[row][col] = last
                            else:
                                last = data[row][col]

            has_index_names = is_list_like(header) and len(header) > 1

            # GH 12292 : error when read one empty column from excel file
            try:
                parser = TextParser(data,
                                    names=names,
                                    header=header,
                                    index_col=index_col,
                                    has_index_names=has_index_names,
                                    squeeze=squeeze,
                                    dtype=dtype,
                                    true_values=true_values,
                                    false_values=false_values,
                                    skiprows=skiprows,
                                    nrows=nrows,
                                    na_values=na_values,
                                    parse_dates=parse_dates,
                                    date_parser=date_parser,
                                    thousands=thousands,
                                    comment=comment,
                                    skipfooter=skipfooter,
                                    usecols=usecols,
                                    mangle_dupe_cols=mangle_dupe_cols,
                                    **kwds)

                output[asheetname] = parser.read(nrows=nrows)

                if not squeeze or isinstance(output[asheetname], DataFrame):
                    if header_names:
                        output[asheetname].columns = output[
                            asheetname].columns.set_names(header_names)
                    elif compat.PY2:
                        output[asheetname].columns = _maybe_convert_to_string(
                            output[asheetname].columns)

            except EmptyDataError:
                # No Data, return an empty DataFrame
                output[asheetname] = DataFrame()

        if ret_dict:
            return output
        else:
            return output[asheetname]
示例#43
0
def strat_maLong_maShort(
    df=readYahoo("SPY"),
    maLongDays=10,
    maShortDays=3,
    closeCol="Close",
    highCol="High",
    lowCol="Low",
    openCol="Open",
    signOfTrade=1,
    printit=True,
    block=False,
):
    """ execute strategy which enters and exit based on Moving Average crossovers
        Example:
            from pystrats.state_strats import strat_maLong_maShort as ss
            dfretfinal = ss() #strat_maLong_maShort()
            print dfretfinal
            print dfretfinal['ret'].mean()
        
    """
    close = np.array(df[closeCol])
    high = np.array(df[highCol])
    low = np.array(df[lowCol])
    open = np.array(df[openCol])
    date = np.array(df["Date"])

    ma10 = rolling_mean(close, maLongDays)
    ma9 = rolling_mean(close, maLongDays - 1)
    ma3 = rolling_mean(close, maShortDays)
    ma2 = rolling_mean(close, maShortDays - 1)

    n = len(df)
    nl = n - 1

    #     pMa10 = dsInsert(ma10[0:nl],0,None)
    #     pMa9 = dsInsert(ma9[0:nl],0,None)
    #     pMa3 = dsInsert(ma3[0:nl],0,None)
    #     pMa2 = dsInsert(ma2[0:nl],0,None)

    pMa10 = np.insert(ma10[0:nl], 0, None)
    pMa9 = np.insert(ma9[0:nl], 0, None)
    pMa3 = np.insert(ma3[0:nl], 0, None)
    pMa2 = np.insert(ma2[0:nl], 0, None)

    pClose = np.insert(close[0:nl], 0, None)
    pHigh = np.insert(high[0:nl], 0, None)
    pLow = np.insert(low[0:nl], 0, None)

    # initialize state vector
    state = np.array([1] * n)

    # loop
    start_i = maLongDays + 1
    for i in range(start_i, n):
        if (pClose[i] < pMa10[i]) & (state[i - 1] == 1) & (high[i] > pMa9[i]):
            state[i] = 2
        elif (state[i - 1] == 2) & (low[i] > pMa2[i]):
            state[i] = 2
        elif (state[i - 1] == 2) & (low[i] <= pMa2[i]):
            state[i] = 1

    pState = np.insert(state[0:nl], 0, 1)

    # create entry conditions
    # 1. initial entry (state 1 to state 2)
    e1_2 = np.array((pState == 1) & (state == 2))
    e2_2 = np.array((pState == 2) & (state == 2))
    e2_1 = np.array((pState == 2) & (state == 1))

    dfret = DataFrame([date, pHigh, pLow, pClose, pMa10, pMa9, pMa3, pMa2]).T
    dfret.columns = ["Date", "pHigh", "pLow", "pClose", "pMa10", "pMa9", "pMa3", "pMa2"]

    # create daily entry prices
    dailyEntryPrices = np.array([0] * n)
    # default entry
    dailyEntryPrices = asb(dailyEntryPrices, pMa9, e1_2)
    useCloseOnEntry = e1_2 & (low > pMa9)
    dailyEntryPrices = asb(dailyEntryPrices, close, useCloseOnEntry)
    dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_2)
    dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_1)
    dfret["entry"] = dailyEntryPrices

    # create DAILY settle prices, which are either 0 or the Close
    # dfret$Close <- close
    dailySettlePrices = np.array([0] * n)
    dailySettlePrices = asb(dailySettlePrices, close, e1_2)  # <- close[w1_2]
    dailySettlePrices = asb(dailySettlePrices, close, e2_2)  # dailySettlePrices[w2_2] <- close[w2_2]
    dailySettlePrices = asb(dailySettlePrices, pMa2, e2_1)  # dailySettlePrices[w2_1] <- pMa2[w2_1]

    # adjust for situations where the high is below the pMa2, so you get out at the close
    useCloseOnExit = e2_1 & (high < pMa2)
    dailySettlePrices = asb(
        dailySettlePrices, close, useCloseOnExit
    )  # dailySettlePrices[useCloseOnExit] <- close[useCloseOnExit]
    dfret["exit"] = dailySettlePrices
    dfret["ret"] = dfret["exit"] / dfret["entry"] - 1

    dfret["ret"].fillna(0)
    dfretfinal = dfret.dropna(0)  # dfretfinal <- dfret[-badrows(dfret),]

    if printit:
        retDf = DataFrame({"Date": dfretfinal["Date"], "ret": dfretfinal["ret"]})
        returnsPerformance(retDf, block=block)
    return dfretfinal
示例#44
0
stream = (line.decode('cp1251').strip().encode('utf-8')
          for line in stdin)

# tee the stream to get the metadata for title
stream, stream_2 = tee(stream)

title = get_metadata(stream_2)['TITLE']

df = DataFrame()
for cur_data in iter_contextual_atom_data(stream):
    current = DataFrame.from_dict([cur_data])
    df = df.append(current, ignore_index=False)

index_cols = list(df.columns.values)
index_cols.remove('value')
df.set_index(index_cols, inplace=True)
df.columns = [title]

# create removable temp file for use with HDFStore
tmpfile = NamedTemporaryFile().name

store = HDFStore(tmpfile)
store['default'] = df
store.close()

# put h5 file to stdout
with open(tmpfile, 'rb') as f:
    print f.read()

# temp file is automatically removed