Python cleanText 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: functions10X

메소드/함수: cleanText

hotexamples.com에서의 예제들: 6

Python cleanText - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 functions10X.cleanText에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def itemizedBatchDictionary(batch, betas):
    fullTexts = ''
    fullTexts = ''.join([fullTexts + doc[-1] for doc in batch])
    intersect = list(set(f10X.cleanText(fullTexts)).intersection(dictionary.keys()))
    colNames = intersect
    rowNames = ["doc"+str(i+1) for i in range(len(batch))]
    zeros = np.zeros((len(rowNames),len(colNames)))
    batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    betas_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    Xs_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    XtimesBeta = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    batch_dict = {}
    i=1
    for doc in batch:
        rowStr = "doc"+str(i)
        docList = list(set(f10X.cleanText(doc[-1])).intersection(intersect))
        d = {}
        d = {k: dictionary[k] for k in docList}
        freq = collections.Counter(f10X.cleanText(doc[-1]))
        items = fNN.getItems(doc[-1])
        for item in items:
            itemList = list(set(f10X.cleanText(item[-1])).intersection(docList))
        for k in docList:
            rowStr = "doc"+str(i)
            n = dictionary[k]['ndocs']
            batch_mat.loc[rowStr,k] = (1+np.log(freq[k]))*np.log(n_docs/n)
        batch_dict.update(d)
        i+=1
    return batch_dict, batch_mat

예제 #2

파일 보기

def forwardPropagation(batch, batch_dict, batch_mat, W):
    y = []
    y_hat = []
    X = []
    i = 0
    for item in batch:
        i += 1
        text = item[-1]
        price_boolean = item[4]
        price = item[5]
        y.append(price_boolean)
        text = f10X.cleanText(text)
        text = list(set(text))
        # From text to values
        doc = "doc" + str(i)
        values = [[
            batch_mat.at[doc, w] * batch_dict[w]['pos'],
            batch_mat.at[doc, w] * batch_dict[w]['neg']
        ] for w in text if w in batch_dict]
        #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict]
        values = np.array(values)

        # Summation layer - results in 2x1 vector
        sumValues = (values.sum(axis=0))
        X.append(sumValues)

        # First linear layer - results in 2x1 vector
        linlayer = W.dot(sumValues)

        # Softmax
        y_hat.append(fNN.softmax(linlayer))
    y = np.column_stack(y)
    y_hat = np.column_stack(y_hat)
    X = np.row_stack(X)
    return y, y_hat, X

예제 #3

파일 보기

def batchDictionary(batch):
    fullTexts = ''
    fullTexts = ''.join([fullTexts + doc[-1] for doc in batch])
    inter = list(
        set(f10X.cleanText(fullTexts)).intersection(dictionary.keys()))
    colNames = inter
    rowNames = ["doc" + str(i + 1) for i in range(len(batch))]
    zeros = np.zeros((len(rowNames), len(colNames)))
    batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    batch_dict = {}
    i = 1
    for doc in batch:
        docList = list(set(f10X.cleanText(doc[-1])).intersection(inter))
        d = {}
        d = {k: dictionary[k] for k in docList}
        freq = collections.Counter(f10X.cleanText(doc[-1]))
        for k in docList:
            rowStr = "doc" + str(i)
            batch_mat.loc[rowStr, k] = freq[k]
        batch_dict.update(d)
        i += 1
    return batch_dict, batch_mat

예제 #4

파일 보기

파일: functionsSVM.py 프로젝트: tgarutti/Fin-SentiLex

def getOmega(text, dictionary):
    text = f10X.cleanText(text)
    values = [dictionary[w]['Score'] for w in text if w in dictionary]
    if len(values)==0:
        return []
    else:  
        len_text = len(text)
        n_sentWords = len(values)
        values = np.array(values)
        n_pos = nPos(values)
        n_neg = nNeg(values)
        score1 = int(sum(values))
        #scorw2 
        return [len_text, n_sentWords, n_pos, n_neg, score1]

예제 #5

파일 보기

파일: classificationNN.py 프로젝트: tgarutti/Fin-SentiLex

def forwardPropagation(batch, batch_dict, batch_mat, coefficients):
    betas = coefficients[0]
    D = coefficients[1]
    W = coefficients[2]
    C = coefficients[3]
    y = []
    y_hat = []
    X = []
    i = 0
    for item in batch:
        i += 1
        text = item[-1]
        price_boolean = item[4]
        price = item[5]
        y.append(price_boolean)
        text = f10X.cleanText(text)
        text = list(set(text))
        # From text to values
        doc = "doc" + str(i)
        values = [[
            batch_mat.at[doc, w] * batch_dict[w]['pos'],
            batch_mat.at[doc, w] * batch_dict[w]['neg']
        ] for w in text if w in batch_dict]
        #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict]
        values = np.array(values)

        # Summation layer - results in 2x1 vector
        sumValues = (values.sum(axis=0))
        X.append(sumValues)

        # NN layers
        linlayer1 = D.dot(sumValues)
        e2a = np.exp(2 * linlayer1)
        tanh = (e2a - 1) / (e2a + 1)
        linlayer2 = W.dot(tanh) + C

        # Softmax
        a = fNN.softmax(linlayer2.T)
        y_hat.append(a)
    y = np.column_stack(y)
    y_hat = np.column_stack(y_hat)
    X = np.row_stack(X)
    return y, y_hat, X

예제 #6

파일 보기

파일: functionsSVM.py 프로젝트: tgarutti/Fin-SentiLex

def getOmega2(text, dictionary):
    N = 276880
    freq = collections.Counter(f10X.cleanText(text))
    avg_count = sum(freq.values())/len(freq.values())
    values = []
    for key, value in freq.items():
        if key in dictionary:
            score = dictionary[key]['Score']
            weight = (1+np.log(value))/(1+np.log(avg_count))*np.log(dictionary[key]['ndocs']/N)
            values.append(score*weight)
    values = [dictionary[w]['Score'] for w in text if w in dictionary]
    if len(values)==0:
        return []
    else:  
        len_text = len(text)
        n_sentWords = len(values)
        values = np.array(values)
        n_pos = nPos(values)
        n_neg = nNeg(values)
        score1 = int(sum(values))
        #scorw2 
        return [len_text, n_sentWords, n_pos, n_neg, score1]