예제 #1
0
def itemizedBatchDictionary(batch, betas):
    fullTexts = ''
    fullTexts = ''.join([fullTexts + doc[-1] for doc in batch])
    intersect = list(set(f10X.cleanText(fullTexts)).intersection(dictionary.keys()))
    colNames = intersect
    rowNames = ["doc"+str(i+1) for i in range(len(batch))]
    zeros = np.zeros((len(rowNames),len(colNames)))
    batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    betas_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    Xs_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    XtimesBeta = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    batch_dict = {}
    i=1
    for doc in batch:
        rowStr = "doc"+str(i)
        docList = list(set(f10X.cleanText(doc[-1])).intersection(intersect))
        d = {}
        d = {k: dictionary[k] for k in docList}
        freq = collections.Counter(f10X.cleanText(doc[-1]))
        items = fNN.getItems(doc[-1])
        for item in items:
            itemList = list(set(f10X.cleanText(item[-1])).intersection(docList))
        for k in docList:
            rowStr = "doc"+str(i)
            n = dictionary[k]['ndocs']
            batch_mat.loc[rowStr,k] = (1+np.log(freq[k]))*np.log(n_docs/n)
        batch_dict.update(d)
        i+=1
    return batch_dict, batch_mat
예제 #2
0
def forwardPropagation(batch, batch_dict, batch_mat, W):
    y = []
    y_hat = []
    X = []
    i = 0
    for item in batch:
        i += 1
        text = item[-1]
        price_boolean = item[4]
        price = item[5]
        y.append(price_boolean)
        text = f10X.cleanText(text)
        text = list(set(text))
        # From text to values
        doc = "doc" + str(i)
        values = [[
            batch_mat.at[doc, w] * batch_dict[w]['pos'],
            batch_mat.at[doc, w] * batch_dict[w]['neg']
        ] for w in text if w in batch_dict]
        #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict]
        values = np.array(values)

        # Summation layer - results in 2x1 vector
        sumValues = (values.sum(axis=0))
        X.append(sumValues)

        # First linear layer - results in 2x1 vector
        linlayer = W.dot(sumValues)

        # Softmax
        y_hat.append(fNN.softmax(linlayer))
    y = np.column_stack(y)
    y_hat = np.column_stack(y_hat)
    X = np.row_stack(X)
    return y, y_hat, X
예제 #3
0
def batchDictionary(batch):
    fullTexts = ''
    fullTexts = ''.join([fullTexts + doc[-1] for doc in batch])
    inter = list(
        set(f10X.cleanText(fullTexts)).intersection(dictionary.keys()))
    colNames = inter
    rowNames = ["doc" + str(i + 1) for i in range(len(batch))]
    zeros = np.zeros((len(rowNames), len(colNames)))
    batch_mat = pd.DataFrame(zeros, index=rowNames, columns=colNames)
    batch_dict = {}
    i = 1
    for doc in batch:
        docList = list(set(f10X.cleanText(doc[-1])).intersection(inter))
        d = {}
        d = {k: dictionary[k] for k in docList}
        freq = collections.Counter(f10X.cleanText(doc[-1]))
        for k in docList:
            rowStr = "doc" + str(i)
            batch_mat.loc[rowStr, k] = freq[k]
        batch_dict.update(d)
        i += 1
    return batch_dict, batch_mat
예제 #4
0
def getOmega(text, dictionary):
    text = f10X.cleanText(text)
    values = [dictionary[w]['Score'] for w in text if w in dictionary]
    if len(values)==0:
        return []
    else:  
        len_text = len(text)
        n_sentWords = len(values)
        values = np.array(values)
        n_pos = nPos(values)
        n_neg = nNeg(values)
        score1 = int(sum(values))
        #scorw2 
        return [len_text, n_sentWords, n_pos, n_neg, score1]
예제 #5
0
def forwardPropagation(batch, batch_dict, batch_mat, coefficients):
    betas = coefficients[0]
    D = coefficients[1]
    W = coefficients[2]
    C = coefficients[3]
    y = []
    y_hat = []
    X = []
    i = 0
    for item in batch:
        i += 1
        text = item[-1]
        price_boolean = item[4]
        price = item[5]
        y.append(price_boolean)
        text = f10X.cleanText(text)
        text = list(set(text))
        # From text to values
        doc = "doc" + str(i)
        values = [[
            batch_mat.at[doc, w] * batch_dict[w]['pos'],
            batch_mat.at[doc, w] * batch_dict[w]['neg']
        ] for w in text if w in batch_dict]
        #values = [[batch_dict[w]['pos'],batch_dict[w]['neg']] for w in text if w in batch_dict]
        values = np.array(values)

        # Summation layer - results in 2x1 vector
        sumValues = (values.sum(axis=0))
        X.append(sumValues)

        # NN layers
        linlayer1 = D.dot(sumValues)
        e2a = np.exp(2 * linlayer1)
        tanh = (e2a - 1) / (e2a + 1)
        linlayer2 = W.dot(tanh) + C

        # Softmax
        a = fNN.softmax(linlayer2.T)
        y_hat.append(a)
    y = np.column_stack(y)
    y_hat = np.column_stack(y_hat)
    X = np.row_stack(X)
    return y, y_hat, X
예제 #6
0
def getOmega2(text, dictionary):
    N = 276880
    freq = collections.Counter(f10X.cleanText(text))
    avg_count = sum(freq.values())/len(freq.values())
    values = []
    for key, value in freq.items():
        if key in dictionary:
            score = dictionary[key]['Score']
            weight = (1+np.log(value))/(1+np.log(avg_count))*np.log(dictionary[key]['ndocs']/N)
            values.append(score*weight)
    values = [dictionary[w]['Score'] for w in text if w in dictionary]
    if len(values)==0:
        return []
    else:  
        len_text = len(text)
        n_sentWords = len(values)
        values = np.array(values)
        n_pos = nPos(values)
        n_neg = nNeg(values)
        score1 = int(sum(values))
        #scorw2 
        return [len_text, n_sentWords, n_pos, n_neg, score1]