def get_dataset(path): dataset=[] for filename in os.listdir(path): t = open(path + filename,"r").read() dataset.append([filename, re.sub('[^a-zA-Z]+', ' ', t)]) dico = matcher.get_dico(dataset) # dico is a column with the matching scores of the MDAs versus the Finance Dictionary df=pd.DataFrame(dataset) df[1] = pd.Series(dico) blob = matcher.get_blob(df) df[2] = pd.Series(blob) df.columns = ['Filename','MatchDico','TextBlob'] return df
def get_dataset(path): dataset=[] for filename in os.listdir(path): if filename.endswith("pos"): t = open(path + filename,"r").read() dataset.append([re.sub('[^a-zA-Z]+', ' ', t), re.sub(r"(?:_pos)$",'', filename), "pos"]) elif filename.endswith("neg"): t = open(path + filename,"r").read() dataset.append([re.sub('[^a-zA-Z]+', ' ', t), re.sub(r"(?:_neg)$",'', filename), "neg"]) return dataset ### Main function # FEATURE 1 - Match with the McDonald Dictionary dataset = get_dataset("../mdatest/") dico = matcher.get_dico(dataset) # dico is a column with the matching scores of the MDAs versus the Finance Dictionary df=pd.DataFrame(dataset) df[3] = pd.Series(dico) df.columns = ['MD&A_Text','Filename','Actual','MatchDico'] # FEATURE 2 and 3 - Match with the Compustat financial data to get the indices 'delta_sales' and 'delta_at' compustat = pd.read_csv('compustat_filenames.csv', sep=',') de = compustat['delta_sale'] dt = compustat['delta_at'] ds = pd.merge(df, compustat, left_on='Filename', right_on='Filename') # We split the global matrix "result" into a training and a testing set train, test = validator.split(ds,0.5) # We fit a Random Forest model (n_estimators default=10, min_samples_leaf default=1)