Exemplo n.º 1
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") 
  OUTPUT_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv") 

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File to be processed. Defaults to '%(default)s'")

  parser.add_argument('--ns', 
                      '--num-samples', 
                      dest='num_samples', 
                      default=None,
                      type=int, 
                      help='The number of samples')

  parser.add_argument('--nf', 
                      '--num-folds', 
                      dest='num_folds', 
                      default=5,
                      type=int, 
                      help='The number of folds')
     
  parser.add_argument('-o', 
                      '--output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="File to be used to save the fold indices. Defaults to '%(default)s'")

    
  args = parser.parse_args()
      
  # read the data    
  if args.num_samples == None:
    data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1")
    num_samples = len(data_in)
  else:
    num_samples = args.num_samples  

  # generate the folds
  from random import shuffle
  indices_list = list(range(0, num_samples))
  shuffle(indices_list)
  retval = []
  folds_indices = [x % args.num_folds + 1 for x in indices_list]

  # write the indices to file
  helpers.ensure_dir(os.path.dirname(args.output_file))
  folds_indices_towrite = pandas.Series(numpy.array(folds_indices))
  folds_indices_towrite.to_csv(args.output_file, index=False) # write to file, but don't give row names
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
  OUTPUT_FILE = os.path.join("..", "..", "data", "params", 'params-decisiontree.pkl')

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('-c', 
                      '--classifier', 
                      type=str, 
                      dest='classifier',
                      default="decision-tree", 
                      help="The classifier to be used. Defaults to '%(default)s'")

  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="A file to output the predicted labels. Defaults to '%(default)s'")

  

  args = parser.parse_args()

  # Read the input dictionary
  type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb"))
  type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3}
  # get all the label data
  labels_orig = [type_dict[x] for x in type_classes] 
  data_orig = token_container

  if args.classifier == 'decision-tree':
    import ipdb; ipdb.set_trace()
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    parameters = {
    'clf__max_depth': [i for i in range(25, 200, 25)],
  } 
  elif args.classifier == 'random-forest':
    clf = ensemble.RandomForestClassifier(criterion='entropy')
    parameters = {
    'clf__max_depth': [i for i in range(25, 200, 25)],
  }
  else: #'logistic-regression':
    clf = linear_model.LogisticRegression()
    parameters = {
    'clf__C': [0.5, 1, 5, 10],
  }

  ppl = pipeline.Pipeline([
    ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True
    ('clf', clf),
  ])
  

  gs = grid_search.GridSearchCV(ppl, parameters, verbose=1, cv=5)
  gs.fit(data_orig, labels_orig)

  print(gs.best_params_, gs.best_score_)

  helpers.ensure_dir(os.path.dirname(args.output_file))
  pickle.dump([gs.best_params_, gs.best_params_], open(args.output_file, "wb" ))
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
  OUTPUT_FILE = os.path.join("..", "..", "data", "params", 'params-decisiontree.pkl')

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('-n',
                      '-ngrams',
                      dest = 'nGrams',
                      type = int,
                      default = 1, 
                      help = 'Defines how to split words by ngrams. Default is tokenized to one word ngrams'
                      )

  parser.add_argument('--ti', 
                      dest='tf_idf', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, TfIdf features will be used')

  parser.add_argument('-b', 
                      dest='binary', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, CountVecorizer will use binary counts instead or frequency counts')


  parser.add_argument('-c', 
                      '--classifier', 
                      type=str, 
                      dest='classifier',
                      default="decision-tree", 
                      help="The classifier to be used. Defaults to '%(default)s'")

  parser.add_argument('--cat',
                      type=str, 
                      dest='category', 
                      default='income-type',
                      choices=('income-type','income-source','expenditure-type'),
                      help="The type of categorization. Defaults to '%(default)s'")

  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="A file to output the predicted labels. Defaults to '%(default)s'")

  

  args = parser.parse_args()

  # Read the input dictionary
  
  data_in = pandas.read_pickle(args.input_file)
  type_classes = list(data_in['type_class'])
  source_classes = list(data_in['source_class'])
  frID = list(data_in['frID'])
  data_orig = data_in['description']

  if args.category == 'income-type' or args.category == 'expenditure-type':
    labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already
  else:
    labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already

  if args.classifier == 'decision-tree':
    #import ipdb; ipdb.set_trace()
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    parameters = {
    'clf__max_depth': [i for i in range(25, 200, 25)], #200
  } 
  elif args.classifier == 'random-forest':
    clf = ensemble.RandomForestClassifier(criterion='entropy')
    parameters = {
    'clf__max_depth': [i for i in range(25, 200, 25)],
  }
  elif args.classifier == 'logistic-regression': #'logistic-regression':
    clf = linear_model.LogisticRegression()
    parameters = {
    'clf__C': [0.5, 1, 5, 10],
  }
  else: # SVM
    clf = svm.SVC()
    parameters = {
    'clf__C': [0.5, 1.0, 5.0, 10], 
    }

  vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams
                 binary=args.binary, # if True all non-zero counts are set to one - used for probabilistic mapping
                 decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding
                 #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform()
                 encoding="ISO-8859-15", # 
                 input='content', # can be 'file', 'filename' or 'content'
                 lowercase=False, #Convert all characters to lowercase before tokenizing. 
                 max_df=1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
                 max_features=None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
                 ngram_range=(1, args.nGrams), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
                 preprocessor=None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
                 stop_words=None, #     
                 min_df=1,
                 strip_accents=None, 
                 token_pattern = '(?u)\\b\\w\\w+\\b',
                 tokenizer=None, 
                 vocabulary=None )

  if args.tf_idf == True:
    transformer = feature_extraction.text.TfidfTransformer()
    ppl = pipeline.Pipeline([
      ('vectorizer', vectorizer),
      ('transformer', transformer),
      ('clf', clf),
    ])
  else:
    ppl = pipeline.Pipeline([
      ('vectorizer', vectorizer),
      ('clf', clf),
    ])
  

  k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)
  gs = grid_search.GridSearchCV(ppl, parameters, verbose=3, cv=k_fold)
  gs.fit(data_orig, labels_orig)

  print(gs.best_params_, gs.best_score_)

  helpers.ensure_dir(os.path.dirname(args.output_file))
  pickle.dump([gs.best_params_, gs.best_score_], open(args.output_file, "wb" ))
Exemplo n.º 4
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv")
    OUTPUT_FILE = os.path.join("..", "..", "data", "features",
                               "data-frame.csv")

    parser.add_argument('--i',
                        '--input-file',
                        type=str,
                        dest='inputFile',
                        default=INPUT_FILE,
                        help='File to be processed to boolean matrix')

    parser.add_argument('-l',
                        '--lemmatize',
                        dest='lemmatize',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, verbs will be lemmatized')

    parser.add_argument('-la',
                        '--lemmatizeall',
                        dest='lemmatizeall',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, all words will be lemmatized')

    parser.add_argument(
        '-lc',
        '--lower-case',
        dest='lowerCase',
        action='store_true',
        default=True,
        help='Boolean - Defaults to converted all to lower-case')

    parser.add_argument(
        '-rw',
        '--remove-words',
        dest='removeWords',
        action='store_true',
        default=None,
        help='Accepts a list of types of words to be removed e.g. ...')

    parser.add_argument('-s',
                        '--stematize',
                        dest='stematize',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, all words will be stematized')

    parser.add_argument(
        '--sa',
        '--strip-accents',
        dest='stripAccents',
        action='store_true',
        default=False,
        help=
        "Removes accents on letters replacing them with just the letter itself"
    )

    parser.add_argument(
        '--sp',
        '--spelling-corrector',
        dest='spellCorrect',
        action='store_true',
        default=False,
        help=
        "Correct spelling mistakes word by word, just taking the most likely correction"
    )

    parser.add_argument(
        '--sw',
        '--stop-words',
        dest='stopWords',
        action='store_true',
        default=False,
        help='Removes the most common words, "stop words", from the text')

    parser.add_argument('-t',
                        '-tokenize',
                        dest='token',
                        action='store_true',
                        default=False,
                        help='Tokenizes text to individual words')

    parser.add_argument(
        '--ta',
        '--alpha-numeric',
        dest='alphaNumeric',
        action='store_false',
        default=True,
        help=
        'Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE'
    )

    parser.add_argument('--th',
                        '--token-hyphen',
                        dest='tokenHyphen',
                        action='store_true',
                        default=False,
                        help='Tokenizes text using the hierarchy structure')

    parser.add_argument(
        '-uc',
        '--upper-case',
        dest='upperCase',
        action='store_true',
        default=False,
        help='Boolean - If set, all words will be converted to upper-case')

    parser.add_argument(
        '-o',
        '--output-file',
        type=str,
        dest='output_file',
        default=OUTPUT_FILE,
        help=
        "Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here"
    )

    args = parser.parse_args()

    data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1")

    #set column names?
    words = pandas.DataFrame({
        'type': data_in.type_class,
        'class': data_in.source_class,
        'description': data_in.description
    })
    #set column names?

    #print(words.head())

    #=============================================================================
    # specify data frame of the required length
    #=============================================================================

    #==============================================================================
    #  # frame_len = 0
    #   #for x in range(1, len(words)):
    #
    #   #  temp_len = len(words.description[x].split())
    #    # if temp_len > frame_len:
    #         frame_len = temp_len
    #         print(frame_len)
    #         print(words.description[x].split())
    #
    #==============================================================================
    processed_data = pandas.DataFrame(data_in[['type_class', 'source_class']])

    # function calls need to be edited to send the relevant columns (excluding type and source).
    # it will require a restructuring of data types. Not sure if can use a dynamic data frame
    # as don't know how many words in each row - also will be different depending on processing

    #tokenize the text either straight or keeping only alpha-numeric(default)
    if args.alphaNumeric:

        # keep just the alpha-numeric characters
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer(r'\w+')
        word_list = list(map(tokenizer.tokenize, words.description))
        #print("alpha numeric only")
        #print(word_list[1:20])


#  else:
#   word_list = list(map(nltk.word_tokenize, words.description))
#print(word_list[1:20])

    if args.tokenHyphen:
        #print("STEMATIZE")
        word_list = list(
            map(text_processing.tokenize_on_hyphen, words.description))
        #print(word_list[1:20])
    # lower case
    if args.lowerCase:
        #print("LOWER CASE")
        word_list = list(map(text_processing.make_lower, word_list))
        #print(word_list[1:20])

    # Upper case
    if args.upperCase:
        #print("UPPER CASE")
        word_list = list(map(text_processing.make_upper, word_list))
        #print(word_list[1:20])

    if args.lemmatizeall:
        #print("LEMMATIZE ALL")
        word_list = list(map(text_processing.lemmatizeall, word_list))
    #print(word_list[1:20])

    if args.lemmatize:
        #print("LEMMATIZE")
        word_list = list(map(text_processing.lemmatize, word_list))
        #print(word_list[1:20])

    if args.removeWords:
        print("REMOVE WORDS")
        #needs function in text_processing

    if args.stematize:
        #print("STEMATIZE")
        word_list = list(map(text_processing.stematize, word_list))
        #print(word_list[1:20])

    if args.stopWords:
        #print("STOP WORDS")
        word_list = list(map(text_processing.exclude_stop_words, word_list))
        #print(word_list[1:20])

    wl_df = pandas.DataFrame(word_list)
    frames = [processed_data, wl_df]

    output_df = pandas.concat(frames, axis=1)

    #print(output_df[1:20])

    helpers.ensure_dir(os.path.dirname(args.output_file))
    output_df.to_csv(args.output_file,
                     index=False)  # write to file, but don't give row names
Exemplo n.º 5
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data", "features",
                              "hierarchy-word-features.pkl")
    OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers",
                              "decision_trees")
    FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv")

    parser.add_argument(
        '-i',
        '--input-file',
        type=str,
        dest='input_file',
        default=INPUT_FILE,
        help=
        "File with the list of item classes and features. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--fn',
        '--fold-number',
        dest='fold_number',
        default=None,
        type=int,
        help=
        "The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--ff',
        '--fold-file',
        type=str,
        dest='foldFile',
        default=FOLD_FILE,
        help=
        "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-o',
        '--output-dir',
        type=str,
        dest='output_dir',
        default=OUTPUT_DIR,
        help="Directory to save the decision trees. Defaults to '%(default)s'")

    args = parser.parse_args()

    # Read the input dictionary
    type_classes, source_classes, token_container = pickle.load(
        open(args.input_file, "rb"))
    type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3}
    # get all the label data
    labels_orig = [type_dict[x] for x in type_classes]

    # Create output directory
    helpers.ensure_dir(args.output_dir)

    dtree = tree.DecisionTreeClassifier(random_state=0,
                                        max_depth=100,
                                        criterion='entropy')
    vectorizer = feature_extraction.DictVectorizer(sparse=True)

    if args.fold_number == None:  # loop over all folds and create the classifier
        print("Training will be done iteratively for all folds...\n")
        cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
        for fold in cross_fold_indices.unique():
            print("Training classifier for fold %d...\n" % fold)

            data_fold = [
                token_container[i] for i in range(len(cross_fold_indices))
                if cross_fold_indices[i] != fold
            ]
            labels_fold = [
                labels_orig[i] for i in range(len(cross_fold_indices))
                if cross_fold_indices[i] != fold
            ]
            data_matrix = vectorizer.fit_transform(data_fold)
            dtree.fit(data_matrix, labels_fold)
            joblib.dump(
                dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold))
            joblib.dump(
                vectorizer,
                os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % fold))
            labels_pred = dtree.predict(data_matrix)
            score = metrics.accuracy_score(labels_fold, labels_pred)
            print("Accuracy on fold %d (train set): %.5f" % (fold, score))
            tree.export_graphviz(
                dtree,
                out_file=os.path.join(args.output_dir,
                                      'tree-fold%d.dot' % fold),
                max_depth=5)  #, feature_names = master_in.values)

    elif args.fold_number == 0:  # use all the data to train the classifier
        print("Training classifier for the full set...\n")
        data_matrix = vectorizer.fit_transform(token_container)
        import ipdb
        ipdb.set_trace()
        dtree.fit(data_matrix, labels_orig)
        joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl'))
        joblib.dump(vectorizer, os.path.join(args.output_dir,
                                             'vectorizer.pkl'))
        tree.export_graphviz(dtree,
                             out_file=os.path.join(args.output_dir,
                                                   'tree.dot'),
                             max_depth=5)  #, feature_names = master_in.values)

    else:  # create classifier for a particular fold
        fold = args.fold_number
        print("Training classifier for fold %d...\n" % fold)
        cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
        data_fold = [
            token_container[i] for i in range(len(cross_fold_indices))
            if cross_fold_indices[i] != fold
        ]
        labels_fold = [
            labels_orig[i] for i in range(len(cross_fold_indices))
            if cross_fold_indices[i] != fold
        ]
        data_matrix = vectorizer.fit_transform(data_fold)
        dtree.fit(data_matrix, labels_fold)
        joblib.dump(
            dtree,
            os.path.join(args.output_dir,
                         'tree-fold%d.pkl' % args.fold_number))
        joblib.dump(
            vectorizer,
            os.path.join(args.output_dir,
                         'vectorizer-fold%d.pkl' % args.fold_number))
        labels_pred = dtree.predict(data_matrix)
        score = metrics.accuracy_score(labels_fold, labels_pred)
        print("Accuracy on fold %d (train set): %.5f" % (fold, score))
Exemplo n.º 6
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data",
                              "iTrain_extra_lastitem_mod_new.csv")  #iTrain.csv
    FOLD_FILE = os.path.join("..", "..", "data", "folds",
                             "iTrain_fold.csv")  #iTrain.csv
    OUTPUT_FILE = os.path.join("..", "..", "data", "features", "new",
                               "dframe_new_iTrain.csv")  #
    BAD_OUTPUT_FILE = os.path.join("..", "..", "data", "features",
                                   "welsh_iTrain.csv")  #
    SPELL_CHECKER_PATH = os.path.join("..", "..", "data", "big.txt")

    parser.add_argument(
        '-i',
        '-input-file',
        type=str,
        dest='inputFile',
        default=INPUT_FILE,
        help=
        'File to be processed to a master set. File must be saved in data and argument structured as - ../../data/yourfilename.csv'
    )

    parser.add_argument(
        '---lan',
        '---language',
        dest='language',
        action='store_false',
        default=False,
        help=
        'Boolean - If set, language will be determined and non-english items will be removed'
    )

    parser.add_argument('-l',
                        '-lemmatize',
                        dest='lemmatize',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, verbs will be lemmatized')

    parser.add_argument('--la',
                        '--lemmatizeall',
                        dest='lemmatizeall',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, all words will be lemmatized')

    parser.add_argument(
        '--lc',
        '--lower-case',
        dest='lowerCase',
        action='store_false',
        default=True,
        help='Boolean - Defaults to converted all to lower-case')

    parser.add_argument(
        '--rw',
        '--remove-words',
        dest='removeTags',
        nargs='+',
        #action='store_true',
        default=None,
        help=
        'Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH'
    )

    parser.add_argument('-s',
                        '-stematize',
                        dest='stematize',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, all words will be stematized')

    parser.add_argument(
        '--sa',
        '--strip-accents',
        dest='stripAccents',
        action='store_false',
        default=True,
        help=
        "Removes accents on letters replacing them with just the letter itself"
    )

    parser.add_argument(
        '--sp',
        '--spelling-corrector',
        dest='spellCorrect',
        action='store_false',
        default=True,
        help=
        "Correct spelling mistakes word by word, just taking the most likely correction"
    )

    parser.add_argument(
        '--sd',
        '--spell-dictionary',
        dest='spell_dictionary',
        default=SPELL_CHECKER_PATH,
        help=
        "File containing the dictionary to be used for spell-check. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--sw',
        '--stop-words',
        dest='stopWords',
        action='store_false',
        default=True,
        help='Removes the most common words, "stop words", from the text')

    #  parser.add_argument('-t',
    #                      '-tokenize',
    #                      dest='tokenize',
    #                      action='store_true',
    #                      default=False,
    #                      help='Tokenizes text to individual words')

    parser.add_argument(
        '--ta',
        '--alpha-numeric',
        dest='alphaNumeric',
        action='store_true',
        default=True,
        help=
        'Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE'
    )

    parser.add_argument(
        '--th',
        '--token-hyphen',
        dest='tokenHyphen',
        action='store_true',
        default=False,
        help='Tokenizes text using the directory structure from input file')

    parser.add_argument(
        '--uc',
        '--upper-case',
        dest='upperCase',
        action='store_true',
        default=False,
        help='Boolean - If set, all words will be converted to upper-case')

    parser.add_argument(
        '--fn',
        '--fold-number',
        dest='fold_number',
        default=0,
        type=int,
        help=
        "The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--ff',
        '--fold-file',
        type=str,
        dest='foldFile',
        default=FOLD_FILE,
        help=
        "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-o',
        '-output-file',
        type=str,
        dest='output_file',
        default=OUTPUT_FILE,
        help=
        "Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here"
    )

    parser.add_argument(
        '--bo',
        '--bad-output-file',
        type=str,
        dest='bad_output_file',
        default=BAD_OUTPUT_FILE,
        help=
        "Directory to be used to save the created welsh data set. Filename will be automatically created based on input flags. Defaults to something needsto go here"
    )

    args = parser.parse_args()

    #  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1")
    data_in = pandas.read_csv(args.inputFile,
                              encoding="ISO-8859-15",
                              dtype=str)
    #set column names?
    words = pandas.DataFrame({
        'frID': data_in.frID,
        'type': data_in.type_class,
        'class': data_in.source_class,
        'description': data_in.description,
        'ICNPO_category': data_in.ICNPO_category,
        'nicename': data_in.nicename
    })

    processed_data = pandas.DataFrame(data_in[[
        'frID', 'type_class', 'source_class', 'ICNPO_category', 'nicename'
    ]])

    #Define word_list
    word_list = words.description

    #print(words.head())
    if args.language:
        #check that text is in english and separate
        print('Items that are more likely to be Welsh:')
        langval = text_processing.language(word_list)
        #    for x in range(0, len(word_list)):
        good = numpy.where([x >= 0.01 for x in langval])
        bad = numpy.where([x < 0.01 for x in langval])
        badwords = words.drop(words.index[good])
        words = words.drop(words.index[bad])
        word_list = words.description
        #
        helpers.ensure_dir(os.path.dirname(args.bad_output_file))
        badwords.to_csv(args.bad_output_file,
                        index=False)  # write to file, but don't give row names
        print('Only items with en < 0.01 are taken to be bad')

    #tokenize the text either straight or keeping only alpha-numeric(default)
    if args.alphaNumeric:

        # keep just the alpha-numeric characters
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer(r'\w+')
        word_list = list(map(tokenizer.tokenize, word_list))
        #print("alpha numeric only")
        #print(word_list[1:20])

#  if args.tokenize:
#          word_list = list(map(tokenizer.tokenize, word_list))

#   word_list = list(map(nltk.word_tokenize, words.description))
#print(word_list[1:20])
    if args.tokenHyphen:
        #print("Tokenize on hyphen")
        word_list = list(map(text_processing.tokenize_on_hyphen, word_list))
        #print(word_list[1:20])

    # lower case
    if args.lowerCase:
        #print("LOWER CASE")
        word_list = list(map(text_processing.make_lower, word_list))
        #print(word_list[1:20])

    # Upper case
    if args.upperCase:
        #print("UPPER CASE")
        word_list = list(map(text_processing.make_upper, word_list))
        #print(word_list[1:20])

    if args.stripAccents:
        #print("CORRECT SPELLING")
        i = 0
        for x in range(len(word_list)):
            correctedWords = [
                text_processing.strip_accents(y) for y in word_list[x]
            ]
            word_list[x] = correctedWords
            i = i + 1


#       if i % 100 == 0 : print('row %d'% i)
#    word_list = list(map(text_processing.strip_accents, word_list))
#print(word_list[1:20])

    if args.spellCorrect:
        #print("CORRECT SPELLING")
        word_list = spell_checker.correctall(word_list, args.spell_dictionary)
        #print(word_list[1:20])

    if args.removeTags:
        #print("REMOVE WORDS")
        #needs function in text_processing
        word_list = list(
            map(text_processing.keep_only_specified_tags, word_list,
                args.removeTags))

    if args.stopWords:
        #print("STOP WORDS")
        word_list = list(map(text_processing.exclude_stop_words, word_list))
        #print(word_list[1:20])

    if args.lemmatizeall:
        #print("LEMMATIZE ALL")
        word_list = list(map(text_processing.lemmatizeall, word_list))
        #print(word_list[1:20])

    if args.lemmatize:
        #print("LEMMATIZE")
        word_list = list(map(text_processing.lemmatize, word_list))
        print(word_list[1:20])

    if args.stematize:
        #print("STEMATIZE")
        word_list = list(map(text_processing.stematize, word_list))
        #print(word_list[1:20])

    wl_df = pandas.DataFrame(word_list)
    frames = [processed_data, wl_df]

    output_df = pandas.concat(frames, axis=1)

    #print(output_df[1:20])

    helpers.ensure_dir(os.path.dirname(args.output_file))
    output_df.to_csv(args.output_file,
                     index=False)  # write to file, but don't give row names
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  # this default file still contains I and IO values so needs updating
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchical", "new", "dframe_iTrain.csv")
  OUTPUT_DIR = os.path.join("..", "..", "data", "features")
  OUTPUT_FILENAME = 'temp/data_stri_Please_rename_or_delete_me.p'
  
  parser.add_argument('--ad',
                      '--addData',
                      dest='addData',
                      action='store_false', 
                      default=True, 
                      help='adds charity type and name to description list')   
                      

  parser.add_argument('-i', 
                      '-input-file', 
                      type=str, 
                      dest='inputFile', 
                      default=INPUT_FILE, 
                      help='Dataframe to be processed to a Bag of words. Defaults to ../../data/features/data_frame_test.csv.' )
 
  parser.add_argument('--lc',
                       '--lastCells',
                       dest='lastCells',
                       type = int,
                       default=0,
                       help='If set will extract the n last cells from the dataframe according to the input number. For use with source_class it needs a hierarchically tokenized dataframe'
                       )  
  
  parser.add_argument('--hi',
                      '--hierarchy',
                      dest='hierarchy',
                      action='store_true', 
                      default=False, 
                      help='If set spaces will be removed from within hierarchical items')    
  
  
  parser.add_argument('--wl',
                      '--wordsForLast',
                      dest='wordsForLast',
                      action='store_true', 
                      default=False, 
                      help='Do not concatonate the last hierarchical item so that individual words are kept')   
                      
  parser.add_argument('--od', 
                      '-output-dir', 
                      type=str, 
                      dest='output_dir', 
                      default=OUTPUT_DIR, 
                      help="Directory to save the output")    

  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='outputFilename', 
                      default=OUTPUT_FILENAME, 
                      help="Filename of output file - must be a pickle.p format")                      
                      
  args = parser.parse_args()

  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" )
  data_in_str = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" ,dtype=str)

#  data_in = data_in[data_in.type_class != 'I']
#  data_in = data_in[data_in.type_class != 'IO']
 
  data_stri = data_in_str[['frID','type_class','source_class']]
  data_stri['description'] = 'Nan' 
  sep = " "
  if args.hierarchy:
    print ('in hierarchy separation mode')    
    token_data = data_in.iloc[0:,5:]
    for i in range(0, len(token_data)):
        row=token_data.iloc[i,0:]      
        for j in range(0, len(row)):
            token=row.iloc[j]
            if args.wordsForLast :
                if j < (len(row)-1) :
                    nextToken=row.iloc[j+1]
                    if isinstance(nextToken, float) : 
                        if not math.isnan(nextToken) :
                          if type(token) == str :
                            token=token.replace(" ", "")
                    else :
                        if type(token) == str :
                          token=token.replace(" ", "")
            else :    
                if type(token) == str :
                    token=token.replace(" ", "")
            row[j]=token
        token_data.iloc[i]=row
    data_in.iloc[0:,5:]=token_data#.iloc[0:,0:]

  if args.addData : 
      charity_type=data_in_str.iloc[0:,3]
      charity_name=data_in_str.iloc[0:,4]     
      
   
   
  if args.lastCells == 0: # effectively processing type_class
      print('last cells equals 0')
   
      token_data = data_in.iloc[0:,5:]
      i=0
      for row in range(0, len(token_data)):
          
          token_data_stri = sep.join(map(str, token_data.iloc[row,0:].dropna()))
          if args.addData :
              collapsedname = charity_name.iloc[row].replace(" ", "")
              collapsedtype = charity_type.iloc[row].replace(" ", "")
              all_description=sep.join([token_data_stri,collapsedname,collapsedtype])
              data_stri.iloc[row, 3] = all_description
              i=i+1
              if i % 100 == 0 : print('row %d'% i)
          else :
              
              data_stri.iloc[row, 3] = token_data_stri 

      
  else: # effectively processing for source class as this selects the last n 
        # segments of the hierachy - needs to be passed the hierachical dataframe
      #print('last cells equals ', lastCells)

      for row in range(0, len(data_in)):
          
          print(row)
          
          all_cells = list()
          for col in data_in.iloc[int(row), 5:]:
              #print("col: ", col)
              all_cells.append(col)
   
          all_cells = [x for x in all_cells if str(x) != 'nan']
        
          data_stri.iloc[int(row), 3] = sep.join(all_cells[len(all_cells)- args.lastCells:len(all_cells)])    

  helpers.ensure_dir(args.output_dir)

  data_stri.to_pickle(os.path.join(args.output_dir, args.outputFilename))
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    # this is the default input directory if nothing is passed
    INPUT_HS = os.path.join("..", "..", "data", "features", "data_frame_hierarchy.csv")
    INPUT_HT = os.path.join("..", "..", "data", "features", "hierarchical-tokens.csv")
    OUTPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-features.pkl")

    parser.add_argument(
        "--hs",
        "--hierarchical-split",
        type=str,
        dest="input_hs",
        default=INPUT_HS,
        help="A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "--ht",
        "--hierarchical-tokens",
        type=str,
        dest="input_ht",
        default=INPUT_HT,
        help="A file containing the selected hierarchical level tokens. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "-o",
        "-output-file",
        type=str,
        dest="output_file",
        default=OUTPUT_FILE,
        help="A file to output the dictionary. Defaults to '%(default)s'",
    )

    args = parser.parse_args()

    # Read the data
    data_hierarchy = pandas.read_csv(args.input_hs, encoding="ISO-8859-1")

    # Read the tokens
    hierarchy_tokens = pandas.read_csv(args.input_ht, header=None)[0]

    # Remove samples from classes which are noisy and not useful ("I", "IO")
    data_hierarchy = data_hierarchy[data_hierarchy.type_class != "I"]
    data_hierarchy = data_hierarchy[data_hierarchy.type_class != "IO"]

    columns_hierarchy = data_hierarchy.columns

    descr_hierarchy = data_hierarchy[columns_hierarchy[2:]]

    # import ipdb; ipdb.set_trace()
    hierarchy_token_container = []
    for row in descr_hierarchy.iterrows():
        row_dict = {}  # initialize the
        rowlist = list(row[1])
        rowlist = [x for x in rowlist if str(x) != "nan"]
        rowlist = [x.strip() for x in rowlist]  # remove spaces
        rowlist = [x.strip(string.punctuation) for x in rowlist]  # remove punctuation
        for token in hierarchy_tokens:
            try:
                ind = rowlist.index(token)
                if ind <= 3:
                    row_dict[token] = rowlist.index(token)
                else:
                    row_dict[token] = 4
            except ValueError:
                pass
        hierarchy_token_container += [row_dict]

    # Extracting the sample classes into more convenient format
    type_class_list = list(data_hierarchy["type_class"])
    source_class_list = list(data_hierarchy["source_class"])

    # Save everything alltogether in a pkl file
    to_dump = [type_class_list, source_class_list, hierarchy_token_container]
    helpers.ensure_dir(os.path.dirname(args.output_file))
    pickle.dump(to_dump, open(args.output_file, "wb"))
Exemplo n.º 9
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
  OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees")
  FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv") 

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('--fn', 
                      '--fold-number', 
                      dest='fold_number', 
                      default=None,
                      type=int, 
                      help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'")

  parser.add_argument('--ff', 
                      '--fold-file', 
                      type=str, 
                      dest='foldFile', 
                      default=FOLD_FILE, 
                      help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'")
   
  parser.add_argument('-o', 
                      '--output-dir', 
                      type=str, 
                      dest='output_dir', 
                      default=OUTPUT_DIR, 
                      help="Directory to save the decision trees. Defaults to '%(default)s'")


  args = parser.parse_args()

  # Read the input dictionary
  type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb"))
  type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3}
  # get all the label data
  labels_orig = [type_dict[x] for x in type_classes]    
  
  # Create output directory
  helpers.ensure_dir(args.output_dir)

  dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=100, criterion='entropy')
  vectorizer = feature_extraction.DictVectorizer(sparse=True)

  if args.fold_number == None: # loop over all folds and create the classifier
    print("Training will be done iteratively for all folds...\n")
    cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
    for fold in cross_fold_indices.unique():
      print("Training classifier for fold %d...\n" % fold)
      
      data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] 
      labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold]
      data_matrix = vectorizer.fit_transform(data_fold)
      dtree.fit(data_matrix, labels_fold)
      joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold))
      joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % fold))
      labels_pred = dtree.predict(data_matrix)
      score = metrics.accuracy_score(labels_fold, labels_pred)
      print("Accuracy on fold %d (train set): %.5f" % (fold, score))
      tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree-fold%d.dot' % fold), max_depth=5)#, feature_names = master_in.values)

  elif args.fold_number == 0: # use all the data to train the classifier
    print("Training classifier for the full set...\n")
    data_matrix = vectorizer.fit_transform(token_container)
    import ipdb; ipdb.set_trace()
    dtree.fit(data_matrix, labels_orig)
    joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl')) 
    joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer.pkl')) 
    tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree.dot'), max_depth=5)#, feature_names = master_in.values)

  else: # create classifier for a particular fold
    fold = args.fold_number
    print("Training classifier for fold %d...\n" % fold)
    cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
    data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold] 
    labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] != fold]
    data_matrix = vectorizer.fit_transform(data_fold)
    dtree.fit(data_matrix, labels_fold)
    joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % args.fold_number)) 
    joblib.dump(vectorizer, os.path.join(args.output_dir, 'vectorizer-fold%d.pkl' % args.fold_number)) 
    labels_pred = dtree.predict(data_matrix)
    score = metrics.accuracy_score(labels_fold, labels_pred)
    print("Accuracy on fold %d (train set): %.5f" % (fold, score))
Exemplo n.º 10
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    # this is the default input directory if nothing is passed
    INPUT_HS = os.path.join("..", "..", "data", "features",
                            "data_frame_hierarchy.csv")
    INPUT_HT = os.path.join("..", "..", "data", "features",
                            "hierarchical-tokens.csv")
    OUTPUT_FILE = os.path.join("..", "..", "data", "features",
                               "hierarchy-features.pkl")

    parser.add_argument(
        '--hs',
        '--hierarchical-split',
        type=str,
        dest='input_hs',
        default=INPUT_HS,
        help=
        "A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--ht',
        '--hierarchical-tokens',
        type=str,
        dest='input_ht',
        default=INPUT_HT,
        help=
        "A file containing the selected hierarchical level tokens. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-o',
        '-output-file',
        type=str,
        dest='output_file',
        default=OUTPUT_FILE,
        help="A file to output the dictionary. Defaults to '%(default)s'")

    args = parser.parse_args()

    # Read the data
    data_hierarchy = pandas.read_csv(args.input_hs, encoding="ISO-8859-1")

    # Read the tokens
    hierarchy_tokens = pandas.read_csv(args.input_ht, header=None)[0]

    # Remove samples from classes which are noisy and not useful ("I", "IO")
    data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'I']
    data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'IO']

    columns_hierarchy = data_hierarchy.columns

    descr_hierarchy = data_hierarchy[columns_hierarchy[2:]]

    #import ipdb; ipdb.set_trace()
    hierarchy_token_container = []
    for row in descr_hierarchy.iterrows():
        row_dict = {}  # initialize the
        rowlist = list(row[1])
        rowlist = [x for x in rowlist if str(x) != 'nan']
        rowlist = [x.strip() for x in rowlist]  #remove spaces
        rowlist = [x.strip(string.punctuation)
                   for x in rowlist]  # remove punctuation
        for token in hierarchy_tokens:
            try:
                ind = rowlist.index(token)
                if ind <= 3:
                    row_dict[token] = rowlist.index(token)
                else:
                    row_dict[token] = 4
            except ValueError:
                pass
        hierarchy_token_container += [row_dict]

    # Extracting the sample classes into more convenient format
    type_class_list = list(data_hierarchy['type_class'])
    source_class_list = list(data_hierarchy['source_class'])

    # Save everything alltogether in a pkl file
    to_dump = [type_class_list, source_class_list, hierarchy_token_container]
    helpers.ensure_dir(os.path.dirname(args.output_file))
    pickle.dump(to_dump, open(args.output_file, "wb"))
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p")
    OUTPUT_FILE = os.path.join("..", "..", "data", "output", "predicted_labels_ensemble.pkl")
    OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", "predicted_labels_ensemble.csv")

    parser.add_argument(
        "-i",
        "--input-file",
        type=str,
        dest="input_file",
        default=INPUT_FILE,
        help="File with the list of item classes and features. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "-g",
        "--gridres-file",
        type=str,
        dest="gridres_file",
        default=None,
        nargs="+",
        help="Files with the best parameters of the classifiers in the ensemble. If None, default parameters will be used. The number of files should correspond to the number of classifiers.",
    )

    parser.add_argument(
        "-n",
        "-ngrams",
        dest="nGrams",
        type=int,
        default=1,
        nargs="+",
        help="Defines how to split words by ngrams. Default is tokenized to one word ngrams",
    )

    parser.add_argument(
        "--ti", dest="tf_idf", action="store_true", default=False, help="Boolean - If set, TfIdf features will be used"
    )

    parser.add_argument(
        "--cat",
        type=str,
        dest="category",
        default="income-type",
        choices=("income-type", "income-source", "expenditure-type"),
        help="The type of categorization. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "-c",
        "--classifiers",
        type=str,
        dest="classifiers",
        nargs="+",
        default=["decision-tree", "logistic-regression"],
        help="The classifiers to be used. More than one classifier can be used. The number of classifier should correspond to the number of grid-search parameter files.",
    )

    parser.add_argument(
        "-o",
        "-output-file",
        type=str,
        dest="output_file",
        default=OUTPUT_FILE,
        help="A pickle file to output the predicted labels. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "--od",
        "--output-file-desc",
        type=str,
        dest="output_file_desc",
        default=OUTPUT_FILE_DESC,
        help="A csv file to output the predicted labels. Defaults to '%(default)s'",
    )

    args = parser.parse_args()

    # Read the input dictionary

    data_in = pandas.read_pickle(args.input_file)
    type_classes = list(data_in["type_class"])
    source_classes = list(data_in["source_class"])
    frID = list(data_in["frID"])
    data_orig = data_in["description"]

    if args.category == "income-type" or args.category == "expenditure-type":
        labels_orig = [str(i) for i in type_classes]  # converting them to strings if they are not strings already
    else:
        labels_orig = [str(i) for i in source_classes]  # converting them to strings if they are not strings already

    clfs = []
    for i in range(len(args.classifiers)):
        if args.classifiers[i] == "decision-tree":
            gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0]
            max_depth = gridres_params["clf__max_depth"] if gridres_params is not None else 100
            clfs += [tree.DecisionTreeClassifier(max_depth=max_depth, criterion="entropy")]
            print("Include a decision tree with max depth=%d" % max_depth)
        if args.classifiers[i] == "random-forest":
            gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0]
            max_depth = gridres_params["clf__max_depth"] if gridres_params is not None else 100
            clfs += [ensemble.RandomForestClassifier(max_depth=max_depth, criterion="entropy")]
            print("Include a random forest with max depth=%d" % max_depth)
        if args.classifiers[i] == "logistic-regression":
            gridres_params = pickle.load(open(args.gridres_file[i], "rb"))[0]
            C = gridres_params["clf__C"] if gridres_params is not None else 1
            clfs += [linear_model.LogisticRegression(C=C)]
            print("Include a logistic regressor with C=%d" % C)

    clf = ensemble_classifier.EnsembleClassifier(clfs=clfs, voting="hard")

    vectorizer = feature_extraction.text.CountVectorizer(
        analyzer="word",  # whether should be made ofword or char n-grams
        binary=False,  # if True all non-zero counts are set to one - used for probabilistic mapping
        decode_error="strict",  # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding
        # dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform()
        encoding="ISO-8859-15",  #
        input="content",  # can be 'file', 'filename' or 'content'
        lowercase=False,  # Convert all characters to lowercase before tokenizing.
        max_df=1.0,  # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
        max_features=None,  # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
        ngram_range=(
            1,
            args.nGrams,
        ),  # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
        preprocessor=None,  # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
        stop_words=None,  #
        min_df=1,
        strip_accents=None,
        token_pattern="(?u)\\b\\w\\w+\\b",
        tokenizer=None,
        vocabulary=None,
    )

    if args.tf_idf == True:
        transformer = feature_extraction.text.TfidfTransformer()
        ppl = pipeline.Pipeline([("vectorizer", vectorizer), ("transformer", transformer), ("clf", clf)])
    else:
        ppl = pipeline.Pipeline([("vectorizer", vectorizer), ("clf", clf)])

    k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

    labels_predicted = [-1] * len(labels_orig)

    accuracy = []

    for train_idx, dev_idx in k_fold:
        data_train = [data_orig[i] for i in train_idx]
        data_dev = [data_orig[i] for i in dev_idx]
        labels_train = [labels_orig[i] for i in train_idx]
        labels_dev = [labels_orig[i] for i in dev_idx]

        ppl.fit(data_train, labels_train)
        predicted_dev = ppl.predict(data_dev)
        labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx)

        accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]

    print("Accuracy of the ensemble classifier: %.4f +- %.4f" % (numpy.mean(accuracy), numpy.std(accuracy)))

    # Save the predicted classes
    to_dump = [labels_orig, labels_predicted]
    helpers.ensure_dir(os.path.dirname(args.output_file))
    pickle.dump(to_dump, open(args.output_file, "wb"))

    # create a dataframe to output type class, predicted type class and description data
    if args.category == "income-type" or args.category == "expenditure-type":
        dump_op_desc = pandas.DataFrame(
            {
                "frID": frID,
                "type_class": labels_orig,
                "type_class_predicted": labels_predicted,
                "description": data_orig,
            }
        )
    else:
        dump_op_desc = pandas.DataFrame(
            {
                "frID": frID,
                "source_class": labels_orig,
                "source_class_predicted": labels_predicted,
                "description": data_orig,
            }
        )
    dump_op_desc.to_csv(args.output_file_desc)
Exemplo n.º 12
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
    CLASSIFIER_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees")
    FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv")
    OUTPUT_DIR = os.path.join("..", "..", "data", "scores", "decision_trees")

    parser.add_argument(
        "-i",
        "--input-file",
        type=str,
        dest="input_file",
        default=INPUT_FILE,
        help="File with the list of item classes and features. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "--fn",
        "--fold-number",
        dest="fold_number",
        default=None,
        type=int,
        help="The fold number to be used for predicition. If 0, all the items will be considered. If None, will train the classifier for all the folds. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "--ff",
        "--fold-file",
        type=str,
        dest="fold_file",
        default=FOLD_FILE,
        help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'",
    )

    parser.add_argument(
        "--cd",
        "--classifier-dir",
        type=str,
        dest="classifier_dir",
        default=CLASSIFIER_DIR,
        help="Directory where the trained decision trees are stored. Defaults to '%(default)s'",
    )

    # parser.add_argument('--sm',
    #                    '--sparse-matrix',
    #                    dest='sparse_matrix',
    #                    action='store_true',
    #                    default=False,
    #                    help='If set, the data will be transformed into a sparse matrix')

    parser.add_argument(
        "-o",
        "--output-dir",
        type=str,
        dest="output_dir",
        default=OUTPUT_DIR,
        help="Directory to save the scores. Defaults to '%(default)s'",
    )

    args = parser.parse_args()

    # Read the input dictionary
    type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb"))
    type_dict = {"IGI": 0, "IC": 1, "IV": 2, "IG": 3}

    # get all the label data
    labels_orig = [type_dict[x] for x in type_classes]

    # Create output directory
    helpers.ensure_dir(args.output_dir)

    # Read the fold indices
    cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0]

    if args.fold_number == None:  # loop over all folds and create the classifier
        print("Evaluation will be done iteratively for all folds...\n")
        pred_labels = {}

        for fold in cross_fold_indices.unique():
            print("Evaluating fold %d...\n" % fold)
            # read the vectorizer and the decision tree
            vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer-fold%d.pkl" % (fold)))
            dtree = joblib.load(os.path.join(args.classifier_dir, "tree-fold%d.pkl" % (fold)))

            # read and transformthe data
            data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold]
            data_matrix = vectorizer.transform(data_fold)

            pred_labels[fold] = dtree.predict(data_matrix)

    elif args.fold_number == 0:  # use all the data to train the classifier
        print("Evaluation will be done on the full set...\n" % fold)
        # read the vectorizer and decision tree
        vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer.pkl"))
        dtree = joblib.load(os.path.join(args.classifier_dir, "tree.pkl"))

        # read the data
        data_matrix = vectorizer.transform(token_container)

        pred_labels = dtree.predict(data_matrix)

    else:  # create classifier for a particular fold
        import ipdb

        ipdb.set_trace()
        fold = args.fold_number
        print("Evaluation for fold %d...\n" % fold)

        # read the vectorizer and the decision tree
        vectorizer = joblib.load(os.path.join(args.classifier_dir, "vectorizer-fold%d.pkl" % (fold)))
        dtree = joblib.load(os.path.join(args.classifier_dir, "tree-fold%d.pkl" % (fold)))

        # read and transformthe data
        data_fold = [token_container[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold]
        data_matrix = vectorizer.transform(data_fold)

        pred_labels = dtree.predict(data_matrix)

    # do the evaluation
    if args.fold_number == None:  # we iterate over all the folds
        for fold in cross_fold_indices.unique():
            labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold]
            pred_labels_fold = pred_labels[fold]
            score = metrics.accuracy_score(labels_fold, pred_labels_fold)
            print("Accuracy on fold %d: %.5f" % (fold, score))
    elif args.fold_number == 0:
        pass
    else:
        fold = args.fold_number
        cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0]
        labels_fold = [labels_orig[i] for i in range(len(cross_fold_indices)) if cross_fold_indices[i] == fold]
        score = metrics.accuracy_score(labels_fold, pred_labels)
        print("Accuracy on fold %d: %.5f" % (fold, score))

    print("Done!\n")
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_FILE = os.path.join("..", "..", "data", "features", "bow_string_input_dframe.p")
  OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.csv')

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('-g', 
                      '--gridres-file', 
                      type=str, 
                      dest='gridres_file',
                      default=None, 
                      help="File with the best parameters of the grid search. Defaults to '%(default)s'")

  parser.add_argument('-n',
                      '-ngrams',
                      dest = 'nGrams',
                      type = int,
                      default = 1, 
                      nargs='+',
                      help = 'Defines how to split words by ngrams. Default is tokenized to one word ngrams'
                      )

  parser.add_argument('--ti', 
                      dest='tf_idf', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, TfIdf features will be used')

  parser.add_argument('-c', 
                      '--classifier', 
                      type=str, 
                      dest='classifier',
                      default="ensemble", 
                      help="The classifier to be used. Defaults to '%(default)s'")

  parser.add_argument('--cat',
                      type=str, 
                      dest='category', 
                      default='income-type',
                      choices=('income-type','income-source','expenditure-type'),
                      help="The type of categorization. Defaults to '%(default)s'")

  parser.add_argument('--od', 
                      '--output-file-desc', 
                      type=str, 
                      dest='output_file_desc', 
                      default=OUTPUT_FILE_DESC, 
                      help="A csv file to output the predicted labels. Defaults to '%(default)s'")
 

  args = parser.parse_args()

  # Read the input dictionary
  
  data_in = pandas.read_pickle(args.input_file)
  type_classes = list(data_in['type_class'])
  source_classes = list(data_in['source_class'])
  frID = list(data_in['frID'])
  data_orig = data_in['description']

  if args.category == 'income-type' or args.category == 'expenditure-type':
    labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already
  else:
    labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already
  
  if args.gridres_file is not None:
    gridres_params = pickle.load(open(args.gridres_file, "rb" ))[0]
  else:
    gridres_params = None

  if args.classifier == 'decision-tree':
    max_depth = gridres_params['clf__max_depth'] if gridres_params is not None else 100
    clf = tree.DecisionTreeClassifier(max_depth=max_depth, criterion='entropy')
    print("Will run a decision tree with max depth=%d" % max_depth) 
  elif args.classifier == 'random-forest':
    max_depth = gridres_params['clf__max_depth'] if gridres_params is not None else 100
    clf = ensemble.RandomForestClassifier(max_depth=max_depth, criterion='entropy')
    print("Will run a random forest with max depth=%d" % max_depth) 
  elif args.classifier == 'logistic-regression':
    C = gridres_params['clf__C'] if gridres_params is not None else 1
    clf = linear_model.LogisticRegression(C=C)
    print("Will run logistic regressor with C=%d" % C)
  else: # ensemble
    clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
    clf2 = ensemble.RandomForestClassifier()
    clf3 = linear_model.LogisticRegression()
    clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard')    

  vectorizer = feature_extraction.text.CountVectorizer( analyzer='word', #whether should be made ofword or char n-grams
                 binary=False, # if True all non-zero counts are set to one - used for probabilistic mapping
                 decode_error= 'strict', # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding
                 #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform()
                 encoding="ISO-8859-15", # 
                 input='content', # can be 'file', 'filename' or 'content'
                 lowercase=False, #Convert all characters to lowercase before tokenizing. 
                 max_df=1.0, # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
                 max_features=None, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
                 ngram_range=(1, args.nGrams), # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
                 preprocessor=None, # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
                 stop_words=None, #     
                 min_df=1,
                 strip_accents=None, 
                 token_pattern = '(?u)\\b\\w\\w+\\b',
                 tokenizer=None, 
                 vocabulary=None )

  if args.tf_idf == True:
    transformer = feature_extraction.text.TfidfTransformer()
    ppl = pipeline.Pipeline([
      ('vectorizer', vectorizer),
      ('transformer', transformer),
      ('clf', clf),
    ])
  else:
    ppl = pipeline.Pipeline([
      ('vectorizer', vectorizer),
      ('clf', clf),
    ])

  k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

  labels_predicted = [-1] * len(labels_orig) 

  accuracy = []

  for train_idx, dev_idx in k_fold: 
    data_train = [data_orig[i] for i in train_idx]
    data_dev =  [data_orig[i] for i in dev_idx]
    labels_train = [labels_orig[i] for i in train_idx]
    labels_dev =  [labels_orig[i] for i in dev_idx]
  
    ppl.fit(data_train, labels_train)
    predicted_dev = ppl.predict(data_dev)
    labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx)

    accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]
  
  print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy)))

  # Save the predicted classes  
  to_dump = [labels_orig, labels_predicted]
  helpers.ensure_dir(os.path.dirname(args.output_file_desc))

  #create a dataframe to output type class, predicted type class and description data
  if args.category == 'income-type' or args.category == 'expenditure-type':
    dump_op_desc = pandas.DataFrame({'frID': frID,
                                    'type_class': labels_orig, 
                                   'type_class_predicted': labels_predicted,
                                   'description': data_orig})
  else:
    dump_op_desc = pandas.DataFrame({'frID': frID, 
                                    'source_class': labels_orig, 
                                   'source_class_predicted': labels_predicted,
                                   'description': data_orig})
  dump_op_desc.to_csv(args.output_file_desc)
Exemplo n.º 14
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    # this default file still contains I and IO values so needs updating
    INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchical",
                              "new", "dframe_iTrain.csv")
    OUTPUT_DIR = os.path.join("..", "..", "data", "features")
    OUTPUT_FILENAME = 'temp/data_stri_Please_rename_or_delete_me.p'

    parser.add_argument('--ad',
                        '--addData',
                        dest='addData',
                        action='store_false',
                        default=True,
                        help='adds charity type and name to description list')

    parser.add_argument(
        '-i',
        '-input-file',
        type=str,
        dest='inputFile',
        default=INPUT_FILE,
        help=
        'Dataframe to be processed to a Bag of words. Defaults to ../../data/features/data_frame_test.csv.'
    )

    parser.add_argument(
        '--lc',
        '--lastCells',
        dest='lastCells',
        type=int,
        default=0,
        help=
        'If set will extract the n last cells from the dataframe according to the input number. For use with source_class it needs a hierarchically tokenized dataframe'
    )

    parser.add_argument(
        '--hi',
        '--hierarchy',
        dest='hierarchy',
        action='store_true',
        default=False,
        help='If set spaces will be removed from within hierarchical items')

    parser.add_argument(
        '--wl',
        '--wordsForLast',
        dest='wordsForLast',
        action='store_true',
        default=False,
        help=
        'Do not concatonate the last hierarchical item so that individual words are kept'
    )

    parser.add_argument('--od',
                        '-output-dir',
                        type=str,
                        dest='output_dir',
                        default=OUTPUT_DIR,
                        help="Directory to save the output")

    parser.add_argument(
        '-o',
        '-output-file',
        type=str,
        dest='outputFilename',
        default=OUTPUT_FILENAME,
        help="Filename of output file - must be a pickle.p format")

    args = parser.parse_args()

    data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15")
    data_in_str = pandas.read_csv(args.inputFile,
                                  encoding="ISO-8859-15",
                                  dtype=str)

    #  data_in = data_in[data_in.type_class != 'I']
    #  data_in = data_in[data_in.type_class != 'IO']

    data_stri = data_in_str[['frID', 'type_class', 'source_class']]
    data_stri['description'] = 'Nan'
    sep = " "
    if args.hierarchy:
        print('in hierarchy separation mode')
        token_data = data_in.iloc[0:, 5:]
        for i in range(0, len(token_data)):
            row = token_data.iloc[i, 0:]
            for j in range(0, len(row)):
                token = row.iloc[j]
                if args.wordsForLast:
                    if j < (len(row) - 1):
                        nextToken = row.iloc[j + 1]
                        if isinstance(nextToken, float):
                            if not math.isnan(nextToken):
                                if type(token) == str:
                                    token = token.replace(" ", "")
                        else:
                            if type(token) == str:
                                token = token.replace(" ", "")
                else:
                    if type(token) == str:
                        token = token.replace(" ", "")
                row[j] = token
            token_data.iloc[i] = row
        data_in.iloc[0:, 5:] = token_data  #.iloc[0:,0:]

    if args.addData:
        charity_type = data_in_str.iloc[0:, 3]
        charity_name = data_in_str.iloc[0:, 4]

    if args.lastCells == 0:  # effectively processing type_class
        print('last cells equals 0')

        token_data = data_in.iloc[0:, 5:]
        i = 0
        for row in range(0, len(token_data)):

            token_data_stri = sep.join(
                map(str, token_data.iloc[row, 0:].dropna()))
            if args.addData:
                collapsedname = charity_name.iloc[row].replace(" ", "")
                collapsedtype = charity_type.iloc[row].replace(" ", "")
                all_description = sep.join(
                    [token_data_stri, collapsedname, collapsedtype])
                data_stri.iloc[row, 3] = all_description
                i = i + 1
                if i % 100 == 0: print('row %d' % i)
            else:

                data_stri.iloc[row, 3] = token_data_stri

    else:  # effectively processing for source class as this selects the last n
        # segments of the hierachy - needs to be passed the hierachical dataframe
        #print('last cells equals ', lastCells)

        for row in range(0, len(data_in)):

            print(row)

            all_cells = list()
            for col in data_in.iloc[int(row), 5:]:
                #print("col: ", col)
                all_cells.append(col)

            all_cells = [x for x in all_cells if str(x) != 'nan']

            data_stri.iloc[int(row), 3] = sep.join(
                all_cells[len(all_cells) - args.lastCells:len(all_cells)])

    helpers.ensure_dir(args.output_dir)

    data_stri.to_pickle(os.path.join(args.output_dir, args.outputFilename))
Exemplo n.º 15
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data", "features",
                              "hierarchy-word-features.pkl")
    CLASSIFIER_DIR = os.path.join("..", "..", "data", "classifiers",
                                  "decision_trees")
    FOLD_FILE = os.path.join("..", "..", "data", "folds", "newfolds.csv")
    OUTPUT_DIR = os.path.join("..", "..", "data", "scores", "decision_trees")

    parser.add_argument(
        '-i',
        '--input-file',
        type=str,
        dest='input_file',
        default=INPUT_FILE,
        help=
        "File with the list of item classes and features. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--fn',
        '--fold-number',
        dest='fold_number',
        default=None,
        type=int,
        help=
        "The fold number to be used for predicition. If 0, all the items will be considered. If None, will train the classifier for all the folds. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--ff',
        '--fold-file',
        type=str,
        dest='fold_file',
        default=FOLD_FILE,
        help=
        "Fold file containing the cross-fold validation indices. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '--cd',
        '--classifier-dir',
        type=str,
        dest='classifier_dir',
        default=CLASSIFIER_DIR,
        help=
        "Directory where the trained decision trees are stored. Defaults to '%(default)s'"
    )

    #parser.add_argument('--sm',
    #                    '--sparse-matrix',
    #                    dest='sparse_matrix',
    #                    action='store_true',
    #                    default=False,
    #                    help='If set, the data will be transformed into a sparse matrix')

    parser.add_argument(
        '-o',
        '--output-dir',
        type=str,
        dest='output_dir',
        default=OUTPUT_DIR,
        help="Directory to save the scores. Defaults to '%(default)s'")

    args = parser.parse_args()

    # Read the input dictionary
    type_classes, source_classes, token_container = pickle.load(
        open(args.input_file, "rb"))
    type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3}

    # get all the label data
    labels_orig = [type_dict[x] for x in type_classes]

    # Create output directory
    helpers.ensure_dir(args.output_dir)

    # Read the fold indices
    cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0]

    if args.fold_number == None:  # loop over all folds and create the classifier
        print("Evaluation will be done iteratively for all folds...\n")
        pred_labels = {}

        for fold in cross_fold_indices.unique():
            print("Evaluating fold %d...\n" % fold)
            # read the vectorizer and the decision tree
            vectorizer = joblib.load(
                os.path.join(args.classifier_dir,
                             'vectorizer-fold%d.pkl' % (fold)))
            dtree = joblib.load(
                os.path.join(args.classifier_dir, 'tree-fold%d.pkl' % (fold)))

            # read and transformthe data
            data_fold = [
                token_container[i] for i in range(len(cross_fold_indices))
                if cross_fold_indices[i] == fold
            ]
            data_matrix = vectorizer.transform(data_fold)

            pred_labels[fold] = dtree.predict(data_matrix)

    elif args.fold_number == 0:  # use all the data to train the classifier
        print("Evaluation will be done on the full set...\n" % fold)
        # read the vectorizer and decision tree
        vectorizer = joblib.load(
            os.path.join(args.classifier_dir, 'vectorizer.pkl'))
        dtree = joblib.load(os.path.join(args.classifier_dir, 'tree.pkl'))

        #read the data
        data_matrix = vectorizer.transform(token_container)

        pred_labels = dtree.predict(data_matrix)

    else:  # create classifier for a particular fold
        import ipdb
        ipdb.set_trace()
        fold = args.fold_number
        print("Evaluation for fold %d...\n" % fold)

        # read the vectorizer and the decision tree
        vectorizer = joblib.load(
            os.path.join(args.classifier_dir,
                         'vectorizer-fold%d.pkl' % (fold)))
        dtree = joblib.load(
            os.path.join(args.classifier_dir, 'tree-fold%d.pkl' % (fold)))

        # read and transformthe data
        data_fold = [
            token_container[i] for i in range(len(cross_fold_indices))
            if cross_fold_indices[i] == fold
        ]
        data_matrix = vectorizer.transform(data_fold)

        pred_labels = dtree.predict(data_matrix)

    # do the evaluation
    if args.fold_number == None:  # we iterate over all the folds
        for fold in cross_fold_indices.unique():
            labels_fold = [
                labels_orig[i] for i in range(len(cross_fold_indices))
                if cross_fold_indices[i] == fold
            ]
            pred_labels_fold = pred_labels[fold]
            score = metrics.accuracy_score(labels_fold, pred_labels_fold)
            print("Accuracy on fold %d: %.5f" % (fold, score))
    elif args.fold_number == 0:
        pass
    else:
        fold = args.fold_number
        cross_fold_indices = pandas.read_csv(args.fold_file, header=None)[0]
        labels_fold = [
            labels_orig[i] for i in range(len(cross_fold_indices))
            if cross_fold_indices[i] == fold
        ]
        score = metrics.accuracy_score(labels_fold, pred_labels)
        print("Accuracy on fold %d: %.5f" % (fold, score))

    print("Done!\n")
Exemplo n.º 16
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_HT = os.path.join("..", "..", "data", "features", "data_frame_hierarchy.csv")
  INPUT_WT = os.path.join("..", "..", "data", "features", "data_frame_words.csv")
  OUTPUT_DIR = os.path.join("..", "..", "data", "features") 


  parser.add_argument('--ht', 
                      '--hierarchical-tokens-file', 
                      type=str, 
                      dest='input_ht', 
                      default=INPUT_HT, 
                      help="A dataframe file containing the hierarchical level tokens for each item in the dataset. Defaults to '%(default)s'")

  parser.add_argument('--wt', 
                      '--word-tokens-file', 
                      type=str, 
                      dest='input_wt', 
                      default=INPUT_WT, 
                      help="A dataframe file containing the word tokens for each item in the dataset. Defaults to '%(default)s'")

  parser.add_argument('--nh', 
                      '--num-hierarchy-tokens', 
                      dest='num_hierarchy_tokens', 
                      default=20,
                      type=int, 
                      help="The number of the most frequent hierarchical tokens to use per class. Defaults to '%(default)s'")

  parser.add_argument('--nw', 
                      '--num-word-tokens', 
                      dest='num_word_tokens', 
                      default=20,
                      type=int, 
                      help="The number of the most frequent hierarchical tokens to use per class. Defaults to '%(default)s'")

  parser.add_argument('-o', 
                      '--output-dir', 
                      type=str, 
                      dest='output_dir', 
                      default=OUTPUT_DIR, 
                      help="Directory to save the selected tokens. Defaults to '%(default)s'") 

  args = parser.parse_args()
      
  data_hierarchy  = pandas.read_csv(args.input_ht, encoding="ISO-8859-1")
  data_words  = pandas.read_csv(args.input_wt, encoding="ISO-8859-1")

  # Remove samples from classes which are noisy and not useful ("I", "IO")
  data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'I']
  data_hierarchy = data_hierarchy[data_hierarchy.type_class != 'IO']
  data_words = data_words[data_words.type_class != 'I']
  data_words = data_words[data_words.type_class != 'IO']

  # Read the available classes and labels from the data
  labels_hierarchy = data_hierarchy.type_class.unique()
  labels_words = data_words.type_class.unique()
  columns_hierarchy = data_hierarchy.columns
  columns_words = data_hierarchy.columns

  hierarchy_tokens = []
  # Finding the N most frequent hierarchy tokens
  print("Processing hierarchical tokens...")
  for label in labels_hierarchy:
    print("Processing %s class..." % label)
    data_by_label = data_hierarchy[data_hierarchy.type_class == label] # filter only the items with the particular label
    descr = data_by_label[columns_hierarchy[2:]] # take only the columns that represent description
    hierarchy_tokens += find_most_common_tokens(descr, args.num_hierarchy_tokens)

  word_tokens = []
  # Finding the N most frequent word tokens
  print("Processing word tokens...")
  for label in labels_words:
    print("Processing %s class..." % label)
    data_by_label = data_words[data_words.type_class == label] # filter only the items with the particular label
    descr = data_by_label[columns_words[2:]] # take only the columns that represent description
    word_tokens += find_most_common_tokens(descr, args.num_word_tokens)  

  #Save the tokens
  import ipdb; ipdb.set_trace()
  helpers.ensure_dir(args.output_dir)
  hierarchy_tokens_towrite = pandas.Series(pandas.Series(numpy.array(hierarchy_tokens)).unique()) # take each token just once
  hierarchy_tokens_towrite.to_csv(os.path.join(args.output_dir, 'hierarchy_tokens.csv'), index=False) # write to file, but don't give row names
  word_tokens_towrite = pandas.Series(pandas.Series(numpy.array(word_tokens)).unique())
  word_tokens_towrite.to_csv(os.path.join(args.output_dir, 'word_tokens.csv'), index=False) # write to file, but don't give row names
Exemplo n.º 17
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    INPUT_FILE = os.path.join("..", "..", "data", "features",
                              "hierarchy-word-features.pkl")
    OUTPUT_FILE = os.path.join("..", "..", "data", "params",
                               'params-decisiontree.pkl')

    parser.add_argument(
        '-i',
        '--input-file',
        type=str,
        dest='input_file',
        default=INPUT_FILE,
        help=
        "File with the list of item classes and features. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-c',
        '--classifier',
        type=str,
        dest='classifier',
        default="decision-tree",
        help="The classifier to be used. Defaults to '%(default)s'")

    parser.add_argument(
        '-o',
        '-output-file',
        type=str,
        dest='output_file',
        default=OUTPUT_FILE,
        help="A file to output the predicted labels. Defaults to '%(default)s'"
    )

    args = parser.parse_args()

    # Read the input dictionary
    type_classes, source_classes, token_container = pickle.load(
        open(args.input_file, "rb"))
    type_dict = {'IGI': 0, 'IC': 1, 'IV': 2, 'IG': 3}
    # get all the label data
    labels_orig = [type_dict[x] for x in type_classes]
    data_orig = token_container

    if args.classifier == 'decision-tree':
        import ipdb
        ipdb.set_trace()
        clf = tree.DecisionTreeClassifier(criterion='entropy')
        parameters = {
            'clf__max_depth': [i for i in range(25, 200, 25)],
        }
    elif args.classifier == 'random-forest':
        clf = ensemble.RandomForestClassifier(criterion='entropy')
        parameters = {
            'clf__max_depth': [i for i in range(25, 200, 25)],
        }
    else:  #'logistic-regression':
        clf = linear_model.LogisticRegression()
        parameters = {
            'clf__C': [0.5, 1, 5, 10],
        }

    ppl = pipeline.Pipeline([
        ('vectorizer',
         feature_extraction.DictVectorizer(sparse=True)),  #sparse=True
        ('clf', clf),
    ])

    gs = grid_search.GridSearchCV(ppl, parameters, verbose=1, cv=5)
    gs.fit(data_orig, labels_orig)

    print(gs.best_params_, gs.best_score_)

    helpers.ensure_dir(os.path.dirname(args.output_file))
    pickle.dump([gs.best_params_, gs.best_params_],
                open(args.output_file, "wb"))
Exemplo n.º 18
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "iTrain_extra_lastitem_mod_new.csv") #iTrain.csv
  FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain_fold.csv") #iTrain.csv
  OUTPUT_FILE = os.path.join("..", "..", "data", "features","new", "dframe_new_iTrain.csv")#
  BAD_OUTPUT_FILE = os.path.join("..", "..", "data", "features","welsh_iTrain.csv")#
  SPELL_CHECKER_PATH = os.path.join("..","..","data","big.txt")

  parser.add_argument('-i', 
                      '-input-file', 
                      type=str, 
                      dest='inputFile', 
                      default=INPUT_FILE, 
                      help='File to be processed to a master set. File must be saved in data and argument structured as - ../../data/yourfilename.csv' )

  parser.add_argument('---lan', 
                      '---language', 
                      dest='language', 
                      action='store_false', 
                      default=False, 
                      help='Boolean - If set, language will be determined and non-english items will be removed')

  parser.add_argument('-l', 
                      '-lemmatize', 
                      dest='lemmatize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, verbs will be lemmatized')
                      
  parser.add_argument('--la', 
                      '--lemmatizeall', 
                      dest='lemmatizeall', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be lemmatized')
                      
  parser.add_argument('--lc', 
                      '--lower-case', 
                      dest='lowerCase', 
                      action='store_false', 
                      default=True, 
                      help='Boolean - Defaults to converted all to lower-case')
                      
  parser.add_argument('--rw', 
                      '--remove-words', 
                      dest='removeTags',
                      nargs= '+',
                      #action='store_true', 
                      default=None, 
                      help='Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH')

  parser.add_argument('-s', 
                      '-stematize', 
                      dest='stematize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be stematized')    
                      
  parser.add_argument('--sa', 
                      '--strip-accents',  
                      dest='stripAccents', 
                      action='store_false', 
                      default=True,  
                      help="Removes accents on letters replacing them with just the letter itself")

  parser.add_argument('--sp', 
                      '--spelling-corrector',  
                      dest='spellCorrect', 
                      action='store_false', 
                      default=True,  
                      help="Correct spelling mistakes word by word, just taking the most likely correction")

  parser.add_argument('--sd', 
                      '--spell-dictionary',  
                      dest='spell_dictionary', 
                      default=SPELL_CHECKER_PATH,
                      help="File containing the dictionary to be used for spell-check. Defaults to '%(default)s'")
  
  parser.add_argument('--sw', 
                      '--stop-words', 
                      dest='stopWords', 
                      action='store_false', 
                      default=True, 
                      help='Removes the most common words, "stop words", from the text')
                      
#  parser.add_argument('-t', 
#                      '-tokenize', 
#                      dest='tokenize', 
#                      action='store_true', 
#                      default=False, 
#                      help='Tokenizes text to individual words')                      

  parser.add_argument('--ta', 
                      '--alpha-numeric', 
                      dest='alphaNumeric', 
                      action='store_true', 
                      default=True, 
                      help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE')
     
  parser.add_argument('--th', 
                      '--token-hyphen', 
                      dest='tokenHyphen', 
                      action='store_true', 
                      default=False, 
                      help='Tokenizes text using the directory structure from input file')

  parser.add_argument('--uc', 
                      '--upper-case', 
                      dest='upperCase', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be converted to upper-case')

  parser.add_argument('--fn', 
                      '--fold-number', 
                      dest='fold_number', 
                      default=0,
                      type=int, 
                      help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'")

  parser.add_argument('--ff', 
                      '--fold-file', 
                      type=str, 
                      dest='foldFile', 
                      default=FOLD_FILE, 
                      help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'")
                      
  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here")                      

  parser.add_argument('--bo', 
                      '--bad-output-file', 
                      type=str, 
                      dest='bad_output_file', 
                      default=BAD_OUTPUT_FILE, 
                      help="Directory to be used to save the created welsh data set. Filename will be automatically created based on input flags. Defaults to something needsto go here")                      

    
  args = parser.parse_args()
      
  #  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1")
  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15", dtype=str)
  #set column names? 
  words = pandas.DataFrame({'frID':data_in.frID, 
                            'type':data_in.type_class, 
                           'class': data_in.source_class, 
                           'description': data_in.description,
                           'ICNPO_category':data_in.ICNPO_category,
                           'nicename':data_in.nicename})   


  processed_data = pandas.DataFrame(data_in[['frID', 'type_class', 'source_class', 'ICNPO_category', 'nicename']])

 
  #Define word_list
  word_list=words.description
  
  #print(words.head())
  if args.language:
      #check that text is in english and separate 
    print('Items that are more likely to be Welsh:')
    langval=text_processing.language(word_list)
#    for x in range(0, len(word_list)):
    good=numpy.where([x >= 0.01 for x in langval])
    bad=numpy.where([x < 0.01 for x in langval])
    badwords = words.drop(words.index[good])
    words = words.drop(words.index[bad])
    word_list=words.description
    #
    helpers.ensure_dir(os.path.dirname(args.bad_output_file))
    badwords.to_csv(args.bad_output_file, index = False) # write to file, but don't give row names
    print('Only items with en < 0.01 are taken to be bad')
    

  #tokenize the text either straight or keeping only alpha-numeric(default)
  if args.alphaNumeric:
      
    # keep just the alpha-numeric characters
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = list(map(tokenizer.tokenize, word_list))
    #print("alpha numeric only")    
    #print(word_list[1:20])      
    
#  if args.tokenize:
#          word_list = list(map(tokenizer.tokenize, word_list))
      
 #   word_list = list(map(nltk.word_tokenize, words.description))
    #print(word_list[1:20])      
  if args.tokenHyphen:
  #print("Tokenize on hyphen")
    word_list = list(map(text_processing.tokenize_on_hyphen , word_list))
    #print(word_list[1:20])      

  # lower case 
  if args.lowerCase:
    #print("LOWER CASE")
    word_list = list(map(text_processing.make_lower, word_list))      
    #print(word_list[1:20])
  
  # Upper case      
  if args.upperCase:
    #print("UPPER CASE")
    word_list = list(map(text_processing.make_upper, word_list))      
    #print(word_list[1:20])
    
  if args.stripAccents:
      #print("CORRECT SPELLING")
    i=0
    for x in range(len(word_list)) :
       correctedWords  = [text_processing.strip_accents(y) for y in word_list[x]]
       word_list[x]= correctedWords
       i=i+1
#       if i % 100 == 0 : print('row %d'% i)
#    word_list = list(map(text_processing.strip_accents, word_list))
      #print(word_list[1:20])    

  if args.spellCorrect:
      #print("CORRECT SPELLING")
    word_list = spell_checker.correctall(word_list, args.spell_dictionary)
      #print(word_list[1:20])     

  if args.removeTags:
      #print("REMOVE WORDS")
      #needs function in text_processing 
      word_list = list(map(text_processing.keep_only_specified_tags, word_list, args.removeTags))
      
  if args.stopWords:
      #print("STOP WORDS")
      word_list = list(map(text_processing.exclude_stop_words, word_list))
      #print(word_list[1:20])      
          
  if args.lemmatizeall:    
      #print("LEMMATIZE ALL")
      word_list = list(map(text_processing.lemmatizeall, word_list))
      #print(word_list[1:20])      
   
  if args.lemmatize:    
      #print("LEMMATIZE")
      word_list = list(map(text_processing.lemmatize, word_list))
      print(word_list[1:20])      

  if args.stematize:
      #print("STEMATIZE")
      word_list = list(map(text_processing.stematize, word_list))
      #print(word_list[1:20])       
 
  wl_df = pandas.DataFrame(word_list)
  frames = [processed_data, wl_df]
  
  output_df = pandas.concat(frames, axis = 1)

  #print(output_df[1:20])

  helpers.ensure_dir(os.path.dirname(args.output_file))
  output_df.to_csv(args.output_file, index = False) # write to file, but don't give row names
Exemplo n.º 19
0
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # this is the default input directory if nothing is passed
    INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv")
    OUTPUT_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv")

    parser.add_argument('-i',
                        '--input-file',
                        type=str,
                        dest='input_file',
                        default=INPUT_FILE,
                        help="File to be processed. Defaults to '%(default)s'")

    parser.add_argument('--ns',
                        '--num-samples',
                        dest='num_samples',
                        default=None,
                        type=int,
                        help='The number of samples')

    parser.add_argument('--nf',
                        '--num-folds',
                        dest='num_folds',
                        default=5,
                        type=int,
                        help='The number of folds')

    parser.add_argument(
        '-o',
        '--output-file',
        type=str,
        dest='output_file',
        default=OUTPUT_FILE,
        help=
        "File to be used to save the fold indices. Defaults to '%(default)s'")

    args = parser.parse_args()

    # read the data
    if args.num_samples == None:
        data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1")
        num_samples = len(data_in)
    else:
        num_samples = args.num_samples

    # generate the folds
    from random import shuffle
    indices_list = list(range(0, num_samples))
    shuffle(indices_list)
    retval = []
    folds_indices = [x % args.num_folds + 1 for x in indices_list]

    # write the indices to file
    helpers.ensure_dir(os.path.dirname(args.output_file))
    folds_indices_towrite = pandas.Series(numpy.array(folds_indices))
    folds_indices_towrite.to_csv(
        args.output_file,
        index=False)  # write to file, but don't give row names
Exemplo n.º 20
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
  INPUT_FILE = os.path.join("..", "..", "data", "features", "hierarchy-word-features.pkl")
  OUTPUT_FILE = os.path.join("..", "..", "data", "output", 'predicted_labels_ensemble.pkl')

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with the list of item classes and features. Defaults to '%(default)s'")

  parser.add_argument('-c', 
                      '--classifier', 
                      type=str, 
                      dest='classifier',
                      default="ensemble", 
                      help="The classifier to be used. Defaults to '%(default)s'")

  parser.add_argument('--orig-labels', 
                      dest='orig_labels', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, the original data labels will be stored. Otherwise, they will be coded as integers')

  parser.add_argument('--cat',
                      type=str, 
                      dest='category', 
                      default='income-type',
                      choices=('income-type','income-source','expenditure-type'),
                      help="The type of categorization. Defaults to '%(default)s'")

  parser.add_argument('-o', 
                      '-output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="A file to output the predicted labels. Defaults to '%(default)s'")

  

  args = parser.parse_args()

  # Read the input dictionary
  type_classes, source_classes, token_container = pickle.load(open(args.input_file, "rb"))
  
  import ipdb; ipdb.set_trace()
  # get all the label data
  if args.category == 'income-type' or args.category == 'expenditure-type':
    labels_orig = [str(i) for i in type_classes] # converting them to strings if they are not strings already
  else:
    labels_orig = [str(i) for i in source_classes] # converting them to strings if they are not strings already
  #labels_orig = [type_dict[x] for x in type_classes] 
  data_orig = token_container

  if args.classifier == 'decision-tree':
    clf = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
  elif args.classifier == 'random-forest':
    clf = ensemble.RandomForestClassifier()
  elif args.classifier == 'logistic-regression':
    clf = linear_model.LogisticRegression()
  else: # ensemble
    clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
    clf2 = ensemble.RandomForestClassifier()
    clf3 = linear_model.LogisticRegression()
    clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3], voting='hard')    

  ppl = pipeline.Pipeline([
    ('vectorizer', feature_extraction.DictVectorizer(sparse=True)), #sparse=True
    ('clf', clf),
  ])

  k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

  #labels_predicted = numpy.array([-1] * len(labels_orig), dtype='int')
  labels_predicted = [-1] * len(labels_orig) 

  accuracy = []

  for train_idx, dev_idx in k_fold: 
    data_train = [data_orig[i] for i in train_idx]
    data_dev =  [data_orig[i] for i in dev_idx]
    labels_train = [labels_orig[i] for i in train_idx]
    labels_dev =  [labels_orig[i] for i in dev_idx]
  
    ppl.fit(data_train, labels_train)
    predicted_dev = ppl.predict(data_dev)
    #labels_predicted[dev_idx] = predicted_dev
    labels_predicted = set_all_predicted(predicted_dev, labels_predicted, dev_idx)

    accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]
  
  print("Accuracy of the %s classifier: %.4f +- %.4f" % (args.classifier, numpy.mean(accuracy), numpy.std(accuracy)))

  # Save the predicted classes  
  #inv_type_dict = {v: k for k, v in type_dict.items()}
  
  to_dump = [labels_orig, labels_predicted]
  helpers.ensure_dir(os.path.dirname(args.output_file))
  pickle.dump(to_dump, open(args.output_file, "wb"))

  '''
  predicted_type_classes = [inv_type_dict[x] for x in labels_predicted]
  if args.orig_labels: # save the original labels
    predicted_type_classes = [inv_type_dict[x] for x in labels_predicted]
    to_dump = [type_classes, predicted_type_classes]
  else: # save the labels codified with integer numbers, as well as the decoding dictionary
    to_dump = [labels_orig, labels_predicted, inv_type_dict]
  '''
  



  

  
Exemplo n.º 21
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "test.csv") #iTrain.csv
  FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain_fold.csv") #iTrain.csv
  OUTPUT_DIR = os.path.join("..", "..", "data", "test") 
  SPELL_CHECKER_PATH = os.path.join("..","..","data","big.txt")

  parser.add_argument('-i', 
                      '-input-file', 
                      type=str, 
                      dest='inputFile', 
                      default=INPUT_FILE, 
                      help='File to be processed to a master set. Defaults to iTrain.csv. File must be saved in data and argument structured as - ../../data/yourfilename.csv' )

  parser.add_argument('---lan', 
                      '---language', 
                      dest='language', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, language will be determined and non-english items will be removed')

  parser.add_argument('-l', 
                      '-lemmatize', 
                      dest='lemmatize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, verbs will be lemmatized')
                      
  parser.add_argument('--la', 
                      '--lemmatizeall', 
                      dest='lemmatizeall', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be lemmatized')
                      
  parser.add_argument('--lc', 
                      '--lower-case', 
                      dest='lowerCase', 
                      action='store_false', 
                      default=True, 
                      help='Boolean - Defaults to converted all to lower-case')
                      
  parser.add_argument('--rw', 
                      '--remove-words', 
                      dest='removeTags',
                      nargs= '+',
                      #action='store_true', 
                      default=None, 
                      help='Accepts a list of types of words to be removed from list of ADJ, ADV, CNJ, DET, EX, FW, MOD, N, NP, NUM, PRO, P, TO, UH, V, VD, VG, VN, WH')

  parser.add_argument('-s', 
                      '-stematize', 
                      dest='stematize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be stematized')    
                      
  parser.add_argument('--sa', 
                      '--strip-accents',  
                      dest='stripAccents', 
                      action='store_false', 
                      default=True,  
                      help="Removes accents on letters replacing them with just the letter itself")

  parser.add_argument('--sp', 
                      '--spelling-corrector',  
                      dest='spellCorrect', 
                      action='store_false', 
                      default=True,  
                      help="Correct spelling mistakes word by word, just taking the most likely correction")
  
  parser.add_argument('--sw', 
                      '--stop-words', 
                      dest='stopWords', 
                      action='store_false', 
                      default=True, 
                      help='Removes the most common words, "stop words", from the text')
                      
#  parser.add_argument('-t', 
#                      '-tokenize', 
#                      dest='tokenize', 
#                      action='store_true', 
#                      default=False, 
#                      help='Tokenizes text to individual words')                      

  parser.add_argument('--ta', 
                      '--alpha-numeric', 
                      dest='alphaNumeric', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE')
     
  parser.add_argument('--th', 
                      '--token-hyphen', 
                      dest='tokenHyphen', 
                      action='store_true', 
                      default=False, 
                      help='Tokenizes text using the directory structure from input file')

  parser.add_argument('--uc', 
                      '--upper-case', 
                      dest='upperCase', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be converted to upper-case')

  parser.add_argument('--fn', 
                      '--fold-number', 
                      dest='fold_number', 
                      default=0,
                      type=int, 
                      help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered . Defaults to '%(default)s'")

  parser.add_argument('--ff', 
                      '--fold-file', 
                      type=str, 
                      dest='foldFile', 
                      default=FOLD_FILE, 
                      help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'")
                      
  parser.add_argument('-o', 
                      '-output-dir', 
                      type=str, 
                      dest='output_dir', 
                      default=OUTPUT_DIR, 
                      help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here")                      

    
  args = parser.parse_args()
      
#  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1")
  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-15" )
  words = pandas.DataFrame(data_in.description) # take just the description data
  
  # Filter the folds for master set training
  if args.fold_number != 0: # for creating the master set, keep just the words that DO NOT belong to the fold given as an argument
    cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
    words = words.loc[cross_fold_indices != args.fold_number]
 
  #Define word_list
  word_list=words.description
  
  #print(words.head())
  if args.language:
      #check that text is in english and separate 
    langval=text_processing.language(word_list)
#    for x in range(0, len(word_list)):
    good=numpy.where([x >= 0.8 for x in langval])
    bad=numpy.where([x < 0.8 for x in langval])
#    good=numpy.where([x=='en' for x in langval])
#    bad=numpy.where([x!='en' for x in langval])
    badwords = words.drop(words.index[good])
    words = words.drop(words.index[bad])
    word_list=words.description
    #
    #
    bad_output_fileName = "welsh_set.csv"
    bad_output_file = os.path.join(args.output_dir, bad_output_fileName)    
    helpers.ensure_dir(os.path.dirname(args.output_dir))
    badwords.to_csv(bad_output_file, index = False) # write to file, but don't give row names
    

  #tokenize the text either straight or keeping only alpha-numeric(default)
  if args.alphaNumeric:
      
    # keep just the alpha-numeric characters
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = list(map(tokenizer.tokenize, word_list))
    #print("alpha numeric only")    
    #print(word_list[1:20])      
    
  if args.tokenize:
          word_list = list(map(tokenizer.tokenize, word_list))
      
 #   word_list = list(map(nltk.word_tokenize, words.description))
    #print(word_list[1:20])      
  if args.tokenHyphen:
  #print("Tokenize on hyphen")
    word_list = list(map(text_processing.tokenize_on_hyphen , word_list))
    #print(word_list[1:20])      

  # lower case 
  if args.lowerCase:
    #print("LOWER CASE")
    word_list = list(map(text_processing.make_lower, word_list))      
    #print(word_list[1:20])
  
  # Upper case      
  if args.upperCase:
    #print("UPPER CASE")
    word_list = list(map(text_processing.make_upper, word_list))      
    #print(word_list[1:20])
    
  if args.stripAccents:
      #print("CORRECT SPELLING")
    i=0
    for x in range(len(word_list)) :
       correctedWords  = [text_processing.strip_accents(y) for y in word_list[x]]
       word_list[x]= correctedWords
       i=i+1
       if i % 100 == 0 : print('row %d'% i)
#    word_list = list(map(text_processing.strip_accents, word_list))
      #print(word_list[1:20])    

  if args.spellCorrect:
      #print("CORRECT SPELLING")
    word_list = spell_checker.correctall(word_list, SPELL_CHECKER_PATH)
      #print(word_list[1:20])     

  if args.removeTags:
      #print("REMOVE WORDS")
      #needs function in text_processing 
      word_list = list(map(text_processing.keep_only_specified_tags, word_list, args.removeTags))
      
  if args.stopWords:
      #print("STOP WORDS")
      word_list = list(map(text_processing.exclude_stop_words, word_list))
      #print(word_list[1:20])      
          
  if args.lemmatizeall:    
      #print("LEMMATIZE ALL")
      word_list = list(map(text_processing.lemmatizeall, word_list))
      #print(word_list[1:20])      
   
  if args.lemmatize:    
      #print("LEMMATIZE")
      word_list = list(map(text_processing.lemmatize, word_list))
      print(word_list[1:20])      

  if args.stematize:
      #print("STEMATIZE")
      word_list = list(map(text_processing.stematize, word_list))
      #print(word_list[1:20])       

  master_set = set()

  list(map(master_set.update, word_list)) # add unique values to the master set
  sorted(master_set)

  #ipdb.set_trace()
  
  helpers.ensure_dir(args.output_dir)

  #ipdb.set_trace()
  
  if args.fold_number!=0:
      output_fileName = "master_set_fold_%d.csv" % args.fold_number
  else:
      output_fileName = "master_set.csv"
  output_path = os.path.join(args.output_dir, output_fileName)    
  master_set_towrite = pandas.Series(numpy.array(list(master_set)))
  master_set_towrite.to_csv(output_path, index=False) # write to file, but don't give row names
Exemplo n.º 22
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "iTrain.csv") 
  OUTPUT_FILE = os.path.join("..", "..", "data", "features", "data-frame.csv") 
    
  parser.add_argument('--i', 
                      '--input-file', 
                      type=str, 
                      dest='inputFile', 
                      default=INPUT_FILE, 
                      help='File to be processed to boolean matrix')

  parser.add_argument('-l', 
                      '--lemmatize', 
                      dest='lemmatize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, verbs will be lemmatized')
                      
  parser.add_argument('-la', 
                      '--lemmatizeall', 
                      dest='lemmatizeall', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be lemmatized')
                      
  parser.add_argument('-lc', 
                      '--lower-case', 
                      dest='lowerCase', 
                      action='store_true', 
                      default=True, 
                      help='Boolean - Defaults to converted all to lower-case')
                      
  parser.add_argument('-rw', 
                      '--remove-words', 
                      dest='removeWords', 
                      action='store_true', 
                      default=None, 
                      help='Accepts a list of types of words to be removed e.g. ...')

  parser.add_argument('-s', 
                      '--stematize', 
                      dest='stematize', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be stematized')    
  
  parser.add_argument('--sa', 
                      '--strip-accents',  
                      dest='stripAccents', 
                      action='store_true', 
                      default=False,  
                      help="Removes accents on letters replacing them with just the letter itself")

  parser.add_argument('--sp', 
                      '--spelling-corrector',  
                      dest='spellCorrect', 
                      action='store_true', 
                      default=False,  
                      help="Correct spelling mistakes word by word, just taking the most likely correction")
      
  parser.add_argument('--sw', 
                      '--stop-words', 
                      dest='stopWords', 
                      action='store_true', 
                      default=False, 
                      help='Removes the most common words, "stop words", from the text')

  parser.add_argument('-t', 
                      '-tokenize', 
                      dest='token', 
                      action='store_true', 
                      default=False, 
                      help='Tokenizes text to individual words')                      

  parser.add_argument('--ta', 
                      '--alpha-numeric', 
                      dest='alphaNumeric', 
                      action='store_false', 
                      default=True, 
                      help='Boolean - If NOT set file will be tokenized and non alpha-numeric words left in. Default is TRUE')
     
  parser.add_argument('--th', 
                      '--token-hyphen', 
                      dest='tokenHyphen', 
                      action='store_true', 
                      default=False, 
                      help='Tokenizes text using the hierarchy structure')
                      
  parser.add_argument('-uc', 
                      '--upper-case', 
                      dest='upperCase', 
                      action='store_true', 
                      default=False, 
                      help='Boolean - If set, all words will be converted to upper-case')
                      
  parser.add_argument('-o', 
                      '--output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="Directory to be used to save the created master set. Filename will be automatically created based on input flags. Defaults to something needsto go here")                      

    
  args = parser.parse_args()
      
  data_in = pandas.read_csv(args.inputFile, encoding="ISO-8859-1")
  
  #set column names? 
  words = pandas.DataFrame({'type':data_in.type_class, 
                           'class': data_in.source_class, 
                           'description': data_in.description}) 
  #set column names? 
    
  #print(words.head())
 
#=============================================================================
# specify data frame of the required length
#=============================================================================
 
#==============================================================================
#  # frame_len = 0
#   #for x in range(1, len(words)):
#       
#   #  temp_len = len(words.description[x].split())
#    # if temp_len > frame_len:
#         frame_len = temp_len
#         print(frame_len)
#         print(words.description[x].split())
# 
#==============================================================================  
  processed_data = pandas.DataFrame(data_in[['type_class', 'source_class']])
  
# function calls need to be edited to send the relevant columns (excluding type and source).
# it will require a restructuring of data types. Not sure if can use a dynamic data frame
# as don't know how many words in each row - also will be different depending on processing
  
  #tokenize the text either straight or keeping only alpha-numeric(default)
  if args.alphaNumeric:
      
    # keep just the alpha-numeric characters
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    word_list = list(map(tokenizer.tokenize, words.description))
    #print("alpha numeric only")    
    #print(word_list[1:20])      
    
#  else:
 #   word_list = list(map(nltk.word_tokenize, words.description))
    #print(word_list[1:20])   
   
  if args.tokenHyphen:
      #print("STEMATIZE")
      word_list = list(map(text_processing.tokenize_on_hyphen, words.description))
      #print(word_list[1:20])  
  # lower case 
  if args.lowerCase:
    #print("LOWER CASE")
    word_list = list(map(text_processing.make_lower, word_list))      
    #print(word_list[1:20])
  
  # Upper case      
  if args.upperCase:
    #print("UPPER CASE")
    word_list = list(map(text_processing.make_upper, word_list))      
    #print(word_list[1:20])
          
  if args.lemmatizeall:    
     #print("LEMMATIZE ALL")
      word_list = list(map(text_processing.lemmatizeall, word_list))
     #print(word_list[1:20])      
      
  if args.lemmatize:    
      #print("LEMMATIZE")
      word_list = list(map(text_processing.lemmatize, word_list))
      #print(word_list[1:20])      

  if args.removeWords:
      print("REMOVE WORDS")
      #needs function in text_processing 
      
  if args.stematize:
      #print("STEMATIZE")
      word_list = list(map(text_processing.stematize, word_list))
      #print(word_list[1:20])    
                
  if args.stopWords:
      #print("STOP WORDS")
      word_list = list(map(text_processing.exclude_stop_words, word_list))
      #print(word_list[1:20])      
      
    
    
  wl_df = pandas.DataFrame(word_list)
  frames = [processed_data, wl_df]
  
  output_df = pandas.concat(frames, axis = 1)

  #print(output_df[1:20])

  helpers.ensure_dir(os.path.dirname(args.output_file))
  output_df.to_csv(args.output_file, index = False) # write to file, but don't give row names
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_FILE = os.path.join("..", "..", "data", "features", "data_frame.csv") 
  OUTPUT_DIR = os.path.join("..", "..", "data", "classifiers", "decision_trees")
  FOLD_FILE = os.path.join("..", "..", "data", "folds", "iTrain.csv") 
  INPUT_MS = os.path.join("..", "..", "data", "features", "master_set.csv")

  parser.add_argument('-i', 
                      '--input-file', 
                      type=str, 
                      dest='input_file',
                      default=INPUT_FILE, 
                      help="File with features. The classes will be taken from this file. Defaults to '%(default)s'")

  parser.add_argument('--ms', 
                      '--master_set', 
                      dest='master_set', 
                      default=INPUT_MS, 
                      help="The input file containing the master set. Defaults to '%(default)s'")

  parser.add_argument('--fn', 
                      '--fold-number', 
                      dest='fold_number', 
                      default=None,
                      type=int, 
                      help="The fold number to be EXCLUDED when creating the master set (If there are N folds, N-1 folds are used for creating the master set). If 0, all the items will be considered. If None, will train the classifier for all the folds in a loop. Defaults to '%(default)s'")

  parser.add_argument('--ff', 
                      '--fold-file', 
                      type=str, 
                      dest='foldFile', 
                      default=FOLD_FILE, 
                      help="Fold file containing the cross-fold validation indices. Defaults to '%(default)s'")
   
  parser.add_argument('-o', 
                      '--output-dir', 
                      type=str, 
                      dest='output_dir', 
                      default=OUTPUT_DIR, 
                      help="Directory to save the decision trees. Defaults to '%(default)s'")


  args = parser.parse_args()

  type_dict = {'IV':1, 'IG':2, 'IC':3, 'I':4, 'IGI':5, 'IO':6}
      
  # read the input matrix and the labels
  data_in = pandas.read_csv(args.input_file, encoding="ISO-8859-1")
  master_in = pandas.read_csv(args.master_set, encoding="ISO-8859-1", header=-1)

  labels_orig=data_in.type_class
  labels=pandas.Series([type_dict[x] for x in labels_orig])

  # Create output directory
  helpers.ensure_dir(args.output_dir)

  dtree = tree.DecisionTreeClassifier(random_state=0, max_depth=100, criterion='entropy')
  
  if args.fold_number == None: # loop over all folds and create the classifier
    print("Training will be done iteratively for all folds...\n")
    cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
    for fold in cross_fold_indices.unique():
      print("Training classifier for fold %d...\n" % fold)
      data_fold = data_in.loc[cross_fold_indices != fold,:]
      import ipdb; ipdb.set_trace()
      data_matrix = data_manipulation.binary_sparse_matrix(data_fold, master_in)
      labels_fold = labels[cross_fold_indices != fold]
      dtree.fit(data_matrix, labels_fold)
      joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % fold)) 
      tree.export_graphviz(dtree, out_file= os.path.join(args.output_dir, 'tree-fold%d.dot' % fold), max_depth=5, feature_names = master_in.values)

  elif args.fold_number == 0: # use all the data to train the classifier
    print("Training classifier for the full set...\n" % fold)
    data_matrix = data_manipulation.binary_sparse_matrix(data_in, master_in)
    dtree.fit(data_matrix, labels)
    joblib.dump(dtree, os.path.join(args.output_dir, 'tree.pkl')) 

  else: # create classifier for a particular fold
    print("Training classifier for fold %d...\n" % fold)
    cross_fold_indices = pandas.read_csv(args.foldFile, header=None)[0]
    data_fold = data_in.loc[cross_fold_indices != args.fold_number]
    data_matrix = data_manipulation.binary_sparse_matrix(data_fold, master_in)
    labels = labels[cross_fold_indices != args.fold_number]
    dtree.fit(data_matrix, labels)
    joblib.dump(dtree, os.path.join(args.output_dir, 'tree-fold%d.pkl' % args.fold_number)) 
def main():

    # to parse the arguments that are passed to main
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    INPUT_FILE = os.path.join("..", "..", "data", "features",
                              "bow_string_input_dframe.p")
    OUTPUT_FILE_DESC = os.path.join("..", "..", "data", "output",
                                    'predicted_labels_ensemble.csv')

    parser.add_argument(
        '-i',
        '--input-file',
        type=str,
        dest='input_file',
        default=INPUT_FILE,
        help=
        "File with the list of item classes and features. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-g',
        '--gridres-file',
        type=str,
        dest='gridres_file',
        default=None,
        help=
        "File with the best parameters of the grid search. Defaults to '%(default)s'"
    )

    parser.add_argument(
        '-n',
        '-ngrams',
        dest='nGrams',
        type=int,
        default=1,
        nargs='+',
        help=
        'Defines how to split words by ngrams. Default is tokenized to one word ngrams'
    )

    parser.add_argument('--ti',
                        dest='tf_idf',
                        action='store_true',
                        default=False,
                        help='Boolean - If set, TfIdf features will be used')

    parser.add_argument(
        '-c',
        '--classifier',
        type=str,
        dest='classifier',
        default="ensemble",
        help="The classifier to be used. Defaults to '%(default)s'")

    parser.add_argument(
        '--cat',
        type=str,
        dest='category',
        default='income-type',
        choices=('income-type', 'income-source', 'expenditure-type'),
        help="The type of categorization. Defaults to '%(default)s'")

    parser.add_argument(
        '--od',
        '--output-file-desc',
        type=str,
        dest='output_file_desc',
        default=OUTPUT_FILE_DESC,
        help=
        "A csv file to output the predicted labels. Defaults to '%(default)s'")

    args = parser.parse_args()

    # Read the input dictionary

    data_in = pandas.read_pickle(args.input_file)
    type_classes = list(data_in['type_class'])
    source_classes = list(data_in['source_class'])
    frID = list(data_in['frID'])
    data_orig = data_in['description']

    if args.category == 'income-type' or args.category == 'expenditure-type':
        labels_orig = [
            str(i) for i in type_classes
        ]  # converting them to strings if they are not strings already
    else:
        labels_orig = [
            str(i) for i in source_classes
        ]  # converting them to strings if they are not strings already

    if args.gridres_file is not None:
        gridres_params = pickle.load(open(args.gridres_file, "rb"))[0]
    else:
        gridres_params = None

    if args.classifier == 'decision-tree':
        max_depth = gridres_params[
            'clf__max_depth'] if gridres_params is not None else 100
        clf = tree.DecisionTreeClassifier(max_depth=max_depth,
                                          criterion='entropy')
        print("Will run a decision tree with max depth=%d" % max_depth)
    elif args.classifier == 'random-forest':
        max_depth = gridres_params[
            'clf__max_depth'] if gridres_params is not None else 100
        clf = ensemble.RandomForestClassifier(max_depth=max_depth,
                                              criterion='entropy')
        print("Will run a random forest with max depth=%d" % max_depth)
    elif args.classifier == 'logistic-regression':
        C = gridres_params['clf__C'] if gridres_params is not None else 1
        clf = linear_model.LogisticRegression(C=C)
        print("Will run logistic regressor with C=%d" % C)
    else:  # ensemble
        clf1 = tree.DecisionTreeClassifier(max_depth=100, criterion='entropy')
        clf2 = ensemble.RandomForestClassifier()
        clf3 = linear_model.LogisticRegression()
        clf = ensemble_classifier.EnsembleClassifier(clfs=[clf1, clf2, clf3],
                                                     voting='hard')

    vectorizer = feature_extraction.text.CountVectorizer(
        analyzer='word',  #whether should be made ofword or char n-grams
        binary=
        False,  # if True all non-zero counts are set to one - used for probabilistic mapping
        decode_error=
        'strict',  # Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given encoding
        #dtype='numpy.int64', # Type of the matrix returned by fit_transform() or transform()
        encoding="ISO-8859-15",  # 
        input='content',  # can be 'file', 'filename' or 'content'
        lowercase=
        False,  #Convert all characters to lowercase before tokenizing. 
        max_df=
        1.0,  # When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None."
        max_features=
        None,  # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.
        ngram_range=(
            1, args.nGrams
        ),  # The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.
        preprocessor=
        None,  # Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.
        stop_words=None,  #     
        min_df=1,
        strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary=None)

    if args.tf_idf == True:
        transformer = feature_extraction.text.TfidfTransformer()
        ppl = pipeline.Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('clf', clf),
        ])
    else:
        ppl = pipeline.Pipeline([
            ('vectorizer', vectorizer),
            ('clf', clf),
        ])

    k_fold = cross_validation.StratifiedKFold(labels_orig, 5, shuffle=True)

    labels_predicted = [-1] * len(labels_orig)

    accuracy = []

    for train_idx, dev_idx in k_fold:
        data_train = [data_orig[i] for i in train_idx]
        data_dev = [data_orig[i] for i in dev_idx]
        labels_train = [labels_orig[i] for i in train_idx]
        labels_dev = [labels_orig[i] for i in dev_idx]

        ppl.fit(data_train, labels_train)
        predicted_dev = ppl.predict(data_dev)
        labels_predicted = set_all_predicted(predicted_dev, labels_predicted,
                                             dev_idx)

        accuracy += [metrics.accuracy_score(labels_dev, predicted_dev)]

    print("Accuracy of the %s classifier: %.4f +- %.4f" %
          (args.classifier, numpy.mean(accuracy), numpy.std(accuracy)))

    # Save the predicted classes
    to_dump = [labels_orig, labels_predicted]
    helpers.ensure_dir(os.path.dirname(args.output_file_desc))

    #create a dataframe to output type class, predicted type class and description data
    if args.category == 'income-type' or args.category == 'expenditure-type':
        dump_op_desc = pandas.DataFrame({
            'frID': frID,
            'type_class': labels_orig,
            'type_class_predicted': labels_predicted,
            'description': data_orig
        })
    else:
        dump_op_desc = pandas.DataFrame({
            'frID': frID,
            'source_class': labels_orig,
            'source_class_predicted': labels_predicted,
            'description': data_orig
        })
    dump_op_desc.to_csv(args.output_file_desc)
Exemplo n.º 25
0
def main():
    
  # to parse the arguments that are passed to main
  parser = argparse.ArgumentParser(description=__doc__, 
                                   formatter_class=argparse.RawDescriptionHelpFormatter)
    
  # this is the default input directory if nothing is passed
  INPUT_MS = os.path.join("..", "..", "data", "features", "master_set.csv") 
  INPUT_DF = os.path.join("..", "..", "data", "features", "data_frame.csv") 
  OUTPUT_FILE = os.path.join("..", "..", "data", "features", "sparse_matrix.pkl") 


  parser.add_argument('--im', 
                      '--master-set-file', 
                      type=str, 
                      dest='inputMS', 
                      default=INPUT_MS, 
                      help='Master set of all words to be processed to boolean matrix')

  parser.add_argument('--id', 
                      '--data-frame-file', 
                      type=str, 
                      dest='inputDF', 
                      default=INPUT_DF, 
                      help='Data frame processed by "create_data_frame"')
   
  parser.add_argument('-o', 
                      '--output-file', 
                      type=str, 
                      dest='output_file', 
                      default=OUTPUT_FILE, 
                      help="File to save the sparse matrix. Defaults to '%(default)s'")


  args = parser.parse_args()
      
  master_in = pandas.read_csv(args.inputMS, encoding="ISO-8859-1", header=-1)
  data_in = pandas.read_csv(args.inputDF, encoding="ISO-8859-1")

  
  numrows = len(data_in)
  numcols = len(master_in)

  rowind = []
  colind = []
  data=[]

  for i in range(len(data_in[:3])):
    thisrow = data_in.iloc[i,2:]
    valid = thisrow.dropna()
    validset=set(list(valid))
    for setelem in validset:
       master_set_ind =  master_in.loc[master_in[0]==setelem][0].index[0]
       rowind.append(i)
       colind.append(master_set_ind)
       data.append(1)
  
  import ipdb; ipdb.set_trace()
  sparse_mat = scipy.sparse.coo_matrix((numpy.array(data),(numpy.array(rowind), numpy.array(colind))), shape=(numrows, numcols))
  
  # save the matrix
  helpers.ensure_dir(os.path.dirname(args.output_file))
  joblib.dump(sparse_mat, args.output_file)
  
  # len(df_in.columns)  
  # df_in.iloc(5)
  
  ipdb.set_trace()   
  
  
  # for loop here through the rows
  for x in range(0, len(df_in)):
      
      # for loop through cells in row - starting from 3rd? - where master set words start
      for y in range(2, len(df_in.columns)):
          
          print('x', x)
          print('y', y)

            
          token = df_in.iloc[x,y]
          #token = 'f**k'
          
          this_cell = [ 1 if token in master_set else 0]# for w in token ]
       

          #df_in.iloc[:1] #access row one
          
          if this_cell == 1:
              master_cols.ix[x, token] = 1