def reformat_data(conll_data_dir, text_only_data_dir, remove_pos): """ Data formatting ========= `word2vec` produces vectors for words, such as `computer`, whereas the rest of my experiments assume there are augmented with a PoS tag, e.g. `computer/N`. To get around that, start with a directory of conll-formatted files such as ``` 1 Anarchism Anarchism NNP MISC 5 nsubj 2 is be VBZ O 5 cop 3 a a DT O 5 det 4 political political JJ O 5 amod 5 philosophy philosophy NN O 0 root ``` and convert them to pos-augmented format (using coarse tags like Petrov's): ``` Anarchism/N is/V a/DET .... ``` :param conll_data_dir: input directory in CONLL format :param text_only_data_dir: output directory """ mkdirs_if_not_exists(text_only_data_dir) Parallel(n_jobs=5)(delayed(_reformat_single_file)(conll_data_dir, filename, text_only_data_dir, remove_pos) for filename in os.listdir(conll_data_dir))
def train_verb_tensors(svos_file, noun_vectors_file, output_filename): """ Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3) :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py` :param noun_vectors_file: a vector store containing noun vectors :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus """ mkdirs_if_not_exists(os.path.dirname(output_filename)) v = Vectors.from_tsv(noun_vectors_file) with open(svos_file) as infile: phrases = set() for line in infile: if DocumentFeature.from_string(line.strip()).type == 'SVO': phrases.add(tuple(line.strip().split('_'))) phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v] phrases = sorted(phrases, key=itemgetter(1)) logging.info('Found %d SVOs in list', len(phrases)) verb_tensors = dict() for verb, svos in groupby(phrases, itemgetter(1)): svos = list(svos) if len(svos) < MIN_SVO_PER_VERB: continue logging.info('Training matrix for %s from %d SVOs', verb, len(svos)) vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos) verb_tensors[verb] = vt logging.info('Trained %d verb matrices, saving...', len(verb_tensors)) for verb, tensor in verb_tensors.items(): df = pd.DataFrame(tensor) df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
def run_stanford_pipeline(data_dir, stanford_dir, java_threads=2, filelistdir=""): """ Process directory of text using stanford core nlp suite. Perform: - Tokenisation - Sentence segmentation - PoS tagging - Lemmatisation Output CONLL to "*data_dir*-tagged" """ if not all([data_dir, stanford_dir]): raise ValueError("ERROR: Must specify path to data and stanford tools.") # Create output directory output_dir = "%s-tagged" % data_dir try: os.mkdir(output_dir) except OSError: pass # Directory already exists # Change working directory to stanford tools os.chdir(stanford_dir) logging.info("<%s> Beginning stanford pipeline..." % current_time()) for data_sub_dir in [name for name in os.listdir(data_dir) if not name.startswith(".")]: # Setup output subdirectory output_sub_dir = os.path.join(output_dir, data_sub_dir) input_sub_dir = os.path.join(data_dir, data_sub_dir) mkdirs_if_not_exists(output_sub_dir) # Create list of files to be processed. filelist = os.path.join(filelistdir if filelistdir else stanford_dir, "%s-filelist.txt" % data_sub_dir) _make_filelist_and_create_files(input_sub_dir, filelist, output_sub_dir) logging.info("<%s> Beginning stanford processing: %s" % ( current_time(), input_sub_dir)) # Construct stanford java command. stanford_cmd = ['./corenlp.sh', '-annotators', 'tokenize,ssplit,pos,lemma,parse', '-filelist', filelist, '-outputDirectory', output_sub_dir, '-threads', str(java_threads), '-outputFormat', 'conll', '-outputExtension', '.tagged', '-parse.maxlen', '50'] logging.info("Running: \n" + str(stanford_cmd)) # Run stanford script, block until complete. subprocess.call(stanford_cmd) logging.info("<%s> Stanford complete for path: %s" % (current_time(), output_sub_dir)) logging.info("<%s> All stanford complete." % current_time()) return output_dir
def build_full_composed_thesauri_with_baroni_and_svd(args): # SET UP A FEW REQUIRED PATHS byblo_opts, _ = parse_byblo_conf_file(args.conf) input_file_name = os.path.basename(byblo_opts.input) # INPUT 1: DIRECTORY. Must contain a single conf file unigram_vectors_dir = os.path.abspath(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir) unigram_vectors_dir_ppmi = '%s-ppmi' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi) unigram_vectors_dir_ppmi_svd = '%s-ppmi-svd' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi_svd) # INPUT 2: A FILE, TSV, underscore-separated observed vectors for ANs and NNs SVD_DIMS = 100 ngram_vectors_dir = '%s-ppmi-svd-composed' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(ngram_vectors_dir) composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer, VerbComposer, RightmostWordComposer] # EXTRACT UNIGRAM VECTORS WITH BYBLO if 'unigrams' in args.stages: calculate_unigram_vectors(os.path.abspath(args.conf), os.path.abspath(args.byblo)) else: logging.warning('Skipping unigrams stage. Assuming output is at %s', byblo_opts.output) # FEATURE REWEIGHTING- will always be performed if 'ppmi' in args.stages: _do_ppmi(_find_events_file(byblo_opts.output), unigram_vectors_dir_ppmi) # REDUCE DIMENSIONALITY # add in observed AN/NN vectors for SVD processing. Reduce both unigram vectors and observed phrase vectors # together and put the output into the same file unreduced_unigram_events_file = _find_events_file(unigram_vectors_dir_ppmi) # ...exp6-12/exp6.events.filtered.strings --> ...exp6-12/exp6 reduced_file_prefix = join(unigram_vectors_dir_ppmi_svd, input_file_name) # only keep the most frequent types per PoS tag to speed things up counts = [('N', 200000), ('V', 200000), ('J', 100000), ('RB', 0), ('AN', 0), ('NN', 0)] if 'svd' in args.stages: # in this case the name exp%d-with-obs-phrases is massively misleading because # there aren't any obs phrase vectors # let's just do SVD on the unigram phrases so we can compose them simply later do_svd(unreduced_unigram_events_file, reduced_file_prefix, desired_counts_per_feature_type=counts, reduce_to=[SVD_DIMS]) else: logging.warning('Skipping SVD stage. Assuming output is at %s-SVD*', reduced_file_prefix) # construct the names of files output by do_svd all_reduced_vectors = '%s-SVD%d.events.filtered.strings' % (reduced_file_prefix, SVD_DIMS) if 'compose' in args.stages: # it is OK for the first parameter to contain phrase vectors, there is explicit filtering coming up # the assumption is these are actually observed phrasal vectors compose_and_write_vectors(all_reduced_vectors, '%s-%s' % (input_file_name, SVD_DIMS), composer_algos, output_dir=ngram_vectors_dir, dense_hd5=True) else: logging.warning('Skipping composition stage. Assuming output is at %s', ngram_vectors_dir)
def run_glove(): logging.info('Starting training') with temp_chdir(args.glove_dir): run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data)) # convert their format to ours df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None) logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df)) # remove any shit-looking tokens, they'll get in the way later mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index] logging.info('Keeping %d entries', sum(mask)) logging.info('Shape of vectors before filtering %r', df.shape) df = df[mask] logging.info('Shape of vectors after filtering %r', df.shape) cols = ['f%d' % i for i in range(df.shape[1])] mkdirs_if_not_exists(output_dir) write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'): """ Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space. :param vectors_path: input vectors in byblo format, compressed or not :param out_dir: where to put the thesaurus and all temp file :param threads: number of byblo threads :param num_neighbours: number of nearest neighbours per entry to output :param sim_function: similarity measure between vectors to use. see byblo docs """ from discoutils.thesaurus_loader import Vectors BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0' vectors_path = os.path.abspath(vectors_path) out_dir = os.path.abspath(out_dir) mkdirs_if_not_exists(out_dir) v = Vectors.from_tsv(vectors_path) # prepare the files that byblo expects outf_basename = os.path.join(out_dir, 'input') events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings') entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings') features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings') v.to_plain_txt(events_file, entries_file, features_file) # write the byblo conf file conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \ '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads, num_neighbours, sim_function) conf_path = os.path.join(out_dir, 'conf.txt') with open(conf_path, 'w') as outf: for line in conf.split(): outf.write(line) outf.write('\n') # go baby go with temp_chdir(BYBLO_BASE_DIR): reindex_all_byblo_vectors(outf_basename) run_byblo(conf_path, touch_input_file=True) unindex_all_byblo_vectors(outf_basename)
def run_experiment(conf): start_time = datetime.now() mkdirs_if_not_exists(conf['output_dir']) test_path = '' tr_data = conf['training_data'] if conf['test_data']: test_path = conf['test_data'] # LOADING RAW TEXT x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data, get_tokenizer_settings_from_conf(conf), test_data=test_path) # CREATE CROSSVALIDATION ITERATOR cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'], y_tr, y_test) if x_test is not None: # concatenate all data, the CV iterator will make sure x_test is used for testing x_vals = list(x_tr) x_vals.extend(list(x_test)) else: x_vals = x_tr all_scores = [] params = [] for i, (train_idx, test_idx) in enumerate(cv_iterator): params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals)) logging.warning('Only using the first CV fold') if conf['crossvalidation']['break_after_first']: # only do one train/test split to save time logging.info('Exiting after first fold') break scores_over_cv = [_cv_loop(*foo) for foo in params] all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores]) _store_scores(all_scores, conf['output_dir'], conf['name']) total_time = (datetime.now() - start_time).seconds / 60 logging.info('MINUTES TAKEN %.2f' % total_time)
def __init__(self, prefix, stage, cv_fold, n_replacements=3): self.token_counts = Counter() self.paraphrases = Counter() self.prefix = prefix # store data here instead of in memory self.stage = stage self.cv_fold = cv_fold self.par_file = '%s.%s.csv.gz' % (self.prefix, 'par') # paraphrases self.tc_file = '%s.%s.csv.gz' % (self.prefix, 'tc') # term counts self.max_paraphrases = n_replacements mkdirs_if_not_exists(os.path.dirname(self.par_file)) mkdirs_if_not_exists(os.path.dirname(self.tc_file)) if cv_fold == 0 and stage == 'tr': # experiment just started, write header to output files with gzip.open(self.tc_file, 'wb') as outfile: outfile.write(bytes('# feature counts in labelled data\n', encoding='UTF8')) outfile.write(bytes('cv_fold,stage,feature,IV,IT,count\n', encoding='UTF8')) with gzip.open(self.par_file, 'wb') as outfile: outfile.write(bytes('# Replacements made at decode time\n', encoding='UTF8')) repl_header = ','.join('neigh{0},neigh{0}_sim'.format(i + 1) for i in range(n_replacements)) header = 'cv_fold,stage,feature,available_replacements,%s,count\n' % repl_header outfile.write(bytes(header, encoding='UTF8'))
def compute_and_write_vectors(corpus_name, stages, percent, repeat, remove_pos): prefix = os.path.abspath(os.path.join(__file__, '..', '..')) output_dir = join(prefix, 'outputs', 'word2vec') mkdirs_if_not_exists(output_dir) # inputs conll_data_dir = join(prefix, 'data/%s-conll' % corpus_name) # outputs if remove_pos: text_only_data_dir = join(prefix, 'data/%s-nopos' % corpus_name) unigram_events_file = join(output_dir, '%s-nopos-%dperc.unigr.strings'%(corpus_name, percent)) else: text_only_data_dir = join(prefix, 'data/%s' % corpus_name) unigram_events_file = join(output_dir, '%s-%dperc.unigr.strings'%(corpus_name, percent)) if percent > 90 and repeat > 1: raise ValueError('Repeating with a different sample of corpus only makes sense when ' 'the samples are sufficiently distinct. This requires that the sample' ' size is fairly small to minimise overlap between samples') if 'reformat' in stages: reformat_data(conll_data_dir, text_only_data_dir, remove_pos) if 'vectors' in stages: models = [_train_model(percent, text_only_data_dir, i, remove_pos) for i in range(repeat)] vectors = [] # write the output of each run separately for i in range(repeat): output_path = unigram_events_file + '.rep%d' % i vectors.append(write_gensim_vectors_to_tsv(models[i], output_path)) if 'average' in stages and repeat > 1: # average vectors and append to list to be written shared_vocab = set.intersection(*[set(model.vocab.keys()) for model in models]) output_path = unigram_events_file + '.avg%d' % repeat model = {} for k in shared_vocab: model[k] = reduce(np.add, [m[k] for m in models]) vectors.append(write_gensim_vectors_to_tsv(model, output_path, vocab=shared_vocab)) else: # let's just pretend something was written above. just need this so the loop below will run vectors = [None] * repeat + ([None] if 'average' in stages and repeat > 1 else []) if 'compose' in stages: for i, v in enumerate(vectors): # if we'll also be composing we don't have to write the unigram vectors to disk # just to read them back later. if 'average' in stages and i == (len(vectors) - 1) and len(vectors) > 1: # last set of vectors in the list, these are the averages ones out_path = 'word2vec-%s_%dpercent-avg%d' % (corpus_name, percent, repeat) input_thing = v if 'vectors' in stages else unigram_events_file + '.avg%d' % repeat else: out_path = 'word2vec-%s_%dpercent-rep%d' % (corpus_name, percent, i) input_thing = v if 'vectors' in stages else unigram_events_file + '.rep%d' % i row_filter = default_row_filter_nopos if remove_pos else default_row_filter compose_and_write_vectors(input_thing, out_path, composer_algos, output_dir=output_dir, row_filter=row_filter, remove_pos=remove_pos, dense_hd5=True)
# the two paths below needs to point to the same thing phrases_to_compose = os.path.join(prefix, '..', 'thesisgenerator', 'features_in_labelled', 'socher.txt') socher_input_file = os.path.join(socher_base_dir, 'parsed.txt') plaintext_socher_input_file = os.path.join(prefix, '..', 'eval', 'features_in_labelled', 'all_features.txt') socher_output_phrases_file = os.path.join(socher_base_dir, 'phrases.txt') socher_output_vectors_file = os.path.join(socher_base_dir, 'outVectors.txt') socher_unigram_embedding_matlab = os.path.join(socher_base_dir, 'vars.normalized.100.mat') # output of reformat stage turian_unigram_vectors_file = os.path.join(socher_base_dir, 'turian_unigrams.h5') output_dir = os.path.join(socher_base_dir, 'composed') mkdirs_if_not_exists(output_dir) socher_composed_vectors_file = os.path.join(output_dir, 'AN_NN_turian_Socher.events.filtered.strings') def run_socher_code(): # symlink the file Socher's code expects to where the list of phrases I'm interested is force_symlink(phrases_to_compose, socher_input_file) with temp_chdir(socher_base_dir): run_and_log_output('./phrase2Vector.sh') # this takes a while # output files are phrases.txt and outVectors.txt def reformat_socher_vectors(): """ Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
def _write_features_of_single_corpus_to_file(all_phrases, corpus_name): ALL_FEATURES_FILE = '%s/%s_all_features.txt' % (ROOT, corpus_name) NP_MODIFIERS_FILE = '%s/%s_np_modifiers.txt' % (ROOT, corpus_name) VERBS_FILE = '%s/%s_verbs.txt' % (ROOT, corpus_name) SOCHER_FILE = '%s/%s_socher.txt' % (ROOT, corpus_name) logging.info('Writing %d unique document features to files in %s', len(all_phrases), ROOT) # How stanford parser formats NPs and VPs # (ROOT # (NP (NN acquisition) (NN pact))) # # (ROOT # (NP (JJ pacific) (NN stock))) stanford_NP_pattern = '(ROOT\n (NP ({} {}) ({} {})))\n\n' # (ROOT # (S # (NP (NNS cats)) # (VP (VBP eat) # (NP (NNS dogs))))) stanford_SVO_pattern = '(ROOT\n (S\n (NP (NN {}))\n (VP (VB {})\n (NP (NN {})))))\n\n' # (ROOT # (S # (VP (VB eat) # (NP (NNS cats))))) stanford_VO_pattern = '(ROOT\n (S\n (VP (VB {})\n (NP (NN {})))))\n\n' # (ROOT # (NP (NN roads))) # I checked that this extracts the neural word embedding for the word stanford_unigram_pattern = '(ROOT\n (NP ({} {})))\n\n' mkdirs_if_not_exists(ROOT) logging.info('Writing all document features to files') seen_modifiers, seen_verbs = set(), set() with open(SOCHER_FILE, 'w') as outf_socher, \ open(NP_MODIFIERS_FILE, 'w') as outf_mods, \ open(VERBS_FILE, 'w') as outf_verbs, \ open(ALL_FEATURES_FILE, 'w') as outf_plain: for item in all_phrases: item = DocumentFeature.from_string(item) # write in my underscore-separated format outf_plain.write(str(item) + '\n') if item.type in {'AN', 'NN'}: # write the phrase in Socher's format string = stanford_NP_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text, item.tokens[1].pos * 2, item.tokens[1].text) outf_socher.write(string) if item.type in {'VO', 'SVO'}: verb = str(item.tokens[-2]) if verb not in seen_verbs: seen_verbs.add(verb) outf_verbs.write(verb) outf_verbs.write('\n') if item.type == 'VO': string = stanford_VO_pattern.format(*[x.tokens[0].text for x in item]) outf_socher.write(string) if item.type == 'SVO': string = stanford_SVO_pattern.format(*[x.tokens[0].text for x in item]) outf_socher.write(string) if item.type in {'AN', 'NN'}: # write just the modifier separately first = str(item.tokens[0]) second = str(item.tokens[1]) if first not in seen_modifiers: outf_mods.write('%s\n' % first) seen_modifiers.add(first) if item.type == '1-GRAM': string = stanford_unigram_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text) outf_socher.write(string) if item.type not in {'1-GRAM', 'AN', 'NN', 'VO', 'SVO'}: # there shouldn't be any other features raise ValueError('Item %r has the wrong feature type: %s' % (item, item.type))
if not os.path.exists(arg): parser.error("The conf file %s does not exist!" % arg) else: return arg if __name__ == '__main__': # parse command-line arguments (conf file only) parser = argparse.ArgumentParser(description='Evaluate vector via document classification') parser.add_argument('conf_file', help='Conf file that defines the experiment', type=is_valid_file) args = parser.parse_args() conf, configspec_file = parse_config_file(args.conf_file) mkdirs_if_not_exists(conf['output_dir']) # set up logging to file logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s", datefmt='%m-%d %H:%M', filename=os.path.join(conf['output_dir'], 'log.txt'), filemode='w') # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter("%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s") # tell the handler to use this format console.setFormatter(formatter) logging.getLogger('').addHandler(console)
def conf(): config, _ = parse_config_file(conf_file) mkdirs_if_not_exists(config['output_dir']) return config