def transcribe_labels(results_fname): """ Read the results stored under the crowdflower subdir of the data dir named by results_fname, and interpret the annotations into labels. Then, write those labels out into the relational-nouns subdir of the data dir within a subsubdir that has the same name as the results_fname. Record teh results in three separate files -- based on what source the word was drawn from. """ # Work out paths results_path = os.path.join(DATA_DIR, 'crowdflower', results_fname) result_fname_no_ext = results_fname.rsplit('.', 1)[0] labels_dir = os.path.join(DATA_DIR, 'relational-nouns', result_fname_no_ext) t4k.ensure_exists(labels_dir) # Read in the results, and interpret labels crowdflower_results = t4k.CrowdflowerResults( results_path, lambda x:x['data']['token']) word_labels = interpret_annotations_by_source(crowdflower_results) # Write labels to disk, with words coming from different sources put into # different files. for source in word_labels: source_label_file = open(os.path.join(labels_dir, source + '.tsv'), 'w') for word, label in word_labels[source].iteritems(): source_label_file.write(word + '\t' + label + '\n')
def extract_features_from_parc(num_processes=NUM_PROCESSES, limit=pd.MAX_ARTICLE_NUM, out_path=None): # Resolve the path to the results file and open it for writing. if out_path is None: out_dir = os.path.join(DATA_DIR, 'parc-verifiability') t4k.ensure_exists(out_dir) out_path = os.path.join(out_dir, 'features.np') out_file = open(out_path, 'w') # Make a queueu so that workers can send results back results_queue = iq.IterableQueue() # Start a bunch of workers for proc_num in range(num_processes): p = multiprocessing.Process(target=extract_features_from_parc_worker, args=(results_queue.get_producer(), proc_num, num_processes, limit)) p.start() # Get an endpoint to collect the work, then close the queue since we won't # make any more endpoints results_consumer = results_queue.get_consumer() results_queue.close() # Collect all the incoming work from the workers all_result_vectors = [] for result_vectors in results_consumer: all_result_vectors.extend(result_vectors) # Turn all the results vectors into a single pandas dataframe, and save it use_headers = headers[:1] + headers[2:] data_frame = pandas.DataFrame(all_result_vectors, columns=use_headers) pickle.dump(data_frame, out_file)
def extract_features_from_parc_file(article_num): parc_features_dir = os.path.join(DATA_DIR, 'parc-verifiability', 'features') t4k.ensure_exists(parc_features_dir) corenlp_path, parc_path, raw_path = pd.get_article_paths(article_num) out_vector_path = os.path.join(parc_features_dir, pd.get_article_features_path(article_num)) return extract_features(corenlp_path, parc_path, raw_path, out_vector_path)
def do_generate_candidates1(): # Decide the output path and the number of positive candidates to find t4k.ensure_exists(CANDIDATES_DIR) out_path = os.path.join(CANDIDATES_DIR, 'candidates1.txt') num_to_generate = 1000 # Read in the seed set, which is the basis for the model that selects new # candidates pos, neg, neut = utils.get_full_seed_set() # Don't keep any candidates that were already in the seed set exclude = pos | neg | neut generate_candidates.generate_candidates( num_to_generate, out_path, pos, neg, exclude)
def do_generate_candidates_iteration(iteration=2, kernel=None, features=None): # Work out the file names candidates_fname = 'candidates%d.txt' % iteration random_candidates_fname = 'random_candidates%d.txt' % iteration previous_results_fname = 'results%d.json' % (iteration-1) previous_labels_dirname = 'results%d' % (iteration-1) previous_task_fnames = ['task%d.csv' % j for j in range(1,iteration)] # Decide the output path and the number of positive candidates to find t4k.ensure_exists(CANDIDATES_DIR) out_path = os.path.join(CANDIDATES_DIR, candidates_fname) random_out_path = os.path.join(CANDIDATES_DIR, random_candidates_fname) num_to_generate = 1000 # Read in the seed set, which is the basis for the model that selects new # candidates pos, neg, neut = utils.get_full_seed_set() exclude = pos | neg | neut # Read in the labelled data inside the first set of results transcribe_labels(previous_results_fname) add_pos, add_neg, add_neut = utils.read_all_labels(os.path.join( DATA_DIR, 'relational-nouns', previous_labels_dirname)) # Add in these nouns to the seeds pos.update(add_pos) neg.update(add_neg) neut.update(add_neut) # Don't keep any candidates that were already in the seed set or previously # loaded questions for task_fname in previous_task_fnames: task_path = os.path.join(CROWDFLOWER_DIR, task_fname) reader = csv.DictReader(open(task_path)) exclude.update([row['token'] for row in reader]) ## Generate the non-random candidates, enabling enrichment of positives #generate_candidates.generate_candidates_ordinal( # num_to_generate, out_path, pos, neg, neut, exclude, kernel, features) # Generate random candidates, enabling exploration and model testing. random_candidates_path = os.path.join( CANDIDATES_DIR, random_candidates_fname) generate_candidates.generate_random_candidates(2000, random_out_path)
def make_crowdflower_csv(iteration=2): # Seed randomness for reproducibility random.seed(0) # Open a file at which to write the csv file t4k.ensure_exists(CROWDFLOWER_DIR) task_fname = 'task%d.csv' % iteration csv_path = os.path.join(CROWDFLOWER_DIR, task_fname) csv_f = open(csv_path, 'w') # First read the scored candidates pos_common_candidates = [] neg_common_candidates = [] neut_common_candidates = [] candidates_fname = 'candidates%d.txt' % iteration for line in open(os.path.join(CANDIDATES_DIR, candidates_fname)): token, class_ = line.split('\t')[:2] if class_ == '+': pos_common_candidates.append(token) elif class_ == '-': neg_common_candidates.append(token) elif class_ == '0': neut_common_candidates.append(token) else: raise ValueError('Unexpected classification character: %s' % class_) # We'll only keep the first 1000 negatives. positives = pos_common_candidates[:1000] neutrals = neut_common_candidates[:1000] negatives = neg_common_candidates[:1000] #num_neut = min(250, len(neut_common_candidates)) #neg_common_candidates = neg_common_candidates[:500-num_neut] #neut_common_candidates = neut_common_candidates[:num_neut] # Next read the random candidates random_candidates_fname = 'random_candidates%d.txt' % iteration random_candidates_path = os.path.join( CANDIDATES_DIR, random_candidates_fname) random_candidates = open(random_candidates_path).read().strip().split('\n') random_candidates[:2000] # Collect all the candidate words together and elminate dupes all_candidates = set(positives + negatives + neutrals + random_candidates) # Now keep track of why each word was included (i.e. was it a word labelled # by the classifier-to-date as positive? negative? or was it randomly # sampled? Note that a word could be both randomly drawn and labelled. positives = set(positives) negatives = set(negatives) neutrals = set(neutrals) random_candidates = set(random_candidates) sourced_candidates = [] for candidate in all_candidates: sources = [] if candidate in pos_common_candidates: sources.append('pos2') if candidate in neg_common_candidates: sources.append('neg2') if candidate in neut_common_candidates: sources.append('neut2') if candidate in random_candidates: sources.append('rand2') sourced_candidates.append((candidate, ':'.join(sources))) # randomize the ordering random.shuffle(sourced_candidates) # Write a csv file with the candidate words in it writer = csv.writer(csv_f) writer.writerow(['token', 'source']) writer.writerows(sourced_candidates)