def get_raw_tweets(query_dict): """ Get raw tweets :param query_dict: query_string: 'datacamp lang:en' time_since: '2019-03-01' time_until: '2019-05-01' max_tweets: 0 for unlimited :return: dataframe """ file_name = _convert_query_dict_to_str_as_filename(query_dict) save_raw_file_name = paths.raw_tweets / f"raw_{file_name}.csv" print(file_name) if save_raw_file_name.is_file(): print(f"Raw file {repr(save_raw_file_name)} already exists, reload") tweet_df = load_csv(save_raw_file_name) else: _validate_query(query_dict) print(f"Getting raw tweets with query:\n{query_dict!r}") tweet_criteria = _create_search_criteria(**query_dict) tweet_objects = _get_tweet_object(tweet_criteria) tweet_df = _convert_tweets_to_dataframe(tweet_objects) print(f"Saving raw tweets to: {repr(save_raw_file_name)}") tweet_df.to_csv(save_raw_file_name, index=False) print("Done getting raw tweets.") return tweet_df
def __init__(self, splits: str): self.name = splits self.splits = splits.split(',') # Loading datasets self.data = [] idx = 0 for split in self.splits: # self.data.extend(json.load(open("data/vqa/%s.json" % split))) for row in load_csv('{}{}{}.csv'.format(VCSD_DATA_ROOT, VCSD_FILE_BASE, split), delimiter='\t'): r = { 'id': idx, 'raw_image_id': row['raw_image_id'], 'image_id': row['image_id'], 'utterance': row['utterance'], 'response': row['response'], 'label': row['label'], } self.data.append(r) idx += 1 print("Load %d data from split(s) %s." % (len(self.data), self.name)) # Convert list to dict (for evaluation) self.id2datum = {datum['id']: datum for datum in self.data}
def read_csv(input_file): """Reads a tab separated value file.""" df = load_csv(input_file, header=0) fact = df['fact'].tolist() jlabel = df.loc[:, 'label'].values lines = [[jlabel[i], fact[i]] for i in range(len(jlabel)) if type(fact[i]) == str] return lines
def _read_csv(cls, input_file, content_col=None, label_col=None): """Reads a tab separated value file.""" if content_col == None: content_col = "content" if label_col == None: label_col = 'label_out' df = load_csv(input_file, header=0) df.fillna("|", inplace=True) job_fact = df[content_col].tolist() labels = df[label_col].to_list() lines = [[labels[i], job_fact[i]] for i in range(len(labels)) if type(job_fact[i]) == str] print('Length of data:', len(lines)) print("Fact example:", job_fact[0]) print("Label example:", labels[0]) return lines
def get_train_labels(self, data_dir, label_col): df = load_csv(data_dir, header=0) label_list = df[label_col].apply( lambda x: self.change_label_to_id(x, label_col)) labels = [label_list[i] for i in range(len(label_list))] return labels
def read_raw_csv(self, input_file): return load_csv(input_file, header=0)
def main(verbose: bool): oenace_codes = load_csv(OENACE_FILE_PATH) sitc_codes = load_csv(SITC2_FILE_PATH) sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH) results = [] for use_enriched_sitc in [True, False]: for TEXT_SIMILARITY_THRESHOLD in [40, 50, 60, 70, 75, 80, 90, 95]: print(f'Doing {TEXT_SIMILARITY_THRESHOLD}') total = 0 oenace_candidates = {} for sitc_code, sitc_title in sitc_codes.items(): total += 1 sitc_title_extendend = [sitc_title] # if we want to use enriched version from the correspondence tables, extend sitc titles if use_enriched_sitc: extending = sitc_enriched_codes.get(sitc_code, []) if extending and verbose: print(f'Extending "{sitc_title}" with: {extending}') sitc_title_extendend += extending # hold a list of possible mapping candidates from oenace codes for each method oenace_candidates[sitc_code] = { 'text_similarity': [], 'inverted_index': [], } # step1: try to do exact name matching for oenace_code, oenace_title in oenace_codes.items(): text_similarity = perform_text_similarity( sitc_title=sitc_title, oeance_title=oenace_title, should_extend_sitc=use_enriched_sitc, sitc_extensions=sitc_title_extendend) if text_similarity > TEXT_SIMILARITY_THRESHOLD: oenace_candidates[sitc_code]['text_similarity'].append( { 'oenace_code': oenace_code, 'oenace_title': oenace_title, 'similarity': text_similarity, }) total_mapped = 0 length_of_mapped_items = 0 for sitc_code in oenace_candidates.keys(): if oenace_candidates[sitc_code].get('text_similarity'): total_mapped += 1 length_of_mapped_items += len( oenace_candidates[sitc_code]['text_similarity']) results.append({ 'threshold': TEXT_SIMILARITY_THRESHOLD, 'enriched': use_enriched_sitc, 'total_mapped_pct': round((total_mapped / len(sitc_codes)) * 100, 2), 'avg_length': round(length_of_mapped_items / total_mapped, 2) }) import csv SITC_PATH_PREPROCESSED = '../data/chart_results/text_similarity_thresholds.csv' with open(SITC_PATH_PREPROCESSED, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=[ 'threshold', 'enriched', 'total_mapped_pct', 'avg_length' ]) writer.writeheader() writer.writerows(results) print(23)
def main(use_enriched_sitc: bool, verbose: bool): oenace_codes = load_csv(OENACE_FILE_PATH) sitc_codes = load_csv(SITC2_FILE_PATH) sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH) method = 'stemming' metadata = load_metadata(method) counter = 0 total = 0 oenace_candidates = {} print(f'Start time: {datetime.now().isoformat()}') for sitc_code, sitc_title in sitc_codes.items(): total += 1 if total > 1: # Change this to pass if you want to map all the codes # Change this to break if you want to stop and use mappings from previous trainings break if verbose: print(f"Findind a mapping for: '{sitc_title}'") sitc_title_extendend = [sitc_title] # if we want to use enriched version from the correspondence tables, extend sitc titles if use_enriched_sitc: extending = sitc_enriched_codes.get(sitc_code, []) if extending and verbose: print(f'Extending "{sitc_title}" with: {extending}') sitc_title_extendend += extending # hold a list of possible mapping candidates from oenace codes for each method oenace_candidates[sitc_code] = { 'text_similarity': [], 'inverted_index': [], 'word_embedding': [], } # step1: try to do exact name matching for oenace_code, oenace_title in oenace_codes.items(): text_similarity = perform_text_similarity( sitc_title=sitc_title, oeance_title=oenace_title, should_extend_sitc=use_enriched_sitc, sitc_extensions=sitc_title_extendend ) if text_similarity > TEXT_SIMILARITY_THRESHOLD: oenace_candidates[sitc_code]['text_similarity'].append({ 'oenace_code': oenace_code, 'oenace_title': oenace_title, 'similarity': text_similarity, }) # step2: perform search using tf-idf weighting tf_idf_results = perform_exploration(query='.'.join(sitc_title_extendend), method=method, metadata=metadata) if tf_idf_results: oenace_candidates[sitc_code]['inverted_index'].extend([{ 'oenace_code': item[0], 'oenace_title': oenace_codes[item[0]], 'similarity': item[1], } for item in tf_idf_results]) # step 3: word embedding word2vec_similarity = Word2VecSimilarity(verbose=verbose) for oenace_code, oenace_title in oenace_codes.items(): if use_enriched_sitc: word_embedding_similarity = word2vec_similarity.enriched_sitc_similarity(sitc_title_extendend, oenace_title) else: word_embedding_similarity = word2vec_similarity.text_similarity(sitc_title, oenace_title) # append all as possible candidates, but in the end choose only top 3 with minimum distance oenace_candidates[sitc_code]['word_embedding'].append({ 'oenace_code': oenace_code, 'oenace_title': oenace_title, 'similarity': word_embedding_similarity, }) # sort by similiarity descending for method in ['text_similarity', 'inverted_index']: if oenace_candidates[sitc_code][method]: oenace_candidates[sitc_code][method] = sort_by_similarity( oenace_candidates[sitc_code][method], descending=True ) oenace_candidates[sitc_code]['word_embedding'] = sort_by_similarity( oenace_candidates[sitc_code]['word_embedding'], descending=False )[:100] # find intersections from all steps intersections = find_matching_intersections(oenace_candidates[sitc_code]) if intersections: counter += 1 print(f"\nFindind a mapping for: '{sitc_title}'") for val in intersections: print(f' - {val}') print(end='\n\n') print(f'Done {total}/{len(sitc_codes)} so far') click.echo(f'Found a total of {counter} mappings from {total}') print(f'End time: {datetime.now().isoformat()}') start_gui(sitc_codes=sitc_codes, oeance_codes=oenace_codes, oenace_candidates=oenace_candidates)
def main(verbose: bool): oenace_codes = load_csv(OENACE_FILE_PATH) sitc_codes = load_csv(SITC2_FILE_PATH) sitc_enriched_codes = load_enriched_sitc_codes(SITC2_ENRICHED_FILE_PATH) results = [] for use_enriched_sitc in [True, False]: for method in ['lemmatizing', 'stemming']: metadata = load_metadata(method) counter = 0 total = 0 oenace_candidates = {} for sitc_code, sitc_title in sitc_codes.items(): total += 1 sitc_title_extendend = [sitc_title] # if we want to use enriched version from the correspondence tables, extend sitc titles if use_enriched_sitc: extending = sitc_enriched_codes.get(sitc_code, []) if extending and verbose: print(f'Extending "{sitc_title}" with: {extending}') sitc_title_extendend += extending # hold a list of possible mapping candidates from oenace codes for each method oenace_candidates[sitc_code] = { 'text_similarity': [], 'inverted_index': [], } # step2: perform search using tf-idf weighting tf_idf_results = perform_exploration( query='.'.join(sitc_title_extendend), method=method, metadata=metadata) if tf_idf_results: oenace_candidates[sitc_code]['inverted_index'].extend([{ 'oenace_code': item[0], 'oenace_title': oenace_codes[item[0]], 'similarity': item[1], } for item in tf_idf_results]) total_mapped = 0 length_of_mapped_items = 0 for sitc_code in oenace_candidates.keys(): if oenace_candidates[sitc_code].get('inverted_index'): total_mapped += 1 length_of_mapped_items += len( oenace_candidates[sitc_code]['inverted_index']) results.append({ 'method': method, 'enriched': use_enriched_sitc, 'total_mapped_pct': round((total_mapped / len(sitc_codes)) * 100, 2), 'avg_length': round(length_of_mapped_items / total_mapped, 2) }) import csv SITC_PATH_PREPROCESSED = '../data/chart_results/lemavsstem.csv' with open(SITC_PATH_PREPROCESSED, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=[ 'method', 'enriched', 'total_mapped_pct', 'avg_length' ]) writer.writeheader() writer.writerows(results) print(23)