def load_data(filename, common_users): """ generates a dictionary of user to all their (cs or monolingual) posts :param filename: csv file with user posts :param common_users: a list of users who have both cs and monolingual texts :return: user to posts map """ texts = {} with open(filename, 'r') as fin: print('reading', filename) csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue author = line[0].strip() if author not in common_users: continue text_by_author = texts.get(author, []) text_by_author.append(' '.join( word_tokenize(line[7].strip().lower()))) texts[author] = text_by_author # end for # end with object_name = '<cs or monolingual texts by author>' Serialization.save_obj(texts, object_name) return texts
def get_wikipedia_word_ranked_list(): """ create and save two dictionaries: word to rank, and word to count """ wordcount = {} filename = '<english wikipedia dump location>' with open(filename, 'r') as fin: for line in fin: for token in line.split(): count = wordcount.get(token, 0) wordcount[token] = count + 1 # end for # end for sorted_wordcount = sorted(wordcount, key=wordcount.get, reverse=True) ranks = {} for count, key in enumerate(sorted_wordcount): if count > 500000: continue ranks[key] = count # end for # end with Serialization.save_obj(wordcount, 'dict.counts.cs') Serialization.save_obj(ranks, 'dict.ranks.cs')
def topic_modelling(data_object_name): """ perform topic modelign for a given set of posts (data object) :param data_object_name: raw data for topic modeling """ data_words = Serialization.load_obj(data_object_name) stop_words = stopwords.words('english') print('removing stopwords and unfrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) id2word = corpora.Dictionary(data_words) corpus = [id2word.doc2bow(post) for post in data_words] topics = CS_TOPICS print('performing topic modeling with', topics, 'topics') ldamodel = LdaMallet(mallet_path, corpus=corpus, num_topics=topics, id2word=id2word) pprint( malletmodel2ldamodel(ldamodel).top_topics(corpus, data_words, id2word)) '''
def get_embeddings(data, title): try: embeddings = Serialization.load_obj(title) except FileNotFoundError: embeddings = model.encode(data, show_progress_bar=True) Serialization.save_obj(embeddings, title) return embeddings
def filter_out_non_english_posts(dataobject): """ given a list of posts, filter in clean monolingual english posts :param dataobject: user to posts object :return: user to posts clean dictionary """ clean_data = {} data = Serialization.load_obj(dataobject) for author in data: print('processing:', author) author_eng_posts = [] for post in data[author]: sentences = [] for sentence in re.split('\.|\! |\? |\n', post): if len(sentence.split()) < 10: continue try: detector = Detector(sentence) except: continue if detector.languages[0].name == 'English' and \ detector.languages[0].confidence > DETECTOR_CONFIDENCE: sentences.append(sentence) # end if # end for if len(sentences) == 0: continue author_eng_posts.append('. '.join(sentences)) # end for if len(author_eng_posts) == 0: continue clean_data[author] = author_eng_posts # end for Serialization.save_obj(clean_data, dataobject+'.clean') for author in clean_data: print(author, len(clean_data[author]))
def extract_proficiency_metrics(objname): """ extract lexical and grammatical proficiency metrics given user to posts data :param objname: pickle object with user to posts data :return: """ metrics = {} data = Serialization.load_obj(objname) for author in data: if len(data[author]) < MIN_POSTS_FOR_TEST: continue metrics[author] = Proficiency.compute_lexical_metrics(data[author]) metrics[author].extend(Proficiency.compute_grammatical_metrics(data[author])) print(author, metrics[author]); sys.stdout.flush() # end for Serialization.save_obj(metrics, objname.replace('data', 'metrics.lex.gramm.clean')) print(len(metrics))
def load_data(file_cs, file_monolingual): """ loads posts by code-switchers and noncode-switchers :param file_cs: a csv file with posts by frequent code-switching users :param file_monolingual: a csv file with posts by user who don't (or very rarely) code-switch :return: """ data_cs, subreddits_cs = DataProcessing.read_data(file_cs) data_monolingual, subreddits_monolingual = DataProcessing.read_data(file_monolingual) subreddits = subreddits_cs subreddits.extend(subreddits_monolingual) subreddits = set(subreddits) Serialization.save_obj(data_cs, DATA_CS) Serialization.save_obj(data_monolingual, DATA_MONOLINGUAL) print('code-switchers:', len(data_cs), 'non-code-switchers:', len(data_monolingual)) print('total subreddits:', len(subreddits))
def init_vad(): df_vad = pd.read_csv('/ais/hal9000/jai/lexicon.txt', delimiter='\t', header=0) df_vad = df_vad.dropna().reset_index(drop=True) df = df_vad[['Word', 'Valence']] valence = np.array(df['Valence'].tolist()) vad_words = list(df_vad['Word']) vad_embeddings = LexicalAnalysis.get_embeddings(vad_words, "vad") print("LOADING VALENCE MODEL") try: valence_model = Serialization.load_obj('valence_model') except FileNotFoundError: valence_model = LexicalAnalysis.fit_beta_reg( valence, vad_embeddings, df, 'v_group') Serialization.save_obj(valence_model, 'valence_model') LexicalAnalysis.goodness_of_fit(valence_model, valence, vad_embeddings) return valence_model
def extract_users_common_set(): """ extract a set of user with both code-switched and english monolingual posts """ users_cs = [] filename = '<a csv file with code-switched posts>' with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue users_cs.append(line[0].strip()) # end for # end with users_non_cs = [] filename = '<a csv file with monolingual enlgish posts>' with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() for line in csv_reader: if len(line) < 8: continue users_non_cs.append(line[0].strip()) # end for # end with common_users = set(users_cs).intersection(set(users_non_cs)) print('total cs users, monolingual users, common users:', len(set(users_cs)), len(set(users_non_cs)), len(common_users)) Serialization.save_obj(common_users, 'common.users')
def lemmatization_and_pos_filter(filename, common_users): """ preprocessing data towards topic modeling :param filename: a csv file with code-switched or monolingual data :param common_users: a list of user with both types of posts """ stop_words = stopwords.words('english') with open(filename, 'r') as fin: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) header = csv_reader.__next__() data = [] for line in csv_reader: if len(line) < 8: continue if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue if line[0].strip() not in common_users: continue data.append(line[7]) # end for # end with print('total of', len(data), 'posts') tokens = sum([len(post.split()) for post in data]) print('average post length', float(tokens) / len(data)) print('converting posts to words...') data_words = list(Utils.post_to_words(data)) print('skipping (performing) lemmatization and pos filtering...') data_words = Utils.lemmatization(data_words) print('removing stopwords and unfrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) Serialization.save_obj(data_words, current_mode + '.preprocessed')
def substitute_named_entities(filename, common_users): """ true-case text and substitute named entities with their type (e.g., organization, person) true-casing precedes ner since it's case-sensitive :param filename: file for processing :param common_users: the set of user common to code-switched and monolingual text """ object_name = '<frequencies dictionary object>' frequencies = Serialization.load_obj(object_name) nlp = spacy.load('en_core_web_lg', disable=['tokenizer', 'parser', 'tagger']) with open(filename, 'r') as fin, open(filename.replace('.csv', '_tc_ne.csv'), 'w') as fout: csv_reader = csv.reader(fin, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer = csv.writer(fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(csv_reader.__next__()) for line in csv_reader: if len(line) < 8: continue if line[0].strip() not in common_users: continue if len(line[7].split()) < 30: continue text_tc = Utils.true_case(line[7], frequencies) prev_end = 0 line_with_entities = [] for ent in nlp(text_tc).ents: line_with_entities.append(''.join( text_tc[prev_end:ent.start_char])) line_with_entities.append(ent.label_) prev_end = ent.end_char # end for line_with_entities.append(''.join(text_tc[prev_end:])) line[7] = (' '.join(line_with_entities)).strip() csv_writer.writerow(line)
def estimate_average_and_significance(metrics_obj): """ extracts mean and standard error of users' proficiency metrics :param metrics_obj: a pickle object name with extracted proficiency metrics per user :return: an N*M matrix where N is the # of metrics and M is the # of users """ values = [] metrics = Serialization.load_obj(metrics_obj) for author in metrics: values.append(metrics[author]) values = np.matrix(values) print('ntty, lexical density, mean AoA,', 'mean concreteness,', 'mean word length,', 'mean clauses,', 'mean tree depth,', 'mean sent length') flats = [] for i in range(1, values.shape[1]): flat = [] for val in values[:, i]: flat.append(float((val)[0])) print('{0:.3f}'.format(np.mean(values[:, i])), '\t', '{0:.3f}'.format(sem(flat))) flats.append(flat) # end for return flats
def extract_markers(cs_texts, non_cs_texts, markers): """ extracts two lists of per-user frequencies: (in)formality markers in their cs and monolingual texts :param cs_texts: user to cs posts dictionary :param non_cs_texts: user to monolingual posts dictionary :param markers: list of (in)formality markers to consider :return: two lists of per-author frequencies """ cs_markers_frequency = [] non_cs_markers_frequency = [] ranks = Serialization.load_obj('dict.ranks') for author in cs_texts: if len(cs_texts[author].split()) > MIN_POSTS_PER_USER and \ len(non_cs_texts.get(author, '').split()) > MIN_POSTS_PER_USER: cs_markers_frequency.append( Formality.count_markers(cs_texts[author], markers, ranks)) non_cs_markers_frequency.append( Formality.count_markers(non_cs_texts[author], markers, ranks)) # end if # end for print('extracted informality markers', len(cs_markers_frequency)) return cs_markers_frequency, non_cs_markers_frequency
def test_formality_difference(): """ extracts two lists of per-user (in)formality markers frequency and performs Wilcoxon pair-wise significance test for difference """ markers = Formality.load_formality_markers() cs_object_name = '<pickle object with map: author to cs texts>' non_cs_object_name = '<pickle object with map: author to monolingual english texts>' cs_texts = Serialization.load_obj(cs_object_name) non_cs_texts = Serialization.load_obj(non_cs_object_name) print('loaded', len(cs_texts), 'and', len(non_cs_texts), 'cs and monolingual english by authors') for author in cs_texts: cs_texts[author] = ' '.join(cs_texts[author]) for author in non_cs_texts: non_cs_texts[author] = ' '.join(non_cs_texts[author]) cs_markers_by_authors, non_cs_markers_by_authors = Formality.extract_markers( cs_texts, non_cs_texts, markers) #print(cs_markers_by_authors, non_cs_markers_by_authors) print('mean markers frequency in cs:', np.mean(cs_markers_by_authors), 'in non-cs:', np.mean(non_cs_markers_by_authors)) Serialization.save_obj(cs_markers_by_authors, 'formality.markers.cs') Serialization.save_obj(non_cs_markers_by_authors, 'formality.markers.non-cs') stat, pval = wilcoxon(cs_markers_by_authors, non_cs_markers_by_authors) print('paired ttest sig test pval:', pval, stat) mean1 = np.mean(cs_markers_by_authors) mean2 = np.mean(non_cs_markers_by_authors) std1 = np.std(cs_markers_by_authors) std2 = np.std(non_cs_markers_by_authors) r1, _ = spearmanr(cs_markers_by_authors, non_cs_markers_by_authors) r2, _ = pearsonr(cs_markers_by_authors, non_cs_markers_by_authors) print(mean1, mean2, std1, std2, r1, r2)
loader = DataLoader(data_dir, datasets_params, encoding='utf8') dataset = loader.load(dataset_name, encoding='utf8', batch_size=params.batch_size, to_tensor=True, to_cuda=params.cuda) logger.info("- done.") # add datasets parameters into params params.update(datasets_params) # create model, optimizer and so on. model, optimizer, criterion, metrics = model_factory(params) # restore model, optimizer status = Serialization(checkpoint_dir=model_dir).restore( model=model, checkpoint=checkpoint) if not status: logger.error("Restore model from the checkpoint: {}, failed".format( checkpoint)) logger.info("Starting evaluate model on test dataset...") metrics_result = evaluate(model, dataset, criterion, metrics) logger.info("- done.") logger.info("Save metrics results...") metrics_file = os.path.join(model_dir, metrics_filename.format(checkpoint)) dump_to_json(metrics_result, metrics_file) logger.info("- done.")
def test_true_casing(): frequencies = Serialization.load_obj('dict.counts.cs') text = 'what do you think about john? i believe he is from toronto!' tc = Utils.true_case(text, frequencies) print(tc)
def topical_differences_sig_analysis(): """ testing code-switching and monolingual english posts for topical differences (1) partition code-switched posts into two random sets (2) perform topic modeling of each partition and compute the similarity between the two parts and their individual similarity to topics extracted from monolingual posts (3) test the multiple-experiment similarity scores for significance """ data_object_name = 'monolingual.preprocessed' data_words = Serialization.load_obj(data_object_name) stop_words = stopwords.words('english') print('removing stopwords and infrequent words...') ranks = Serialization.load_obj('dict.ranks') data_words = Utils.remove_noncontent_words(data_words, stop_words, ranks) print('after pre-processing: total of', len(data_words), 'posts') topics = MONOLINGUAL_TOPICS for i in range(EXPERIMENTS): shuffle(data_words) part1 = data_words[:math.floor(len(data_words) / 2)] part2 = data_words[math.floor(len(data_words) / 2):] model = Utils.model_topic(part1, topics) Serialization.save_obj(model, 'lda.mallet.monolingual.part1.' + str(i)) print('saved topic model: part1,', i) model = Utils.model_topic(part2, topics) Serialization.save_obj(model, 'lda.mallet.monolingual.part2.' + str(i)) print('saved topic model: part2,', i) sys.stdout.flush() # end for inter = [] intra = [] ldamodel_cs = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.cs')) for i in range(30): print('processing', i) ldamodel_mono1 = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.monolingual.part1.' + str(i))) ldamodel_mono2 = malletmodel2ldamodel( Serialization.load_obj('lda.mallet.monolingual.part2.' + str(i))) diff_matrix1, _ = ldamodel_cs.diff(ldamodel_mono1, distance='jaccard') diff_matrix2, _ = ldamodel_cs.diff(ldamodel_mono2, distance='jaccard') #intra.append(np.mean([np.mean(np.matrix(diff_matrix1)), np.mean(np.matrix(diff_matrix2))])) intra.append( np.mean([ np.min(np.matrix(diff_matrix1)), np.min(np.matrix(diff_matrix2)) ])) diff_matrix3, _ = ldamodel_mono1.diff(ldamodel_mono2, distance='jaccard') #inter.append(np.mean(np.matrix(diff_matrix3))) inter.append(np.min(np.matrix(diff_matrix3))) # end for print(np.mean(intra), np.mean(inter)) _, pval = ranksums(intra, inter) print('pval:', pval)
# end class MAX_WORD_RANK = 10000 MIN_POSTS_FOR_TEST = 50 DATA_CS = 'data.cs.by.author' DATA_MONOLINGUAL = 'data.monolingual.by.author' DATA_CS_CLEAN = 'data.cs.by.author.clean' DATA_MONOLINGUAL_CLEAN = 'data.monolingual.by.author.clean' METRICS_CS = 'metrics.lex.gramm.clean.cs.by.author' METRICS_MONOLINGUAL = 'metrics.lex.gramm.clean.mono.by.author' DETECTOR_CONFIDENCE = 90 non_natives = DataProcessing.read_non_native_authors() ranks = Serialization.load_obj('dict.ranks') filename = '<a file with english words concreteness ratings>' concreteness = DataProcessing.load_concreteness_scores(filename) filename = '<a file with english words AoA ratings>' aoa = DataProcessing.load_aoa_scores(filename) if __name__ == '__main__': """ assumes polyglot language detector and benepar parser installed https://polyglot.readthedocs.io/en/latest/Detection.html https://pypi.org/project/benepar/ """ file_cs = '<a csv file with cs posts>' file_monolingual = '<a csv file with monolingual english posts>' Proficiency.load_data(file_cs, file_monolingual)
checkpoint = args.checkpoint input_file = args.input_file output_file = args.output_file encoding = args.encoding msg = "Data directory not exists: {}" assert os.path.isdir(data_dir), msg.format(data_dir) msg = "Model directory not exists: {}" assert os.path.isdir(model_dir), msg.format(model_dir) msg = "Input file not exists: {}" assert os.path.isfile(input_file), msg.format(input_file) datasets_params = Params(datasets_params_file) word_vocab = Vocabulary(os.path.join(data_dir, words_txt)) tag_vocab = Vocabulary(os.path.join(data_dir, tags_txt)) unk_word = datasets_params.unk_word params = Params(os.path.join(model_dir, params_filename)) params.update(datasets_params) params.set('cuda', torch.cuda.is_available()) # restore model from the checkpoint model, *others = model_factory(params) Serialization(model_dir).restore(model, checkpoint=checkpoint) # predict predict(model, word_vocab, tag_vocab, unk_word, input_file, output_file, encoding, params.cuda) print("It's done! Please check the output file:") print(output_file)
EXPERIMENTS = 30 CS_TOPICS = 17 MONOLINGUAL_TOPICS = 21 MIN_WORD_RANK = 300 MAX_WORD_RANK = 10000 MIN_SENTENCE_LENGTH = 50 NAMED_ENTITIES = [ 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL' ] mallet_path = '<path-to-mallet-topic-modeling-dir>mallet-2.0.8/bin/mallet' current_mode = 'monolingual' # cs if __name__ == '__main__': DataProcessing.test_true_casing() DataProcessing.clean_and_prepare_data() common_users = Serialization.load_obj('common.users') filename = 'data/' + current_mode + '_corpus_clean.csv' Utils.substitute_named_entities(filename, common_users) filename = 'data/' + current_mode + '_corpus_clean_tc_ne.csv' Utils.lemmatization_and_pos_filter(filename, common_users) Utils.topical_differences_sig_analysis() # end if