Пример #1
0
    def load_data(filename, common_users):
        """
        generates a dictionary of user to all their (cs or monolingual) posts
        :param filename: csv file with user posts
        :param common_users: a list of users who have both cs and monolingual texts
        :return: user to posts map
        """
        texts = {}
        with open(filename, 'r') as fin:
            print('reading', filename)
            csv_reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            header = csv_reader.__next__()
            for line in csv_reader:
                if len(line) < 8: continue
                if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue
                author = line[0].strip()
                if author not in common_users: continue

                text_by_author = texts.get(author, [])
                text_by_author.append(' '.join(
                    word_tokenize(line[7].strip().lower())))
                texts[author] = text_by_author
            # end for
        # end with

        object_name = '<cs or monolingual texts by author>'
        Serialization.save_obj(texts, object_name)
        return texts
    def get_wikipedia_word_ranked_list():
        """
        create and save two dictionaries: word to rank, and word to count
        """
        wordcount = {}
        filename = '<english wikipedia dump location>'
        with open(filename, 'r') as fin:
            for line in fin:
                for token in line.split():
                    count = wordcount.get(token, 0)
                    wordcount[token] = count + 1
                # end for
            # end for
            sorted_wordcount = sorted(wordcount,
                                      key=wordcount.get,
                                      reverse=True)

            ranks = {}
            for count, key in enumerate(sorted_wordcount):
                if count > 500000: continue
                ranks[key] = count
            # end for
        # end with
        Serialization.save_obj(wordcount, 'dict.counts.cs')
        Serialization.save_obj(ranks, 'dict.ranks.cs')
    def topic_modelling(data_object_name):
        """
        perform topic modelign for a given set of posts (data object)
        :param data_object_name: raw data for topic modeling
        """
        data_words = Serialization.load_obj(data_object_name)

        stop_words = stopwords.words('english')
        print('removing stopwords and unfrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)

        id2word = corpora.Dictionary(data_words)
        corpus = [id2word.doc2bow(post) for post in data_words]

        topics = CS_TOPICS
        print('performing topic modeling with', topics, 'topics')
        ldamodel = LdaMallet(mallet_path,
                             corpus=corpus,
                             num_topics=topics,
                             id2word=id2word)
        pprint(
            malletmodel2ldamodel(ldamodel).top_topics(corpus, data_words,
                                                      id2word))
        '''
Пример #4
0
 def get_embeddings(data, title):
     try:
         embeddings = Serialization.load_obj(title)
     except FileNotFoundError:
         embeddings = model.encode(data, show_progress_bar=True)
         Serialization.save_obj(embeddings, title)
     return embeddings
Пример #5
0
    def filter_out_non_english_posts(dataobject):
        """
        given a list of posts, filter in clean monolingual english posts
        :param dataobject: user to posts object
        :return: user to posts clean dictionary
        """
        clean_data = {}
        data = Serialization.load_obj(dataobject)
        for author in data:
            print('processing:', author)
            author_eng_posts = []
            for post in data[author]:
                sentences = []
                for sentence in re.split('\.|\! |\? |\n', post):
                    if len(sentence.split()) < 10: continue
                    try: detector = Detector(sentence)
                    except: continue

                    if detector.languages[0].name == 'English' and \
                            detector.languages[0].confidence > DETECTOR_CONFIDENCE:
                        sentences.append(sentence)
                    # end if
                # end for
                if len(sentences) == 0: continue
                author_eng_posts.append('. '.join(sentences))
            # end for
            if len(author_eng_posts) == 0: continue
            clean_data[author] = author_eng_posts
        # end for

        Serialization.save_obj(clean_data, dataobject+'.clean')
        for author in clean_data:
            print(author, len(clean_data[author]))
Пример #6
0
    def extract_proficiency_metrics(objname):
        """
        extract lexical and grammatical proficiency metrics given user to posts data
        :param objname: pickle object with user to posts data
        :return:
        """
        metrics = {}
        data = Serialization.load_obj(objname)

        for author in data:
            if len(data[author]) < MIN_POSTS_FOR_TEST: continue
            metrics[author] = Proficiency.compute_lexical_metrics(data[author])
            metrics[author].extend(Proficiency.compute_grammatical_metrics(data[author]))
            print(author, metrics[author]); sys.stdout.flush()
        # end for
        Serialization.save_obj(metrics, objname.replace('data', 'metrics.lex.gramm.clean'))
        print(len(metrics))
Пример #7
0
    def load_data(file_cs, file_monolingual):
        """
        loads posts by code-switchers and noncode-switchers
        :param file_cs: a csv file with posts by frequent code-switching users
        :param file_monolingual: a csv file with posts by user who don't (or very rarely) code-switch
        :return:
        """
        data_cs, subreddits_cs = DataProcessing.read_data(file_cs)
        data_monolingual, subreddits_monolingual = DataProcessing.read_data(file_monolingual)

        subreddits = subreddits_cs
        subreddits.extend(subreddits_monolingual)
        subreddits = set(subreddits)

        Serialization.save_obj(data_cs, DATA_CS)
        Serialization.save_obj(data_monolingual, DATA_MONOLINGUAL)
        print('code-switchers:', len(data_cs), 'non-code-switchers:', len(data_monolingual))
        print('total subreddits:', len(subreddits))
Пример #8
0
    def init_vad():
        df_vad = pd.read_csv('/ais/hal9000/jai/lexicon.txt',
                             delimiter='\t',
                             header=0)
        df_vad = df_vad.dropna().reset_index(drop=True)
        df = df_vad[['Word', 'Valence']]
        valence = np.array(df['Valence'].tolist())

        vad_words = list(df_vad['Word'])

        vad_embeddings = LexicalAnalysis.get_embeddings(vad_words, "vad")

        print("LOADING VALENCE MODEL")
        try:
            valence_model = Serialization.load_obj('valence_model')
        except FileNotFoundError:
            valence_model = LexicalAnalysis.fit_beta_reg(
                valence, vad_embeddings, df, 'v_group')
            Serialization.save_obj(valence_model, 'valence_model')

        LexicalAnalysis.goodness_of_fit(valence_model, valence, vad_embeddings)

        return valence_model
    def extract_users_common_set():
        """
        extract a set of user with both code-switched and english monolingual posts
        """
        users_cs = []
        filename = '<a csv file with code-switched posts>'
        with open(filename, 'r') as fin:
            csv_reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            header = csv_reader.__next__()
            for line in csv_reader:
                if len(line) < 8: continue
                users_cs.append(line[0].strip())
            # end for
        # end with

        users_non_cs = []
        filename = '<a csv file with monolingual enlgish posts>'
        with open(filename, 'r') as fin:
            csv_reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            header = csv_reader.__next__()
            for line in csv_reader:
                if len(line) < 8: continue
                users_non_cs.append(line[0].strip())
            # end for
        # end with

        common_users = set(users_cs).intersection(set(users_non_cs))
        print('total cs users, monolingual users, common users:',
              len(set(users_cs)), len(set(users_non_cs)), len(common_users))

        Serialization.save_obj(common_users, 'common.users')
    def lemmatization_and_pos_filter(filename, common_users):
        """
        preprocessing data towards topic modeling
        :param filename: a csv file with code-switched or monolingual data
        :param common_users: a list of user with both types of posts
        """
        stop_words = stopwords.words('english')
        with open(filename, 'r') as fin:
            csv_reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            header = csv_reader.__next__()

            data = []
            for line in csv_reader:
                if len(line) < 8: continue
                if len(line[7].split()) < MIN_SENTENCE_LENGTH: continue
                if line[0].strip() not in common_users: continue
                data.append(line[7])
            # end for
        # end with

        print('total of', len(data), 'posts')
        tokens = sum([len(post.split()) for post in data])
        print('average post length', float(tokens) / len(data))

        print('converting posts to words...')
        data_words = list(Utils.post_to_words(data))
        print('skipping (performing) lemmatization and pos filtering...')
        data_words = Utils.lemmatization(data_words)
        print('removing stopwords and unfrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)

        Serialization.save_obj(data_words, current_mode + '.preprocessed')
    def substitute_named_entities(filename, common_users):
        """
        true-case text and substitute named entities with their type (e.g., organization, person)
        true-casing precedes ner since it's case-sensitive
        :param filename: file for processing
        :param common_users: the set of user common to code-switched and monolingual text
        """
        object_name = '<frequencies dictionary object>'
        frequencies = Serialization.load_obj(object_name)
        nlp = spacy.load('en_core_web_lg',
                         disable=['tokenizer', 'parser', 'tagger'])
        with open(filename,
                  'r') as fin, open(filename.replace('.csv', '_tc_ne.csv'),
                                    'w') as fout:
            csv_reader = csv.reader(fin,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            csv_writer = csv.writer(fout,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(csv_reader.__next__())
            for line in csv_reader:
                if len(line) < 8: continue
                if line[0].strip() not in common_users: continue
                if len(line[7].split()) < 30: continue

                text_tc = Utils.true_case(line[7], frequencies)

                prev_end = 0
                line_with_entities = []
                for ent in nlp(text_tc).ents:
                    line_with_entities.append(''.join(
                        text_tc[prev_end:ent.start_char]))
                    line_with_entities.append(ent.label_)
                    prev_end = ent.end_char
                # end for
                line_with_entities.append(''.join(text_tc[prev_end:]))
                line[7] = (' '.join(line_with_entities)).strip()
                csv_writer.writerow(line)
Пример #12
0
    def estimate_average_and_significance(metrics_obj):
        """
        extracts mean and standard error of users' proficiency metrics
        :param metrics_obj: a pickle object name with extracted proficiency metrics per user
        :return: an N*M matrix where N is the # of metrics and M is the # of users
        """
        values = []
        metrics = Serialization.load_obj(metrics_obj)
        for author in metrics: values.append(metrics[author])
        values = np.matrix(values)

        print('ntty, lexical density, mean AoA,', 'mean concreteness,', 'mean word length,',
              'mean clauses,', 'mean tree depth,', 'mean sent length')

        flats = []
        for i in range(1, values.shape[1]):
            flat = []
            for val in values[:, i]: flat.append(float((val)[0]))
            print('{0:.3f}'.format(np.mean(values[:, i])), '\t', '{0:.3f}'.format(sem(flat)))
            flats.append(flat)
        # end for
        return flats
Пример #13
0
 def extract_markers(cs_texts, non_cs_texts, markers):
     """
     extracts two lists of per-user frequencies: (in)formality markers in their cs and monolingual texts
     :param cs_texts: user to cs posts dictionary
     :param non_cs_texts: user to monolingual posts dictionary
     :param markers: list of (in)formality markers to consider
     :return: two lists of per-author frequencies
     """
     cs_markers_frequency = []
     non_cs_markers_frequency = []
     ranks = Serialization.load_obj('dict.ranks')
     for author in cs_texts:
         if len(cs_texts[author].split()) > MIN_POSTS_PER_USER and \
                 len(non_cs_texts.get(author, '').split()) > MIN_POSTS_PER_USER:
             cs_markers_frequency.append(
                 Formality.count_markers(cs_texts[author], markers, ranks))
             non_cs_markers_frequency.append(
                 Formality.count_markers(non_cs_texts[author], markers,
                                         ranks))
         # end if
     # end for
     print('extracted informality markers', len(cs_markers_frequency))
     return cs_markers_frequency, non_cs_markers_frequency
Пример #14
0
    def test_formality_difference():
        """
        extracts two lists of per-user (in)formality markers frequency and
        performs Wilcoxon pair-wise significance test for difference
        """
        markers = Formality.load_formality_markers()
        cs_object_name = '<pickle object with map: author to cs texts>'
        non_cs_object_name = '<pickle object with map: author to monolingual english texts>'
        cs_texts = Serialization.load_obj(cs_object_name)
        non_cs_texts = Serialization.load_obj(non_cs_object_name)
        print('loaded', len(cs_texts), 'and', len(non_cs_texts),
              'cs and monolingual english by authors')
        for author in cs_texts:
            cs_texts[author] = ' '.join(cs_texts[author])
        for author in non_cs_texts:
            non_cs_texts[author] = ' '.join(non_cs_texts[author])

        cs_markers_by_authors, non_cs_markers_by_authors = Formality.extract_markers(
            cs_texts, non_cs_texts, markers)
        #print(cs_markers_by_authors, non_cs_markers_by_authors)

        print('mean markers frequency in cs:', np.mean(cs_markers_by_authors),
              'in non-cs:', np.mean(non_cs_markers_by_authors))

        Serialization.save_obj(cs_markers_by_authors, 'formality.markers.cs')
        Serialization.save_obj(non_cs_markers_by_authors,
                               'formality.markers.non-cs')
        stat, pval = wilcoxon(cs_markers_by_authors, non_cs_markers_by_authors)
        print('paired ttest sig test pval:', pval, stat)

        mean1 = np.mean(cs_markers_by_authors)
        mean2 = np.mean(non_cs_markers_by_authors)
        std1 = np.std(cs_markers_by_authors)
        std2 = np.std(non_cs_markers_by_authors)
        r1, _ = spearmanr(cs_markers_by_authors, non_cs_markers_by_authors)
        r2, _ = pearsonr(cs_markers_by_authors, non_cs_markers_by_authors)
        print(mean1, mean2, std1, std2, r1, r2)
Пример #15
0
    loader = DataLoader(data_dir, datasets_params, encoding='utf8')
    dataset = loader.load(dataset_name,
                          encoding='utf8',
                          batch_size=params.batch_size,
                          to_tensor=True,
                          to_cuda=params.cuda)
    logger.info("- done.")

    # add datasets parameters into params
    params.update(datasets_params)

    # create model, optimizer and so on.
    model, optimizer, criterion, metrics = model_factory(params)

    # restore model, optimizer
    status = Serialization(checkpoint_dir=model_dir).restore(
        model=model, checkpoint=checkpoint)
    
    if not status:
        logger.error("Restore model from the checkpoint: {}, failed".format(
            checkpoint))

    logger.info("Starting evaluate model on test dataset...")
    metrics_result = evaluate(model, dataset, criterion, metrics)
    logger.info("- done.")

    logger.info("Save metrics results...")
    metrics_file = os.path.join(model_dir, 
                                metrics_filename.format(checkpoint))
    dump_to_json(metrics_result, metrics_file)
    logger.info("- done.")
 def test_true_casing():
     frequencies = Serialization.load_obj('dict.counts.cs')
     text = 'what do you think about john? i believe he is from toronto!'
     tc = Utils.true_case(text, frequencies)
     print(tc)
    def topical_differences_sig_analysis():
        """
        testing code-switching and monolingual english posts for topical differences
        (1) partition code-switched posts into two random sets
        (2) perform topic modeling of each partition and compute the similarity between the two parts and
        their individual similarity to topics extracted from monolingual posts
        (3) test the multiple-experiment similarity scores for significance
        """
        data_object_name = 'monolingual.preprocessed'

        data_words = Serialization.load_obj(data_object_name)

        stop_words = stopwords.words('english')
        print('removing stopwords and infrequent words...')
        ranks = Serialization.load_obj('dict.ranks')
        data_words = Utils.remove_noncontent_words(data_words, stop_words,
                                                   ranks)
        print('after pre-processing: total of', len(data_words), 'posts')

        topics = MONOLINGUAL_TOPICS
        for i in range(EXPERIMENTS):
            shuffle(data_words)
            part1 = data_words[:math.floor(len(data_words) / 2)]
            part2 = data_words[math.floor(len(data_words) / 2):]

            model = Utils.model_topic(part1, topics)
            Serialization.save_obj(model,
                                   'lda.mallet.monolingual.part1.' + str(i))
            print('saved topic model: part1,', i)

            model = Utils.model_topic(part2, topics)
            Serialization.save_obj(model,
                                   'lda.mallet.monolingual.part2.' + str(i))
            print('saved topic model: part2,', i)
            sys.stdout.flush()

        # end for

        inter = []
        intra = []
        ldamodel_cs = malletmodel2ldamodel(
            Serialization.load_obj('lda.mallet.cs'))
        for i in range(30):
            print('processing', i)
            ldamodel_mono1 = malletmodel2ldamodel(
                Serialization.load_obj('lda.mallet.monolingual.part1.' +
                                       str(i)))
            ldamodel_mono2 = malletmodel2ldamodel(
                Serialization.load_obj('lda.mallet.monolingual.part2.' +
                                       str(i)))
            diff_matrix1, _ = ldamodel_cs.diff(ldamodel_mono1,
                                               distance='jaccard')
            diff_matrix2, _ = ldamodel_cs.diff(ldamodel_mono2,
                                               distance='jaccard')
            #intra.append(np.mean([np.mean(np.matrix(diff_matrix1)), np.mean(np.matrix(diff_matrix2))]))
            intra.append(
                np.mean([
                    np.min(np.matrix(diff_matrix1)),
                    np.min(np.matrix(diff_matrix2))
                ]))
            diff_matrix3, _ = ldamodel_mono1.diff(ldamodel_mono2,
                                                  distance='jaccard')
            #inter.append(np.mean(np.matrix(diff_matrix3)))
            inter.append(np.min(np.matrix(diff_matrix3)))
        # end for

        print(np.mean(intra), np.mean(inter))
        _, pval = ranksums(intra, inter)
        print('pval:', pval)
Пример #18
0
# end class


MAX_WORD_RANK = 10000
MIN_POSTS_FOR_TEST = 50
DATA_CS = 'data.cs.by.author'
DATA_MONOLINGUAL = 'data.monolingual.by.author'
DATA_CS_CLEAN = 'data.cs.by.author.clean'
DATA_MONOLINGUAL_CLEAN = 'data.monolingual.by.author.clean'
METRICS_CS = 'metrics.lex.gramm.clean.cs.by.author'
METRICS_MONOLINGUAL = 'metrics.lex.gramm.clean.mono.by.author'

DETECTOR_CONFIDENCE = 90

non_natives = DataProcessing.read_non_native_authors()
ranks = Serialization.load_obj('dict.ranks')
filename = '<a file with english words concreteness ratings>'
concreteness = DataProcessing.load_concreteness_scores(filename)
filename = '<a file with english words AoA ratings>'
aoa = DataProcessing.load_aoa_scores(filename)

if __name__ == '__main__':
    """
    assumes polyglot language detector and benepar parser installed
    https://polyglot.readthedocs.io/en/latest/Detection.html
    https://pypi.org/project/benepar/
    """

    file_cs = '<a csv file with cs posts>'
    file_monolingual = '<a csv file with monolingual english posts>'
    Proficiency.load_data(file_cs, file_monolingual)
Пример #19
0
    checkpoint = args.checkpoint
    input_file = args.input_file
    output_file = args.output_file
    encoding = args.encoding

    msg = "Data directory not exists: {}"
    assert os.path.isdir(data_dir), msg.format(data_dir)
    msg = "Model directory not exists: {}"
    assert os.path.isdir(model_dir), msg.format(model_dir)
    msg = "Input file not exists: {}"
    assert os.path.isfile(input_file), msg.format(input_file)

    datasets_params = Params(datasets_params_file)
    word_vocab = Vocabulary(os.path.join(data_dir, words_txt))
    tag_vocab = Vocabulary(os.path.join(data_dir, tags_txt))
    unk_word = datasets_params.unk_word

    params = Params(os.path.join(model_dir, params_filename))
    params.update(datasets_params)
    params.set('cuda', torch.cuda.is_available())

    # restore model from the checkpoint
    model, *others = model_factory(params)
    Serialization(model_dir).restore(model, checkpoint=checkpoint)

    # predict
    predict(model, word_vocab, tag_vocab, unk_word, input_file, output_file,
            encoding, params.cuda)

    print("It's done! Please check the output file:")
    print(output_file)
EXPERIMENTS = 30
CS_TOPICS = 17
MONOLINGUAL_TOPICS = 21
MIN_WORD_RANK = 300
MAX_WORD_RANK = 10000
MIN_SENTENCE_LENGTH = 50
NAMED_ENTITIES = [
    'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
    'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
    'QUANTITY', 'ORDINAL', 'CARDINAL'
]

mallet_path = '<path-to-mallet-topic-modeling-dir>mallet-2.0.8/bin/mallet'
current_mode = 'monolingual'  # cs

if __name__ == '__main__':

    DataProcessing.test_true_casing()
    DataProcessing.clean_and_prepare_data()
    common_users = Serialization.load_obj('common.users')

    filename = 'data/' + current_mode + '_corpus_clean.csv'
    Utils.substitute_named_entities(filename, common_users)
    filename = 'data/' + current_mode + '_corpus_clean_tc_ne.csv'
    Utils.lemmatization_and_pos_filter(filename, common_users)

    Utils.topical_differences_sig_analysis()

# end if